Random Forest, LASSO, and XGboost on Classifying School Total Minority

Sun, May 8, 2022 4-minute read

This blog post will use three models, i.e., Random Forest, LASSO, and XGboost on data sets provided by TidyTuesday about college costs and minority information. Some ideas in this blog post are inspired by Julia Silge’s blog post (link).

library(tidyverse)
library(tidymodels)
library(geofacet)
library(scales)
theme_set(theme_bw())
tuition_cost <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/tuition_cost.csv") %>%
  rename(school = name) %>%
  filter(type %in% c("Private", "Public"),
         !is.na(state))

diversity_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-10/diversity_school.csv") %>%
  rename(school = name) %>%
  filter(!is.na(state))
tuition_cost %>%
  select(school, state, type, in_state_total, out_of_state_total) %>%
  pivot_longer(cols = 4:5) %>%
  mutate(name = str_replace_all(name, "_", " ")) %>%
  ggplot(aes(value, type, fill = name)) +
  geom_boxplot() +
  facet_geo(~state) +
  scale_x_continuous(n.breaks = 3,
                     labels = dollar) +
  labs(x = NULL,
       y = NULL,
       fill = NULL,
       title = "In/Out State Total Expenses per State")

diversity_raw %>%
  mutate(category = fct_lump(category, n = 5, w = enrollment),
         pct_enrollment = enrollment / total_enrollment) %>%
  filter(category != "Other") %>%
  ggplot(aes(pct_enrollment, category, fill = category)) +
  geom_boxplot() +
  facet_geo(~state) +
  scale_x_continuous(labels = percent) +
  labs(x = "enrollment percentage",
       y = NULL,
       fill = NULL,
       title = "College Enrollment Category per State") +
  theme(axis.text.x = element_text(angle = 90))

Here we categorize each school based on the percentage of Total Minority. If the percentage is above 30%, then total_minority is high. Otherwise, it is low.

university_df <- diversity_raw %>%
  filter(category == "Total Minority") %>%
  mutate(pct_minority = enrollment/total_enrollment,
         total_minority = if_else(pct_minority > 0.3, "high", "low")) %>%
  inner_join(tuition_cost,
            by = c("school", "state")) %>% 
   select(-c(category, school, state, pct_minority, enrollment, contains("_total"))) %>%
  mutate(across(where(is.character), factor))

university_df 
## # A tibble: 2,096 x 8
##    total_enrollment total_minority state_code type  degree_length room_and_board
##               <dbl> <fct>          <fct>      <fct> <fct>                  <dbl>
##  1            81459 low            VA         Priv~ 4 Year                 10478
##  2            66046 high           FL         Publ~ 2 Year                    NA
##  3            60767 high           FL         Publ~ 4 Year                  9617
##  4            57821 low            UT         Priv~ 4 Year                    NA
##  5            51487 high           VA         Publ~ 2 Year                    NA
##  6            51313 high           TX         Publ~ 4 Year                 10804
##  7            50320 high           AZ         Publ~ 4 Year                 12648
##  8            50081 low            MI         Publ~ 4 Year                 10322
##  9            49610 high           FL         Publ~ 4 Year                 10882
## 10            49459 high           FL         Publ~ 4 Year                 10120
## # ... with 2,086 more rows, and 2 more variables: in_state_tuition <dbl>,
## #   out_of_state_tuition <dbl>

Now let’s apply some machine learning algorithms to university_df to predict if a school is high or low on total_minority.

Split the data:

set.seed(2022)

university_spl <- university_df %>%
  initial_split()

university_train <- training(university_spl)
university_test <- testing(university_spl)

university_folds <- vfold_cv(university_train)

Make a recipe for the models:

university_rec <- recipe(total_minority ~ ., data = university_train) %>%
  step_impute_linear(room_and_board)

Random Forest and its workflow:

rand_spec <- rand_forest(trees = tune(),
            mtry = tune(),
            min_n = tune()) %>%
  set_mode("classification") %>%
  set_engine("ranger")

rand_wf <- workflow() %>%
  add_model(rand_spec) %>%
  add_recipe(university_rec)

Tune the random forest model:

rand_res <- rand_wf %>%
  tune_grid(
    university_folds,
    grid = crossing(
      trees = seq(500, 1500, 500),
      mtry = c(2, 3, 4, 5, 6),
      min_n = seq(3, 10, 2)
    )
  )

autoplot(rand_res)

Final Fit and the ROC curve:

rand_last_fit <- rand_wf %>%
  finalize_workflow(rand_res %>% select_best("roc_auc")) %>%
  last_fit(university_spl)

rand_last_fit %>%
  collect_predictions() %>%
  roc_curve(total_minority, .pred_high) %>%
  autoplot()

Lasso model:

log_spec <- logistic_reg(penalty = tune()) %>%
  set_engine("glmnet")

log_wf <- workflow() %>%
  add_model(log_spec) %>%
  add_recipe(university_rec %>%
               step_dummy(all_nominal_predictors())) 

log_res <- log_wf %>%
  tune_grid(
    university_folds,
    grid = crossing(
      penalty = 10 ^ seq(-7, -0.5, 0.5)
    )
  )

autoplot(log_res)

log_last_fit <- log_wf %>%
  finalize_workflow(log_res %>%
                      select_by_one_std_err(metric = "roc_auc", desc(penalty))
                    ) %>%
  last_fit(university_spl)

log_last_fit %>%
  collect_predictions() %>%
  roc_curve(total_minority, .pred_high) %>%
  autoplot()

XGBoost:

xg_spec <- boost_tree(
  trees = tune(),
  min_n = tune(),
  mtry = tune(),
  learn_rate = 0.02
) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xg_wf <- workflow() %>%
  add_model(xg_spec) %>%
  add_recipe(university_rec %>%
               step_dummy(all_nominal_predictors())) 

xg_res <- xg_wf %>%
  tune_grid(
    university_folds,
    grid = crossing(
      trees = seq(500, 1500, 500),
      mtry = c(2, 4, 6, 8, 10),
      min_n = c(3, 5, 7, 9)
    )
  )

autoplot(xg_res)

xg_last_fit <- xg_wf %>%
  finalize_workflow(xg_res %>%
                      select_best("roc_auc")) %>%
  last_fit(university_spl)

xg_last_fit %>%
  collect_predictions() %>%
  roc_curve(total_minority, .pred_high) %>%
  autoplot()

Comparing three models together:

select_predictions <- function(tbl){
  tbl %>%
    collect_predictions() %>%
    select(.pred_high, total_minority)
}

rand_last_fit %>%
  select_predictions() %>%
  mutate(mod = "Random Forest") %>%
  bind_rows(
    log_last_fit %>%
      select_predictions() %>%
      mutate(mod = "Logistic Regression")
  ) %>%
  bind_rows(
    xg_last_fit %>%
      select_predictions() %>%
      mutate(mod = "XGBoost")
  ) %>%
  group_by(mod) %>%
  roc_curve(total_minority, .pred_high) %>%
  ungroup() %>%
  ggplot(aes(1 - specificity, sensitivity, color = mod)) +
  geom_path(size = 2, alpha = 0.5) +
  geom_abline(lty = 2, color = "grey50", size = 2) +
  labs(color = "model",
       title = "ROC Curves for Three Tuned Models")

Three models perform very similarly on classifying whether a school is high or low on total minority, with XGboost slighly better.