US Wind Turbine Data Analysis

Tue, Sep 7, 2021 4-minute read

The dataset for this blog analysis is from TidyTuesday about wind turbine data across different states in the U.S. The dataset contains a number of technical terms that might not be easy to understand, but we can still use data science tools to analyze and visualize the data.

Data Introduction

library(tidyverse)
library(ggthemes)
library(geofacet)
theme_set(theme_bw())
wt <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-11-06/us_wind.csv",na = c("missing", "-9999", "n/a"))
head(wt)
## # A tibble: 6 x 24
##   case_id faa_ors faa_asn usgs_pr_id t_state t_county    t_fips p_name   p_year
##     <dbl> <chr>   <chr>        <dbl> <chr>   <chr>       <chr>  <chr>     <dbl>
## 1 3073429 NA      NA            4960 CA      Kern County 06029  251 Wind   1987
## 2 3071522 NA      NA            4997 CA      Kern County 06029  251 Wind   1987
## 3 3073425 NA      NA            4957 CA      Kern County 06029  251 Wind   1987
## 4 3071569 NA      NA            5023 CA      Kern County 06029  251 Wind   1987
## 5 3005252 NA      NA            5768 CA      Kern County 06029  251 Wind   1987
## 6 3003862 NA      NA            5836 CA      Kern County 06029  251 Wind   1987
## # ... with 15 more variables: p_tnum <dbl>, p_cap <dbl>, t_manu <chr>,
## #   t_model <chr>, t_cap <dbl>, t_hh <dbl>, t_rd <dbl>, t_rsa <dbl>,
## #   t_ttlh <dbl>, t_conf_atr <dbl>, t_conf_loc <dbl>, t_img_date <chr>,
## #   t_img_srce <chr>, xlong <dbl>, ylat <dbl>
skimr::skim(wt)
Table 1: Data summary
Name wt
Number of rows 58185
Number of columns 24
_______________________
Column type frequency:
character 10
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
faa_ors 8405 0.86 9 9 0 49756 0
faa_asn 8095 0.86 13 17 0 49492 0
t_state 0 1.00 2 2 0 45 0
t_county 0 1.00 4 31 0 521 0
t_fips 0 1.00 5 5 0 631 0
p_name 0 1.00 3 42 0 1479 0
t_manu 3876 0.93 3 72 0 70 0
t_model 4469 0.92 2 14 0 260 0
t_img_date 24174 0.58 8 10 0 555 0
t_img_srce 0 1.00 4 16 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
case_id 0 1.00 3040036.95 21727.20 3000001.00 3023351.00 3038762.00 3053909.00 3087216.00 ▅▇▇▅▃
usgs_pr_id 15272 0.74 26396.60 13915.73 0.00 16301.00 27304.00 38130.00 49135.00 ▆▆▇▇▇
p_year 62 1.00 2007.64 9.22 1981.00 2006.00 2010.00 2014.00 2018.00 ▂▁▂▆▇
p_tnum 0 1.00 158.57 331.74 1.00 50.00 83.00 124.00 1831.00 ▇▁▁▁▁
p_cap 3692 0.94 137.36 86.91 0.05 76.95 129.00 199.50 495.01 ▇▇▅▁▁
t_cap 3694 0.94 1652.56 681.84 40.00 1500.00 1650.00 2000.00 6000.00 ▂▇▁▁▁
t_hh 6625 0.89 77.31 12.87 18.20 80.00 80.00 80.00 130.00 ▁▁▇▁▁
t_rd 4949 0.91 83.66 23.78 11.00 77.00 87.00 100.00 150.00 ▁▂▇▆▁
t_rsa 4949 0.91 5941.47 2696.78 95.03 4656.63 5944.68 7853.98 17671.46 ▃▇▆▁▁
t_ttlh 5062 0.91 120.70 22.83 9.10 118.60 123.40 131.40 200.30 ▁▁▇▇▁
t_conf_atr 0 1.00 2.77 0.56 1.00 3.00 3.00 3.00 3.00 ▁▁▁▁▇
t_conf_loc 0 1.00 2.95 0.31 1.00 3.00 3.00 3.00 3.00 ▁▁▁▁▇
xlong 0 1.00 -101.65 12.36 -171.71 -107.41 -100.25 -95.51 144.72 ▂▇▁▁▁
ylat 0 1.00 38.40 5.32 13.39 34.69 37.87 42.69 66.84 ▁▃▇▁▁

Wind turbine counts across states

wt %>%
  count(t_state, sort = T) %>%
  mutate(t_state = fct_reorder(t_state, n)) %>%
  ggplot(aes(n, t_state, fill = t_state)) +
  geom_col(show.legend = F) +
  labs(x = "# of wind turbines", y = "")

Key turbine metrics faceted by state

state_facet_plot <- function(y, y_title = ""){
  y <- enquo(y)
  wt %>%
    ggplot(aes(p_year, !!y, color = t_state)) +
    geom_point() +
    geom_smooth(method = "lm") +
    facet_geo(~t_state) +
    theme(
      legend.position = "none",
      axis.text.x = element_text(size = 7)
    ) +
    labs(x = "year", y = y_title, title = paste("State-wise", str_to_title(y_title), "Across Time")) +
    scale_x_continuous(n.breaks = 3)
}

state_facet_plot(t_hh, y_title = "turbine hub height (meters)")

state_facet_plot(t_cap, y_title = "turbine capacity (kW)")

state_facet_plot(t_rd, y_title = "turbine rotor diameter (meters)")

state_facet_plot(t_rsa, y_title = "turbine rotor swept area (meters^2)")

Key combined turbine metrics

metric_plot <- function(y, y_title = ""){
  y <- enquo(y)
  wt %>%
    ggplot(aes(p_year, !!y, color = t_state)) +
    geom_point() +
    geom_smooth(aes(group = 1),method = "lm") +
    theme(
      legend.position = "none"
    ) +
    labs(x = "year", y = y_title, title = paste(str_to_title(y_title), "Across Time"))
}


metric_plot(t_hh, y_title = "turbine hub height (meters)")

metric_plot(t_cap, y_title = "turbine capacity (kW)")

metric_plot(t_rd, y_title = "turbine rotor diameter (meters)")

metric_plot(t_rsa, y_title = "turbine rotor swept area (meters^2)")

The U.S. Map with turbine locations

wt %>%
  ggplot(aes(xlong, ylat, color = t_state)) +
    geom_point(show.legend = F) +
    borders("state") +
    coord_map() +
  labs(x = "long", y = "lat", title = "Turbine Locations on the U.S. Map")

# Removing AK and HI
wt %>%
  filter(xlong < 100 & xlong > -130) %>%
  ggplot(aes(xlong, ylat, color = t_state)) +
    geom_point(show.legend = F) +
    borders("state") +
    coord_map()  +
  labs(x = "long", y = "lat", title = "Turbine Locations on the U.S. Map")

The idea of constructing the following code is inspired by this link.

wt %>%
  group_by(t_model, t_state) %>%
  summarize(num_of_turbines = n(),
            long = mean(xlong, na.rm = TRUE),
            lat = mean(ylat, na.rm = TRUE),
            p_year) %>%
  filter(long < 0, long > -130, lat > 20) %>%
  ggplot(aes(long, lat, size = num_of_turbines, color = p_year)) +
  borders("state") +
  geom_point() +
  coord_map() +
  labs(size = "# of turbines", color = "year")
## `summarise()` regrouping output by 't_model', 't_state' (override with `.groups` argument)