US Wind Turbine Data Analysis

Tue, Sep 7, 2021 4-minute read

The dataset for this blog analysis is from TidyTuesday about wind turbine data across different states in the U.S. The dataset contains a number of technical terms that might not be easy to understand, but we can still use data science tools to analyze and visualize the data.

Data Introduction

wt <- read_csv("",na = c("missing", "-9999", "n/a"))
## # A tibble: 6 x 24
##   case_id faa_ors faa_asn usgs_pr_id t_state t_county    t_fips p_name   p_year
##     <dbl> <chr>   <chr>        <dbl> <chr>   <chr>       <chr>  <chr>     <dbl>
## 1 3073429 NA      NA            4960 CA      Kern County 06029  251 Wind   1987
## 2 3071522 NA      NA            4997 CA      Kern County 06029  251 Wind   1987
## 3 3073425 NA      NA            4957 CA      Kern County 06029  251 Wind   1987
## 4 3071569 NA      NA            5023 CA      Kern County 06029  251 Wind   1987
## 5 3005252 NA      NA            5768 CA      Kern County 06029  251 Wind   1987
## 6 3003862 NA      NA            5836 CA      Kern County 06029  251 Wind   1987
## # ... with 15 more variables: p_tnum <dbl>, p_cap <dbl>, t_manu <chr>,
## #   t_model <chr>, t_cap <dbl>, t_hh <dbl>, t_rd <dbl>, t_rsa <dbl>,
## #   t_ttlh <dbl>, t_conf_atr <dbl>, t_conf_loc <dbl>, t_img_date <chr>,
## #   t_img_srce <chr>, xlong <dbl>, ylat <dbl>
Table 1: Data summary
Name wt
Number of rows 58185
Number of columns 24
Column type frequency:
character 10
numeric 14
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
faa_ors 8405 0.86 9 9 0 49756 0
faa_asn 8095 0.86 13 17 0 49492 0
t_state 0 1.00 2 2 0 45 0
t_county 0 1.00 4 31 0 521 0
t_fips 0 1.00 5 5 0 631 0
p_name 0 1.00 3 42 0 1479 0
t_manu 3876 0.93 3 72 0 70 0
t_model 4469 0.92 2 14 0 260 0
t_img_date 24174 0.58 8 10 0 555 0
t_img_srce 0 1.00 4 16 0 4 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
case_id 0 1.00 3040036.95 21727.20 3000001.00 3023351.00 3038762.00 3053909.00 3087216.00 ▅▇▇▅▃
usgs_pr_id 15272 0.74 26396.60 13915.73 0.00 16301.00 27304.00 38130.00 49135.00 ▆▆▇▇▇
p_year 62 1.00 2007.64 9.22 1981.00 2006.00 2010.00 2014.00 2018.00 ▂▁▂▆▇
p_tnum 0 1.00 158.57 331.74 1.00 50.00 83.00 124.00 1831.00 ▇▁▁▁▁
p_cap 3692 0.94 137.36 86.91 0.05 76.95 129.00 199.50 495.01 ▇▇▅▁▁
t_cap 3694 0.94 1652.56 681.84 40.00 1500.00 1650.00 2000.00 6000.00 ▂▇▁▁▁
t_hh 6625 0.89 77.31 12.87 18.20 80.00 80.00 80.00 130.00 ▁▁▇▁▁
t_rd 4949 0.91 83.66 23.78 11.00 77.00 87.00 100.00 150.00 ▁▂▇▆▁
t_rsa 4949 0.91 5941.47 2696.78 95.03 4656.63 5944.68 7853.98 17671.46 ▃▇▆▁▁
t_ttlh 5062 0.91 120.70 22.83 9.10 118.60 123.40 131.40 200.30 ▁▁▇▇▁
t_conf_atr 0 1.00 2.77 0.56 1.00 3.00 3.00 3.00 3.00 ▁▁▁▁▇
t_conf_loc 0 1.00 2.95 0.31 1.00 3.00 3.00 3.00 3.00 ▁▁▁▁▇
xlong 0 1.00 -101.65 12.36 -171.71 -107.41 -100.25 -95.51 144.72 ▂▇▁▁▁
ylat 0 1.00 38.40 5.32 13.39 34.69 37.87 42.69 66.84 ▁▃▇▁▁

Wind turbine counts across states

wt %>%
  count(t_state, sort = T) %>%
  mutate(t_state = fct_reorder(t_state, n)) %>%
  ggplot(aes(n, t_state, fill = t_state)) +
  geom_col(show.legend = F) +
  labs(x = "# of wind turbines", y = "")

Key turbine metrics faceted by state

state_facet_plot <- function(y, y_title = ""){
  y <- enquo(y)
  wt %>%
    ggplot(aes(p_year, !!y, color = t_state)) +
    geom_point() +
    geom_smooth(method = "lm") +
    facet_geo(~t_state) +
      legend.position = "none",
      axis.text.x = element_text(size = 7)
    ) +
    labs(x = "year", y = y_title, title = paste("State-wise", str_to_title(y_title), "Across Time")) +
    scale_x_continuous(n.breaks = 3)

state_facet_plot(t_hh, y_title = "turbine hub height (meters)")

state_facet_plot(t_cap, y_title = "turbine capacity (kW)")

state_facet_plot(t_rd, y_title = "turbine rotor diameter (meters)")

state_facet_plot(t_rsa, y_title = "turbine rotor swept area (meters^2)")

Key combined turbine metrics

metric_plot <- function(y, y_title = ""){
  y <- enquo(y)
  wt %>%
    ggplot(aes(p_year, !!y, color = t_state)) +
    geom_point() +
    geom_smooth(aes(group = 1),method = "lm") +
      legend.position = "none"
    ) +
    labs(x = "year", y = y_title, title = paste(str_to_title(y_title), "Across Time"))

metric_plot(t_hh, y_title = "turbine hub height (meters)")

metric_plot(t_cap, y_title = "turbine capacity (kW)")

metric_plot(t_rd, y_title = "turbine rotor diameter (meters)")

metric_plot(t_rsa, y_title = "turbine rotor swept area (meters^2)")

The U.S. Map with turbine locations

wt %>%
  ggplot(aes(xlong, ylat, color = t_state)) +
    geom_point(show.legend = F) +
    borders("state") +
    coord_map() +
  labs(x = "long", y = "lat", title = "Turbine Locations on the U.S. Map")

# Removing AK and HI
wt %>%
  filter(xlong < 100 & xlong > -130) %>%
  ggplot(aes(xlong, ylat, color = t_state)) +
    geom_point(show.legend = F) +
    borders("state") +
    coord_map()  +
  labs(x = "long", y = "lat", title = "Turbine Locations on the U.S. Map")

The idea of constructing the following code is inspired by this link.

wt %>%
  group_by(t_model, t_state) %>%
  summarize(num_of_turbines = n(),
            long = mean(xlong, na.rm = TRUE),
            lat = mean(ylat, na.rm = TRUE),
            p_year) %>%
  filter(long < 0, long > -130, lat > 20) %>%
  ggplot(aes(long, lat, size = num_of_turbines, color = p_year)) +
  borders("state") +
  geom_point() +
  coord_map() +
  labs(size = "# of turbines", color = "year")
## `summarise()` regrouping output by 't_model', 't_state' (override with `.groups` argument)