NYC Restaurants Visualization & PCA

Thu, Sep 16, 2021 7-minute read

This blog post analyzes the dataset about the inspections on the restaurants in NYC, and the dataset is obtained from TidyTuesday.

library(tidyverse)
library(lubridate)
library(scales)
library(ggthemes)
library(widyr)
library(broom)
theme_set(theme_tufte())

nyr <- read_csv("https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv") %>% 
        janitor::clean_names() %>%
        select(-phone, -grade_date, -record_date, -building, -street) %>% 
  mutate(inspection_date = mdy(inspection_date),
         cuisine_description = case_when(
           cuisine_description == "Latin (Cuban, Dominican, Puerto Rican, South & Central American)" ~ "Latin",
           TRUE ~ as.character(cuisine_description)
         )) %>%
  filter(inspection_date > 2000)

nyr

## # A tibble: 378,107 x 21
##       camis dba      boro   zipcode cuisine_descript~ inspection_date action    
##       <dbl> <chr>    <chr>    <dbl> <chr>             <date>          <chr>     
##  1 41636061 JACK'S ~ Manha~   10012 American          2017-06-26      Violation~
##  2 50045844 HOT SPO~ Brook~   11204 Chinese/Japanese  2017-04-05      Violation~
##  3 41521159 CITY SA~ Manha~   10036 Sandwiches/Salad~ 2019-06-19      Violation~
##  4 41576944 TRIX     Brook~   11211 French            2017-01-27      Violation~
##  5 50003506 LUPITA ~ Manha~   10029 Mexican           2017-10-25      Violation~
##  6 50098012 FRIENDL~ Brook~   11206 Sandwiches/Salad~ 2021-08-23      Violation~
##  7 50051744 ORCHARD~ Bronx    10464 Coffee/Tea        2016-08-17      Violation~
##  8 50008223 SUBWAY   Bronx    10457 Sandwiches        2019-05-10      Violation~
##  9 41694845 THE WIC~ Brook~   11209 American          2021-08-17      Violation~
## 10 41694845 THE WIC~ Brook~   11209 American          2021-08-17      Violation~
## # ... with 378,097 more rows, and 14 more variables: violation_code <chr>,
## #   violation_description <chr>, critical_flag <chr>, score <dbl>, grade <chr>,
## #   inspection_type <chr>, latitude <dbl>, longitude <dbl>,
## #   community_board <dbl>, council_district <chr>, census_tract <chr>,
## #   bin <dbl>, bbl <dbl>, nta <chr>

nyr <- nyr %>%
  separate(inspection_type, c("inspection_program", "inspection_type"), sep = " / ")

skimr::skim(nyr)

Table 1: Data summary
Name	nyr
Number of rows	378107
Number of columns	22
_______________________
Column type frequency:
character	13
Date	1
numeric	8
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
dba	56	1.00	2	90	20072
boro	0	1.00	1	13	6
cuisine_description	1	1.00	4	30	85
action	0	1.00	33	130	5
violation_code	4652	0.99	3	4	106
violation_description	2217	0.99	19	360	106
critical_flag	0	1.00	8	14	3
grade	184964	0.51	1	1	7
inspection_program	0	1.00	9	28	8
inspection_type	0	1.00	13	28	6
council_district	6626	0.98	2	2	51
census_tract	6626	0.98	6	6	1178
nta	6626	0.98	4	4	193

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
inspection_date	0	1	2009-06-22	2021-09-16	2019-01-22	1486

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
camis	0	1.00	46363746.72	4.369762e+06	30075445.00	41428295.00	50011515.00	50061285.00	5.011492e+07	▁▁▆▁▇
zipcode	5624	0.99	10681.84	6.019200e+02	10000.00	10022.00	10469.00	11229.00	3.033900e+04	▇▁▁▁▁
score	13539	0.96	20.35	1.490000e+01	0.00	11.00	15.00	26.00	1.640000e+02	▇▂▁▁▁
latitude	353	1.00	40.12	4.930000e+00	0.00	40.69	40.73	40.76	4.091000e+01	▁▁▁▁▇
longitude	353	1.00	-72.84	8.940000e+00	-74.25	-73.99	-73.96	-73.90	0.000000e+00	▇▁▁▁▁
community_board	6626	0.98	248.49	1.301500e+02	101.00	105.00	301.00	401.00	5.950000e+02	▇▂▅▅▁
bin	8344	0.98	2510375.18	1.346132e+06	1000000.00	1043896.00	3007533.00	4002366.00	5.799501e+06	▇▂▅▅▁
bbl	1015	1.00	2400511137.26	1.339002e+09	1.00	1010420121.00	3001670003.00	4001630004.00	5.270001e+09	▇▂▅▅▁

Total # of Cuisine counts

nyr %>%
  filter(!is.na(cuisine_description)) %>%
  count(cuisine_description, sort = T) %>%
  mutate(cuisine_description = fct_lump(cuisine_description, n = 10, w = n),
         cuisine_description = fct_reorder(cuisine_description, n)) %>%
  ggplot(aes(n, cuisine_description)) +
  geom_col() +
  labs(x = "# of restaurants", y = "cuisine", title = "NYC Cuisine #")

Top 10 Cuisines and their critical flag counts

top_10_cuisines <- nyr %>%
  count(cuisine_description, sort = T) %>%
  head(10) %>%
  select(cuisine_description) %>%
  pull()

nyr_top10 <- nyr %>%
  filter(cuisine_description %in% top_10_cuisines,
         critical_flag != "Not Applicable")

nyr_top10 %>%
  count(cuisine_description, critical_flag, sort = T) %>%
  ggplot(aes(n, cuisine_description)) +
  geom_col() +
  facet_wrap(~critical_flag) +
  labs(x = "# of restaurants", y = "cuisine",
       title = "Top 10 Cuisine and Their Critical Flag Counts")

Top 10 Cuisine and their critical flag percentage

total_restaurants <- nyr %>%
  filter(cuisine_description %in% top_10_cuisines) %>%
  group_by(cuisine_description) %>%
  summarize(total = n())

nyr_top10 %>%
  left_join(total_restaurants) %>%
  group_by(cuisine_description, critical_flag) %>%
  summarize(critical_percentage = n()/total) %>%
  distinct() %>%
  ggplot(aes(critical_percentage, cuisine_description)) +
  geom_col() +
  facet_wrap(~critical_flag) +
  expand_limits(x = 1) +
  scale_x_continuous(labels = percent) +
  labs(x = "percentage", y = "cuisine",
       title = "Top 10 Cuisine and Their Critical Flag Percentage")

Grades and inspection date

nyr %>%
  filter(year(inspection_date) > 2016,
         !is.na(grade),
         !boro %in% c("Missing", 0),
         !grade %in% c("Not Yet Graded", "G")) %>%
  count(grade, inspection_date, boro) %>%
  ggplot(aes(inspection_date, n, fill = grade)) +
  geom_area(show.legend = F) +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  facet_grid(boro~grade) +
  labs(x = "inspection date", y = "# of restaurants",
       title = "Inspection Date and Grades")

Boro and critical flags

nyr %>%
  mutate(year = year(inspection_date)) %>%
  filter(year > 2015,
         boro != 0) %>%
  count(year, boro, critical_flag, sort = T) %>%
  ggplot(aes(critical_flag, boro, fill = n)) +
  geom_tile() +
  facet_wrap(~year, ncol = 1) +
  scale_fill_gradient2(
    low = "blue",
    high = "red",
    mid = "white",
    midpoint = 10000
  ) +
  labs(x = "Critical Flag", y = NULL, fill = "# of restaurants",
       title = "Yearly Critical Flags among All 5 Boros in NYC")

There is no the same camis that changed its name based on dba.

nyr %>%
  count(camis, dba, sort = T) %>%
  count(camis, sort = T)

## # A tibble: 25,022 x 2
##       camis     n
##       <dbl> <int>
##  1 30075445     1
##  2 30112340     1
##  3 30191841     1
##  4 40356018     1
##  5 40356483     1
##  6 40356731     1
##  7 40357217     1
##  8 40359480     1
##  9 40359705     1
## 10 40360045     1
## # ... with 25,012 more rows

Inspection Type and critical flag

nyr %>%
  filter(boro != 0,
         critical_flag != "Not Critical") %>%
  count(critical_flag, inspection_type, boro) %>%
  ggplot(aes(critical_flag, inspection_type, fill = n)) +
  geom_tile() +
  facet_wrap(~boro) +
  scale_fill_gradient2(
    low = "blue",
    high = "red",
    mid = "white",
    midpoint = 10000
  ) +
  labs(x = "Critical Flag", y = "Inspection Type", fill = "# of restaurants",
       title = "Inspection Types and Critical Flags among All Boros in NYC")

The following code is inspired by David Robinson’s code for learning purposes only, but some chunks of code do not work for some reason.

inspections <- nyr %>%
  group_by(camis,
           dba,
           boro,
           zipcode,
           cuisine_description,
           inspection_date,
           action,
           score,
           grade,
           inspection_type,
           inspection_program) %>%
  summarize(
    critical_violations = sum(critical_flag == "Critical"),
    non_critical_violations = sum(critical_flag == "Not Critical")
  ) %>%
  ungroup()


most_recent_cycle_inspection <- inspections %>% 
  filter(inspection_program == "Cycle Inspection",
         inspection_type == "Initial Inspection") %>%
  arrange(desc(inspection_date)) %>% 
  distinct(camis, .keep_all = TRUE)

by_dba <- most_recent_cycle_inspection %>%
  group_by(dba, cuisine = cuisine_description) %>%
  summarize(locations = n(),
            avg_score = mean(score),
            median_score = median(score)) %>%
  ungroup() %>%
  arrange(desc(locations))

## `summarise()` has grouped output by 'dba'. You can override using the `.groups` argument.

by_dba %>%
  mutate(locations_bin = cut(locations, c(0, 1, 3, 10, Inf), labels = c("1", "2-3", "4-10", ">10"))) %>% 
  ggplot(aes(locations_bin, avg_score + 1, fill = locations_bin)) +
  geom_boxplot(show.legend = F) +
  scale_y_log10() +
  labs(x = "# of location(s)", y = "Average Score (+1)",
       title = "Relation Between # of Locations & Average Score")

For some unknown reason, David’s code strangely does not work, and here is the updated one that works. Note: The original code is commented out.

by_dba %>%
  add_count(cuisine) %>%
  filter(n > 200) %>%
  nest(-cuisine) %>%
  mutate(model = map(data, ~ t.test(.$avg_score)),
         tidied = map(model, tidy)) %>%
  #unnest(map(model, tidy))
  #mutate(tidied = map(model, tidy)) %>%
  unnest(tidied) %>%
  mutate(
    cuisine = fct_reorder(cuisine, estimate)
  ) %>%
  ggplot(aes(estimate, cuisine)) +
  geom_point() +
  geom_errorbar(aes(xmin = conf.low,
                    xmax = conf.high)) +
  labs(x = "estimated average score",
       title = "Estimated Average Score with Confidence Interval")

violations <- nyr %>%
  semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>%
  filter(!is.na(violation_description))

violations %>%
  pairwise_cor(violation_description, camis, sort = TRUE)

## # A tibble: 7,310 x 3
##    item1                            item2                            correlation
##    <chr>                            <chr>                                  <dbl>
##  1 Food not labeled in accordance ~ Records and logs not maintained~       0.866
##  2 Records and logs not maintained~ Food not labeled in accordance ~       0.866
##  3 Required nutritional informatio~ Additional nutritional informat~       0.764
##  4 Additional nutritional informat~ Required nutritional informatio~       0.764
##  5 ROP processing equipment not ap~ HACCP plan not approved or appr~       0.750
##  6 HACCP plan not approved or appr~ ROP processing equipment not ap~       0.750
##  7 Facility not vermin proof. Harb~ Evidence of mice or live mice p~       0.625
##  8 Evidence of mice or live mice p~ Facility not vermin proof. Harb~       0.625
##  9 Filth flies or food/refuse/sewa~ Facility not vermin proof. Harb~       0.532
## 10 Facility not vermin proof. Harb~ Filth flies or food/refuse/sewa~       0.532
## # ... with 7,300 more rows

PCA

principal_components <- violations %>%
  mutate(value = 1) %>%
  widely_svd(violation_description, camis, value, nv = 6)

principal_components %>%
  filter(dimension == 2) %>%
  top_n(10, abs(value)) %>%
  mutate(violation_description = str_sub(violation_description, 1, 60),
         violation_description = fct_reorder(violation_description, value)) %>%
  ggplot(aes(violation_description, value)) +
  geom_col() +
  coord_flip()