Hypergeometric Testing on Seattle Pets
Sat, Oct 16, 2021
4-minute read
The dataset of this blog post comes from TidyTuesday about pet names and the related information. This is a relatively simple dataset that does not have too many columns to explore, but I will analyze the dataset by using hypergeometric testing.
First, load the packages and the dataset with a few cleaning steps.
library(tidyverse)
library(lubridate)
library(tidytext)
pets <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-26/seattle_pets.csv") %>%
mutate(license_issue_date = mdy(license_issue_date),
primary_breed = str_remove(primary_breed, ",.+"),
year = year(license_issue_date),
month = month(license_issue_date))
pets
## # A tibble: 52,519 x 9
## license_issue_date license_number animals_name species primary_breed
## <date> <chr> <chr> <chr> <chr>
## 1 2018-11-16 8002756 Wall-E Dog Mixed Breed
## 2 2018-11-11 S124529 Andre Dog Terrier
## 3 2018-11-21 903793 Mac Dog Retriever
## 4 2018-11-23 824666 Melb Cat Domestic Shorthair
## 5 2018-12-30 S119138 Gingersnap Cat Domestic Shorthair
## 6 2018-12-16 S138529 Cody Dog Retriever
## 7 2017-10-04 580652 Millie Dog Terrier
## 8 2018-08-09 S142558 Sebastian Cat Domestic Shorthair
## 9 2018-08-20 S142546 Madeline Cat Domestic Shorthair
## 10 2018-12-08 S123830 Cleo Cat Domestic Shorthair
## # ... with 52,509 more rows, and 4 more variables: secondary_breed <chr>,
## # zip_code <chr>, year <dbl>, month <dbl>
Top 30 Names for Cats and Dogs
pets %>%
count(animals_name,species, sort = T) %>%
filter(species %in% c("Cat", "Dog"),
!is.na(animals_name)) %>%
group_by(species) %>%
slice_max(order_by = n, n = 30) %>%
ungroup() %>%
mutate(animals_name = reorder_within(animals_name, n, species)) %>%
ggplot(aes(n, animals_name, fill = animals_name)) +
geom_col(show.legend = F) +
facet_wrap(~species, scales = "free", ncol = 1) +
scale_y_reordered() +
theme(
strip.text = element_text(size = 15, face = "bold")
) +
labs(x = "name count",
y = "pet name",
title = "Top 30 Pet Name Count Faceted by Cat & Dog")
pets %>%
count(year, month, species, sort = T) %>%
mutate(date = make_date(year,month)) %>%
ggplot(aes(date, n, color = species)) +
geom_line() +
labs(x = "year",
y = "species count")
The following dog hypergeometric test is inspired and followed by David Robinson’s code.
Dog Name Hypergeometric Test
dogs <- pets %>%
filter(species == "Dog")
name_counts <- dogs %>%
count(animals_name, name = "name_total") %>%
filter(name_total > 100)
breed_counts <- dogs %>%
count(primary_breed, name = "breed_total") %>%
filter(breed_total >= 200)
total_dogs <- nrow(dogs)
name_breed_counts <- dogs %>%
count(primary_breed, animals_name) %>%
complete(primary_breed, animals_name, fill = list(n = 0)) %>%
inner_join(name_counts, by = "animals_name") %>%
inner_join(breed_counts, by = "primary_breed")
# One-sided hypergeometric p-value
hypergeom_test <- name_breed_counts %>%
mutate(percent_of_breed = n / breed_total,
percent_name_overall = name_total / total_dogs) %>%
mutate(overrepresented_ratio = percent_of_breed / percent_name_overall) %>%
arrange(desc(overrepresented_ratio)) %>%
mutate(hypergeom_p_value = 1 - phyper(n, name_total, total_dogs - name_total, breed_total),
holm_p_value = p.adjust(hypergeom_p_value),
fdr = p.adjust(hypergeom_p_value, method = "fdr"))
hypergeom_test %>%
filter(fdr < .05)
## # A tibble: 4 x 11
## primary_breed animals_name n name_total breed_total percent_of_breed
## <chr> <chr> <dbl> <int> <int> <dbl>
## 1 Boxer Rocky 8 113 485 0.0165
## 2 Pug Zoe 8 117 554 0.0144
## 3 Beagle Lucy 19 337 542 0.0351
## 4 Retriever Lucy 99 337 7225 0.0137
## # ... with 5 more variables: percent_name_overall <dbl>,
## # overrepresented_ratio <dbl>, hypergeom_p_value <dbl>, holm_p_value <dbl>,
## # fdr <dbl>
ggplot(hypergeom_test, aes(hypergeom_p_value)) +
geom_histogram(binwidth = .1) +
labs(x = "One-sided hypergeometric p-values for overrepresented name")
Based on the output above, Rocky
, Zoe
, Lucy
and Lucy
are pet names that overrepresent Boxer
, Pug
, Beagle
and Retriever
respectively.
Cat Name Hypergeometric Test
By following the same process as what I did above, here is the same process to deal with cat names.
cats <- pets %>%
filter(species == "Cat")
name_counts <- cats %>%
count(animals_name, name = "name_total") %>%
filter(name_total > 100)
breed_counts <- cats %>%
count(primary_breed, name = "breed_total") %>%
filter(breed_total >= 200)
total_cats <- nrow(cats)
name_breed_counts <- cats %>%
count(primary_breed, animals_name) %>%
complete(primary_breed, animals_name, fill = list(n = 0)) %>%
inner_join(name_counts, by = "animals_name") %>%
inner_join(breed_counts, by = "primary_breed")
# One-sided hypergeometric p-value
hypergeom_test <- name_breed_counts %>%
mutate(percent_of_breed = n / breed_total,
percent_name_overall = name_total / total_dogs) %>%
mutate(overrepresented_ratio = percent_of_breed / percent_name_overall) %>%
arrange(desc(overrepresented_ratio)) %>%
mutate(hypergeom_p_value = 1 - phyper(n, name_total, total_cats - name_total, breed_total),
holm_p_value = p.adjust(hypergeom_p_value),
fdr = p.adjust(hypergeom_p_value, method = "fdr"))
hypergeom_test %>%
filter(fdr < .05)
## # A tibble: 1 x 11
## primary_breed animals_name n name_total breed_total percent_of_breed
## <chr> <chr> <dbl> <int> <int> <dbl>
## 1 Domestic Shorthair <NA> 296 406 10086 0.0293
## # ... with 5 more variables: percent_name_overall <dbl>,
## # overrepresented_ratio <dbl>, hypergeom_p_value <dbl>, holm_p_value <dbl>,
## # fdr <dbl>
ggplot(hypergeom_test, aes(hypergeom_p_value)) +
geom_histogram(binwidth = .1) +
labs(x = "One-sided hypergeometric p-values for overrepresented name")