Hypergeometric Testing on Seattle Pets
Sat, Oct 16, 2021
4-minute read
The dataset of this blog post comes from TidyTuesday about pet names and the related information. This is a relatively simple dataset that does not have too many columns to explore, but I will analyze the dataset by using hypergeometric testing.
First, load the packages and the dataset with a few cleaning steps.
library(tidyverse)
library(lubridate)
library(tidytext)
pets <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-26/seattle_pets.csv") %>%
mutate(license_issue_date = mdy(license_issue_date),
primary_breed = str_remove(primary_breed, ",.+"),
year = year(license_issue_date),
month = month(license_issue_date))
pets
## # A tibble: 52,519 x 9
## license_issue_date license_number animals_name species primary_breed
## <date> <chr> <chr> <chr> <chr>
## 1 2018-11-16 8002756 Wall-E Dog Mixed Breed
## 2 2018-11-11 S124529 Andre Dog Terrier
## 3 2018-11-21 903793 Mac Dog Retriever
## 4 2018-11-23 824666 Melb Cat Domestic Shorthair
## 5 2018-12-30 S119138 Gingersnap Cat Domestic Shorthair
## 6 2018-12-16 S138529 Cody Dog Retriever
## 7 2017-10-04 580652 Millie Dog Terrier
## 8 2018-08-09 S142558 Sebastian Cat Domestic Shorthair
## 9 2018-08-20 S142546 Madeline Cat Domestic Shorthair
## 10 2018-12-08 S123830 Cleo Cat Domestic Shorthair
## # ... with 52,509 more rows, and 4 more variables: secondary_breed <chr>,
## # zip_code <chr>, year <dbl>, month <dbl>
Top 30 Names for Cats and Dogs
pets %>%
count(animals_name,species, sort = T) %>%
filter(species %in% c("Cat", "Dog"),
!is.na(animals_name)) %>%
group_by(species) %>%
slice_max(order_by = n, n = 30) %>%
ungroup() %>%
mutate(animals_name = reorder_within(animals_name, n, species)) %>%
ggplot(aes(n, animals_name, fill = animals_name)) +
geom_col(show.legend = F) +
facet_wrap(~species, scales = "free", ncol = 1) +
scale_y_reordered() +
theme(
strip.text = element_text(size = 15, face = "bold")
) +
labs(x = "name count",
y = "pet name",
title = "Top 30 Pet Name Count Faceted by Cat & Dog")

pets %>%
count(year, month, species, sort = T) %>%
mutate(date = make_date(year,month)) %>%
ggplot(aes(date, n, color = species)) +
geom_line() +
labs(x = "year",
y = "species count")

The following dog hypergeometric test is inspired and followed by David Robinson’s code.
Dog Name Hypergeometric Test
dogs <- pets %>%
filter(species == "Dog")
name_counts <- dogs %>%
count(animals_name, name = "name_total") %>%
filter(name_total > 100)
breed_counts <- dogs %>%
count(primary_breed, name = "breed_total") %>%
filter(breed_total >= 200)
total_dogs <- nrow(dogs)
name_breed_counts <- dogs %>%
count(primary_breed, animals_name) %>%
complete(primary_breed, animals_name, fill = list(n = 0)) %>%
inner_join(name_counts, by = "animals_name") %>%
inner_join(breed_counts, by = "primary_breed")
# One-sided hypergeometric p-value
hypergeom_test <- name_breed_counts %>%
mutate(percent_of_breed = n / breed_total,
percent_name_overall = name_total / total_dogs) %>%
mutate(overrepresented_ratio = percent_of_breed / percent_name_overall) %>%
arrange(desc(overrepresented_ratio)) %>%
mutate(hypergeom_p_value = 1 - phyper(n, name_total, total_dogs - name_total, breed_total),
holm_p_value = p.adjust(hypergeom_p_value),
fdr = p.adjust(hypergeom_p_value, method = "fdr"))
hypergeom_test %>%
filter(fdr < .05)
## # A tibble: 4 x 11
## primary_breed animals_name n name_total breed_total percent_of_breed
## <chr> <chr> <dbl> <int> <int> <dbl>
## 1 Boxer Rocky 8 113 485 0.0165
## 2 Pug Zoe 8 117 554 0.0144
## 3 Beagle Lucy 19 337 542 0.0351
## 4 Retriever Lucy 99 337 7225 0.0137
## # ... with 5 more variables: percent_name_overall <dbl>,
## # overrepresented_ratio <dbl>, hypergeom_p_value <dbl>, holm_p_value <dbl>,
## # fdr <dbl>
ggplot(hypergeom_test, aes(hypergeom_p_value)) +
geom_histogram(binwidth = .1) +
labs(x = "One-sided hypergeometric p-values for overrepresented name")

Based on the output above, Rocky, Zoe, Lucy and Lucy are pet names that overrepresent Boxer, Pug, Beagle and Retriever respectively.
Cat Name Hypergeometric Test
By following the same process as what I did above, here is the same process to deal with cat names.
cats <- pets %>%
filter(species == "Cat")
name_counts <- cats %>%
count(animals_name, name = "name_total") %>%
filter(name_total > 100)
breed_counts <- cats %>%
count(primary_breed, name = "breed_total") %>%
filter(breed_total >= 200)
total_cats <- nrow(cats)
name_breed_counts <- cats %>%
count(primary_breed, animals_name) %>%
complete(primary_breed, animals_name, fill = list(n = 0)) %>%
inner_join(name_counts, by = "animals_name") %>%
inner_join(breed_counts, by = "primary_breed")
# One-sided hypergeometric p-value
hypergeom_test <- name_breed_counts %>%
mutate(percent_of_breed = n / breed_total,
percent_name_overall = name_total / total_dogs) %>%
mutate(overrepresented_ratio = percent_of_breed / percent_name_overall) %>%
arrange(desc(overrepresented_ratio)) %>%
mutate(hypergeom_p_value = 1 - phyper(n, name_total, total_cats - name_total, breed_total),
holm_p_value = p.adjust(hypergeom_p_value),
fdr = p.adjust(hypergeom_p_value, method = "fdr"))
hypergeom_test %>%
filter(fdr < .05)
## # A tibble: 1 x 11
## primary_breed animals_name n name_total breed_total percent_of_breed
## <chr> <chr> <dbl> <int> <int> <dbl>
## 1 Domestic Shorthair <NA> 296 406 10086 0.0293
## # ... with 5 more variables: percent_name_overall <dbl>,
## # overrepresented_ratio <dbl>, hypergeom_p_value <dbl>, holm_p_value <dbl>,
## # fdr <dbl>
ggplot(hypergeom_test, aes(hypergeom_p_value)) +
geom_histogram(binwidth = .1) +
labs(x = "One-sided hypergeometric p-values for overrepresented name")
