Kenya Census Visualization (Map Included)

Tue, Mar 8, 2022 3-minute read

In this blog, I will analyze a few of the TidyTuesday datasets about the Kenya census. You can get the datasets from here.

library(tidyverse)
library(janitor)
library(scales)
theme_set(theme_bw())

gender <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-19/gender.csv') %>%
  clean_names() %>%
  filter(county != "Total") %>%
  mutate(county = str_replace_all(county, "([a-z])([A-Z])", "\\1 \\2"))

crops <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-19/crops.csv') %>%
  clean_names() %>%
  mutate(sub_county = str_to_title(sub_county)) %>%
  filter(sub_county != "Kenya") %>%
  rename(county = sub_county)

households <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-19/households.csv') %>%
  clean_names() %>%
  filter(county != "Kenya") %>%
  mutate(county = str_replace_all(county, "([a-z])([A-Z])", "\\1 \\2"))

Join households and gender together:

house_gender_joined <- households %>%
  inner_join(gender, by = "county")

house_gender_joined

## # A tibble: 47 x 8
##    county    population number_of_house~ average_househo~   male female intersex
##    <chr>          <dbl>            <dbl>            <dbl>  <dbl>  <dbl>    <dbl>
##  1 Mombasa      1190987           378422              3.1 610257 598046       30
##  2 Kwale         858748           173176              5   425121 441681       18
##  3 Kilifi       1440958           298472              4.8 704089 749673       25
##  4 Tana Riv~     314710            68242              4.6 158550 157391        2
##  5 Lamu          141909            37963              3.7  76103  67813        4
##  6 Taita/Ta~     335747            96429              3.5 173337 167327        7
##  7 Garissa       835482           141394              5.9 458975 382344       34
##  8 Wajir         775302           127932              6.1 415374 365840       49
##  9 Mandera       862079           125763              6.9 434976 432444       37
## 10 Marsabit      447150            77495              5.8 243548 216219       18
## # ... with 37 more rows, and 1 more variable: total <dbl>

It is ideally compact to include all house_gender_joined information on one plot:

house_gender_joined %>%
  ggplot(aes(male, female)) +
  geom_point(aes(size = average_household_size,
                 color = average_household_size),
             alpha = 0.5) +
  geom_abline(color = "lightblue") +
  geom_text(aes(label = county),
            hjust = 1,
            vjust = 1,
            check_overlap = T) +
  scale_x_log10(label = comma) +
  scale_y_log10(label = comma) +
  scale_color_gradient(low = "green",
                       high = "red",
                       guide = "none") +
  theme(legend.position = "bottom") +
  labs(x = "# of male",
       y = "# of female",
       size = "average household size",
       title = "Male and Female Relationship with Average Household Size",
       subtitle = "The smaller average household size, more spacious")

We can see from the above plot that the ratio between male and female is roughly 1 across all counties, and counties with # of people in the middle range are more crowded than counties on either side.

crop_county <- crops %>%
  pivot_longer(cols = c(2:11), 
               names_to = "crop",
               values_to = "household") %>%
  mutate(crop = str_replace(crop, "_", " ")) %>%
  left_join(house_gender_joined %>% select(county, number_of_households),
            by = "county") %>%
  mutate(pct_crop = household/number_of_households,
         county = fct_reorder(county, pct_crop, sum, na.rm = T)) 

crop_county %>%
  ggplot(aes(pct_crop, county, fill = crop)) +
  geom_col() +
  scale_x_continuous(labels = percent) +
  labs(x = "crop percentage",
       y = "",
       title = "County-Level Crop Production in Kenya")

The reason why some crop percentage goes above 100% is that there are households that produce multiple types of crops (double counting).

The following code is inspired by David Robinson. You can find his code here.

Making a map by using the rKenyaCensus package:

#remotes::install_github("Shelmith-Kariuki/rKenyaCensus")
library(rKenyaCensus)
library(sf)

kenya_sf <- KenyaCounties_SHP %>%
  st_as_sf() %>%
  st_simplify(dTolerance = 200) %>%
  mutate(county = str_to_title(County)) %>%
  left_join(crop_county, by = "county")

kenya_sf %>%
  filter(!is.na(crop)) %>%
  mutate(crop = str_to_title(crop)) %>%
  ggplot(aes(fill = number_of_households)) +
  geom_sf() +
  ggthemes::theme_map() +
  labs(fill = "households growing this crop") +
  facet_wrap(~crop, ncol = 5) +
  scale_fill_gradient(high = "green",
                      low = "red") +
  theme(legend.position = "bottom",
        strip.text = element_text(size = 15),
        plot.title = element_text(size = 18),
        legend.text = element_text(angle = 90, vjust = 0))