GDPR Data Visualization
Fri, Dec 17, 2021
4-minute read
This blog post analyzes the E.U.’s General Data Protection Regulation (GDPR) from TidyTuesday.
library(tidyverse)
library(tidytext)
library(lubridate)
library(scales)
theme_set(theme_bw())
gdpr <- read_tsv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_text.tsv")
gdpr_violations <- read_tsv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv") %>%
mutate(article_violated = str_remove_all(article_violated, "\\s*"),
date = mdy(date))
gdpr
## # A tibble: 425 x 7
## chapter chapter_title article article_title sub_article gdpr_text href
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 1 General provi~ 1 Subject-matte~ 1 "This Regul~ http:~
## 2 1 General provi~ 1 Subject-matte~ 2 "This Regul~ http:~
## 3 1 General provi~ 1 Subject-matte~ 3 "The free m~ http:~
## 4 1 General provi~ 2 Material scope 1 "This Regul~ http:~
## 5 1 General provi~ 2 Material scope 2 "This Regul~ http:~
## 6 1 General provi~ 2 Material scope 3 "For the pr~ http:~
## 7 1 General provi~ 2 Material scope 4 "This Regul~ http:~
## 8 1 General provi~ 3 Territorial s~ 1 "This Regul~ http:~
## 9 1 General provi~ 3 Territorial s~ 2 "This Regul~ http:~
## 10 1 General provi~ 3 Territorial s~ 3 "the monito~ http:~
## # ... with 415 more rows
gdpr_violations
## # A tibble: 250 x 11
## id picture name price authority date controller article_violated
## <dbl> <chr> <chr> <dbl> <chr> <date> <chr> <chr>
## 1 1 https:/~ Poland 9380 Polish N~ 2019-10-18 Polish Ma~ Art.28GDPR
## 2 2 https:/~ Roman~ 2500 Romanian~ 2019-10-17 UTTIS IND~ Art.12GDPR|Art.~
## 3 3 https:/~ Spain 60000 Spanish ~ 2019-10-16 Xfera Mov~ Art.5GDPR|Art.6~
## 4 4 https:/~ Spain 8000 Spanish ~ 2019-10-16 Iberdrola~ Art.31GDPR
## 5 5 https:/~ Roman~ 150000 Romanian~ 2019-10-09 Raiffeise~ Art.32GDPR
## 6 6 https:/~ Roman~ 20000 Romanian~ 2019-10-09 Vreau Cre~ Art.32GDPR|Art.~
## 7 7 https:/~ Greece 200000 Hellenic~ 2019-10-07 Telecommu~ Art.5(1)c)GDPR|~
## 8 8 https:/~ Greece 200000 Hellenic~ 2019-10-07 Telecommu~ Art.21(3)GDPR|A~
## 9 9 https:/~ Spain 30000 Spanish ~ 2019-10-01 Vueling A~ Art.5GDPR|Art.6~
## 10 10 https:/~ Roman~ 9000 Romanian~ 2019-09-26 Inteligo ~ Art.5(1)a)GDPR|~
## # ... with 240 more rows, and 3 more variables: type <chr>, source <chr>,
## # summary <chr>
GDPR word counts
gdpr %>%
unnest_tokens(word, gdpr_text) %>%
anti_join(stop_words) %>%
filter(!str_detect(word, "[:digit:]")) %>%
count(chapter_title, word, chapter, sort = T) %>%
group_by(chapter_title) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder_within(word, n, chapter_title)) %>%
ggplot(aes(n, word, fill = factor(chapter))) +
geom_col() +
facet_wrap(~chapter_title, scales = "free") +
scale_y_reordered() +
labs(x = "# of words",
y = NULL,
fill = "chapter",
title = "Chapter-wise 10 most frequent words")
GDPR violations
gdpr_violations %>%
mutate(article_violated = str_remove_all(article_violated, " GDPR")) %>%
separate_rows(article_violated, sep = "\\|") %>%
mutate(article_violated = str_extract(article_violated, "Art\\.\\s?[:digit:]+")) %>%
count(name, article_violated, sort = T) %>%
filter(!is.na(article_violated)) %>%
mutate(article_violated = fct_reorder(article_violated, n, sum)) %>%
ggplot(aes(n, article_violated, fill = name)) +
geom_col() +
labs(x = "# of violations",
y = "violated article",
fill = NULL,
title = "Which articles have been violated most?")
Fines
gdpr_violations %>%
mutate(article_violated = str_remove_all(article_violated, " GDPR")) %>%
separate_rows(article_violated, sep = "\\|") %>%
mutate(article_violated = str_extract(article_violated, "Art\\.\\s?[:digit:]+")) %>%
filter(!is.na(article_violated)) %>%
mutate(article_violated = fct_reorder(article_violated, price, median, na.rm = T)) %>%
ggplot(aes(price, article_violated, fill = article_violated)) +
geom_boxplot() +
scale_x_log10() +
theme(legend.position = "none") +
labs(x = "fine in Euros(€)",
y = "violated article",
title = "How much does article violation cost?")
TF-IDF
gdpr_violations %>%
mutate(article_violated = str_remove_all(article_violated, " GDPR")) %>%
separate_rows(article_violated, sep = "\\|") %>%
mutate(article_violated = str_extract(article_violated, "Art\\.\\s?[:digit:]+")) %>%
filter(!is.na(article_violated)) %>%
unnest_tokens(word, type) %>%
anti_join(stop_words) %>%
count(article_violated, word, sort = T) %>%
bind_tf_idf(word, article_violated, n) %>%
group_by(article_violated) %>%
slice_max(tf_idf, n = 5) %>%
ungroup() %>%
mutate(article_violated = fct_reorder(article_violated, -tf_idf, sum),
word = reorder_within(word, tf_idf, article_violated)) %>%
ggplot(aes(tf_idf, word, fill = article_violated)) +
geom_col() +
scale_y_reordered() +
facet_wrap(~article_violated, scales = "free_y") +
theme(legend.position = "none") +
labs(x = "TF-IDF",
y = NULL,
title = "Top 5 words with the largest TF-IDF values")
Monthly average fine
gdpr_violations %>%
mutate(month = month(date, label = T)) %>%
group_by(name, month) %>%
summarize(avg_fine = mean(price)) %>%
ungroup() %>%
ggplot(aes(month, name, fill = avg_fine)) +
geom_tile() +
theme(panel.grid = element_blank()) +
scale_fill_gradient2(low = "green",
high = "red",
mid = "pink",
midpoint = 2e+07,
labels = dollar) +
labs(x = NULL,
y = NULL,
fill = "average fine",
title = "Average Fine Per Country Per Month")
gdpr_violations %>%
mutate(authority = fct_lump(authority, 5),
authority = str_remove(authority, " \\(.+$")) %>%
filter(authority != "Other") %>%
ggplot(aes(price, authority, fill = name)) +
geom_violin() +
scale_x_log10(label = dollar) +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "fine",
y = NULL,
fill = "country",
title = "Top 5 authorities fine charge",
subtitle = "Top authorities are defined by the # of fines they gave")