TidyTuesday: Cetaceans Dataset
Dec 18, 2018
Christopher Yee
2 minute read

Analyzing data for #tidytuesday week of 12/18/2018 (source)

# LOAD PACKAGES AND PARSE DATA
library(tidyverse)
library(scales)
library(RColorBrewer)
library(forcats)
library(lubridate)
library(tidytext)

cetaceans_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-18/allCetaceanData.csv")

cetaceans <- cetaceans_raw

Most notable cause of death between Male vs Female ?

cetaceans %>% 
  select(sex, COD) %>%
  filter(sex != "U") %>%
  na.omit() %>%
  mutate(sex = replace(sex, str_detect(sex, "F"), "Female"), 
         sex = replace(sex, str_detect(sex, "M"), "Male")) %>%
  unnest_tokens(bigram, COD, token = "ngrams", n = 2) %>%
  count(sex, bigram) %>%
  bind_tf_idf(bigram, sex, n) %>%
  arrange(desc(tf_idf)) %>%
  filter(tf_idf > 0.0011) %>%
  ggplot() +
  geom_col(aes(reorder(bigram, tf_idf), tf_idf, fill = sex)) +
  coord_flip() +
  scale_fill_brewer(palette = 'Set2',
                    name = "") +
  labs(x = "",
       y = "",
       title = "Bigrams with highest TF-IDF for cause of death \n between Cetacean genders",
       caption = "Source: The Pudding") +
  theme_bw() 

What is the primary cause of death between Born vs Capture Cetaceans?

cod_acquisition_ratio <- cetaceans %>%
  select(acquisition, COD) %>%
  filter(acquisition == 'Born' | acquisition == 'Capture') %>%
  na.omit() %>%
  mutate(COD = tolower(COD)) %>%
  count(COD, acquisition) %>%
  filter(sum(n) >= 10) %>%
  ungroup() %>%
  spread(acquisition, n, fill = 0) %>%
  mutate_if(is.numeric, funs((. + 1) / sum(. +1))) %>%
  mutate(logratio = log(Born / Capture)) %>%
  arrange(desc(logratio))
  
cod_acquisition_ratio %>%
  arrange(abs(logratio)) %>%
  group_by(logratio < 0) %>%
  top_n(10, abs(logratio)) %>%
  ungroup() %>%
  mutate(COD = reorder(COD, logratio)) %>%
  ggplot() +
  geom_col(aes(COD, logratio, fill = logratio < 0)) +
  coord_flip() +
  scale_fill_brewer(palette = 'Accent',
                    name = "", 
                    labels = c("Born", "Capture")) +
  theme_bw() +
  labs(x = "",
       y = "Log Odds Ratio (Born / Capture)",
       title = "Comparing the odds ratio of words for cause of death \n between Cetacean's captured from the ocean or \n born in captivity (reported)",
       caption = "Source: The Pudding")