TidyTuesday: Cetaceans Dataset
Analyzing data for #tidytuesday week of 12/18/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(lubridate) library(tidytext) cetaceans_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-18/allCetaceanData.csv") cetaceans <- cetaceans_raw Most notable cause of death between Male vs Female ? cetaceans %>% select(sex, COD) %>% filter(sex != "U") %>% na.omit() %>% mutate(sex = replace(sex, str_detect(sex, "F"), "Female"), sex = replace(sex, str_detect(sex, "M"), "Male")) %>% unnest_tokens(bigram, COD, token = "ngrams", n = 2) %>% count(sex, bigram) %>% bind_tf_idf(bigram, sex, n) %>% arrange(desc(tf_idf)) %>% filter(tf_idf > 0.0011) %>% ggplot() + geom_col(aes(reorder(bigram, tf_idf), tf_idf, fill = sex)) + coord_flip() + scale_fill_brewer(palette = 'Set2', name = "") + labs(x = "", y = "", title = "Bigrams with highest TF-IDF for cause of death \n between Cetacean genders", caption = "Source: The Pudding") + theme_bw() ...