TidyTuesday: rtweet Data

Analyzing data for #tidytuesday week of 01/01/2019 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(tidytext) library(topicmodels) tweets_raw <- as_tibble(readRDS("rstats_tweets.rds")) Parse data and identify top users # IDEA BEHIND THIS IS TO FILTER OUT BOTS # FIND TOP USERS top_interactions <- tweets_raw %>% select(screen_name, favorite_count, retweet_count) %>% group_by(screen_name) %>% summarize(favorite = sum(favorite_count), retweet = sum(retweet_count)) %>% group_by(screen_name) %>% mutate(total = sum(favorite, retweet)) %>% arrange(desc(total)) %>% head(12) # JOIN TOP USERS WITH RAW DATASET tweets <- tweets_raw %>% inner_join(top_interactions, by='screen_name') # FINAL DATA PROCESSING tweets_parsed <- tweets %>% select(screen_name, text) %>% group_by(screen_name) %>% unnest_tokens(word, text) %>% anti_join(stop_words) %>% filter(!...

January 1, 2019 · Christopher Yee

TidyTuesday: Cetaceans Dataset

Analyzing data for #tidytuesday week of 12/18/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(lubridate) library(tidytext) cetaceans_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-18/allCetaceanData.csv") cetaceans <- cetaceans_raw Most notable cause of death between Male vs Female ? cetaceans %>% select(sex, COD) %>% filter(sex != "U") %>% na.omit() %>% mutate(sex = replace(sex, str_detect(sex, "F"), "Female"), sex = replace(sex, str_detect(sex, "M"), "Male")) %>% unnest_tokens(bigram, COD, token = "ngrams", n = 2) %>% count(sex, bigram) %>% bind_tf_idf(bigram, sex, n) %>% arrange(desc(tf_idf)) %>% filter(tf_idf > 0....

December 18, 2018 · Christopher Yee

TidyTuesday: NYC Restaurant Inspections

Analyzing data for #tidytuesday week of 12/11/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(lubridate) library(ebbr) nyc_restaurants_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-11/nyc_restaurants.csv") nyc_restaurants <- nyc_restaurants_raw %>% filter(inspection_date != '01/01/1900') What is the rate of “A” inspection grades by cuisine type? First step is to compute the relevant statistics cuisine_grades <- nyc_restaurants %>% select(cuisine_description, grade) %>% na.omit() %>% group_by(cuisine_description) %>% count(grade) %>% mutate(total = sum(n), pct_total = n/total) %>% ungroup() Next we apply empirical Bayesian estimation and filter the top 20 results...

December 11, 2018 · Christopher Yee

TidyTuesday: Medium Article Metadata

Analyzing data for #tidytuesday week of 12/4/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(tidytext) library(stringr) articles_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-04/medium_datasci.csv") articles <- articles_raw Who are the top 10 authors in terms of total articles published? top_authors <- articles %>% select(author) %>% group_by(author) %>% count() %>% arrange(desc(n)) %>% na.omit() %>% head(10) top_authors %>% ggplot() + geom_col(aes(reorder(author, n), n), fill = "darkslategray4", alpha = 0.8) + coord_flip() + theme_bw() + labs(x = "", y = "", title = "Top 10 authors on Medium in terms of total articles published") Are there differences in words used between the titles and subtitles?...

December 4, 2018 · Christopher Yee

TidyTuesday: Baltimore Bridges

Analyzing data for #tidytuesday week of 11/27/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) bridges_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-11-27/baltimore_bridges.csv") bridges <- bridges_raw Do bridge conditions get better over time? # REORDER BRIDGE_CONDITION FACTORS x <- bridges x$bridge_condition <- as.factor(x$bridge_condition) x$bridge_condition <- factor(x$bridge_condition, levels = c("Poor", "Fair", "Good")) x %>% filter(yr_built >= 1900) %>% # removing 2017 due to outlier select(lat, long, yr_built, bridge_condition, avg_daily_traffic) %>% group_by(yr_built, bridge_condition) %>% summarize(avg_daily_traffic = mean(avg_daily_traffic)) %>% ggplot() + geom_col(aes(yr_built, avg_daily_traffic, fill = bridge_condition), alpha = 0....

November 27, 2018 · Christopher Yee

TidyTuesday: Thanksgiving Dinner

Analyzing data for #tidytuesday week of 11/20/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) thanksgiving_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-11-20/thanksgiving_meals.csv") thanksgiving <- thanksgiving_raw %>% filter(celebrate != 'No') What are the most popular pies for Thanksgiving ? thanksgiving %>% select(pie1:pie13) %>% pivot_longer(pie1:pie13, names_to = "pie_type") %>% filter(value != 'None') %>% select(value) %>% group_by(value) %>% count() %>% filter(n > 10) %>% ungroup() %>% ggplot(aes(reorder(value, n), n, label = n)) + geom_bar(aes(fill = value), alpha = 0....

November 20, 2018 · Christopher Yee