Blogs

TidyTuesday: Space Launches

Analyzing data for #tidytuesday week of 1/15/2019 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(RColorBrewer) library(forcats) library(scales) library(ebbr) launches_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv") launches <- launches_raw %>% filter(launch_year >= '1960') Distribution of the most space launches over time? countries <- launches %>% count(state_code, sort = TRUE) %>% filter(n >= 100) launches %>% inner_join(countries) %>% # INCOMING NASTY IFELSE CODE (NEED TO REFACTOR) mutate(state_code = ifelse(state_code == 'RU', 'Russia / Soviet Union', ifelse(state_code == 'SU', 'Russia / Soviet Union', ifelse(state_code == 'US', 'United States', ifelse(state_code == 'CN', 'China', ifelse(state_code == 'IN', 'India', ifelse(state_code == 'F', 'France', ifelse(state_code == 'J', 'Japan', state_code)))))))) %>% ggplot() + geom_density(aes(launch_year, fill = state_code, color = state_code), alpha = 0.2) + theme_light() + scale_color_brewer(palette = 'Set1') + scale_fill_brewer(palette = 'Set1') + labs(x = "", y = "", title = "Distribution of space launches over time by country", subtitle = "Minimum of 100 launches", caption = "Source: The Economist", fill = "Country", color = "Country") + scale_y_continuous(labels = percent_format(round(1))) ...

TidyTuesday: TV Golden Age

Analyzing data for #tidytuesday week of 01/08/2019 (source) # LOAD PACKAGES AND PARSE DATA library(knitr) library(tidyverse) library(RColorBrewer) library(forcats) library(lubridate) library(broom) tv_data_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-08/IMDb_Economist_tv_ratings.csv") tv_data <- tv_data_raw Prepare the data for k-means clustering tv_data_summarized <- tv_data %>% group_by(title, genres, date) %>% summarize(min_rating = min(av_rating), avg_rating = mean(av_rating), max_rating = max(av_rating), min_share = min(share), avg_share = mean(share), max_share = max(share)) %>% ungroup() kclust_data <- tv_data_summarized %>% select(-title, -genres, -date) kclust_results <- kmeans(kclust_data, center = 9) Check output data (boxplot) # CHECK OUTPUT DATA tv_data_summarized %>% left_join(augment(kclust_results, kclust_data)) %>% mutate(title = factor(title)) %>% group_by(.cluster) %>% ggplot() + geom_boxplot(aes(.cluster, avg_rating, fill = .cluster), show.legend = FALSE, alpha = 0.5) + theme_light() + labs(x = "Cluster #", y = "Average Rating", caption = "Source: The Economist", title = "Average rating distribution for each cluster assignment") + scale_fill_brewer(palette = 'Paired') ...

TidyTuesday: rtweet Data

Analyzing data for #tidytuesday week of 01/01/2019 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(tidytext) library(topicmodels) tweets_raw <- as_tibble(readRDS("rstats_tweets.rds")) Parse data and identify top users # IDEA BEHIND THIS IS TO FILTER OUT BOTS # FIND TOP USERS top_interactions <- tweets_raw %>% select(screen_name, favorite_count, retweet_count) %>% group_by(screen_name) %>% summarize(favorite = sum(favorite_count), retweet = sum(retweet_count)) %>% group_by(screen_name) %>% mutate(total = sum(favorite, retweet)) %>% arrange(desc(total)) %>% head(12) # JOIN TOP USERS WITH RAW DATASET tweets <- tweets_raw %>% inner_join(top_interactions, by='screen_name') # FINAL DATA PROCESSING tweets_parsed <- tweets %>% select(screen_name, text) %>% group_by(screen_name) %>% unnest_tokens(word, text) %>% anti_join(stop_words) %>% filter(!grepl("https|t.co|http|bit.ly|kindly|goo.gl|rstats|amp", word)) # REMOVE EXTRA STOP WORDS What are the most significant keywords for each #rstats Twitter user? tweets_tfidf <- tweets_parsed %>% count(screen_name, word, sort = TRUE) %>% ungroup() %>% bind_tf_idf(word, screen_name, n) tweets_tfidf %>% filter(!near(tf, 1)) %>% arrange(desc(tf_idf)) %>% group_by(screen_name) %>% distinct(screen_name, word, .keep_all = TRUE) %>% top_n(10, tf_idf) %>% ungroup() %>% mutate(word = factor(word, levels = rev(unique(word)))) %>% ggplot(aes(word, tf_idf, fill = screen_name)) + geom_col(show.legend = FALSE) + facet_wrap(~screen_name, ncol = 4, scales = "free") + coord_flip() + theme_light() + labs(x = "", y = "", title = "Highest TF-IDF words for top #rstats Twitter users", caption = "Source: data from {rtweet} package") + scale_fill_brewer(palette = 'Paired') ...

Retrospective Introspection in 2018

With the year coming to a close and 2019 just around the corner, I thought I would try something new and reflect back on the defining moments of my 2018 - along with my $0.02. I do not compose enough of this (or my personal thoughts) in written format but I recently migrated my blog from WordPress to Hugo so this is a good a time as any. Lucky to work with some of the smartest digital marketing folks in fintech It’s rare to find the perfect team. I always thought it was a load of crock from techies who wrote on Medium and have everyone go on some wild goose chase. ...

TidyTuesday: Cetaceans Dataset

Analyzing data for #tidytuesday week of 12/18/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(lubridate) library(tidytext) cetaceans_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-18/allCetaceanData.csv") cetaceans <- cetaceans_raw Most notable cause of death between Male vs Female ? cetaceans %>% select(sex, COD) %>% filter(sex != "U") %>% na.omit() %>% mutate(sex = replace(sex, str_detect(sex, "F"), "Female"), sex = replace(sex, str_detect(sex, "M"), "Male")) %>% unnest_tokens(bigram, COD, token = "ngrams", n = 2) %>% count(sex, bigram) %>% bind_tf_idf(bigram, sex, n) %>% arrange(desc(tf_idf)) %>% filter(tf_idf > 0.0011) %>% ggplot() + geom_col(aes(reorder(bigram, tf_idf), tf_idf, fill = sex)) + coord_flip() + scale_fill_brewer(palette = 'Set2', name = "") + labs(x = "", y = "", title = "Bigrams with highest TF-IDF for cause of death \n between Cetacean genders", caption = "Source: The Pudding") + theme_bw() ...

TidyTuesday: NYC Restaurant Inspections

Analyzing data for #tidytuesday week of 12/11/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(lubridate) library(ebbr) nyc_restaurants_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-11/nyc_restaurants.csv") nyc_restaurants <- nyc_restaurants_raw %>% filter(inspection_date != '01/01/1900') What is the rate of “A” inspection grades by cuisine type? First step is to compute the relevant statistics cuisine_grades <- nyc_restaurants %>% select(cuisine_description, grade) %>% na.omit() %>% group_by(cuisine_description) %>% count(grade) %>% mutate(total = sum(n), pct_total = n/total) %>% ungroup() Next we apply empirical Bayesian estimation and filter the top 20 results ...

TidyTuesday: Medium Article Metadata

Analyzing data for #tidytuesday week of 12/4/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) library(tidytext) library(stringr) articles_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-12-04/medium_datasci.csv") articles <- articles_raw Who are the top 10 authors in terms of total articles published? top_authors <- articles %>% select(author) %>% group_by(author) %>% count() %>% arrange(desc(n)) %>% na.omit() %>% head(10) top_authors %>% ggplot() + geom_col(aes(reorder(author, n), n), fill = "darkslategray4", alpha = 0.8) + coord_flip() + theme_bw() + labs(x = "", y = "", title = "Top 10 authors on Medium in terms of total articles published") ...

TidyTuesday: Baltimore Bridges

Analyzing data for #tidytuesday week of 11/27/2018 (source) # LOAD PACKAGES AND PARSE DATA library(tidyverse) library(scales) library(RColorBrewer) library(forcats) bridges_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018/2018-11-27/baltimore_bridges.csv") bridges <- bridges_raw Do bridge conditions get better over time? # REORDER BRIDGE_CONDITION FACTORS x <- bridges x$bridge_condition <- as.factor(x$bridge_condition) x$bridge_condition <- factor(x$bridge_condition, levels = c("Poor", "Fair", "Good")) x %>% filter(yr_built >= 1900) %>% # removing 2017 due to outlier select(lat, long, yr_built, bridge_condition, avg_daily_traffic) %>% group_by(yr_built, bridge_condition) %>% summarize(avg_daily_traffic = mean(avg_daily_traffic)) %>% ggplot() + geom_col(aes(yr_built, avg_daily_traffic, fill = bridge_condition), alpha = 0.3) + scale_y_continuous(label = comma_format(), limits = c(0, 223000)) + scale_fill_brewer(palette = 'Set1') + scale_color_brewer(palette = 'Set1') + geom_smooth(aes(yr_built, avg_daily_traffic, color = bridge_condition), se = FALSE) + theme_bw() + labs(x = "", y = "", title = "Baltimore bridges: average daily traffic by year built", subtitle = "Applied smoothing to highlight differences in bridge conditions and dampen outliers", fill = "Bridge Condition", color = "Bridge Condition") ...