Data from #tidytuesday week of 2020-05-26 (source)
If you are looking for the R script then you can find it here
Load packages
library(tidyverse)
library(ggrepel)
library(FactoMineR)
Download data
bc_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/boston_cocktails.csv')
Data processing
Standardize cases
bc_raw %>%
count(ingredient, sort = TRUE) %>%
filter(str_detect(ingredient, "red pepper sauce"))
## # A tibble: 2 x 2
## ingredient n
## <chr> <int>
## 1 Hot red pepper sauce 4
## 2 hot red pepper sauce 1
Let’s fix that by making all ingredient values to lower case:
bc <- bc_raw %>%
mutate(ingredient = str_to_lower(ingredient)) %>%
distinct() %>%
select(name, ingredient)
And to make sure it works…
bc %>%
count(ingredient, sort = TRUE) %>%
filter(str_detect(ingredient, "red pepper sauce"))
## # A tibble: 1 x 2
## ingredient n
## <chr> <int>
## 1 hot red pepper sauce 5
Fix untidy data
To follow “tidy” principles, we need one row per observation.
bc %>%
filter(str_detect(ingredient, ","))
## # A tibble: 85 x 2
## name ingredient
## <chr> <chr>
## 1 John Collins orange and lemon wheels, maraschino cherry
## 2 Irish Shillelagh fresh raspberries and strawberries, 2 peach slices, ma…
## 3 Underneath The Mango… lime wedge, sweet chili powder
## 4 Emperor Norton's Mis… fresh strawberries, cut in halves
## 5 Toasted Drop lemon wedge, cinnamon sugar
## 6 Stockholm 75 lemon wedge, superfine sugar
## 7 Salty Dog lemon wedge, coarse salt
## 8 Rouxby Red for glass lemon wedge, coarse salt
## 9 Redhead Martini strawberries, cut into halves
## 10 Canadian Breeze pineapple wedge, maraschino cherry
## # … with 75 more rows
We can reformat this by separating the commas then adding a new row for each cocktail per ingredient.
# CLEAN DATAFRAME
bc_tidy <- bc %>%
filter(!str_detect(ingredient, ","))
# EXTRACT UNTIDY DATA THEN CLEAN
bc_untidy <- bc %>%
filter(str_detect(ingredient, ",")) %>%
mutate(ingredient = str_split(ingredient, ", ")) %>%
unnest(ingredient)
# COMBINE BOTH DATAFRAMES
bc_clean <- rbind(bc_tidy, bc_untidy) %>%
distinct()
bc_untidy
## # A tibble: 193 x 2
## name ingredient
## <chr> <chr>
## 1 John Collins orange and lemon wheels
## 2 John Collins maraschino cherry
## 3 Irish Shillelagh fresh raspberries and strawberries
## 4 Irish Shillelagh 2 peach slices
## 5 Irish Shillelagh maraschino cherry
## 6 Underneath The Mango Tree lime wedge
## 7 Underneath The Mango Tree sweet chili powder
## 8 Emperor Norton's Mistress fresh strawberries
## 9 Emperor Norton's Mistress cut in halves
## 10 Toasted Drop lemon wedge
## # … with 183 more rows
Reduce cardinality
Our dataset has more than 550 unique ingredients so let’s trim that down to the ingredients that are used in ten or more cocktails.
bc_clean %>%
distinct(ingredient) %>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 553
n_ingredients <- bc_clean %>%
count(ingredient, sort = TRUE) %>%
filter(n > 10)
Normalize ingredients
Similar to our case statement section above, let’s make sure our ingredients are consolidated to the same format.
df <- bc_clean %>%
inner_join(n_ingredients) %>%
select(-n) %>%
mutate(ingredient = str_replace_all(ingredient, "-", "_"),
ingredient = str_replace_all(ingredient, " ", "_"),
ingredient = str_replace_all(ingredient, "old_mr._boston_", ""),
ingredient = str_replace_all(ingredient, "old_thompson_", ""))
Multiple Correspondence Analysis (MCA)
Our dataset is mostly categorical so MCA can help identify and highlight any underlying structures.
Format data for MCA
df_mca_processed <- df %>%
mutate(value = 1) %>%
pivot_wider(names_from = ingredient) %>%
replace(is.na(.), 0) %>%
select(-name) %>%
mutate_if(is.double, as.factor)
mca_results <- MCA(df_mca_processed, graph = FALSE)
Shape data to tidy structure
mca_df <- data.frame(mca_results$var$coord)
mca_final <- rownames_to_column(mca_df, var = "rowname") %>%
as_tibble() %>%
filter(str_detect(rowname, "_1")) %>%
mutate(variable = str_replace_all(rowname, "_1", "")) %>%
select(variable, everything(), -rowname) %>%
mutate(highlight = case_when(str_detect(variable, "gin") ~ "gin",
str_detect(variable, "rum") ~ "rum",
str_detect(variable, "vodka") ~ "vodka",
str_detect(variable, "whiskey") ~ "whiskey",
str_detect(variable, "brandy") ~ "brandy",
str_detect(variable, "bourbon") ~ "bourbon",
str_detect(variable, "tequila") ~ "tequila"))
Final plot
mca_final %>%
ggplot(aes(x = Dim.1, y = Dim.2, label = variable, color = highlight)) +
geom_density2d(color = "gray90") +
geom_point(show.legend = FALSE) +
geom_text_repel(show.legend = FALSE) +
labs(x = "D1", y = "D2",
title = "Multiple correspondence analysis (MCA) on the most common cocktail ingredients",
subtitle = "Closer points suggest they are typically mixed together",
caption = "by: @eeysirhc\nsource: Mr. Boston Bartender's Guide") +
theme_minimal(base_size = 15) +
theme(axis.text.y = element_blank(),
axis.text.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
Future work
- Part 2: cocktail recommendation system based on the input of favorite drink
- Calculate and use dissimilarity to recommend a drink to someone based on what they dislike