This is part 2 of TidyTuesday: Cocktails.

Below shows how we can use #rstats to write a cocktail recommendation system that takes in a drink and returns a few other cocktails based on similarly mixed ingredients.

Load libraries

library(tidyverse)
library(recommenderlab)

Download and parse data

Note: please check out part 1 for deatils on processing steps

bc_raw <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-26/boston_cocktails.csv')

bc <- bc_raw %>% 
  mutate(ingredient = str_to_lower(ingredient)) %>% 
  distinct() %>% 
  select(name, ingredient)

bc_tidy <- bc %>% 
  filter(!str_detect(ingredient, ","))

bc_untidy <- bc %>% 
  filter(str_detect(ingredient, ",")) %>% 
  mutate(ingredient = str_split(ingredient, ", ")) %>% 
  unnest(ingredient)

bc_clean <- rbind(bc_tidy, bc_untidy) %>% 
  distinct()

df <- bc_clean %>% 
  mutate(ingredient = str_replace_all(ingredient, "-", "_"),
         ingredient = str_replace_all(ingredient, " ", "_"),
         ingredient = str_replace_all(ingredient, "old_mr._boston_", ""),
         ingredient = str_replace_all(ingredient, "old_thompson_", "")) 

df_processed <- df %>% 
  mutate(value = 1) %>% 
  pivot_wider(names_from = name) %>% 
  replace(is.na(.), 0)

Recommendation algorithm

Transform data to binary rating matrix

cocktails_matrix <- df_processed %>% 
  select(-ingredient) %>% 
  as.matrix() %>% 
  as("binaryRatingMatrix")

Create evaluation scheme

scheme <- cocktails_matrix %>% 
  evaluationScheme(method = "cross", k = 5, train = 0.8, given = -1)

Input customer cocktail preference

Let’s check the ingredients for a very simple cocktail:

df %>% filter(name == 'Screwdriver')
## # A tibble: 2 x 2
##   name        ingredient  
##   <chr>       <chr>       
## 1 Screwdriver vodka       
## 2 Screwdriver orange_juice

Perfect. We can now use our recommendation model and predict what cocktail the customer might like based on similarly mixed ingredients.

favorite_cocktail <- "Screwdriver"

recco_matrix <- df %>% 
  select(name) %>% 
  unique() %>% 
  mutate(value = as.numeric(name %in% favorite_cocktail)) %>%
  pivot_wider(names_from = name) %>% 
  replace(is.na(.), 0) %>% 
  as.matrix() %>% 
  as("binaryRatingMatrix")

recco <- Recommender(getData(scheme, 'train'), 
                      method = "IBCF",  
                      param = list(k = 5))

cocktails_predict <- predict(recco, newdata = recco_matrix, n = 5)

Cocktail recommendation output

cocktails_results <- as(cocktails_predict, 'list') %>% as_tibble()

cocktails_results
## # A tibble: 5 x 1
##   `1`               
##   <chr>             
## 1 Handball Cooler   
## 2 Top Banana        
## 3 Madras            
## 4 Creamy Screwdriver
## 5 Petit Zinc

Finally, let’s make sure our results have at least vodka or orange juice as their ingredients for the recommended cocktails:

df %>% 
  filter(name %in% cocktails_results$`1`) 
## # A tibble: 16 x 2
##    name               ingredient     
##    <chr>              <chr>          
##  1 Top Banana         vodka          
##  2 Top Banana         creme_de_banana
##  3 Top Banana         orange_juice   
##  4 Creamy Screwdriver vodka          
##  5 Creamy Screwdriver egg_yolk       
##  6 Creamy Screwdriver orange_juice   
##  7 Creamy Screwdriver sugar          
##  8 Petit Zinc         vodka          
##  9 Petit Zinc         triple_sec     
## 10 Petit Zinc         sweet_vermouth 
## 11 Petit Zinc         orange_juice   
## 12 Madras             vodka          
## 13 Madras             cranberry_juice
## 14 Madras             orange_juice   
## 15 Handball Cooler    vodka          
## 16 Handball Cooler    orange_juice

Not bad at all!

Out of scope

  • Model evaluaiton
  • Dynamic web app built in Shiny