Class Demo: Recipe generator

Class Demo: Recipe generator#

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import re
import sys
from collections import Counter, defaultdict
from urllib.request import urlopen
import math

Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.

Data#

This is a demo for recipe generation using PyTorch and Transformers. For the purpose of this demo, we’ll sample 10_000 recipe titles from the corpus

orig_recipes_df = pd.read_csv("../data/RAW_recipes.csv")
orig_recipes_df = orig_recipes_df.dropna()
recipes_df = orig_recipes_df.sample(10_000)

recipes_df

	name	id	minutes	contributor_id	submitted	tags	nutrition	n_steps	steps	description	ingredients	n_ingredients
124299	lime cilantro grilled steak	29651	140	27783	2002-05-29	['weeknight', 'time-to-make', 'course', 'main-...	[6.3, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]	7	['combine lime juice , cilantro , water , jala...	i found this recipe in a recent issue of gemma...	['boneless beef chuck steaks', 'fresh lime jui...	8
158654	persian rhubarb syrup	240370	60	414666	2007-07-14	['60-minutes-or-less', 'time-to-make', 'course...	[778.0, 0.0, 737.0, 0.0, 5.0, 0.0, 66.0]	10	['put the rhubarb in a large stainless steel p...	add some seltzer or ginger ale and ice and thi...	['rhubarb', 'fresh lime juice', 'sugar', 'mint...	4
151872	pampered chef style apple crisp for microwave...	63314	65	27783	2003-05-28	['weeknight', 'time-to-make', 'course', 'main-...	[461.9, 27.0, 192.0, 4.0, 7.0, 50.0, 24.0]	9	['you will need a 12" round stoneware baking p...	this is one of the recipes that was served at ...	['tart baking apples', 'graham cracker squares...	9
176661	rocky road sundae pie	10409	40	8728	2001-07-25	['60-minutes-or-less', 'time-to-make', 'course...	[620.9, 58.0, 193.0, 9.0, 16.0, 92.0, 22.0]	33	['preheat oven to 325', 'butter a 9-inch pan',...	great summer pie!	['chocolate ice cream', 'walnuts', 'chocolate ...	8
163677	pork taco meat	85631	70	107334	2004-03-03	['time-to-make', 'course', 'preparation', 'ver...	[729.8, 73.0, 4.0, 13.0, 137.0, 81.0, 0.0]	7	['cube the pork steaks , cutting off much of t...	this is the best pork taco meat i've ever had....	['pork steak', 'oil', 'chili powder', 'salt', ...	7
...	...	...	...	...	...	...	...	...	...	...	...	...
98092	grill a juicy burger with the only rub you ll ...	371492	30	37449	2009-05-11	['30-minutes-or-less', 'time-to-make', 'course...	[282.4, 23.0, 44.0, 62.0, 44.0, 29.0, 4.0]	7	['combine sirloin with the ground chuck and cr...	want to ensure juicy great tasting burgers for...	['ground sirloin', 'ground chuck', 'ice', 'sea...	12
48942	chipotle tortilla casserole	205081	30	83093	2007-01-12	['30-minutes-or-less', 'time-to-make', 'course...	[439.5, 35.0, 19.0, 31.0, 45.0, 30.0, 12.0]	10	['preheat oven to 400f', 'arrange chips in bot...	this is a recipe attributed to rick bayless, f...	['corn tortilla chips', 'cooked chicken', 'tom...	11
184473	shamakia	491426	50	2548229	2012-12-07	['60-minutes-or-less', 'time-to-make', 'course...	[430.0, 47.0, 10.0, 32.0, 28.0, 49.0, 8.0]	7	['pre-heat oven to 350 degrees', 'cook sausage...	delicious casserole recipe that has been in my...	['fresh spinach', 'spicy sausage', 'sliced mus...	8
147699	omelette	431785	10	1627002	2010-07-02	['15-minutes-or-less', 'time-to-make', 'prepar...	[370.0, 48.0, 4.0, 12.0, 37.0, 50.0, 0.0]	10	['break the eggs into a jug and add the water'...	taken from 'food for health and vitality' - se...	['eggs', 'water', 'dried thyme', 'black pepper...	9
223230	vichyssoise cream of leek	47763	40	20480	2002-12-07	['60-minutes-or-less', 'time-to-make', 'course...	[173.4, 15.0, 9.0, 31.0, 12.0, 28.0, 5.0]	6	['in a 2 qt glass casserole place leeks , onio...	wonderful soup made in the microwave. it can b...	['leeks', 'onion', 'potatoes', 'butter', 'chic...	8

10000 rows × 12 columns

# Set the appropriate device depending upon your hardware. 

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') 
print(device)

mps

Tokenization#

Our data isn’t that big. The vocabulary is quite small. So we’ll write a wrapper class to map token ids of from pretrained model to custom token ids.

from transformers import AutoTokenizer

class TokenizerWrapper():
    """
    A wrapper class for the AutoTokenizer to handle tokenization and provide
    custom token-vocabulary mappings. 
    """
    def __init__(self, model_name="bert-base-cased"):        
        """
        Initializes the TokenizerWrapper with a specified model.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # The wrapper class creates a token-to-vocab mapping
        # Let's keep the ids corresponding to special tokens.  
        # 0 --> [PAD], 101 --> [CLS], 102 --> [SEP]  
        self.token_id_to_vocab_id = {0: 0, 101: 1, 102: 2}
        self.vocab_id_to_token_id = {0: 0, 1: 101, 2:102}
        self.vocab_id = len(self.vocab_id_to_token_id)
        self.padding_len = None 

    def build_dictionary(self, list_of_recipes: list):
        """
        Processes a list of captions to build and update the vocabulary based on the tokens found in the captions.
        This function also finds the maximum length of the tokenized captions to set the padding length.
    
        """
        # Tokenize all recipes to find the unique tokens and the maximum length
        tokenized_outputs = self.tokenizer(list_of_recipes, add_special_tokens=False)
        all_token_ids = set(token for sublist in tokenized_outputs.input_ids for token in sublist)
    
        # Update the custom token-vocabulary mapping
        for token_id in all_token_ids:
            if token_id not in self.token_id_to_vocab_id:
                self.token_id_to_vocab_id[token_id] = self.vocab_id
                self.vocab_id_to_token_id[self.vocab_id] = token_id
                self.vocab_id += 1
    
        # Set the padding length to the length of the longest tokenized recipe
        self.padding_len = max(len(tokens) for tokens in tokenized_outputs.input_ids)
    
    
    def get_vocab_size(self):
        """
        Returns the size of the custom vocabulary.
        """
        assert len(self.token_id_to_vocab_id) == len(self.vocab_id_to_token_id)
        return len(self.token_id_to_vocab_id)


    def tokenize(self, text: str) -> list:
        """
        Tokenizes a text string into custom vocabulary IDs, using the built dictionary. 
        Requires the dictionary to be built first.
    
        Parameters:
            text (str): The text to tokenize.
    
        Returns:
            list of int: A list of custom vocabulary IDs corresponding to the text tokens.
        """
        assert self.padding_len is not None, 'Call build_dictionary first.'
        # Tokenize the text with the maximum length set to the previously found maximum padding length

        # Tokenize the text. You don't need `truncation=True` for your lab 
        tokenized_output = self.tokenizer(text, padding="max_length", max_length=self.padding_len, truncation=True)
        
        return [self.token_id_to_vocab_id.get(token_id, 0)  # Default to [PAD] if token_id is not found
                for token_id in tokenized_output.input_ids]
        
    
    def decode(self, vocab_list: list) -> str:
        """
        Decodes a list of custom vocabulary IDs back into the original text string.

        Parameters:
            vocab_list (list of int): A list of custom vocabulary IDs to decode.

        Returns:
            str: The decoded text string.
        """        
        token_list = [self.vocab_id_to_token_id[vocab_id] for vocab_id in vocab_list]
        # decode the string
        decoded_string = self.tokenizer.decode(token_list, skip_special_tokens=True)
        return decoded_string.strip()

# Build the dictionary for our tokenizer  
from tqdm import tqdm, trange 
tokenizer_wrapper = TokenizerWrapper()
tokenizer_wrapper.build_dictionary(recipes_df["name"].to_list())

recipe_tokens = tokenizer_wrapper.tokenize(recipes_df['name'].iloc[10])
decoded_recipe = tokenizer_wrapper.decode(recipe_tokens)
print('Recipe:', recipes_df['name'].iloc[10])
print('Tokens:', recipe_tokens)
print('Decoded recipe:', decoded_recipe)

Recipe: wild rice and chicken soup
Tokens: [1, 2079, 3515, 376, 402, 2406, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded recipe: wild rice and chicken soup

vocab_size = tokenizer_wrapper.get_vocab_size()
vocab_size

Preparing the data#

Now that we have a class to tokenize our text let’s prepare our data

def build_data(data_df, tokenizer_wrapper):    
    dataset = []
    for row_id in trange(len(data_df)):
        reicpe_tokens = torch.tensor(tokenizer_wrapper.tokenize(data_df['name'].iloc[row_id])) 
        dataset.append({'token': reicpe_tokens})
    return dataset 

Let’s create train and test datasets by calling build_data on train and test splits.

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(recipes_df, test_size=0.2, random_state=123)
train_data = build_data(train_df, tokenizer_wrapper)
test_data = build_data(test_df, tokenizer_wrapper)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
  0%|                                                                  | 0/8000 [00:00<?, ?it/s]TOKENIZERS_PARALLELISM=(true | false)
100%|████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 19025.24it/s]
100%|████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 20211.81it/s]

# What's the vocab size? 
vocab_size = tokenizer_wrapper.get_vocab_size()
print(f'The vocab size is {vocab_size}.')

The vocab size is 3692.

class PytorchDataset():
    def __init__(self, data, pad_vocab_id=0):
        self.data = data
        self.pad_tensor = torch.tensor([pad_vocab_id])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, ind):
        # Retrieve the next sequence of tokens from the current index
        # by excluding the first token of the current sequence and appending a padding token at the end.        
        target_sequence = torch.cat([self.data[ind]['token'][1:], self.pad_tensor]) 
        return self.data[ind]['token'], target_sequence

train_dataset = PytorchDataset(train_data)
test_dataset = PytorchDataset(test_data)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=50, shuffle=False)

# Now let's get a batch of data from DataLoader
train_text, train_target = next(iter(train_dataloader))
train_text = train_text.to(device)
train_text.shape

torch.Size([64, 23])

train_text[11]

tensor([   1,   82, 1641,  271,  604, 1282, 2318,  386, 1183,  388,  376, 3188,
        3348,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       device='mps:0')

train_target[11]

tensor([  82, 1641,  271,  604, 1282, 2318,  386, 1183,  388,  376, 3188, 3348,
           2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

tokenizer_wrapper.decode(train_text[11].tolist())

'szechuan lamb with peppers and spinach'

tokenizer_wrapper.decode(train_target[11].tolist())

'szechuan lamb with peppers and spinach'

This is called autoregressive training.

Transformer model architecture#

To define our transformer model architecture for text generation, we will focus on constructing a decoder-only model. Here are the essential components:

Embedding Layer: This layer transforms our input token indices into embeddings that the transformer can process.
Transformer Decoder Layers: These are the core of our decoder-only architecture, where each layer processes the input embeddings to generate the next token predictions.
Linear Layer: This final output layer maps the decoder’s output to the size of our vocabulary, facilitating the prediction of the next token.

For implementing these components in PyTorch, we use the following classes:

nn.Embedding to create embeddings of tokens
nn.Linear for the final unembedding layer
nn.TransformerDecoderLayer: This class allows us to define individual transformer blocks with self-attention and a feedforward network.
nn.TransformerDecoder This class is used to stack multiple TransformerDecoderLayer objects into a full decoder.

Required parameters of TransformerDecoderLayer

d_model (int): the number of expected features in the input (required).
nhead (int): the number of heads in the multiheadattention models (required).

Required parameters of TransformerDecoder

decoder_layer: an instance of the TransformerDecoderLayer() class (required).
num_layers: the number of sub-decoder-layers in the decoder (required).

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class RecipeGenerator(nn.Module):
    def __init__(self, d_model, n_heads, num_layers, vocab_size, device, dropout=0.1):
        """
        Initialize the RecipeGenerator which uses a transformer decoder architecture
        for generating recipes.

        Parameters:
            d_model (int): The number of expected features in the encoder/decoder inputs.
            n_heads (int): The number of heads in the multiheadattention models.
            num_layers (int): The number of sub-decoder-layers in the transformer.
            vocab_size (int): The size of the vocabulary.
            device (torch.device): The device on which the model will be trained.
            dropout (float): The dropout value used in PositionalEncoding and TransformerDecoderLayer.
        """        
        super(RecipeGenerator, self).__init__()
        self.d_model = d_model
        self.device = device
        # Positional Encoding to add position information to input embeddings
        self.pos_encoding = PositionalEncoding(d_model=d_model, dropout=dropout)

        # Transformer decoder
        self.TransformerDecoder = nn.TransformerDecoder(
            decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout), 
            num_layers = num_layers
        )

        # Embedding layer for converting input text tokens into vectors
        self.text_embedding = nn.Embedding(vocab_size, d_model)

        # Final linear layer to map the output of the transformer decoder to vocabulary size        
        self.linear_layer = nn.Linear(d_model, vocab_size)
        
        # Initialize the weights of the model
        self.init_weights()
        
    def init_weights(self):
        """
        Initialize weights of the model to small random values.
        """
        initrange = 0.1
        self.text_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear_layer.bias.data.zero_()
        self.linear_layer.weight.data.uniform_(-initrange, initrange)

    def forward(self, text):
        # Get the embeded input
        encoded_text = self.embed_text(text)

        # Get transformer output
        transformer_output = self.decode(encoded_text)

        # Final linear layer (unembedding layer)
        
        return self.linear_layer(transformer_output)
    
    def embed_text(self, text):
        embedding = self.text_embedding(text) * math.sqrt(self.d_model)
        # Changing [batch_size, seq_len, d_model] to [seq_len, batch_size, d_model]
        return self.pos_encoding(embedding.permute(1, 0, 2))
    
    def decode(self, encoded_text):
        # Get the length of the sequences to be decoded. This is needed to generate the causal masks
        seq_len = encoded_text.size(0)
        causal_mask = self.generate_mask(seq_len)
        dummy_memory = torch.zeros_like(encoded_text)
        return self.TransformerDecoder(tgt=encoded_text, memory=dummy_memory, tgt_mask=causal_mask)
    
    def generate_mask(self, size):
        # Make the upper triangular part of the matrix to -inf
        mask = torch.triu(torch.ones(size, size, device=self.device), 1)
        return mask.float().masked_fill(mask == 1, float('-inf'))

import torch 
size = 10
mask = torch.triu(torch.ones(size, size), 1)
mask.float().masked_fill(mask == 1, float('-inf'))

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

# Now let's try your model. 
# Define the hyperparameters and initalize the model. Feel free to change these hyperparameters. 
d_model = 256 
n_heads = 4
num_layers = 8
model = RecipeGenerator(d_model=d_model, n_heads=n_heads, num_layers=num_layers, vocab_size=vocab_size, device=device).to(device)

train_text

tensor([[   1,  256,  377,  ...,    0,    0,    0],
        [   1, 2230,   83,  ...,    0,    0,    0],
        [   1,  378,  430,  ...,    0,    0,    0],
        ...,
        [   1, 2683,   81,  ...,    0,    0,    0],
        [   1, 1519,  587,  ...,    0,    0,    0],
        [   1, 3362, 3515,  ...,    0,    0,    0]], device='mps:0')

# pass inputs to your model
output = model(train_text)
output.shape

torch.Size([23, 64, 3692])

vocab_size

train_text.shape

torch.Size([64, 23])

output.shape

torch.Size([23, 64, 3692])

def trainer(model, criterion, optimizer, train_dataloader, test_dataloader, epochs=5, patience=5, clip_norm=1.0):
    train_losses, test_losses = [], []
    consec_increases, verbose = 0, True
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for train_text, target_seq in train_dataloader:
            train_text, target_seq = train_text.to(device), target_seq.to(device)
            optimizer.zero_grad()
            output = model(train_text).permute(1, 2, 0)  # Ensure output is in correct shape for loss calculation
            loss = criterion(output, target_seq)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        test_loss = 0
        with torch.no_grad():
            for test_text, target_seq in test_dataloader:
                test_text, target_seq = test_text.to(device), target_seq.to(device)
                output = model(test_text).permute(1, 2, 0)
                test_loss += criterion(output, target_seq).item()

        train_losses.append(train_loss / len(train_dataloader))
        test_losses.append(test_loss / len(test_dataloader))
        if verbose:
            print(f"Epoch {epoch+1}: Train Loss {train_losses[-1]:.4f}, Test Loss {test_losses[-1]:.4f}")

        if epoch > 0 and test_losses[-1] > test_losses[-2] * (1 + 1e-5):
            consec_increases += 1
        else:
            consec_increases = 0

        if consec_increases >= patience:
            print(f"Stopped early at epoch {epoch + 1}")
            break

    return train_losses, test_losses

# Define the optimizer and the loss function. Feel free to change the hyperparameters. 

num_epoch = 20
clip_norm = 1.0
lr = 5e-5

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0) # Ignore the padding index
train_losses, test_losses = trainer(model, criterion, optimizer,train_dataloader, test_dataloader, epochs= num_epoch)

Epoch 1: Train Loss 6.9428, Test Loss 6.3931
Epoch 2: Train Loss 6.0120, Test Loss 5.5934
Epoch 3: Train Loss 5.3771, Test Loss 5.1153
Epoch 4: Train Loss 4.9798, Test Loss 4.8317
Epoch 5: Train Loss 4.7009, Test Loss 4.6426
Epoch 6: Train Loss 4.4967, Test Loss 4.5094
Epoch 7: Train Loss 4.3364, Test Loss 4.4099
Epoch 8: Train Loss 4.2019, Test Loss 4.3313
Epoch 9: Train Loss 4.0903, Test Loss 4.2596
Epoch 10: Train Loss 3.9926, Test Loss 4.2018
Epoch 11: Train Loss 3.9047, Test Loss 4.1619
Epoch 12: Train Loss 3.8245, Test Loss 4.1265
Epoch 13: Train Loss 3.7520, Test Loss 4.0932
Epoch 14: Train Loss 3.6853, Test Loss 4.0584
Epoch 15: Train Loss 3.6224, Test Loss 4.0359
Epoch 16: Train Loss 3.5630, Test Loss 4.0141
Epoch 17: Train Loss 3.5078, Test Loss 3.9946
Epoch 18: Train Loss 3.4531, Test Loss 3.9735
Epoch 19: Train Loss 3.4047, Test Loss 3.9687
Epoch 20: Train Loss 3.3578, Test Loss 3.9533

def generate_recipe(model, device, max_recipe_length=39, seed = 10, end_vocab=2):
    """
    Generates a recipe using the specified model and device.

    Parameters:
        model (torch.nn.Module): The trained model used for generating reci[es.
        device (torch.device): The device (e.g., CPU or GPU) to which tensors will be sent for model execution.
        max_recipe_length (int, optional): The maximum length of the generated recipe. Defaults to 100.
        start_vocab (int, optional): The vocabulary index used to signify the start of a recipe. Defaults to 1.
        end_vocab (int, optional): The vocabulary index used to signify the end of a recipe. Defaults to 2.
        
    Returns:
        numpy.ndarray: An array containing the sequence of vocabulary indices representing the generated caption.
        
    """    
    context = torch.tensor([[seed]]).to(device)
    for _ in range(max_recipe_length):
        logits = model(context)[-1]
        probabilities = torch.softmax(logits, dim=-1).flatten(start_dim=1)
        next_vocab = torch.multinomial(probabilities, num_samples=1)
        context = torch.cat([context, next_vocab], dim=1)
        if next_vocab.item() == end_vocab:
            break
    return context.cpu().numpy().flatten()

recipe = generate_recipe(model, device, max_recipe_length=20, seed = 10)

generated_recipe = tokenizer_wrapper.decode(recipe)
generated_recipe

'ban challengerakas wings'

The generated recipes are not good. But with more data, epochs, and some hyperparameter optimization, they are likely to improve.