Class Demo: Recipe generator#

import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import re
import sys
from collections import Counter, defaultdict
from urllib.request import urlopen
import math
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.

Data#

This is a demo for recipe generation using PyTorch and Transformers. For the purpose of this demo, we’ll sample 10_000 recipe titles from the corpus

orig_recipes_df = pd.read_csv("../data/RAW_recipes.csv")
orig_recipes_df = orig_recipes_df.dropna()
recipes_df = orig_recipes_df.sample(10_000)
recipes_df
name id minutes contributor_id submitted tags nutrition n_steps steps description ingredients n_ingredients
124299 lime cilantro grilled steak 29651 140 27783 2002-05-29 ['weeknight', 'time-to-make', 'course', 'main-... [6.3, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0] 7 ['combine lime juice , cilantro , water , jala... i found this recipe in a recent issue of gemma... ['boneless beef chuck steaks', 'fresh lime jui... 8
158654 persian rhubarb syrup 240370 60 414666 2007-07-14 ['60-minutes-or-less', 'time-to-make', 'course... [778.0, 0.0, 737.0, 0.0, 5.0, 0.0, 66.0] 10 ['put the rhubarb in a large stainless steel p... add some seltzer or ginger ale and ice and thi... ['rhubarb', 'fresh lime juice', 'sugar', 'mint... 4
151872 pampered chef style apple crisp for microwave... 63314 65 27783 2003-05-28 ['weeknight', 'time-to-make', 'course', 'main-... [461.9, 27.0, 192.0, 4.0, 7.0, 50.0, 24.0] 9 ['you will need a 12" round stoneware baking p... this is one of the recipes that was served at ... ['tart baking apples', 'graham cracker squares... 9
176661 rocky road sundae pie 10409 40 8728 2001-07-25 ['60-minutes-or-less', 'time-to-make', 'course... [620.9, 58.0, 193.0, 9.0, 16.0, 92.0, 22.0] 33 ['preheat oven to 325', 'butter a 9-inch pan',... great summer pie! ['chocolate ice cream', 'walnuts', 'chocolate ... 8
163677 pork taco meat 85631 70 107334 2004-03-03 ['time-to-make', 'course', 'preparation', 'ver... [729.8, 73.0, 4.0, 13.0, 137.0, 81.0, 0.0] 7 ['cube the pork steaks , cutting off much of t... this is the best pork taco meat i've ever had.... ['pork steak', 'oil', 'chili powder', 'salt', ... 7
... ... ... ... ... ... ... ... ... ... ... ... ...
98092 grill a juicy burger with the only rub you ll ... 371492 30 37449 2009-05-11 ['30-minutes-or-less', 'time-to-make', 'course... [282.4, 23.0, 44.0, 62.0, 44.0, 29.0, 4.0] 7 ['combine sirloin with the ground chuck and cr... want to ensure juicy great tasting burgers for... ['ground sirloin', 'ground chuck', 'ice', 'sea... 12
48942 chipotle tortilla casserole 205081 30 83093 2007-01-12 ['30-minutes-or-less', 'time-to-make', 'course... [439.5, 35.0, 19.0, 31.0, 45.0, 30.0, 12.0] 10 ['preheat oven to 400f', 'arrange chips in bot... this is a recipe attributed to rick bayless, f... ['corn tortilla chips', 'cooked chicken', 'tom... 11
184473 shamakia 491426 50 2548229 2012-12-07 ['60-minutes-or-less', 'time-to-make', 'course... [430.0, 47.0, 10.0, 32.0, 28.0, 49.0, 8.0] 7 ['pre-heat oven to 350 degrees', 'cook sausage... delicious casserole recipe that has been in my... ['fresh spinach', 'spicy sausage', 'sliced mus... 8
147699 omelette 431785 10 1627002 2010-07-02 ['15-minutes-or-less', 'time-to-make', 'prepar... [370.0, 48.0, 4.0, 12.0, 37.0, 50.0, 0.0] 10 ['break the eggs into a jug and add the water'... taken from 'food for health and vitality' - se... ['eggs', 'water', 'dried thyme', 'black pepper... 9
223230 vichyssoise cream of leek 47763 40 20480 2002-12-07 ['60-minutes-or-less', 'time-to-make', 'course... [173.4, 15.0, 9.0, 31.0, 12.0, 28.0, 5.0] 6 ['in a 2 qt glass casserole place leeks , onio... wonderful soup made in the microwave. it can b... ['leeks', 'onion', 'potatoes', 'butter', 'chic... 8

10000 rows × 12 columns

# Set the appropriate device depending upon your hardware. 

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') 
print(device)
mps



Tokenization#

Our data isn’t that big. The vocabulary is quite small. So we’ll write a wrapper class to map token ids of from pretrained model to custom token ids.

from transformers import AutoTokenizer

class TokenizerWrapper():
    """
    A wrapper class for the AutoTokenizer to handle tokenization and provide
    custom token-vocabulary mappings. 
    """
    def __init__(self, model_name="bert-base-cased"):        
        """
        Initializes the TokenizerWrapper with a specified model.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # The wrapper class creates a token-to-vocab mapping
        # Let's keep the ids corresponding to special tokens.  
        # 0 --> [PAD], 101 --> [CLS], 102 --> [SEP]  
        self.token_id_to_vocab_id = {0: 0, 101: 1, 102: 2}
        self.vocab_id_to_token_id = {0: 0, 1: 101, 2:102}
        self.vocab_id = len(self.vocab_id_to_token_id)
        self.padding_len = None 

    def build_dictionary(self, list_of_recipes: list):
        """
        Processes a list of captions to build and update the vocabulary based on the tokens found in the captions.
        This function also finds the maximum length of the tokenized captions to set the padding length.
    
        """
        # Tokenize all recipes to find the unique tokens and the maximum length
        tokenized_outputs = self.tokenizer(list_of_recipes, add_special_tokens=False)
        all_token_ids = set(token for sublist in tokenized_outputs.input_ids for token in sublist)
    
        # Update the custom token-vocabulary mapping
        for token_id in all_token_ids:
            if token_id not in self.token_id_to_vocab_id:
                self.token_id_to_vocab_id[token_id] = self.vocab_id
                self.vocab_id_to_token_id[self.vocab_id] = token_id
                self.vocab_id += 1
    
        # Set the padding length to the length of the longest tokenized recipe
        self.padding_len = max(len(tokens) for tokens in tokenized_outputs.input_ids)
    
    
    def get_vocab_size(self):
        """
        Returns the size of the custom vocabulary.
        """
        assert len(self.token_id_to_vocab_id) == len(self.vocab_id_to_token_id)
        return len(self.token_id_to_vocab_id)


    def tokenize(self, text: str) -> list:
        """
        Tokenizes a text string into custom vocabulary IDs, using the built dictionary. 
        Requires the dictionary to be built first.
    
        Parameters:
            text (str): The text to tokenize.
    
        Returns:
            list of int: A list of custom vocabulary IDs corresponding to the text tokens.
        """
        assert self.padding_len is not None, 'Call build_dictionary first.'
        # Tokenize the text with the maximum length set to the previously found maximum padding length

        # Tokenize the text. You don't need `truncation=True` for your lab 
        tokenized_output = self.tokenizer(text, padding="max_length", max_length=self.padding_len, truncation=True)
        
        return [self.token_id_to_vocab_id.get(token_id, 0)  # Default to [PAD] if token_id is not found
                for token_id in tokenized_output.input_ids]
        
    
    def decode(self, vocab_list: list) -> str:
        """
        Decodes a list of custom vocabulary IDs back into the original text string.

        Parameters:
            vocab_list (list of int): A list of custom vocabulary IDs to decode.

        Returns:
            str: The decoded text string.
        """        
        token_list = [self.vocab_id_to_token_id[vocab_id] for vocab_id in vocab_list]
        # decode the string
        decoded_string = self.tokenizer.decode(token_list, skip_special_tokens=True)
        return decoded_string.strip()
# Build the dictionary for our tokenizer  
from tqdm import tqdm, trange 
tokenizer_wrapper = TokenizerWrapper()
tokenizer_wrapper.build_dictionary(recipes_df["name"].to_list())
recipe_tokens = tokenizer_wrapper.tokenize(recipes_df['name'].iloc[10])
decoded_recipe = tokenizer_wrapper.decode(recipe_tokens)
print('Recipe:', recipes_df['name'].iloc[10])
print('Tokens:', recipe_tokens)
print('Decoded recipe:', decoded_recipe)
Recipe: wild rice and chicken soup
Tokens: [1, 2079, 3515, 376, 402, 2406, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded recipe: wild rice and chicken soup
vocab_size = tokenizer_wrapper.get_vocab_size()
vocab_size
3692



Preparing the data#

Now that we have a class to tokenize our text let’s prepare our data

def build_data(data_df, tokenizer_wrapper):    
    dataset = []
    for row_id in trange(len(data_df)):
        reicpe_tokens = torch.tensor(tokenizer_wrapper.tokenize(data_df['name'].iloc[row_id])) 
        dataset.append({'token': reicpe_tokens})
    return dataset 

Let’s create train and test datasets by calling build_data on train and test splits.

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(recipes_df, test_size=0.2, random_state=123)
train_data = build_data(train_df, tokenizer_wrapper)
test_data = build_data(test_df, tokenizer_wrapper)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
  0%|                                                                  | 0/8000 [00:00<?, ?it/s]TOKENIZERS_PARALLELISM=(true | false)
100%|████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 19025.24it/s]
100%|████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 20211.81it/s]
# What's the vocab size? 
vocab_size = tokenizer_wrapper.get_vocab_size()
print(f'The vocab size is {vocab_size}.')
The vocab size is 3692.
class PytorchDataset():
    def __init__(self, data, pad_vocab_id=0):
        self.data = data
        self.pad_tensor = torch.tensor([pad_vocab_id])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, ind):
        # Retrieve the next sequence of tokens from the current index
        # by excluding the first token of the current sequence and appending a padding token at the end.        
        target_sequence = torch.cat([self.data[ind]['token'][1:], self.pad_tensor]) 
        return self.data[ind]['token'], target_sequence
train_dataset = PytorchDataset(train_data)
test_dataset = PytorchDataset(test_data)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=50, shuffle=False)
# Now let's get a batch of data from DataLoader
train_text, train_target = next(iter(train_dataloader))
train_text = train_text.to(device)
train_text.shape
torch.Size([64, 23])
train_text[11]
tensor([   1,   82, 1641,  271,  604, 1282, 2318,  386, 1183,  388,  376, 3188,
        3348,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       device='mps:0')
train_target[11]
tensor([  82, 1641,  271,  604, 1282, 2318,  386, 1183,  388,  376, 3188, 3348,
           2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
tokenizer_wrapper.decode(train_text[11].tolist())
'szechuan lamb with peppers and spinach'
tokenizer_wrapper.decode(train_target[11].tolist())
'szechuan lamb with peppers and spinach'

This is called autoregressive training.





Transformer model architecture#

To define our transformer model architecture for text generation, we will focus on constructing a decoder-only model. Here are the essential components:

  • Embedding Layer: This layer transforms our input token indices into embeddings that the transformer can process.

  • Transformer Decoder Layers: These are the core of our decoder-only architecture, where each layer processes the input embeddings to generate the next token predictions.

  • Linear Layer: This final output layer maps the decoder’s output to the size of our vocabulary, facilitating the prediction of the next token.

For implementing these components in PyTorch, we use the following classes:

Required parameters of TransformerDecoderLayer

  • d_model (int): the number of expected features in the input (required).

  • nhead (int): the number of heads in the multiheadattention models (required).

Required parameters of TransformerDecoder

  • decoder_layer: an instance of the TransformerDecoderLayer() class (required).

  • num_layers: the number of sub-decoder-layers in the decoder (required).

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
class RecipeGenerator(nn.Module):
    def __init__(self, d_model, n_heads, num_layers, vocab_size, device, dropout=0.1):
        """
        Initialize the RecipeGenerator which uses a transformer decoder architecture
        for generating recipes.

        Parameters:
            d_model (int): The number of expected features in the encoder/decoder inputs.
            n_heads (int): The number of heads in the multiheadattention models.
            num_layers (int): The number of sub-decoder-layers in the transformer.
            vocab_size (int): The size of the vocabulary.
            device (torch.device): The device on which the model will be trained.
            dropout (float): The dropout value used in PositionalEncoding and TransformerDecoderLayer.
        """        
        super(RecipeGenerator, self).__init__()
        self.d_model = d_model
        self.device = device
        # Positional Encoding to add position information to input embeddings
        self.pos_encoding = PositionalEncoding(d_model=d_model, dropout=dropout)

        # Transformer decoder
        self.TransformerDecoder = nn.TransformerDecoder(
            decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout), 
            num_layers = num_layers
        )

        # Embedding layer for converting input text tokens into vectors
        self.text_embedding = nn.Embedding(vocab_size, d_model)

        # Final linear layer to map the output of the transformer decoder to vocabulary size        
        self.linear_layer = nn.Linear(d_model, vocab_size)
        
        # Initialize the weights of the model
        self.init_weights()
        
    def init_weights(self):
        """
        Initialize weights of the model to small random values.
        """
        initrange = 0.1
        self.text_embedding.weight.data.uniform_(-initrange, initrange)
        self.linear_layer.bias.data.zero_()
        self.linear_layer.weight.data.uniform_(-initrange, initrange)

    def forward(self, text):
        # Get the embeded input
        encoded_text = self.embed_text(text)

        # Get transformer output
        transformer_output = self.decode(encoded_text)

        # Final linear layer (unembedding layer)
        
        return self.linear_layer(transformer_output)
    
    def embed_text(self, text):
        embedding = self.text_embedding(text) * math.sqrt(self.d_model)
        # Changing [batch_size, seq_len, d_model] to [seq_len, batch_size, d_model]
        return self.pos_encoding(embedding.permute(1, 0, 2))
    
    def decode(self, encoded_text):
        # Get the length of the sequences to be decoded. This is needed to generate the causal masks
        seq_len = encoded_text.size(0)
        causal_mask = self.generate_mask(seq_len)
        dummy_memory = torch.zeros_like(encoded_text)
        return self.TransformerDecoder(tgt=encoded_text, memory=dummy_memory, tgt_mask=causal_mask)
    
    def generate_mask(self, size):
        # Make the upper triangular part of the matrix to -inf
        mask = torch.triu(torch.ones(size, size, device=self.device), 1)
        return mask.float().masked_fill(mask == 1, float('-inf'))
import torch 
size = 10
mask = torch.triu(torch.ones(size, size), 1)
mask.float().masked_fill(mask == 1, float('-inf'))
tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
# Now let's try your model. 
# Define the hyperparameters and initalize the model. Feel free to change these hyperparameters. 
d_model = 256 
n_heads = 4
num_layers = 8
model = RecipeGenerator(d_model=d_model, n_heads=n_heads, num_layers=num_layers, vocab_size=vocab_size, device=device).to(device)
train_text
tensor([[   1,  256,  377,  ...,    0,    0,    0],
        [   1, 2230,   83,  ...,    0,    0,    0],
        [   1,  378,  430,  ...,    0,    0,    0],
        ...,
        [   1, 2683,   81,  ...,    0,    0,    0],
        [   1, 1519,  587,  ...,    0,    0,    0],
        [   1, 3362, 3515,  ...,    0,    0,    0]], device='mps:0')
# pass inputs to your model
output = model(train_text)
output.shape
torch.Size([23, 64, 3692])
vocab_size
3692
train_text.shape
torch.Size([64, 23])
output.shape
torch.Size([23, 64, 3692])
def trainer(model, criterion, optimizer, train_dataloader, test_dataloader, epochs=5, patience=5, clip_norm=1.0):
    train_losses, test_losses = [], []
    consec_increases, verbose = 0, True
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for train_text, target_seq in train_dataloader:
            train_text, target_seq = train_text.to(device), target_seq.to(device)
            optimizer.zero_grad()
            output = model(train_text).permute(1, 2, 0)  # Ensure output is in correct shape for loss calculation
            loss = criterion(output, target_seq)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        test_loss = 0
        with torch.no_grad():
            for test_text, target_seq in test_dataloader:
                test_text, target_seq = test_text.to(device), target_seq.to(device)
                output = model(test_text).permute(1, 2, 0)
                test_loss += criterion(output, target_seq).item()

        train_losses.append(train_loss / len(train_dataloader))
        test_losses.append(test_loss / len(test_dataloader))
        if verbose:
            print(f"Epoch {epoch+1}: Train Loss {train_losses[-1]:.4f}, Test Loss {test_losses[-1]:.4f}")

        if epoch > 0 and test_losses[-1] > test_losses[-2] * (1 + 1e-5):
            consec_increases += 1
        else:
            consec_increases = 0

        if consec_increases >= patience:
            print(f"Stopped early at epoch {epoch + 1}")
            break

    return train_losses, test_losses
# Define the optimizer and the loss function. Feel free to change the hyperparameters. 

num_epoch = 20
clip_norm = 1.0
lr = 5e-5

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0) # Ignore the padding index
train_losses, test_losses = trainer(model, criterion, optimizer,train_dataloader, test_dataloader, epochs= num_epoch)
Epoch 1: Train Loss 6.9428, Test Loss 6.3931
Epoch 2: Train Loss 6.0120, Test Loss 5.5934
Epoch 3: Train Loss 5.3771, Test Loss 5.1153
Epoch 4: Train Loss 4.9798, Test Loss 4.8317
Epoch 5: Train Loss 4.7009, Test Loss 4.6426
Epoch 6: Train Loss 4.4967, Test Loss 4.5094
Epoch 7: Train Loss 4.3364, Test Loss 4.4099
Epoch 8: Train Loss 4.2019, Test Loss 4.3313
Epoch 9: Train Loss 4.0903, Test Loss 4.2596
Epoch 10: Train Loss 3.9926, Test Loss 4.2018
Epoch 11: Train Loss 3.9047, Test Loss 4.1619
Epoch 12: Train Loss 3.8245, Test Loss 4.1265
Epoch 13: Train Loss 3.7520, Test Loss 4.0932
Epoch 14: Train Loss 3.6853, Test Loss 4.0584
Epoch 15: Train Loss 3.6224, Test Loss 4.0359
Epoch 16: Train Loss 3.5630, Test Loss 4.0141
Epoch 17: Train Loss 3.5078, Test Loss 3.9946
Epoch 18: Train Loss 3.4531, Test Loss 3.9735
Epoch 19: Train Loss 3.4047, Test Loss 3.9687
Epoch 20: Train Loss 3.3578, Test Loss 3.9533
def generate_recipe(model, device, max_recipe_length=39, seed = 10, end_vocab=2):
    """
    Generates a recipe using the specified model and device.

    Parameters:
        model (torch.nn.Module): The trained model used for generating reci[es.
        device (torch.device): The device (e.g., CPU or GPU) to which tensors will be sent for model execution.
        max_recipe_length (int, optional): The maximum length of the generated recipe. Defaults to 100.
        start_vocab (int, optional): The vocabulary index used to signify the start of a recipe. Defaults to 1.
        end_vocab (int, optional): The vocabulary index used to signify the end of a recipe. Defaults to 2.
        
    Returns:
        numpy.ndarray: An array containing the sequence of vocabulary indices representing the generated caption.
        
    """    
    context = torch.tensor([[seed]]).to(device)
    for _ in range(max_recipe_length):
        logits = model(context)[-1]
        probabilities = torch.softmax(logits, dim=-1).flatten(start_dim=1)
        next_vocab = torch.multinomial(probabilities, num_samples=1)
        context = torch.cat([context, next_vocab], dim=1)
        if next_vocab.item() == end_vocab:
            break
    return context.cpu().numpy().flatten()
recipe = generate_recipe(model, device, max_recipe_length=20, seed = 10)
generated_recipe = tokenizer_wrapper.decode(recipe)
generated_recipe
'ban challengerakas wings'

The generated recipes are not good. But with more data, epochs, and some hyperparameter optimization, they are likely to improve.