Class Demo: Recipe generator#
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import re
import sys
from collections import Counter, defaultdict
from urllib.request import urlopen
import math
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
Data#
This is a demo for recipe generation using PyTorch and Transformers. For the purpose of this demo, we’ll sample 10_000 recipe titles from the corpus
orig_recipes_df = pd.read_csv("../data/RAW_recipes.csv")
orig_recipes_df = orig_recipes_df.dropna()
recipes_df = orig_recipes_df.sample(10_000)
recipes_df
name | id | minutes | contributor_id | submitted | tags | nutrition | n_steps | steps | description | ingredients | n_ingredients | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
124299 | lime cilantro grilled steak | 29651 | 140 | 27783 | 2002-05-29 | ['weeknight', 'time-to-make', 'course', 'main-... | [6.3, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0] | 7 | ['combine lime juice , cilantro , water , jala... | i found this recipe in a recent issue of gemma... | ['boneless beef chuck steaks', 'fresh lime jui... | 8 |
158654 | persian rhubarb syrup | 240370 | 60 | 414666 | 2007-07-14 | ['60-minutes-or-less', 'time-to-make', 'course... | [778.0, 0.0, 737.0, 0.0, 5.0, 0.0, 66.0] | 10 | ['put the rhubarb in a large stainless steel p... | add some seltzer or ginger ale and ice and thi... | ['rhubarb', 'fresh lime juice', 'sugar', 'mint... | 4 |
151872 | pampered chef style apple crisp for microwave... | 63314 | 65 | 27783 | 2003-05-28 | ['weeknight', 'time-to-make', 'course', 'main-... | [461.9, 27.0, 192.0, 4.0, 7.0, 50.0, 24.0] | 9 | ['you will need a 12" round stoneware baking p... | this is one of the recipes that was served at ... | ['tart baking apples', 'graham cracker squares... | 9 |
176661 | rocky road sundae pie | 10409 | 40 | 8728 | 2001-07-25 | ['60-minutes-or-less', 'time-to-make', 'course... | [620.9, 58.0, 193.0, 9.0, 16.0, 92.0, 22.0] | 33 | ['preheat oven to 325', 'butter a 9-inch pan',... | great summer pie! | ['chocolate ice cream', 'walnuts', 'chocolate ... | 8 |
163677 | pork taco meat | 85631 | 70 | 107334 | 2004-03-03 | ['time-to-make', 'course', 'preparation', 'ver... | [729.8, 73.0, 4.0, 13.0, 137.0, 81.0, 0.0] | 7 | ['cube the pork steaks , cutting off much of t... | this is the best pork taco meat i've ever had.... | ['pork steak', 'oil', 'chili powder', 'salt', ... | 7 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
98092 | grill a juicy burger with the only rub you ll ... | 371492 | 30 | 37449 | 2009-05-11 | ['30-minutes-or-less', 'time-to-make', 'course... | [282.4, 23.0, 44.0, 62.0, 44.0, 29.0, 4.0] | 7 | ['combine sirloin with the ground chuck and cr... | want to ensure juicy great tasting burgers for... | ['ground sirloin', 'ground chuck', 'ice', 'sea... | 12 |
48942 | chipotle tortilla casserole | 205081 | 30 | 83093 | 2007-01-12 | ['30-minutes-or-less', 'time-to-make', 'course... | [439.5, 35.0, 19.0, 31.0, 45.0, 30.0, 12.0] | 10 | ['preheat oven to 400f', 'arrange chips in bot... | this is a recipe attributed to rick bayless, f... | ['corn tortilla chips', 'cooked chicken', 'tom... | 11 |
184473 | shamakia | 491426 | 50 | 2548229 | 2012-12-07 | ['60-minutes-or-less', 'time-to-make', 'course... | [430.0, 47.0, 10.0, 32.0, 28.0, 49.0, 8.0] | 7 | ['pre-heat oven to 350 degrees', 'cook sausage... | delicious casserole recipe that has been in my... | ['fresh spinach', 'spicy sausage', 'sliced mus... | 8 |
147699 | omelette | 431785 | 10 | 1627002 | 2010-07-02 | ['15-minutes-or-less', 'time-to-make', 'prepar... | [370.0, 48.0, 4.0, 12.0, 37.0, 50.0, 0.0] | 10 | ['break the eggs into a jug and add the water'... | taken from 'food for health and vitality' - se... | ['eggs', 'water', 'dried thyme', 'black pepper... | 9 |
223230 | vichyssoise cream of leek | 47763 | 40 | 20480 | 2002-12-07 | ['60-minutes-or-less', 'time-to-make', 'course... | [173.4, 15.0, 9.0, 31.0, 12.0, 28.0, 5.0] | 6 | ['in a 2 qt glass casserole place leeks , onio... | wonderful soup made in the microwave. it can b... | ['leeks', 'onion', 'potatoes', 'butter', 'chic... | 8 |
10000 rows × 12 columns
# Set the appropriate device depending upon your hardware.
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(device)
mps
Tokenization#
Our data isn’t that big. The vocabulary is quite small. So we’ll write a wrapper class to map token ids of from pretrained model to custom token ids.
from transformers import AutoTokenizer
class TokenizerWrapper():
"""
A wrapper class for the AutoTokenizer to handle tokenization and provide
custom token-vocabulary mappings.
"""
def __init__(self, model_name="bert-base-cased"):
"""
Initializes the TokenizerWrapper with a specified model.
"""
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# The wrapper class creates a token-to-vocab mapping
# Let's keep the ids corresponding to special tokens.
# 0 --> [PAD], 101 --> [CLS], 102 --> [SEP]
self.token_id_to_vocab_id = {0: 0, 101: 1, 102: 2}
self.vocab_id_to_token_id = {0: 0, 1: 101, 2:102}
self.vocab_id = len(self.vocab_id_to_token_id)
self.padding_len = None
def build_dictionary(self, list_of_recipes: list):
"""
Processes a list of captions to build and update the vocabulary based on the tokens found in the captions.
This function also finds the maximum length of the tokenized captions to set the padding length.
"""
# Tokenize all recipes to find the unique tokens and the maximum length
tokenized_outputs = self.tokenizer(list_of_recipes, add_special_tokens=False)
all_token_ids = set(token for sublist in tokenized_outputs.input_ids for token in sublist)
# Update the custom token-vocabulary mapping
for token_id in all_token_ids:
if token_id not in self.token_id_to_vocab_id:
self.token_id_to_vocab_id[token_id] = self.vocab_id
self.vocab_id_to_token_id[self.vocab_id] = token_id
self.vocab_id += 1
# Set the padding length to the length of the longest tokenized recipe
self.padding_len = max(len(tokens) for tokens in tokenized_outputs.input_ids)
def get_vocab_size(self):
"""
Returns the size of the custom vocabulary.
"""
assert len(self.token_id_to_vocab_id) == len(self.vocab_id_to_token_id)
return len(self.token_id_to_vocab_id)
def tokenize(self, text: str) -> list:
"""
Tokenizes a text string into custom vocabulary IDs, using the built dictionary.
Requires the dictionary to be built first.
Parameters:
text (str): The text to tokenize.
Returns:
list of int: A list of custom vocabulary IDs corresponding to the text tokens.
"""
assert self.padding_len is not None, 'Call build_dictionary first.'
# Tokenize the text with the maximum length set to the previously found maximum padding length
# Tokenize the text. You don't need `truncation=True` for your lab
tokenized_output = self.tokenizer(text, padding="max_length", max_length=self.padding_len, truncation=True)
return [self.token_id_to_vocab_id.get(token_id, 0) # Default to [PAD] if token_id is not found
for token_id in tokenized_output.input_ids]
def decode(self, vocab_list: list) -> str:
"""
Decodes a list of custom vocabulary IDs back into the original text string.
Parameters:
vocab_list (list of int): A list of custom vocabulary IDs to decode.
Returns:
str: The decoded text string.
"""
token_list = [self.vocab_id_to_token_id[vocab_id] for vocab_id in vocab_list]
# decode the string
decoded_string = self.tokenizer.decode(token_list, skip_special_tokens=True)
return decoded_string.strip()
# Build the dictionary for our tokenizer
from tqdm import tqdm, trange
tokenizer_wrapper = TokenizerWrapper()
tokenizer_wrapper.build_dictionary(recipes_df["name"].to_list())
recipe_tokens = tokenizer_wrapper.tokenize(recipes_df['name'].iloc[10])
decoded_recipe = tokenizer_wrapper.decode(recipe_tokens)
print('Recipe:', recipes_df['name'].iloc[10])
print('Tokens:', recipe_tokens)
print('Decoded recipe:', decoded_recipe)
Recipe: wild rice and chicken soup
Tokens: [1, 2079, 3515, 376, 402, 2406, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Decoded recipe: wild rice and chicken soup
vocab_size = tokenizer_wrapper.get_vocab_size()
vocab_size
3692
Preparing the data#
Now that we have a class to tokenize our text let’s prepare our data
def build_data(data_df, tokenizer_wrapper):
dataset = []
for row_id in trange(len(data_df)):
reicpe_tokens = torch.tensor(tokenizer_wrapper.tokenize(data_df['name'].iloc[row_id]))
dataset.append({'token': reicpe_tokens})
return dataset
Let’s create train and test datasets by calling build_data
on train and test splits.
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(recipes_df, test_size=0.2, random_state=123)
train_data = build_data(train_df, tokenizer_wrapper)
test_data = build_data(test_df, tokenizer_wrapper)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
- Avoid using `tokenizers` before the fork if possible
0%| | 0/8000 [00:00<?, ?it/s]TOKENIZERS_PARALLELISM=(true | false)
100%|████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 19025.24it/s]
100%|████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 20211.81it/s]
# What's the vocab size?
vocab_size = tokenizer_wrapper.get_vocab_size()
print(f'The vocab size is {vocab_size}.')
The vocab size is 3692.
class PytorchDataset():
def __init__(self, data, pad_vocab_id=0):
self.data = data
self.pad_tensor = torch.tensor([pad_vocab_id])
def __len__(self):
return len(self.data)
def __getitem__(self, ind):
# Retrieve the next sequence of tokens from the current index
# by excluding the first token of the current sequence and appending a padding token at the end.
target_sequence = torch.cat([self.data[ind]['token'][1:], self.pad_tensor])
return self.data[ind]['token'], target_sequence
train_dataset = PytorchDataset(train_data)
test_dataset = PytorchDataset(test_data)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=50, shuffle=False)
# Now let's get a batch of data from DataLoader
train_text, train_target = next(iter(train_dataloader))
train_text = train_text.to(device)
train_text.shape
torch.Size([64, 23])
train_text[11]
tensor([ 1, 82, 1641, 271, 604, 1282, 2318, 386, 1183, 388, 376, 3188,
3348, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
device='mps:0')
train_target[11]
tensor([ 82, 1641, 271, 604, 1282, 2318, 386, 1183, 388, 376, 3188, 3348,
2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tokenizer_wrapper.decode(train_text[11].tolist())
'szechuan lamb with peppers and spinach'
tokenizer_wrapper.decode(train_target[11].tolist())
'szechuan lamb with peppers and spinach'
This is called autoregressive training.
Transformer model architecture#
To define our transformer model architecture for text generation, we will focus on constructing a decoder-only model. Here are the essential components:
Embedding Layer: This layer transforms our input token indices into embeddings that the transformer can process.
Transformer Decoder Layers: These are the core of our decoder-only architecture, where each layer processes the input embeddings to generate the next token predictions.
Linear Layer: This final output layer maps the decoder’s output to the size of our vocabulary, facilitating the prediction of the next token.
For implementing these components in PyTorch, we use the following classes:
nn.Embedding to create embeddings of tokens
nn.Linear for the final unembedding layer
nn.TransformerDecoderLayer: This class allows us to define individual transformer blocks with self-attention and a feedforward network.
nn.TransformerDecoder This class is used to stack multiple
TransformerDecoderLayer
objects into a full decoder.
Required parameters of TransformerDecoderLayer
d_model
(int): the number of expected features in the input (required).nhead
(int): the number of heads in the multiheadattention models (required).
Required parameters of TransformerDecoder
decoder_layer: an instance of the TransformerDecoderLayer() class (required).
num_layers: the number of sub-decoder-layers in the decoder (required).
class PositionalEncoding(nn.Module):
def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(max_len, 1, d_model)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x + self.pe[:x.size(0)]
return self.dropout(x)
class RecipeGenerator(nn.Module):
def __init__(self, d_model, n_heads, num_layers, vocab_size, device, dropout=0.1):
"""
Initialize the RecipeGenerator which uses a transformer decoder architecture
for generating recipes.
Parameters:
d_model (int): The number of expected features in the encoder/decoder inputs.
n_heads (int): The number of heads in the multiheadattention models.
num_layers (int): The number of sub-decoder-layers in the transformer.
vocab_size (int): The size of the vocabulary.
device (torch.device): The device on which the model will be trained.
dropout (float): The dropout value used in PositionalEncoding and TransformerDecoderLayer.
"""
super(RecipeGenerator, self).__init__()
self.d_model = d_model
self.device = device
# Positional Encoding to add position information to input embeddings
self.pos_encoding = PositionalEncoding(d_model=d_model, dropout=dropout)
# Transformer decoder
self.TransformerDecoder = nn.TransformerDecoder(
decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout),
num_layers = num_layers
)
# Embedding layer for converting input text tokens into vectors
self.text_embedding = nn.Embedding(vocab_size, d_model)
# Final linear layer to map the output of the transformer decoder to vocabulary size
self.linear_layer = nn.Linear(d_model, vocab_size)
# Initialize the weights of the model
self.init_weights()
def init_weights(self):
"""
Initialize weights of the model to small random values.
"""
initrange = 0.1
self.text_embedding.weight.data.uniform_(-initrange, initrange)
self.linear_layer.bias.data.zero_()
self.linear_layer.weight.data.uniform_(-initrange, initrange)
def forward(self, text):
# Get the embeded input
encoded_text = self.embed_text(text)
# Get transformer output
transformer_output = self.decode(encoded_text)
# Final linear layer (unembedding layer)
return self.linear_layer(transformer_output)
def embed_text(self, text):
embedding = self.text_embedding(text) * math.sqrt(self.d_model)
# Changing [batch_size, seq_len, d_model] to [seq_len, batch_size, d_model]
return self.pos_encoding(embedding.permute(1, 0, 2))
def decode(self, encoded_text):
# Get the length of the sequences to be decoded. This is needed to generate the causal masks
seq_len = encoded_text.size(0)
causal_mask = self.generate_mask(seq_len)
dummy_memory = torch.zeros_like(encoded_text)
return self.TransformerDecoder(tgt=encoded_text, memory=dummy_memory, tgt_mask=causal_mask)
def generate_mask(self, size):
# Make the upper triangular part of the matrix to -inf
mask = torch.triu(torch.ones(size, size, device=self.device), 1)
return mask.float().masked_fill(mask == 1, float('-inf'))
import torch
size = 10
mask = torch.triu(torch.ones(size, size), 1)
mask.float().masked_fill(mask == 1, float('-inf'))
tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
[0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
[0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
[0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
[0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
[0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
[0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
[0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
[0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
# Now let's try your model.
# Define the hyperparameters and initalize the model. Feel free to change these hyperparameters.
d_model = 256
n_heads = 4
num_layers = 8
model = RecipeGenerator(d_model=d_model, n_heads=n_heads, num_layers=num_layers, vocab_size=vocab_size, device=device).to(device)
train_text
tensor([[ 1, 256, 377, ..., 0, 0, 0],
[ 1, 2230, 83, ..., 0, 0, 0],
[ 1, 378, 430, ..., 0, 0, 0],
...,
[ 1, 2683, 81, ..., 0, 0, 0],
[ 1, 1519, 587, ..., 0, 0, 0],
[ 1, 3362, 3515, ..., 0, 0, 0]], device='mps:0')
# pass inputs to your model
output = model(train_text)
output.shape
torch.Size([23, 64, 3692])
vocab_size
3692
train_text.shape
torch.Size([64, 23])
output.shape
torch.Size([23, 64, 3692])
def trainer(model, criterion, optimizer, train_dataloader, test_dataloader, epochs=5, patience=5, clip_norm=1.0):
train_losses, test_losses = [], []
consec_increases, verbose = 0, True
for epoch in range(epochs):
model.train()
train_loss = 0
for train_text, target_seq in train_dataloader:
train_text, target_seq = train_text.to(device), target_seq.to(device)
optimizer.zero_grad()
output = model(train_text).permute(1, 2, 0) # Ensure output is in correct shape for loss calculation
loss = criterion(output, target_seq)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)
optimizer.step()
train_loss += loss.item()
model.eval()
test_loss = 0
with torch.no_grad():
for test_text, target_seq in test_dataloader:
test_text, target_seq = test_text.to(device), target_seq.to(device)
output = model(test_text).permute(1, 2, 0)
test_loss += criterion(output, target_seq).item()
train_losses.append(train_loss / len(train_dataloader))
test_losses.append(test_loss / len(test_dataloader))
if verbose:
print(f"Epoch {epoch+1}: Train Loss {train_losses[-1]:.4f}, Test Loss {test_losses[-1]:.4f}")
if epoch > 0 and test_losses[-1] > test_losses[-2] * (1 + 1e-5):
consec_increases += 1
else:
consec_increases = 0
if consec_increases >= patience:
print(f"Stopped early at epoch {epoch + 1}")
break
return train_losses, test_losses
# Define the optimizer and the loss function. Feel free to change the hyperparameters.
num_epoch = 20
clip_norm = 1.0
lr = 5e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss(ignore_index=0) # Ignore the padding index
train_losses, test_losses = trainer(model, criterion, optimizer,train_dataloader, test_dataloader, epochs= num_epoch)
Epoch 1: Train Loss 6.9428, Test Loss 6.3931
Epoch 2: Train Loss 6.0120, Test Loss 5.5934
Epoch 3: Train Loss 5.3771, Test Loss 5.1153
Epoch 4: Train Loss 4.9798, Test Loss 4.8317
Epoch 5: Train Loss 4.7009, Test Loss 4.6426
Epoch 6: Train Loss 4.4967, Test Loss 4.5094
Epoch 7: Train Loss 4.3364, Test Loss 4.4099
Epoch 8: Train Loss 4.2019, Test Loss 4.3313
Epoch 9: Train Loss 4.0903, Test Loss 4.2596
Epoch 10: Train Loss 3.9926, Test Loss 4.2018
Epoch 11: Train Loss 3.9047, Test Loss 4.1619
Epoch 12: Train Loss 3.8245, Test Loss 4.1265
Epoch 13: Train Loss 3.7520, Test Loss 4.0932
Epoch 14: Train Loss 3.6853, Test Loss 4.0584
Epoch 15: Train Loss 3.6224, Test Loss 4.0359
Epoch 16: Train Loss 3.5630, Test Loss 4.0141
Epoch 17: Train Loss 3.5078, Test Loss 3.9946
Epoch 18: Train Loss 3.4531, Test Loss 3.9735
Epoch 19: Train Loss 3.4047, Test Loss 3.9687
Epoch 20: Train Loss 3.3578, Test Loss 3.9533
def generate_recipe(model, device, max_recipe_length=39, seed = 10, end_vocab=2):
"""
Generates a recipe using the specified model and device.
Parameters:
model (torch.nn.Module): The trained model used for generating reci[es.
device (torch.device): The device (e.g., CPU or GPU) to which tensors will be sent for model execution.
max_recipe_length (int, optional): The maximum length of the generated recipe. Defaults to 100.
start_vocab (int, optional): The vocabulary index used to signify the start of a recipe. Defaults to 1.
end_vocab (int, optional): The vocabulary index used to signify the end of a recipe. Defaults to 2.
Returns:
numpy.ndarray: An array containing the sequence of vocabulary indices representing the generated caption.
"""
context = torch.tensor([[seed]]).to(device)
for _ in range(max_recipe_length):
logits = model(context)[-1]
probabilities = torch.softmax(logits, dim=-1).flatten(start_dim=1)
next_vocab = torch.multinomial(probabilities, num_samples=1)
context = torch.cat([context, next_vocab], dim=1)
if next_vocab.item() == end_vocab:
break
return context.cpu().numpy().flatten()
recipe = generate_recipe(model, device, max_recipe_length=20, seed = 10)
generated_recipe = tokenizer_wrapper.decode(recipe)
generated_recipe
'ban challengerakas wings'
The generated recipes are not good. But with more data, epochs, and some hyperparameter optimization, they are likely to improve.