{ "cells": [ { "cell_type": "markdown", "id": "49fecb8b-833e-4f1d-979c-29de9ebc3e37", "metadata": {}, "source": [ "## Class Demo: Recipe generator" ] }, { "cell_type": "code", "execution_count": 1, "id": "73190c22-c0fe-4a74-972d-03543b7b9959", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n", "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n" ] } ], "source": [ "import torch\n", "import torch.nn as nn\n", "import numpy as np\n", "from torch.utils.data import Dataset, DataLoader\n", "import pandas as pd\n", "import os\n", "import re\n", "import sys\n", "from collections import Counter, defaultdict\n", "from urllib.request import urlopen\n", "import math" ] }, { "cell_type": "markdown", "id": "053617b4-3a67-4f7b-bf32-0a7dba9a15f6", "metadata": {}, "source": [ "### Data" ] }, { "cell_type": "markdown", "id": "5bc90cc6-d9bc-4747-bc91-ed5dcc065c47", "metadata": {}, "source": [ "This is a demo for recipe generation using PyTorch and Transformers. \n", "For the purpose of this demo, we'll sample 10_000 recipe titles from the corpus " ] }, { "cell_type": "code", "execution_count": 2, "id": "765fad1d-0cc9-4d88-a4d8-357a767fc072", "metadata": {}, "outputs": [], "source": [ "orig_recipes_df = pd.read_csv(\"../data/RAW_recipes.csv\")\n", "orig_recipes_df = orig_recipes_df.dropna()\n", "recipes_df = orig_recipes_df.sample(10_000)" ] }, { "cell_type": "code", "execution_count": 3, "id": "7839bff9-07e0-4e3d-be61-2aba64b9fefd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameidminutescontributor_idsubmittedtagsnutritionn_stepsstepsdescriptioningredientsn_ingredients
124299lime cilantro grilled steak29651140277832002-05-29['weeknight', 'time-to-make', 'course', 'main-...[6.3, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]7['combine lime juice , cilantro , water , jala...i found this recipe in a recent issue of gemma...['boneless beef chuck steaks', 'fresh lime jui...8
158654persian rhubarb syrup240370604146662007-07-14['60-minutes-or-less', 'time-to-make', 'course...[778.0, 0.0, 737.0, 0.0, 5.0, 0.0, 66.0]10['put the rhubarb in a large stainless steel p...add some seltzer or ginger ale and ice and thi...['rhubarb', 'fresh lime juice', 'sugar', 'mint...4
151872pampered chef style apple crisp for microwave...6331465277832003-05-28['weeknight', 'time-to-make', 'course', 'main-...[461.9, 27.0, 192.0, 4.0, 7.0, 50.0, 24.0]9['you will need a 12\" round stoneware baking p...this is one of the recipes that was served at ...['tart baking apples', 'graham cracker squares...9
176661rocky road sundae pie104094087282001-07-25['60-minutes-or-less', 'time-to-make', 'course...[620.9, 58.0, 193.0, 9.0, 16.0, 92.0, 22.0]33['preheat oven to 325', 'butter a 9-inch pan',...great summer pie!['chocolate ice cream', 'walnuts', 'chocolate ...8
163677pork taco meat85631701073342004-03-03['time-to-make', 'course', 'preparation', 'ver...[729.8, 73.0, 4.0, 13.0, 137.0, 81.0, 0.0]7['cube the pork steaks , cutting off much of t...this is the best pork taco meat i've ever had....['pork steak', 'oil', 'chili powder', 'salt', ...7
.......................................
98092grill a juicy burger with the only rub you ll ...37149230374492009-05-11['30-minutes-or-less', 'time-to-make', 'course...[282.4, 23.0, 44.0, 62.0, 44.0, 29.0, 4.0]7['combine sirloin with the ground chuck and cr...want to ensure juicy great tasting burgers for...['ground sirloin', 'ground chuck', 'ice', 'sea...12
48942chipotle tortilla casserole20508130830932007-01-12['30-minutes-or-less', 'time-to-make', 'course...[439.5, 35.0, 19.0, 31.0, 45.0, 30.0, 12.0]10['preheat oven to 400f', 'arrange chips in bot...this is a recipe attributed to rick bayless, f...['corn tortilla chips', 'cooked chicken', 'tom...11
184473shamakia4914265025482292012-12-07['60-minutes-or-less', 'time-to-make', 'course...[430.0, 47.0, 10.0, 32.0, 28.0, 49.0, 8.0]7['pre-heat oven to 350 degrees', 'cook sausage...delicious casserole recipe that has been in my...['fresh spinach', 'spicy sausage', 'sliced mus...8
147699omelette4317851016270022010-07-02['15-minutes-or-less', 'time-to-make', 'prepar...[370.0, 48.0, 4.0, 12.0, 37.0, 50.0, 0.0]10['break the eggs into a jug and add the water'...taken from 'food for health and vitality' - se...['eggs', 'water', 'dried thyme', 'black pepper...9
223230vichyssoise cream of leek4776340204802002-12-07['60-minutes-or-less', 'time-to-make', 'course...[173.4, 15.0, 9.0, 31.0, 12.0, 28.0, 5.0]6['in a 2 qt glass casserole place leeks , onio...wonderful soup made in the microwave. it can b...['leeks', 'onion', 'potatoes', 'butter', 'chic...8
\n", "

10000 rows × 12 columns

\n", "
" ], "text/plain": [ " name id minutes \\\n", "124299 lime cilantro grilled steak 29651 140 \n", "158654 persian rhubarb syrup 240370 60 \n", "151872 pampered chef style apple crisp for microwave... 63314 65 \n", "176661 rocky road sundae pie 10409 40 \n", "163677 pork taco meat 85631 70 \n", "... ... ... ... \n", "98092 grill a juicy burger with the only rub you ll ... 371492 30 \n", "48942 chipotle tortilla casserole 205081 30 \n", "184473 shamakia 491426 50 \n", "147699 omelette 431785 10 \n", "223230 vichyssoise cream of leek 47763 40 \n", "\n", " contributor_id submitted \\\n", "124299 27783 2002-05-29 \n", "158654 414666 2007-07-14 \n", "151872 27783 2003-05-28 \n", "176661 8728 2001-07-25 \n", "163677 107334 2004-03-03 \n", "... ... ... \n", "98092 37449 2009-05-11 \n", "48942 83093 2007-01-12 \n", "184473 2548229 2012-12-07 \n", "147699 1627002 2010-07-02 \n", "223230 20480 2002-12-07 \n", "\n", " tags \\\n", "124299 ['weeknight', 'time-to-make', 'course', 'main-... \n", "158654 ['60-minutes-or-less', 'time-to-make', 'course... \n", "151872 ['weeknight', 'time-to-make', 'course', 'main-... \n", "176661 ['60-minutes-or-less', 'time-to-make', 'course... \n", "163677 ['time-to-make', 'course', 'preparation', 'ver... \n", "... ... \n", "98092 ['30-minutes-or-less', 'time-to-make', 'course... \n", "48942 ['30-minutes-or-less', 'time-to-make', 'course... \n", "184473 ['60-minutes-or-less', 'time-to-make', 'course... \n", "147699 ['15-minutes-or-less', 'time-to-make', 'prepar... \n", "223230 ['60-minutes-or-less', 'time-to-make', 'course... \n", "\n", " nutrition n_steps \\\n", "124299 [6.3, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0] 7 \n", "158654 [778.0, 0.0, 737.0, 0.0, 5.0, 0.0, 66.0] 10 \n", "151872 [461.9, 27.0, 192.0, 4.0, 7.0, 50.0, 24.0] 9 \n", "176661 [620.9, 58.0, 193.0, 9.0, 16.0, 92.0, 22.0] 33 \n", "163677 [729.8, 73.0, 4.0, 13.0, 137.0, 81.0, 0.0] 7 \n", "... ... ... \n", "98092 [282.4, 23.0, 44.0, 62.0, 44.0, 29.0, 4.0] 7 \n", "48942 [439.5, 35.0, 19.0, 31.0, 45.0, 30.0, 12.0] 10 \n", "184473 [430.0, 47.0, 10.0, 32.0, 28.0, 49.0, 8.0] 7 \n", "147699 [370.0, 48.0, 4.0, 12.0, 37.0, 50.0, 0.0] 10 \n", "223230 [173.4, 15.0, 9.0, 31.0, 12.0, 28.0, 5.0] 6 \n", "\n", " steps \\\n", "124299 ['combine lime juice , cilantro , water , jala... \n", "158654 ['put the rhubarb in a large stainless steel p... \n", "151872 ['you will need a 12\" round stoneware baking p... \n", "176661 ['preheat oven to 325', 'butter a 9-inch pan',... \n", "163677 ['cube the pork steaks , cutting off much of t... \n", "... ... \n", "98092 ['combine sirloin with the ground chuck and cr... \n", "48942 ['preheat oven to 400f', 'arrange chips in bot... \n", "184473 ['pre-heat oven to 350 degrees', 'cook sausage... \n", "147699 ['break the eggs into a jug and add the water'... \n", "223230 ['in a 2 qt glass casserole place leeks , onio... \n", "\n", " description \\\n", "124299 i found this recipe in a recent issue of gemma... \n", "158654 add some seltzer or ginger ale and ice and thi... \n", "151872 this is one of the recipes that was served at ... \n", "176661 great summer pie! \n", "163677 this is the best pork taco meat i've ever had.... \n", "... ... \n", "98092 want to ensure juicy great tasting burgers for... \n", "48942 this is a recipe attributed to rick bayless, f... \n", "184473 delicious casserole recipe that has been in my... \n", "147699 taken from 'food for health and vitality' - se... \n", "223230 wonderful soup made in the microwave. it can b... \n", "\n", " ingredients n_ingredients \n", "124299 ['boneless beef chuck steaks', 'fresh lime jui... 8 \n", "158654 ['rhubarb', 'fresh lime juice', 'sugar', 'mint... 4 \n", "151872 ['tart baking apples', 'graham cracker squares... 9 \n", "176661 ['chocolate ice cream', 'walnuts', 'chocolate ... 8 \n", "163677 ['pork steak', 'oil', 'chili powder', 'salt', ... 7 \n", "... ... ... \n", "98092 ['ground sirloin', 'ground chuck', 'ice', 'sea... 12 \n", "48942 ['corn tortilla chips', 'cooked chicken', 'tom... 11 \n", "184473 ['fresh spinach', 'spicy sausage', 'sliced mus... 8 \n", "147699 ['eggs', 'water', 'dried thyme', 'black pepper... 9 \n", "223230 ['leeks', 'onion', 'potatoes', 'butter', 'chic... 8 \n", "\n", "[10000 rows x 12 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recipes_df" ] }, { "cell_type": "code", "execution_count": 4, "id": "bf1f1886-8a5c-42c0-97c6-43561ced1dcd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mps\n" ] } ], "source": [ "# Set the appropriate device depending upon your hardware. \n", "\n", "# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') \n", "print(device)" ] }, { "cell_type": "markdown", "id": "6d4c5b60-70ac-49d7-a590-9c02c2757df5", "metadata": {}, "source": [ "

" ] }, { "cell_type": "markdown", "id": "83dbf9e7-1830-42f2-86fb-4e24e0524621", "metadata": {}, "source": [ "### Tokenization \n", "\n", "Our data isn't that big. The vocabulary is quite small. So we'll write a wrapper class to map token ids of from pretrained model to custom token ids. " ] }, { "cell_type": "code", "execution_count": 25, "id": "93d5db3f-e718-4c8f-bbae-26efe1290950", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "class TokenizerWrapper():\n", " \"\"\"\n", " A wrapper class for the AutoTokenizer to handle tokenization and provide\n", " custom token-vocabulary mappings. \n", " \"\"\"\n", " def __init__(self, model_name=\"bert-base-cased\"): \n", " \"\"\"\n", " Initializes the TokenizerWrapper with a specified model.\n", " \"\"\"\n", " self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n", " # The wrapper class creates a token-to-vocab mapping\n", " # Let's keep the ids corresponding to special tokens. \n", " # 0 --> [PAD], 101 --> [CLS], 102 --> [SEP] \n", " self.token_id_to_vocab_id = {0: 0, 101: 1, 102: 2}\n", " self.vocab_id_to_token_id = {0: 0, 1: 101, 2:102}\n", " self.vocab_id = len(self.vocab_id_to_token_id)\n", " self.padding_len = None \n", "\n", " def build_dictionary(self, list_of_recipes: list):\n", " \"\"\"\n", " Processes a list of captions to build and update the vocabulary based on the tokens found in the captions.\n", " This function also finds the maximum length of the tokenized captions to set the padding length.\n", " \n", " \"\"\"\n", " # Tokenize all recipes to find the unique tokens and the maximum length\n", " tokenized_outputs = self.tokenizer(list_of_recipes, add_special_tokens=False)\n", " all_token_ids = set(token for sublist in tokenized_outputs.input_ids for token in sublist)\n", " \n", " # Update the custom token-vocabulary mapping\n", " for token_id in all_token_ids:\n", " if token_id not in self.token_id_to_vocab_id:\n", " self.token_id_to_vocab_id[token_id] = self.vocab_id\n", " self.vocab_id_to_token_id[self.vocab_id] = token_id\n", " self.vocab_id += 1\n", " \n", " # Set the padding length to the length of the longest tokenized recipe\n", " self.padding_len = max(len(tokens) for tokens in tokenized_outputs.input_ids)\n", " \n", " \n", " def get_vocab_size(self):\n", " \"\"\"\n", " Returns the size of the custom vocabulary.\n", " \"\"\"\n", " assert len(self.token_id_to_vocab_id) == len(self.vocab_id_to_token_id)\n", " return len(self.token_id_to_vocab_id)\n", "\n", "\n", " def tokenize(self, text: str) -> list:\n", " \"\"\"\n", " Tokenizes a text string into custom vocabulary IDs, using the built dictionary. \n", " Requires the dictionary to be built first.\n", " \n", " Parameters:\n", " text (str): The text to tokenize.\n", " \n", " Returns:\n", " list of int: A list of custom vocabulary IDs corresponding to the text tokens.\n", " \"\"\"\n", " assert self.padding_len is not None, 'Call build_dictionary first.'\n", " # Tokenize the text with the maximum length set to the previously found maximum padding length\n", "\n", " # Tokenize the text. You don't need `truncation=True` for your lab \n", " tokenized_output = self.tokenizer(text, padding=\"max_length\", max_length=self.padding_len, truncation=True)\n", " \n", " return [self.token_id_to_vocab_id.get(token_id, 0) # Default to [PAD] if token_id is not found\n", " for token_id in tokenized_output.input_ids]\n", " \n", " \n", " def decode(self, vocab_list: list) -> str:\n", " \"\"\"\n", " Decodes a list of custom vocabulary IDs back into the original text string.\n", "\n", " Parameters:\n", " vocab_list (list of int): A list of custom vocabulary IDs to decode.\n", "\n", " Returns:\n", " str: The decoded text string.\n", " \"\"\" \n", " token_list = [self.vocab_id_to_token_id[vocab_id] for vocab_id in vocab_list]\n", " # decode the string\n", " decoded_string = self.tokenizer.decode(token_list, skip_special_tokens=True)\n", " return decoded_string.strip()\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "ab21b95b-135d-4b98-ad63-7e48e9b89ef0", "metadata": {}, "outputs": [], "source": [ "# Build the dictionary for our tokenizer \n", "from tqdm import tqdm, trange \n", "tokenizer_wrapper = TokenizerWrapper()\n", "tokenizer_wrapper.build_dictionary(recipes_df[\"name\"].to_list())" ] }, { "cell_type": "code", "execution_count": 7, "id": "57a5e6de-63b0-438f-be17-bb8b61e3c0c5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Recipe: wild rice and chicken soup\n", "Tokens: [1, 2079, 3515, 376, 402, 2406, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n", "Decoded recipe: wild rice and chicken soup\n" ] } ], "source": [ "recipe_tokens = tokenizer_wrapper.tokenize(recipes_df['name'].iloc[10])\n", "decoded_recipe = tokenizer_wrapper.decode(recipe_tokens)\n", "print('Recipe:', recipes_df['name'].iloc[10])\n", "print('Tokens:', recipe_tokens)\n", "print('Decoded recipe:', decoded_recipe)" ] }, { "cell_type": "code", "execution_count": 8, "id": "6dd9a66e-40f9-4dd4-a581-b7f7804ea1d5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3692" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab_size = tokenizer_wrapper.get_vocab_size()\n", "vocab_size" ] }, { "cell_type": "markdown", "id": "c5903521-939a-444c-9901-56ffa1b5599e", "metadata": {}, "source": [ "

" ] }, { "cell_type": "markdown", "id": "e21f57cd-e64c-4f52-b9ee-263233369576", "metadata": {}, "source": [ "### Preparing the data " ] }, { "cell_type": "markdown", "id": "70c0a514-ecf0-4475-a8c6-e371d440cc7e", "metadata": {}, "source": [ "Now that we have a class to tokenize our text let's prepare our data" ] }, { "cell_type": "code", "execution_count": 9, "id": "d127ebca-f8f7-4397-ac11-33336966e1d0", "metadata": {}, "outputs": [], "source": [ "def build_data(data_df, tokenizer_wrapper): \n", " dataset = []\n", " for row_id in trange(len(data_df)):\n", " reicpe_tokens = torch.tensor(tokenizer_wrapper.tokenize(data_df['name'].iloc[row_id])) \n", " dataset.append({'token': reicpe_tokens})\n", " return dataset " ] }, { "cell_type": "markdown", "id": "be52cda2-aec7-4e96-b653-b0115b86f646", "metadata": {}, "source": [ "Let's create train and test datasets by calling `build_data` on train and test splits. " ] }, { "cell_type": "code", "execution_count": 10, "id": "54299ff0-a919-4453-84c7-c3b4b417fd06", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", " 0%| | 0/8000 [00:00


" ] }, { "cell_type": "markdown", "id": "de883b9e-2e53-41f6-8e45-db7825d936c3", "metadata": {}, "source": [ "### Transformer model architecture \n", "\n", "To define our transformer model architecture for text generation, we will focus on constructing a decoder-only model. Here are the essential components:\n", "\n", "- **Embedding Layer**: This layer transforms our input token indices into embeddings that the transformer can process.\n", " \n", "- **Transformer Decoder Layers**: These are the core of our decoder-only architecture, where each layer processes the input embeddings to generate the next token predictions.\n", " \n", "- **Linear Layer**: This final output layer maps the decoder's output to the size of our vocabulary, facilitating the prediction of the next token.\n", "\n", "For implementing these components in PyTorch, we use the following classes:\n", "- [nn.Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) to create embeddings of tokens\n", "- [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) for the final unembedding layer\n", "- [nn.TransformerDecoderLayer](https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoderLayer.html): This class allows us to define individual transformer blocks with self-attention and a feedforward network.\n", "- [nn.TransformerDecoder](https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html) This class is used to stack multiple `TransformerDecoderLayer` objects into a full decoder. \n", "\n", "**Required parameters of TransformerDecoderLayer**\n", "\n", "\n", "- `d_model` (int): the number of expected features in the input (required).\n", "\n", "- `nhead` (int): the number of heads in the multiheadattention models (required).\n", "\n", "\n", "**Required parameters of TransformerDecoder**\n", "\n", "- decoder_layer: an instance of the TransformerDecoderLayer() class (required).\n", "\n", "- num_layers: the number of sub-decoder-layers in the decoder (required)." ] }, { "cell_type": "code", "execution_count": 19, "id": "426427fd-d3b9-4484-b3da-db0cfedaa6f2", "metadata": {}, "outputs": [], "source": [ "class PositionalEncoding(nn.Module):\n", " def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):\n", " super().__init__()\n", " self.dropout = nn.Dropout(p=dropout)\n", "\n", " position = torch.arange(max_len).unsqueeze(1)\n", " div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))\n", " pe = torch.zeros(max_len, 1, d_model)\n", " pe[:, 0, 0::2] = torch.sin(position * div_term)\n", " pe[:, 0, 1::2] = torch.cos(position * div_term)\n", " self.register_buffer('pe', pe)\n", "\n", " def forward(self, x: torch.Tensor) -> torch.Tensor:\n", " x = x + self.pe[:x.size(0)]\n", " return self.dropout(x)" ] }, { "cell_type": "code", "execution_count": 31, "id": "d9be74fc-4b38-4f60-bc2b-45d5818314d9", "metadata": {}, "outputs": [], "source": [ "class RecipeGenerator(nn.Module):\n", " def __init__(self, d_model, n_heads, num_layers, vocab_size, device, dropout=0.1):\n", " \"\"\"\n", " Initialize the RecipeGenerator which uses a transformer decoder architecture\n", " for generating recipes.\n", "\n", " Parameters:\n", " d_model (int): The number of expected features in the encoder/decoder inputs.\n", " n_heads (int): The number of heads in the multiheadattention models.\n", " num_layers (int): The number of sub-decoder-layers in the transformer.\n", " vocab_size (int): The size of the vocabulary.\n", " device (torch.device): The device on which the model will be trained.\n", " dropout (float): The dropout value used in PositionalEncoding and TransformerDecoderLayer.\n", " \"\"\" \n", " super(RecipeGenerator, self).__init__()\n", " self.d_model = d_model\n", " self.device = device\n", " # Positional Encoding to add position information to input embeddings\n", " self.pos_encoding = PositionalEncoding(d_model=d_model, dropout=dropout)\n", "\n", " # Transformer decoder\n", " self.TransformerDecoder = nn.TransformerDecoder(\n", " decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout), \n", " num_layers = num_layers\n", " )\n", "\n", " # Embedding layer for converting input text tokens into vectors\n", " self.text_embedding = nn.Embedding(vocab_size, d_model)\n", "\n", " # Final linear layer to map the output of the transformer decoder to vocabulary size \n", " self.linear_layer = nn.Linear(d_model, vocab_size)\n", " \n", " # Initialize the weights of the model\n", " self.init_weights()\n", " \n", " def init_weights(self):\n", " \"\"\"\n", " Initialize weights of the model to small random values.\n", " \"\"\"\n", " initrange = 0.1\n", " self.text_embedding.weight.data.uniform_(-initrange, initrange)\n", " self.linear_layer.bias.data.zero_()\n", " self.linear_layer.weight.data.uniform_(-initrange, initrange)\n", "\n", " def forward(self, text):\n", " # Get the embeded input\n", " encoded_text = self.embed_text(text)\n", "\n", " # Get transformer output\n", " transformer_output = self.decode(encoded_text)\n", "\n", " # Final linear layer (unembedding layer)\n", " \n", " return self.linear_layer(transformer_output)\n", " \n", " def embed_text(self, text):\n", " embedding = self.text_embedding(text) * math.sqrt(self.d_model)\n", " # Changing [batch_size, seq_len, d_model] to [seq_len, batch_size, d_model]\n", " return self.pos_encoding(embedding.permute(1, 0, 2))\n", " \n", " def decode(self, encoded_text):\n", " # Get the length of the sequences to be decoded. This is needed to generate the causal masks\n", " seq_len = encoded_text.size(0)\n", " causal_mask = self.generate_mask(seq_len)\n", " dummy_memory = torch.zeros_like(encoded_text)\n", " return self.TransformerDecoder(tgt=encoded_text, memory=dummy_memory, tgt_mask=causal_mask)\n", " \n", " def generate_mask(self, size):\n", " # Make the upper triangular part of the matrix to -inf\n", " mask = torch.triu(torch.ones(size, size, device=self.device), 1)\n", " return mask.float().masked_fill(mask == 1, float('-inf'))\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "3fb26974-ebd5-45f0-95ec-9389cfa1ef8f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],\n", " [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],\n", " [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],\n", " [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],\n", " [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],\n", " [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],\n", " [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],\n", " [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],\n", " [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],\n", " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch \n", "size = 10\n", "mask = torch.triu(torch.ones(size, size), 1)\n", "mask.float().masked_fill(mask == 1, float('-inf'))" ] }, { "cell_type": "code", "execution_count": 33, "id": "aec3e665-1988-4c7a-8337-41a81d0a792c", "metadata": {}, "outputs": [], "source": [ "# Now let's try your model. \n", "# Define the hyperparameters and initalize the model. Feel free to change these hyperparameters. \n", "d_model = 256 \n", "n_heads = 4\n", "num_layers = 8\n", "model = RecipeGenerator(d_model=d_model, n_heads=n_heads, num_layers=num_layers, vocab_size=vocab_size, device=device).to(device)" ] }, { "cell_type": "code", "execution_count": 34, "id": "ca2660f0-5eae-463e-9ea2-48f090ac5bbd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 1, 256, 377, ..., 0, 0, 0],\n", " [ 1, 2230, 83, ..., 0, 0, 0],\n", " [ 1, 378, 430, ..., 0, 0, 0],\n", " ...,\n", " [ 1, 2683, 81, ..., 0, 0, 0],\n", " [ 1, 1519, 587, ..., 0, 0, 0],\n", " [ 1, 3362, 3515, ..., 0, 0, 0]], device='mps:0')" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_text" ] }, { "cell_type": "code", "execution_count": 35, "id": "e55dbf9a-f866-40e7-a934-a4a48ef0de90", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([23, 64, 3692])" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# pass inputs to your model\n", "output = model(train_text)\n", "output.shape" ] }, { "cell_type": "code", "execution_count": 36, "id": "39544a5d-b332-4cff-807c-04d3a2eb08ed", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3692" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vocab_size" ] }, { "cell_type": "code", "execution_count": 37, "id": "76417d4a-6e75-4c60-9fa7-433798366f13", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([64, 23])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_text.shape" ] }, { "cell_type": "code", "execution_count": 38, "id": "e1c1cc1a-765f-4ed4-89ff-d900cd5a25a3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([23, 64, 3692])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output.shape" ] }, { "cell_type": "code", "execution_count": 39, "id": "52a533ea-631e-4fc6-a24a-14ec898fff48", "metadata": {}, "outputs": [], "source": [ "def trainer(model, criterion, optimizer, train_dataloader, test_dataloader, epochs=5, patience=5, clip_norm=1.0):\n", " train_losses, test_losses = [], []\n", " consec_increases, verbose = 0, True\n", " \n", " for epoch in range(epochs):\n", " model.train()\n", " train_loss = 0\n", " for train_text, target_seq in train_dataloader:\n", " train_text, target_seq = train_text.to(device), target_seq.to(device)\n", " optimizer.zero_grad()\n", " output = model(train_text).permute(1, 2, 0) # Ensure output is in correct shape for loss calculation\n", " loss = criterion(output, target_seq)\n", " loss.backward()\n", " torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)\n", " optimizer.step()\n", " train_loss += loss.item()\n", "\n", " model.eval()\n", " test_loss = 0\n", " with torch.no_grad():\n", " for test_text, target_seq in test_dataloader:\n", " test_text, target_seq = test_text.to(device), target_seq.to(device)\n", " output = model(test_text).permute(1, 2, 0)\n", " test_loss += criterion(output, target_seq).item()\n", "\n", " train_losses.append(train_loss / len(train_dataloader))\n", " test_losses.append(test_loss / len(test_dataloader))\n", " if verbose:\n", " print(f\"Epoch {epoch+1}: Train Loss {train_losses[-1]:.4f}, Test Loss {test_losses[-1]:.4f}\")\n", "\n", " if epoch > 0 and test_losses[-1] > test_losses[-2] * (1 + 1e-5):\n", " consec_increases += 1\n", " else:\n", " consec_increases = 0\n", "\n", " if consec_increases >= patience:\n", " print(f\"Stopped early at epoch {epoch + 1}\")\n", " break\n", "\n", " return train_losses, test_losses\n" ] }, { "cell_type": "code", "execution_count": 40, "id": "895144a0-8d32-4841-b284-6f059a8f78a8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1: Train Loss 6.9428, Test Loss 6.3931\n", "Epoch 2: Train Loss 6.0120, Test Loss 5.5934\n", "Epoch 3: Train Loss 5.3771, Test Loss 5.1153\n", "Epoch 4: Train Loss 4.9798, Test Loss 4.8317\n", "Epoch 5: Train Loss 4.7009, Test Loss 4.6426\n", "Epoch 6: Train Loss 4.4967, Test Loss 4.5094\n", "Epoch 7: Train Loss 4.3364, Test Loss 4.4099\n", "Epoch 8: Train Loss 4.2019, Test Loss 4.3313\n", "Epoch 9: Train Loss 4.0903, Test Loss 4.2596\n", "Epoch 10: Train Loss 3.9926, Test Loss 4.2018\n", "Epoch 11: Train Loss 3.9047, Test Loss 4.1619\n", "Epoch 12: Train Loss 3.8245, Test Loss 4.1265\n", "Epoch 13: Train Loss 3.7520, Test Loss 4.0932\n", "Epoch 14: Train Loss 3.6853, Test Loss 4.0584\n", "Epoch 15: Train Loss 3.6224, Test Loss 4.0359\n", "Epoch 16: Train Loss 3.5630, Test Loss 4.0141\n", "Epoch 17: Train Loss 3.5078, Test Loss 3.9946\n", "Epoch 18: Train Loss 3.4531, Test Loss 3.9735\n", "Epoch 19: Train Loss 3.4047, Test Loss 3.9687\n", "Epoch 20: Train Loss 3.3578, Test Loss 3.9533\n" ] } ], "source": [ "# Define the optimizer and the loss function. Feel free to change the hyperparameters. \n", "\n", "num_epoch = 20\n", "clip_norm = 1.0\n", "lr = 5e-5\n", "\n", "optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", "criterion = torch.nn.CrossEntropyLoss(ignore_index=0) # Ignore the padding index\n", "train_losses, test_losses = trainer(model, criterion, optimizer,train_dataloader, test_dataloader, epochs= num_epoch)" ] }, { "cell_type": "code", "execution_count": 41, "id": "6eb1aa4e-8584-4dc3-893c-c35a53b3b99d", "metadata": {}, "outputs": [], "source": [ "def generate_recipe(model, device, max_recipe_length=39, seed = 10, end_vocab=2):\n", " \"\"\"\n", " Generates a recipe using the specified model and device.\n", "\n", " Parameters:\n", " model (torch.nn.Module): The trained model used for generating reci[es.\n", " device (torch.device): The device (e.g., CPU or GPU) to which tensors will be sent for model execution.\n", " max_recipe_length (int, optional): The maximum length of the generated recipe. Defaults to 100.\n", " start_vocab (int, optional): The vocabulary index used to signify the start of a recipe. Defaults to 1.\n", " end_vocab (int, optional): The vocabulary index used to signify the end of a recipe. Defaults to 2.\n", " \n", " Returns:\n", " numpy.ndarray: An array containing the sequence of vocabulary indices representing the generated caption.\n", " \n", " \"\"\" \n", " context = torch.tensor([[seed]]).to(device)\n", " for _ in range(max_recipe_length):\n", " logits = model(context)[-1]\n", " probabilities = torch.softmax(logits, dim=-1).flatten(start_dim=1)\n", " next_vocab = torch.multinomial(probabilities, num_samples=1)\n", " context = torch.cat([context, next_vocab], dim=1)\n", " if next_vocab.item() == end_vocab:\n", " break\n", " return context.cpu().numpy().flatten()" ] }, { "cell_type": "code", "execution_count": 42, "id": "229871ab-e2dd-460e-9f00-3a8d74f27f18", "metadata": {}, "outputs": [], "source": [ "recipe = generate_recipe(model, device, max_recipe_length=20, seed = 10)" ] }, { "cell_type": "code", "execution_count": 43, "id": "f308b243-b184-45cb-91e2-cd22f67e4d40", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ban challengerakas wings'" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "generated_recipe = tokenizer_wrapper.decode(recipe)\n", "generated_recipe" ] }, { "cell_type": "markdown", "id": "90f93d66-1128-4ed5-9979-5608bc21b7fd", "metadata": {}, "source": [ "The generated recipes are not good. But with more data, epochs, and some hyperparameter optimization, they are likely to improve. " ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:575]", "language": "python", "name": "conda-env-575-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }