{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "49fecb8b-833e-4f1d-979c-29de9ebc3e37",
   "metadata": {},
   "source": [
    "## Class Demo: Recipe generator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "73190c22-c0fe-4a74-972d-03543b7b9959",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n",
      "Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import numpy as np\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "import pandas as pd\n",
    "import os\n",
    "import re\n",
    "import sys\n",
    "from collections import Counter, defaultdict\n",
    "from urllib.request import urlopen\n",
    "import math"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "053617b4-3a67-4f7b-bf32-0a7dba9a15f6",
   "metadata": {},
   "source": [
    "### Data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5bc90cc6-d9bc-4747-bc91-ed5dcc065c47",
   "metadata": {},
   "source": [
    "This is a demo for recipe generation using PyTorch and Transformers. \n",
    "For the purpose of this demo, we'll sample 10_000 recipe titles from the corpus "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "765fad1d-0cc9-4d88-a4d8-357a767fc072",
   "metadata": {},
   "outputs": [],
   "source": [
    "orig_recipes_df = pd.read_csv(\"../data/RAW_recipes.csv\")\n",
    "orig_recipes_df = orig_recipes_df.dropna()\n",
    "recipes_df = orig_recipes_df.sample(10_000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7839bff9-07e0-4e3d-be61-2aba64b9fefd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>id</th>\n",
       "      <th>minutes</th>\n",
       "      <th>contributor_id</th>\n",
       "      <th>submitted</th>\n",
       "      <th>tags</th>\n",
       "      <th>nutrition</th>\n",
       "      <th>n_steps</th>\n",
       "      <th>steps</th>\n",
       "      <th>description</th>\n",
       "      <th>ingredients</th>\n",
       "      <th>n_ingredients</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>124299</th>\n",
       "      <td>lime cilantro grilled steak</td>\n",
       "      <td>29651</td>\n",
       "      <td>140</td>\n",
       "      <td>27783</td>\n",
       "      <td>2002-05-29</td>\n",
       "      <td>['weeknight', 'time-to-make', 'course', 'main-...</td>\n",
       "      <td>[6.3, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]</td>\n",
       "      <td>7</td>\n",
       "      <td>['combine lime juice , cilantro , water , jala...</td>\n",
       "      <td>i found this recipe in a recent issue of gemma...</td>\n",
       "      <td>['boneless beef chuck steaks', 'fresh lime jui...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>158654</th>\n",
       "      <td>persian rhubarb syrup</td>\n",
       "      <td>240370</td>\n",
       "      <td>60</td>\n",
       "      <td>414666</td>\n",
       "      <td>2007-07-14</td>\n",
       "      <td>['60-minutes-or-less', 'time-to-make', 'course...</td>\n",
       "      <td>[778.0, 0.0, 737.0, 0.0, 5.0, 0.0, 66.0]</td>\n",
       "      <td>10</td>\n",
       "      <td>['put the rhubarb in a large stainless steel p...</td>\n",
       "      <td>add some seltzer or ginger ale and ice and thi...</td>\n",
       "      <td>['rhubarb', 'fresh lime juice', 'sugar', 'mint...</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>151872</th>\n",
       "      <td>pampered chef style apple crisp  for microwave...</td>\n",
       "      <td>63314</td>\n",
       "      <td>65</td>\n",
       "      <td>27783</td>\n",
       "      <td>2003-05-28</td>\n",
       "      <td>['weeknight', 'time-to-make', 'course', 'main-...</td>\n",
       "      <td>[461.9, 27.0, 192.0, 4.0, 7.0, 50.0, 24.0]</td>\n",
       "      <td>9</td>\n",
       "      <td>['you will need a 12\" round stoneware baking p...</td>\n",
       "      <td>this is one of the recipes that was served at ...</td>\n",
       "      <td>['tart baking apples', 'graham cracker squares...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>176661</th>\n",
       "      <td>rocky road sundae pie</td>\n",
       "      <td>10409</td>\n",
       "      <td>40</td>\n",
       "      <td>8728</td>\n",
       "      <td>2001-07-25</td>\n",
       "      <td>['60-minutes-or-less', 'time-to-make', 'course...</td>\n",
       "      <td>[620.9, 58.0, 193.0, 9.0, 16.0, 92.0, 22.0]</td>\n",
       "      <td>33</td>\n",
       "      <td>['preheat oven to 325', 'butter a 9-inch pan',...</td>\n",
       "      <td>great summer pie!</td>\n",
       "      <td>['chocolate ice cream', 'walnuts', 'chocolate ...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>163677</th>\n",
       "      <td>pork taco meat</td>\n",
       "      <td>85631</td>\n",
       "      <td>70</td>\n",
       "      <td>107334</td>\n",
       "      <td>2004-03-03</td>\n",
       "      <td>['time-to-make', 'course', 'preparation', 'ver...</td>\n",
       "      <td>[729.8, 73.0, 4.0, 13.0, 137.0, 81.0, 0.0]</td>\n",
       "      <td>7</td>\n",
       "      <td>['cube the pork steaks , cutting off much of t...</td>\n",
       "      <td>this is the best pork taco meat i've ever had....</td>\n",
       "      <td>['pork steak', 'oil', 'chili powder', 'salt', ...</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98092</th>\n",
       "      <td>grill a juicy burger with the only rub you ll ...</td>\n",
       "      <td>371492</td>\n",
       "      <td>30</td>\n",
       "      <td>37449</td>\n",
       "      <td>2009-05-11</td>\n",
       "      <td>['30-minutes-or-less', 'time-to-make', 'course...</td>\n",
       "      <td>[282.4, 23.0, 44.0, 62.0, 44.0, 29.0, 4.0]</td>\n",
       "      <td>7</td>\n",
       "      <td>['combine sirloin with the ground chuck and cr...</td>\n",
       "      <td>want to ensure juicy great tasting burgers for...</td>\n",
       "      <td>['ground sirloin', 'ground chuck', 'ice', 'sea...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48942</th>\n",
       "      <td>chipotle tortilla casserole</td>\n",
       "      <td>205081</td>\n",
       "      <td>30</td>\n",
       "      <td>83093</td>\n",
       "      <td>2007-01-12</td>\n",
       "      <td>['30-minutes-or-less', 'time-to-make', 'course...</td>\n",
       "      <td>[439.5, 35.0, 19.0, 31.0, 45.0, 30.0, 12.0]</td>\n",
       "      <td>10</td>\n",
       "      <td>['preheat oven to 400f', 'arrange chips in bot...</td>\n",
       "      <td>this is a recipe attributed to rick bayless, f...</td>\n",
       "      <td>['corn tortilla chips', 'cooked chicken', 'tom...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>184473</th>\n",
       "      <td>shamakia</td>\n",
       "      <td>491426</td>\n",
       "      <td>50</td>\n",
       "      <td>2548229</td>\n",
       "      <td>2012-12-07</td>\n",
       "      <td>['60-minutes-or-less', 'time-to-make', 'course...</td>\n",
       "      <td>[430.0, 47.0, 10.0, 32.0, 28.0, 49.0, 8.0]</td>\n",
       "      <td>7</td>\n",
       "      <td>['pre-heat oven to 350 degrees', 'cook sausage...</td>\n",
       "      <td>delicious casserole recipe that has been in my...</td>\n",
       "      <td>['fresh spinach', 'spicy sausage', 'sliced mus...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>147699</th>\n",
       "      <td>omelette</td>\n",
       "      <td>431785</td>\n",
       "      <td>10</td>\n",
       "      <td>1627002</td>\n",
       "      <td>2010-07-02</td>\n",
       "      <td>['15-minutes-or-less', 'time-to-make', 'prepar...</td>\n",
       "      <td>[370.0, 48.0, 4.0, 12.0, 37.0, 50.0, 0.0]</td>\n",
       "      <td>10</td>\n",
       "      <td>['break the eggs into a jug and add the water'...</td>\n",
       "      <td>taken from 'food for health and vitality' - se...</td>\n",
       "      <td>['eggs', 'water', 'dried thyme', 'black pepper...</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>223230</th>\n",
       "      <td>vichyssoise cream of leek</td>\n",
       "      <td>47763</td>\n",
       "      <td>40</td>\n",
       "      <td>20480</td>\n",
       "      <td>2002-12-07</td>\n",
       "      <td>['60-minutes-or-less', 'time-to-make', 'course...</td>\n",
       "      <td>[173.4, 15.0, 9.0, 31.0, 12.0, 28.0, 5.0]</td>\n",
       "      <td>6</td>\n",
       "      <td>['in a 2 qt glass casserole place leeks , onio...</td>\n",
       "      <td>wonderful soup made in the microwave. it can b...</td>\n",
       "      <td>['leeks', 'onion', 'potatoes', 'butter', 'chic...</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10000 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     name      id  minutes  \\\n",
       "124299                        lime cilantro grilled steak   29651      140   \n",
       "158654                              persian rhubarb syrup  240370       60   \n",
       "151872  pampered chef style apple crisp  for microwave...   63314       65   \n",
       "176661                              rocky road sundae pie   10409       40   \n",
       "163677                                     pork taco meat   85631       70   \n",
       "...                                                   ...     ...      ...   \n",
       "98092   grill a juicy burger with the only rub you ll ...  371492       30   \n",
       "48942                         chipotle tortilla casserole  205081       30   \n",
       "184473                                           shamakia  491426       50   \n",
       "147699                                           omelette  431785       10   \n",
       "223230                          vichyssoise cream of leek   47763       40   \n",
       "\n",
       "        contributor_id   submitted  \\\n",
       "124299           27783  2002-05-29   \n",
       "158654          414666  2007-07-14   \n",
       "151872           27783  2003-05-28   \n",
       "176661            8728  2001-07-25   \n",
       "163677          107334  2004-03-03   \n",
       "...                ...         ...   \n",
       "98092            37449  2009-05-11   \n",
       "48942            83093  2007-01-12   \n",
       "184473         2548229  2012-12-07   \n",
       "147699         1627002  2010-07-02   \n",
       "223230           20480  2002-12-07   \n",
       "\n",
       "                                                     tags  \\\n",
       "124299  ['weeknight', 'time-to-make', 'course', 'main-...   \n",
       "158654  ['60-minutes-or-less', 'time-to-make', 'course...   \n",
       "151872  ['weeknight', 'time-to-make', 'course', 'main-...   \n",
       "176661  ['60-minutes-or-less', 'time-to-make', 'course...   \n",
       "163677  ['time-to-make', 'course', 'preparation', 'ver...   \n",
       "...                                                   ...   \n",
       "98092   ['30-minutes-or-less', 'time-to-make', 'course...   \n",
       "48942   ['30-minutes-or-less', 'time-to-make', 'course...   \n",
       "184473  ['60-minutes-or-less', 'time-to-make', 'course...   \n",
       "147699  ['15-minutes-or-less', 'time-to-make', 'prepar...   \n",
       "223230  ['60-minutes-or-less', 'time-to-make', 'course...   \n",
       "\n",
       "                                          nutrition  n_steps  \\\n",
       "124299          [6.3, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]        7   \n",
       "158654     [778.0, 0.0, 737.0, 0.0, 5.0, 0.0, 66.0]       10   \n",
       "151872   [461.9, 27.0, 192.0, 4.0, 7.0, 50.0, 24.0]        9   \n",
       "176661  [620.9, 58.0, 193.0, 9.0, 16.0, 92.0, 22.0]       33   \n",
       "163677   [729.8, 73.0, 4.0, 13.0, 137.0, 81.0, 0.0]        7   \n",
       "...                                             ...      ...   \n",
       "98092    [282.4, 23.0, 44.0, 62.0, 44.0, 29.0, 4.0]        7   \n",
       "48942   [439.5, 35.0, 19.0, 31.0, 45.0, 30.0, 12.0]       10   \n",
       "184473   [430.0, 47.0, 10.0, 32.0, 28.0, 49.0, 8.0]        7   \n",
       "147699    [370.0, 48.0, 4.0, 12.0, 37.0, 50.0, 0.0]       10   \n",
       "223230    [173.4, 15.0, 9.0, 31.0, 12.0, 28.0, 5.0]        6   \n",
       "\n",
       "                                                    steps  \\\n",
       "124299  ['combine lime juice , cilantro , water , jala...   \n",
       "158654  ['put the rhubarb in a large stainless steel p...   \n",
       "151872  ['you will need a 12\" round stoneware baking p...   \n",
       "176661  ['preheat oven to 325', 'butter a 9-inch pan',...   \n",
       "163677  ['cube the pork steaks , cutting off much of t...   \n",
       "...                                                   ...   \n",
       "98092   ['combine sirloin with the ground chuck and cr...   \n",
       "48942   ['preheat oven to 400f', 'arrange chips in bot...   \n",
       "184473  ['pre-heat oven to 350 degrees', 'cook sausage...   \n",
       "147699  ['break the eggs into a jug and add the water'...   \n",
       "223230  ['in a 2 qt glass casserole place leeks , onio...   \n",
       "\n",
       "                                              description  \\\n",
       "124299  i found this recipe in a recent issue of gemma...   \n",
       "158654  add some seltzer or ginger ale and ice and thi...   \n",
       "151872  this is one of the recipes that was served at ...   \n",
       "176661                                  great summer pie!   \n",
       "163677  this is the best pork taco meat i've ever had....   \n",
       "...                                                   ...   \n",
       "98092   want to ensure juicy great tasting burgers for...   \n",
       "48942   this is a recipe attributed to rick bayless, f...   \n",
       "184473  delicious casserole recipe that has been in my...   \n",
       "147699  taken from 'food for health and vitality' - se...   \n",
       "223230  wonderful soup made in the microwave. it can b...   \n",
       "\n",
       "                                              ingredients  n_ingredients  \n",
       "124299  ['boneless beef chuck steaks', 'fresh lime jui...              8  \n",
       "158654  ['rhubarb', 'fresh lime juice', 'sugar', 'mint...              4  \n",
       "151872  ['tart baking apples', 'graham cracker squares...              9  \n",
       "176661  ['chocolate ice cream', 'walnuts', 'chocolate ...              8  \n",
       "163677  ['pork steak', 'oil', 'chili powder', 'salt', ...              7  \n",
       "...                                                   ...            ...  \n",
       "98092   ['ground sirloin', 'ground chuck', 'ice', 'sea...             12  \n",
       "48942   ['corn tortilla chips', 'cooked chicken', 'tom...             11  \n",
       "184473  ['fresh spinach', 'spicy sausage', 'sliced mus...              8  \n",
       "147699  ['eggs', 'water', 'dried thyme', 'black pepper...              9  \n",
       "223230  ['leeks', 'onion', 'potatoes', 'butter', 'chic...              8  \n",
       "\n",
       "[10000 rows x 12 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "recipes_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bf1f1886-8a5c-42c0-97c6-43561ced1dcd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mps\n"
     ]
    }
   ],
   "source": [
    "# Set the appropriate device depending upon your hardware. \n",
    "\n",
    "# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu') \n",
    "print(device)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6d4c5b60-70ac-49d7-a590-9c02c2757df5",
   "metadata": {},
   "source": [
    "<br><br>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "83dbf9e7-1830-42f2-86fb-4e24e0524621",
   "metadata": {},
   "source": [
    "### Tokenization \n",
    "\n",
    "Our data isn't that big. The vocabulary is quite small. So we'll write a wrapper class to map token ids of from pretrained model to custom token ids. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "93d5db3f-e718-4c8f-bbae-26efe1290950",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "class TokenizerWrapper():\n",
    "    \"\"\"\n",
    "    A wrapper class for the AutoTokenizer to handle tokenization and provide\n",
    "    custom token-vocabulary mappings. \n",
    "    \"\"\"\n",
    "    def __init__(self, model_name=\"bert-base-cased\"):        \n",
    "        \"\"\"\n",
    "        Initializes the TokenizerWrapper with a specified model.\n",
    "        \"\"\"\n",
    "        self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "        # The wrapper class creates a token-to-vocab mapping\n",
    "        # Let's keep the ids corresponding to special tokens.  \n",
    "        # 0 --> [PAD], 101 --> [CLS], 102 --> [SEP]  \n",
    "        self.token_id_to_vocab_id = {0: 0, 101: 1, 102: 2}\n",
    "        self.vocab_id_to_token_id = {0: 0, 1: 101, 2:102}\n",
    "        self.vocab_id = len(self.vocab_id_to_token_id)\n",
    "        self.padding_len = None \n",
    "\n",
    "    def build_dictionary(self, list_of_recipes: list):\n",
    "        \"\"\"\n",
    "        Processes a list of captions to build and update the vocabulary based on the tokens found in the captions.\n",
    "        This function also finds the maximum length of the tokenized captions to set the padding length.\n",
    "    \n",
    "        \"\"\"\n",
    "        # Tokenize all recipes to find the unique tokens and the maximum length\n",
    "        tokenized_outputs = self.tokenizer(list_of_recipes, add_special_tokens=False)\n",
    "        all_token_ids = set(token for sublist in tokenized_outputs.input_ids for token in sublist)\n",
    "    \n",
    "        # Update the custom token-vocabulary mapping\n",
    "        for token_id in all_token_ids:\n",
    "            if token_id not in self.token_id_to_vocab_id:\n",
    "                self.token_id_to_vocab_id[token_id] = self.vocab_id\n",
    "                self.vocab_id_to_token_id[self.vocab_id] = token_id\n",
    "                self.vocab_id += 1\n",
    "    \n",
    "        # Set the padding length to the length of the longest tokenized recipe\n",
    "        self.padding_len = max(len(tokens) for tokens in tokenized_outputs.input_ids)\n",
    "    \n",
    "    \n",
    "    def get_vocab_size(self):\n",
    "        \"\"\"\n",
    "        Returns the size of the custom vocabulary.\n",
    "        \"\"\"\n",
    "        assert len(self.token_id_to_vocab_id) == len(self.vocab_id_to_token_id)\n",
    "        return len(self.token_id_to_vocab_id)\n",
    "\n",
    "\n",
    "    def tokenize(self, text: str) -> list:\n",
    "        \"\"\"\n",
    "        Tokenizes a text string into custom vocabulary IDs, using the built dictionary. \n",
    "        Requires the dictionary to be built first.\n",
    "    \n",
    "        Parameters:\n",
    "            text (str): The text to tokenize.\n",
    "    \n",
    "        Returns:\n",
    "            list of int: A list of custom vocabulary IDs corresponding to the text tokens.\n",
    "        \"\"\"\n",
    "        assert self.padding_len is not None, 'Call build_dictionary first.'\n",
    "        # Tokenize the text with the maximum length set to the previously found maximum padding length\n",
    "\n",
    "        # Tokenize the text. You don't need `truncation=True` for your lab \n",
    "        tokenized_output = self.tokenizer(text, padding=\"max_length\", max_length=self.padding_len, truncation=True)\n",
    "        \n",
    "        return [self.token_id_to_vocab_id.get(token_id, 0)  # Default to [PAD] if token_id is not found\n",
    "                for token_id in tokenized_output.input_ids]\n",
    "        \n",
    "    \n",
    "    def decode(self, vocab_list: list) -> str:\n",
    "        \"\"\"\n",
    "        Decodes a list of custom vocabulary IDs back into the original text string.\n",
    "\n",
    "        Parameters:\n",
    "            vocab_list (list of int): A list of custom vocabulary IDs to decode.\n",
    "\n",
    "        Returns:\n",
    "            str: The decoded text string.\n",
    "        \"\"\"        \n",
    "        token_list = [self.vocab_id_to_token_id[vocab_id] for vocab_id in vocab_list]\n",
    "        # decode the string\n",
    "        decoded_string = self.tokenizer.decode(token_list, skip_special_tokens=True)\n",
    "        return decoded_string.strip()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ab21b95b-135d-4b98-ad63-7e48e9b89ef0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build the dictionary for our tokenizer  \n",
    "from tqdm import tqdm, trange \n",
    "tokenizer_wrapper = TokenizerWrapper()\n",
    "tokenizer_wrapper.build_dictionary(recipes_df[\"name\"].to_list())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "57a5e6de-63b0-438f-be17-bb8b61e3c0c5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recipe: wild rice and chicken soup\n",
      "Tokens: [1, 2079, 3515, 376, 402, 2406, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
      "Decoded recipe: wild rice and chicken soup\n"
     ]
    }
   ],
   "source": [
    "recipe_tokens = tokenizer_wrapper.tokenize(recipes_df['name'].iloc[10])\n",
    "decoded_recipe = tokenizer_wrapper.decode(recipe_tokens)\n",
    "print('Recipe:', recipes_df['name'].iloc[10])\n",
    "print('Tokens:', recipe_tokens)\n",
    "print('Decoded recipe:', decoded_recipe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6dd9a66e-40f9-4dd4-a581-b7f7804ea1d5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3692"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_size = tokenizer_wrapper.get_vocab_size()\n",
    "vocab_size"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c5903521-939a-444c-9901-56ffa1b5599e",
   "metadata": {},
   "source": [
    "<br><br>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e21f57cd-e64c-4f52-b9ee-263233369576",
   "metadata": {},
   "source": [
    "### Preparing the data "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70c0a514-ecf0-4475-a8c6-e371d440cc7e",
   "metadata": {},
   "source": [
    "Now that we have a class to tokenize our text let's prepare our data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d127ebca-f8f7-4397-ac11-33336966e1d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_data(data_df, tokenizer_wrapper):    \n",
    "    dataset = []\n",
    "    for row_id in trange(len(data_df)):\n",
    "        reicpe_tokens = torch.tensor(tokenizer_wrapper.tokenize(data_df['name'].iloc[row_id])) \n",
    "        dataset.append({'token': reicpe_tokens})\n",
    "    return dataset "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "be52cda2-aec7-4e96-b653-b0115b86f646",
   "metadata": {},
   "source": [
    "Let's create train and test datasets by calling `build_data` on train and test splits. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "54299ff0-a919-4453-84c7-c3b4b417fd06",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
      "To disable this warning, you can either:\n",
      "\t- Avoid using `tokenizers` before the fork if possible\n",
      "  0%|                                                                  | 0/8000 [00:00<?, ?it/s]TOKENIZERS_PARALLELISM=(true | false)\n",
      "100%|████████████████████████████████████████████████████| 8000/8000 [00:00<00:00, 19025.24it/s]\n",
      "100%|████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 20211.81it/s]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train_df, test_df = train_test_split(recipes_df, test_size=0.2, random_state=123)\n",
    "train_data = build_data(train_df, tokenizer_wrapper)\n",
    "test_data = build_data(test_df, tokenizer_wrapper)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "140dabb0-d937-49c8-aa19-11cb67b4dcdd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The vocab size is 3692.\n"
     ]
    }
   ],
   "source": [
    "# What's the vocab size? \n",
    "vocab_size = tokenizer_wrapper.get_vocab_size()\n",
    "print(f'The vocab size is {vocab_size}.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "3afe702f-0349-44ce-82f1-4f4f5d4f5254",
   "metadata": {},
   "outputs": [],
   "source": [
    "class PytorchDataset():\n",
    "    def __init__(self, data, pad_vocab_id=0):\n",
    "        self.data = data\n",
    "        self.pad_tensor = torch.tensor([pad_vocab_id])\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.data)\n",
    "\n",
    "    def __getitem__(self, ind):\n",
    "        # Retrieve the next sequence of tokens from the current index\n",
    "        # by excluding the first token of the current sequence and appending a padding token at the end.        \n",
    "        target_sequence = torch.cat([self.data[ind]['token'][1:], self.pad_tensor]) \n",
    "        return self.data[ind]['token'], target_sequence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "5d993084-a56d-45fe-8438-48d6f88c6a0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = PytorchDataset(train_data)\n",
    "test_dataset = PytorchDataset(test_data)\n",
    "train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)\n",
    "test_dataloader = DataLoader(test_dataset, batch_size=50, shuffle=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0f735587-d16d-4e5d-aaf9-0ba8bf29b533",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([64, 23])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now let's get a batch of data from DataLoader\n",
    "train_text, train_target = next(iter(train_dataloader))\n",
    "train_text = train_text.to(device)\n",
    "train_text.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "103cff6c-b65c-4c5f-bf05-b1ddc3efbe9b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([   1,   82, 1641,  271,  604, 1282, 2318,  386, 1183,  388,  376, 3188,\n",
       "        3348,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0],\n",
       "       device='mps:0')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_text[11]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "4a019a4e-38b1-4d93-9a06-9b9da5c9faed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([  82, 1641,  271,  604, 1282, 2318,  386, 1183,  388,  376, 3188, 3348,\n",
       "           2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_target[11]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4bd67dac-e5e7-4a68-9b05-3f901673b1e1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'szechuan lamb with peppers and spinach'"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_wrapper.decode(train_text[11].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "149533c9-778d-443e-8080-723c95bfb8d5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'szechuan lamb with peppers and spinach'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer_wrapper.decode(train_target[11].tolist())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02094388-1b47-4f8e-9218-8167425fe48d",
   "metadata": {},
   "source": [
    "This is called autoregressive training.  "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa877d35-0e3a-44f0-a875-33e72a50a597",
   "metadata": {},
   "source": [
    "<br><br><br><br>"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de883b9e-2e53-41f6-8e45-db7825d936c3",
   "metadata": {},
   "source": [
    "### Transformer model architecture \n",
    "\n",
    "To define our transformer model architecture for text generation, we will focus on constructing a decoder-only model. Here are the essential components:\n",
    "\n",
    "- **Embedding Layer**: This layer transforms our input token indices into embeddings that the transformer can process.\n",
    "  \n",
    "- **Transformer Decoder Layers**: These are the core of our decoder-only architecture, where each layer processes the input embeddings to generate the next token predictions.\n",
    "  \n",
    "- **Linear Layer**: This final output layer maps the decoder's output to the size of our vocabulary, facilitating the prediction of the next token.\n",
    "\n",
    "For implementing these components in PyTorch, we use the following classes:\n",
    "- [nn.Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) to create embeddings of tokens\n",
    "- [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) for the final unembedding layer\n",
    "- [nn.TransformerDecoderLayer](https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoderLayer.html): This class allows us to define individual transformer blocks with self-attention and a feedforward network.\n",
    "- [nn.TransformerDecoder](https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html) This class is used to stack multiple `TransformerDecoderLayer` objects into a full decoder.  \n",
    "\n",
    "**Required parameters of TransformerDecoderLayer**\n",
    "\n",
    "\n",
    "- `d_model` (int): the number of expected features in the input (required).\n",
    "\n",
    "- `nhead` (int): the number of heads in the multiheadattention models (required).\n",
    "\n",
    "\n",
    "**Required parameters of TransformerDecoder**\n",
    "\n",
    "- decoder_layer: an instance of the TransformerDecoderLayer() class (required).\n",
    "\n",
    "- num_layers: the number of sub-decoder-layers in the decoder (required)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "426427fd-d3b9-4484-b3da-db0cfedaa6f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "class PositionalEncoding(nn.Module):\n",
    "    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):\n",
    "        super().__init__()\n",
    "        self.dropout = nn.Dropout(p=dropout)\n",
    "\n",
    "        position = torch.arange(max_len).unsqueeze(1)\n",
    "        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))\n",
    "        pe = torch.zeros(max_len, 1, d_model)\n",
    "        pe[:, 0, 0::2] = torch.sin(position * div_term)\n",
    "        pe[:, 0, 1::2] = torch.cos(position * div_term)\n",
    "        self.register_buffer('pe', pe)\n",
    "\n",
    "    def forward(self, x: torch.Tensor) -> torch.Tensor:\n",
    "        x = x + self.pe[:x.size(0)]\n",
    "        return self.dropout(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "d9be74fc-4b38-4f60-bc2b-45d5818314d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "class RecipeGenerator(nn.Module):\n",
    "    def __init__(self, d_model, n_heads, num_layers, vocab_size, device, dropout=0.1):\n",
    "        \"\"\"\n",
    "        Initialize the RecipeGenerator which uses a transformer decoder architecture\n",
    "        for generating recipes.\n",
    "\n",
    "        Parameters:\n",
    "            d_model (int): The number of expected features in the encoder/decoder inputs.\n",
    "            n_heads (int): The number of heads in the multiheadattention models.\n",
    "            num_layers (int): The number of sub-decoder-layers in the transformer.\n",
    "            vocab_size (int): The size of the vocabulary.\n",
    "            device (torch.device): The device on which the model will be trained.\n",
    "            dropout (float): The dropout value used in PositionalEncoding and TransformerDecoderLayer.\n",
    "        \"\"\"        \n",
    "        super(RecipeGenerator, self).__init__()\n",
    "        self.d_model = d_model\n",
    "        self.device = device\n",
    "        # Positional Encoding to add position information to input embeddings\n",
    "        self.pos_encoding = PositionalEncoding(d_model=d_model, dropout=dropout)\n",
    "\n",
    "        # Transformer decoder\n",
    "        self.TransformerDecoder = nn.TransformerDecoder(\n",
    "            decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout), \n",
    "            num_layers = num_layers\n",
    "        )\n",
    "\n",
    "        # Embedding layer for converting input text tokens into vectors\n",
    "        self.text_embedding = nn.Embedding(vocab_size, d_model)\n",
    "\n",
    "        # Final linear layer to map the output of the transformer decoder to vocabulary size        \n",
    "        self.linear_layer = nn.Linear(d_model, vocab_size)\n",
    "        \n",
    "        # Initialize the weights of the model\n",
    "        self.init_weights()\n",
    "        \n",
    "    def init_weights(self):\n",
    "        \"\"\"\n",
    "        Initialize weights of the model to small random values.\n",
    "        \"\"\"\n",
    "        initrange = 0.1\n",
    "        self.text_embedding.weight.data.uniform_(-initrange, initrange)\n",
    "        self.linear_layer.bias.data.zero_()\n",
    "        self.linear_layer.weight.data.uniform_(-initrange, initrange)\n",
    "\n",
    "    def forward(self, text):\n",
    "        # Get the embeded input\n",
    "        encoded_text = self.embed_text(text)\n",
    "\n",
    "        # Get transformer output\n",
    "        transformer_output = self.decode(encoded_text)\n",
    "\n",
    "        # Final linear layer (unembedding layer)\n",
    "        \n",
    "        return self.linear_layer(transformer_output)\n",
    "    \n",
    "    def embed_text(self, text):\n",
    "        embedding = self.text_embedding(text) * math.sqrt(self.d_model)\n",
    "        # Changing [batch_size, seq_len, d_model] to [seq_len, batch_size, d_model]\n",
    "        return self.pos_encoding(embedding.permute(1, 0, 2))\n",
    "    \n",
    "    def decode(self, encoded_text):\n",
    "        # Get the length of the sequences to be decoded. This is needed to generate the causal masks\n",
    "        seq_len = encoded_text.size(0)\n",
    "        causal_mask = self.generate_mask(seq_len)\n",
    "        dummy_memory = torch.zeros_like(encoded_text)\n",
    "        return self.TransformerDecoder(tgt=encoded_text, memory=dummy_memory, tgt_mask=causal_mask)\n",
    "    \n",
    "    def generate_mask(self, size):\n",
    "        # Make the upper triangular part of the matrix to -inf\n",
    "        mask = torch.triu(torch.ones(size, size, device=self.device), 1)\n",
    "        return mask.float().masked_fill(mask == 1, float('-inf'))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "3fb26974-ebd5-45f0-95ec-9389cfa1ef8f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],\n",
       "        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],\n",
       "        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],\n",
       "        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],\n",
       "        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],\n",
       "        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],\n",
       "        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],\n",
       "        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],\n",
       "        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],\n",
       "        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch \n",
    "size = 10\n",
    "mask = torch.triu(torch.ones(size, size), 1)\n",
    "mask.float().masked_fill(mask == 1, float('-inf'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "aec3e665-1988-4c7a-8337-41a81d0a792c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now let's try your model. \n",
    "# Define the hyperparameters and initalize the model. Feel free to change these hyperparameters. \n",
    "d_model = 256 \n",
    "n_heads = 4\n",
    "num_layers = 8\n",
    "model = RecipeGenerator(d_model=d_model, n_heads=n_heads, num_layers=num_layers, vocab_size=vocab_size, device=device).to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "ca2660f0-5eae-463e-9ea2-48f090ac5bbd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[   1,  256,  377,  ...,    0,    0,    0],\n",
       "        [   1, 2230,   83,  ...,    0,    0,    0],\n",
       "        [   1,  378,  430,  ...,    0,    0,    0],\n",
       "        ...,\n",
       "        [   1, 2683,   81,  ...,    0,    0,    0],\n",
       "        [   1, 1519,  587,  ...,    0,    0,    0],\n",
       "        [   1, 3362, 3515,  ...,    0,    0,    0]], device='mps:0')"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "e55dbf9a-f866-40e7-a934-a4a48ef0de90",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([23, 64, 3692])"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# pass inputs to your model\n",
    "output = model(train_text)\n",
    "output.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "39544a5d-b332-4cff-807c-04d3a2eb08ed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3692"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "76417d4a-6e75-4c60-9fa7-433798366f13",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([64, 23])"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_text.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "e1c1cc1a-765f-4ed4-89ff-d900cd5a25a3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([23, 64, 3692])"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "output.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "52a533ea-631e-4fc6-a24a-14ec898fff48",
   "metadata": {},
   "outputs": [],
   "source": [
    "def trainer(model, criterion, optimizer, train_dataloader, test_dataloader, epochs=5, patience=5, clip_norm=1.0):\n",
    "    train_losses, test_losses = [], []\n",
    "    consec_increases, verbose = 0, True\n",
    "    \n",
    "    for epoch in range(epochs):\n",
    "        model.train()\n",
    "        train_loss = 0\n",
    "        for train_text, target_seq in train_dataloader:\n",
    "            train_text, target_seq = train_text.to(device), target_seq.to(device)\n",
    "            optimizer.zero_grad()\n",
    "            output = model(train_text).permute(1, 2, 0)  # Ensure output is in correct shape for loss calculation\n",
    "            loss = criterion(output, target_seq)\n",
    "            loss.backward()\n",
    "            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)\n",
    "            optimizer.step()\n",
    "            train_loss += loss.item()\n",
    "\n",
    "        model.eval()\n",
    "        test_loss = 0\n",
    "        with torch.no_grad():\n",
    "            for test_text, target_seq in test_dataloader:\n",
    "                test_text, target_seq = test_text.to(device), target_seq.to(device)\n",
    "                output = model(test_text).permute(1, 2, 0)\n",
    "                test_loss += criterion(output, target_seq).item()\n",
    "\n",
    "        train_losses.append(train_loss / len(train_dataloader))\n",
    "        test_losses.append(test_loss / len(test_dataloader))\n",
    "        if verbose:\n",
    "            print(f\"Epoch {epoch+1}: Train Loss {train_losses[-1]:.4f}, Test Loss {test_losses[-1]:.4f}\")\n",
    "\n",
    "        if epoch > 0 and test_losses[-1] > test_losses[-2] * (1 + 1e-5):\n",
    "            consec_increases += 1\n",
    "        else:\n",
    "            consec_increases = 0\n",
    "\n",
    "        if consec_increases >= patience:\n",
    "            print(f\"Stopped early at epoch {epoch + 1}\")\n",
    "            break\n",
    "\n",
    "    return train_losses, test_losses\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "895144a0-8d32-4841-b284-6f059a8f78a8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1: Train Loss 6.9428, Test Loss 6.3931\n",
      "Epoch 2: Train Loss 6.0120, Test Loss 5.5934\n",
      "Epoch 3: Train Loss 5.3771, Test Loss 5.1153\n",
      "Epoch 4: Train Loss 4.9798, Test Loss 4.8317\n",
      "Epoch 5: Train Loss 4.7009, Test Loss 4.6426\n",
      "Epoch 6: Train Loss 4.4967, Test Loss 4.5094\n",
      "Epoch 7: Train Loss 4.3364, Test Loss 4.4099\n",
      "Epoch 8: Train Loss 4.2019, Test Loss 4.3313\n",
      "Epoch 9: Train Loss 4.0903, Test Loss 4.2596\n",
      "Epoch 10: Train Loss 3.9926, Test Loss 4.2018\n",
      "Epoch 11: Train Loss 3.9047, Test Loss 4.1619\n",
      "Epoch 12: Train Loss 3.8245, Test Loss 4.1265\n",
      "Epoch 13: Train Loss 3.7520, Test Loss 4.0932\n",
      "Epoch 14: Train Loss 3.6853, Test Loss 4.0584\n",
      "Epoch 15: Train Loss 3.6224, Test Loss 4.0359\n",
      "Epoch 16: Train Loss 3.5630, Test Loss 4.0141\n",
      "Epoch 17: Train Loss 3.5078, Test Loss 3.9946\n",
      "Epoch 18: Train Loss 3.4531, Test Loss 3.9735\n",
      "Epoch 19: Train Loss 3.4047, Test Loss 3.9687\n",
      "Epoch 20: Train Loss 3.3578, Test Loss 3.9533\n"
     ]
    }
   ],
   "source": [
    "# Define the optimizer and the loss function. Feel free to change the hyperparameters. \n",
    "\n",
    "num_epoch = 20\n",
    "clip_norm = 1.0\n",
    "lr = 5e-5\n",
    "\n",
    "optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n",
    "criterion = torch.nn.CrossEntropyLoss(ignore_index=0) # Ignore the padding index\n",
    "train_losses, test_losses = trainer(model, criterion, optimizer,train_dataloader, test_dataloader, epochs= num_epoch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "6eb1aa4e-8584-4dc3-893c-c35a53b3b99d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_recipe(model, device, max_recipe_length=39, seed = 10, end_vocab=2):\n",
    "    \"\"\"\n",
    "    Generates a recipe using the specified model and device.\n",
    "\n",
    "    Parameters:\n",
    "        model (torch.nn.Module): The trained model used for generating reci[es.\n",
    "        device (torch.device): The device (e.g., CPU or GPU) to which tensors will be sent for model execution.\n",
    "        max_recipe_length (int, optional): The maximum length of the generated recipe. Defaults to 100.\n",
    "        start_vocab (int, optional): The vocabulary index used to signify the start of a recipe. Defaults to 1.\n",
    "        end_vocab (int, optional): The vocabulary index used to signify the end of a recipe. Defaults to 2.\n",
    "        \n",
    "    Returns:\n",
    "        numpy.ndarray: An array containing the sequence of vocabulary indices representing the generated caption.\n",
    "        \n",
    "    \"\"\"    \n",
    "    context = torch.tensor([[seed]]).to(device)\n",
    "    for _ in range(max_recipe_length):\n",
    "        logits = model(context)[-1]\n",
    "        probabilities = torch.softmax(logits, dim=-1).flatten(start_dim=1)\n",
    "        next_vocab = torch.multinomial(probabilities, num_samples=1)\n",
    "        context = torch.cat([context, next_vocab], dim=1)\n",
    "        if next_vocab.item() == end_vocab:\n",
    "            break\n",
    "    return context.cpu().numpy().flatten()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "229871ab-e2dd-460e-9f00-3a8d74f27f18",
   "metadata": {},
   "outputs": [],
   "source": [
    "recipe = generate_recipe(model, device, max_recipe_length=20, seed = 10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "f308b243-b184-45cb-91e2-cd22f67e4d40",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ban challengerakas wings'"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generated_recipe = tokenizer_wrapper.decode(recipe)\n",
    "generated_recipe"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "90f93d66-1128-4ed5-9979-5608bc21b7fd",
   "metadata": {},
   "source": [
    "The generated recipes are not good. But with more data, epochs, and some hyperparameter optimization, they are likely to improve.  "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:575]",
   "language": "python",
   "name": "conda-env-575-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}