AppendixB: Skip-gram demo with toy data

AppendixB: Skip-gram demo with toy data#

import os
import random
import sys
import time

sys.path.append(os.path.join(os.path.abspath(".."), "code"))
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from scipy import spatial
torch.manual_seed(1)
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt 

from comat import CooccurrenceMatrix
from preprocessing import MyPreprocessor
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
# from support_functions import *

plt.rcParams["font.size"] = 16
import matplotlib.cm as cm

import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline
pd.set_option("display.max_colwidth", 0)

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Cell In[1], line 7
      4 import time
      6 sys.path.append(os.path.join(os.path.abspath(".."), "code"))
----> 7 import torch
      8 import torch.nn as nn
      9 import torch.optim as optim

File ~/miniforge3/envs/jbook/lib/python3.12/site-packages/torch/__init__.py:367
    365     if USE_GLOBAL_DEPS:
    366         _load_global_deps()
--> 367     from torch._C import *  # noqa: F403
    370 class SymInt:
    371     """
    372     Like an int (including magic methods), but redirects all operations on the
    373     wrapped node. This is used in particular to symbolically record operations
    374     in the symbolic shape workflow.
    375     """

ImportError: dlopen(/Users/kvarada/miniforge3/envs/jbook/lib/python3.12/site-packages/torch/_C.cpython-312-darwin.so, 0x0002): Library not loaded: @rpath/libgfortran.5.dylib
  Referenced from: <0B9C315B-A1DD-3527-88DB-4B90531D343F> /Users/kvarada/miniforge3/envs/jbook/lib/libopenblas.0.dylib
  Reason: tried: '/Users/kvarada/miniforge3/envs/jbook/lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/python3.12/site-packages/torch/lib/libgfortran.5.dylib' (no such file), '/Users/kvarada/miniforge3/envs/jbook/lib/python3.12/site-packages/torch/../../../libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/lib/python3.12/site-packages/torch/lib/libgfortran.5.dylib' (no such file), '/Users/kvarada/miniforge3/envs/jbook/lib/python3.12/site-packages/torch/../../../libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/bin/../lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/Users/kvarada/miniforge3/envs/jbook/bin/../lib/libgfortran.5.dylib' (duplicate LC_RPATH '@loader_path'), '/usr/local/lib/libgfortran.5.dylib' (no such file), '/usr/lib/libgfortran.5.dylib' (no such file, not in dyld cache)

class SkipgramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipgramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        scores = self.out(embeds)
        log_probs = torch.log_softmax(scores, dim=1)
        return log_probs

def create_input_pairs(pp_corpus, word2idx, context_size=2):
    """
    Generate training pairs of word indices for the SkipGram model.
    
    Args:
        pp_corpus (list of list of str): Preprocessed corpus where each sub-list represents a tokenized sentence.
        word2idx (dict): A dictionary mapping words to their respective indices in the vocabulary.
        context_size (int): The size of the context window around the target word.
        
    Returns:
        np.array: An array of tuples, each containing a pair of indices (target_word_index, context_word_index).
    """    
    idx_pairs = [(word2idx[sentence[i]], word2idx[sentence[j]])
                 for sentence in pp_corpus
                 for i in range(len(sentence))
                 for j in range(max(i - context_size, 0), min(i + context_size + 1, len(sentence)))
                 if i != j]
    return np.array(idx_pairs)

def get_vocab(tokenized_corpus):
    """
    Create a vocabulary list from a tokenized corpus.
    
    Args:
        tokenized_corpus (list of list of str): The corpus where each sub-list represents a tokenized sentence.
        
    Returns:
        list: A list of unique tokens found in the corpus, representing the vocabulary.
    """    
    return list(set(token for sentence in tokenized_corpus for token in sentence))    

def train_skipgram(model, pairs, epochs=10, learning_rate=0.01, verbose=False):
    loss_function = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        total_loss = 0
        for context, target in pairs:
            context_var = torch.tensor([context], dtype=torch.long)
            log_probs = model(context_var)
            loss = loss_function(log_probs, torch.tensor([target], dtype=torch.long))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        if verbose: 
            print(f"Epoch {epoch}: Total Loss: {total_loss}")

def plot_word2_vec_pca(
    data,
    words,
    show_labels=False,
    size=40,
    title="PCA visualization",
):
    """
    Carry out dimensionality reduction using PCA and plot 2-dimensional clusters.

    Parameters
    -----------
    data : numpy array
        data as a numpy array
    words : list
        the original raw sentences for labeling datapoints
    show_labels : boolean
        whether you want to show labels for points or not (default: False)
    size : int
        size of points in the scatterplot
    title : str
        title for the visualization plot

    Returns
    -----------
    None. Shows the clusters.
    """

    pca = PCA(n_components=2)
    principal_comp = pca.fit_transform(data)
    pca_df = pd.DataFrame(data=principal_comp, columns=["pca1", "pca2"])

    plt.figure(figsize=(6, 4))
    plt.title(title)
    ax = sns.scatterplot(
        x="pca1", y="pca2", data=pca_df, palette="tab10", s=size
    )

    x = pca_df["pca1"].tolist()
    y = pca_df["pca2"].tolist()
    if show_labels:
        for i, txt in enumerate(words):
            plt.annotate(txt, (x[i]+0.05, y[i]+0.05), size=10)

    plt.show()   
    
def plot_embeddings(fig, n_epochs, model, idx_pairs, vocab, word2idx, show_labels=True, word1="hockey", word2="football", word3="mango"):
    train_skipgram(model, idx_pairs, n_epochs)
    emb_mat = model.embeddings.weight.detach().numpy()
    context_mat = model.out.weight.detach().numpy()
#     fig.add_trace(go.Heatmap(z=emb_mat, y=vocab, colorscale="Viridis"), row=1, col=1)

#     fig.add_trace(go.Heatmap(z=context_mat, y=vocab, colorscale="Viridis"), row=1, col=2)

#     fig.update_layout(height=700, showlegend=False)

    pca = PCA(n_components=2)
    Z_pca = pca.fit_transform(emb_mat)

    
    plt.scatter(Z_pca[:,0], Z_pca[:, 1])
    x = Z_pca[:, 0]
    y = Z_pca[:, 1]
    if show_labels:
        for i, txt in enumerate(vocab):
            plt.annotate(txt, (x[i]+0.05, y[i]+0.05), size=15)
   
    #plot_word2_vec_pca(emb_mat, vocab, show_labels=True)
    #sim1 = round(1 - spatial.distance.cosine(emb_mat[word2idx[word1]], emb_mat[word2idx[word2]]), 4)
    #sim2 = round(1 - spatial.distance.cosine(emb_mat[word2idx[word1]], emb_mat[word2idx[word3]]), 4)    
    vec1 = emb_mat[word2idx[word1]].reshape(1, -1)
    vec2 = emb_mat[word2idx[word2]].reshape(1, -1)    
    vec3 = emb_mat[word2idx[word3]].reshape(1, -1)    
    sim1 = np.round(cosine_similarity(vec1, vec2)[0][0], 2)
    sim2 = np.round(cosine_similarity(vec1, vec3)[0][0], 2)    
    plt.title(f"\nNumber of epochs {n_epochs}\n Similarity({word1}, {word2}) = {sim1}\n Similarity({word1},{word3}) = {sim2}", fontsize=9)    
    # print(f"Similarity between {word1} and {word2} is {sim1}")    
    # print(f"Similarity between {word1} and {word3} is {sim2}")            
    plt.close()    
    return fig
        

For the purpose of your lab or quiz, you do not have to understand the code in this demo.

toy_corpus = [
    "drink mango juice",
    "drink pineapple juice",
    "drink apple juice",
    "drink squeezed pineapple juice",
    "drink squeezed mango juice",
    "drink apple tea",
    "drink mango tea",
    "drink mango water",
    "drink apple water",
    "drink pineapple water",
    "drink juice",
    "drink water",
    "drink tea",
    "play hockey",
    "play football",
    "play piano",
    "piano play",
    "play hockey game",
    "play football game",
]

sents = MyPreprocessor(toy_corpus)  # memory smart generator

EMBEDDING_DIM = 10
CONTEXT_SIZE = 2
EPOCHS = 10

# from word2vec_demo import *

vocab = get_vocab(sents)
word2idx = {w: idx for (idx, w) in enumerate(vocab)}
idx2word = {idx: w for (idx, w) in enumerate(vocab)}

vocab_size = len(vocab)

vocab

['piano',
 'play',
 'football',
 'mango',
 'water',
 'tea',
 'pineapple',
 'squeezed',
 'hockey',
 'game',
 'apple',
 'juice',
 'drink']

idx_pairs = create_input_pairs(sents, word2idx)    
idx_pairs[:10] # training examples

array([[12,  3],
       [12, 11],
       [ 3, 12],
       [ 3, 11],
       [11, 12],
       [11,  3],
       [12,  6],
       [12, 11],
       [ 6, 12],
       [ 6, 11]])

model = SkipgramModel(len(vocab), EMBEDDING_DIM)
train_skipgram(model, idx_pairs, epochs=10)

from IPython.display import display

import panel as pn
from panel import widgets
from panel.interact import interact
import matplotlib

pn.extension()

def f(n_epochs):
    fig = plt.figure(figsize=(6, 4))    
    return plot_embeddings(fig, n_epochs, model, idx_pairs, vocab, word2idx)

#interact(f, angle=widgets.IntSlider(start=0, end=182, step=1, value=0))
interact(f, n_epochs=widgets.IntSlider(start=0, end=1000, step=100, value=0))#.embed(max_opts=200)

Skip-gram before the weights are trained

Skip-gram after 900 epochs

../../_images/word2vec-toy-epochs900.png