import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from seaborn import heatmap as hm
import numpy as np

class SparseAttention(nn.Module):
    #Atención enmascarando subsecuentes
    def __init__(self, d_model, stride=3):
        super(SparseAttention, self).__init__()
        self.d_model = d_model
        self.stride = stride
        self.Q = nn.Linear(d_model, d_model, bias=False)
        self.K = nn.Linear(d_model, d_model, bias=False)
        self.V = nn.Linear(d_model, d_model, bias=False)

    def forward(self, x):
        query, key, value = self.Q(x), self.K(x), self.V(x)
        scores = torch.matmul(query, key.T)/np.sqrt(self.d_model)
        #Enmascaramiento de los scores
        mask  = self.masking(x)
        scores = scores.masked_fill(mask == 0, -1e9)
        att = nn.functional.softmax(scores, dim=-1)
        h = torch.matmul(att, value)

        return h, att

    def masking(self, x):
        #Creación de la máscara
        n = x.size(0)
        mask = np.ones((n,n))
        for i in range(0,n):
            for j in range(0,self.stride):
                m = max(0,i-j)
                mask[i,m] = 0
        
        return torch.from_numpy(mask) == 0

model = SparseAttention(128, stride=3)

x = torch.rand(5,128)
labels = ['$w_1$','$w_2$','$w_3$','$w_4$','$w_5$']
h, att = model(x)
hm(att.detach().numpy(), annot=True, xticklabels=labels, yticklabels=labels)
plt.show()

import copy

class MultiHeadMaskAttention(nn.Module):
    def __init__(self, in_size, d_model, hidden=128, heads=3, dropout=0.3):
        super(MultiHeadMaskAttention, self).__init__()
        self.d_model = d_model
        self.enc = Encoding(in_size, d_model)
        #Uso de atención dispersa con stride incremental
        self.att = nn.ModuleList([copy.deepcopy(SparseAttention(d_model, stride=i+1)) for i, _ in enumerate(range(heads))])
        self.lin = nn.Linear(heads*d_model, d_model, bias=True)
        self.norm = LayerNorm(d_model)
        self.ffw = nn.Sequential(nn.Linear(d_model, hidden), nn.ReLU(),
                                nn.Linear(hidden, d_model))
        self.drop1 = nn.Dropout(p=dropout)
        self.drop2 = nn.Dropout(p=dropout)
        self.drop3 = nn.Dropout(p=dropout)
    
    def forward(self, x):
        x_e = self.enc(x)
        x_e = self.drop1(x_e)
        head_att = [head(x_e) for head in self.att]
        self.att_weights = [head[1] for head in head_att]
        heads = [head[0] for head in head_att]
        multi_heads = torch.cat(heads, dim=-1)
        h = self.lin(multi_heads)
        h_norm = x_e + self.norm(h)
        h_norm = self.drop2(h_norm)
        out = self.ffw(h)
        
        return self.drop3(h_norm + self.norm(out))

import pandas as pd
from tqdm import tqdm
from transformers import *

#Corpus a utilizar
corpus = ['el perro come un hueso', 'un muchacho jugaba', 'el muchacho saltaba la cuerda',
          'un perro come croquetas', 'el perro come', 'el gato come croquetas', 
          'un gato come', 'un muchacho jugaba con la cuerda', 'el muchacho jugaba con la cuerda']
corpus = [w.split() for w in corpus]
#Creación del vocabulario
voc = vocab()
voc['[bos]'] = 0
voc['[eos]'] = 1
#Indexación de cadenas
sents = list(index(corpus, voc))

#Pares de entrenamiento
x = [torch.cat((torch.tensor([voc['[bos]']]),s), axis=0) for s in sents]
y = [torch.cat((s, torch.tensor([voc['[eos]']])), axis=0) for s in sents]
print(x[0], y[0])

tensor([0, 2, 3, 4, 5, 6]) tensor([2, 3, 4, 5, 6, 1])

len_voc = len(voc)
model = nn.Sequential(MultiHeadMaskAttention(len_voc, 128, heads=4), 
                      nn.Linear(128,len_voc), nn.Softmax(1)) 

#Carga del modelo
model.load_state_dict(torch.load('sparse.model'))
model.eval()

Sequential(
  (0): MultiHeadMaskAttention(
    (enc): Encoding(
      (emb): Embedding(15, 128)
      (pe): PositionalEncoding()
    )
    (att): ModuleList(
      (0-3): 4 x SparseAttention(
        (Q): Linear(in_features=128, out_features=128, bias=False)
        (K): Linear(in_features=128, out_features=128, bias=False)
        (V): Linear(in_features=128, out_features=128, bias=False)
      )
    )
    (lin): Linear(in_features=512, out_features=128, bias=True)
    (norm): LayerNorm()
    (ffw): Sequential(
      (0): Linear(in_features=128, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=128, bias=True)
    )
    (drop1): Dropout(p=0.3, inplace=False)
    (drop2): Dropout(p=0.3, inplace=False)
    (drop3): Dropout(p=0.3, inplace=False)
  )
  (1): Linear(in_features=128, out_features=15, bias=True)
  (2): Softmax(dim=1)
)

criterion = nn.CrossEntropyLoss()
optimizer = NoamOptimizer(model.parameters(), model[0].d_model, decay=0.01)
epochs = range(100)

#Entrenamiento
model.train()
for t in tqdm(epochs):
    for i in torch.randperm(len(x)):
        prediction = model(x[i])
        optimizer.zero_grad()
        loss_value = criterion(prediction, y[i])
        loss_value.backward()
        optimizer.step()

#torch.save(model.state_dict(), 'model.model')

/home/cienciasia/anaconda3/lib/python3.11/site-packages/torch/cuda/__init__.py:619: UserWarning: Can't initialize NVML
  warnings.warn("Can't initialize NVML")
100%|█████████████████████████████████████████| 100/100 [00:09<00:00, 10.02it/s]

devoc = {i:t for t,i in voc.items()}
def result(text, model):
    #Función para predecir la siguiente palabra dado el contexto
    tokens = text.split()
    x = torch.tensor([voc[t] for t in tokens])
    pred = model(x)
    max_token = pred.argmax(axis=1).detach().numpy()
    
    return pred.detach().numpy(), ' '.join([devoc[i] for i in max_token])

p, pred_text = result('[bos]', model)
print('Palabra siguiente con mayor prob: {}'.format(pred_text))

#Visualización de probabilidades más altas
args = np.argsort(p[-1])[::-1]
probs = np.sort(p[-1])[::-1]
pd.DataFrame(data=probs, columns=['prob. tóken'], index=[devoc[j] for j in args]).plot.bar()
plt.show()

Palabra siguiente con mayor prob: el

text = '[bos] un gato come'
result(text, model)

for i, att_w in enumerate(model[0].att_weights):
    hm(att_w.detach().numpy(), xticklabels=text.split(), yticklabels=text.split(), vmin=0, vmax=1)
    plt.title('Atención en cabeza %i' %i)
    plt.show()

Atención dispersa¶

Aplicación de atención dispersa¶

Datos para el entrenamiento¶

Entrenamiento del modelo¶

Exploración del modelo¶

Referencias¶