import torch
import torch.nn as nn
import numpy as np
import copy
from tqdm import tqdm
from transformers import *

class Attention(nn.Module):
    def __init__(self, d_model):
        super(Attention, self).__init__()
        # Capas de proyecciones
        self.d_model = d_model
        self.Q = nn.Linear(d_model, d_model, bias=False)
        self.K  = nn.Linear(d_model, d_model, bias=False)
        self.V  = nn.Linear(d_model, d_model, bias=False)
        
    def forward(self, x, encode):
        # Proyección de los datos
        query,key,value = self.Q(x),self.K(encode),self.V(encode)
        scores = torch.matmul(query, key.T)/np.sqrt(self.d_model)
        p_attn = torch.nn.functional.softmax(scores, dim = -1)
        Vs = torch.matmul(p_attn, value).reshape(x.shape)
        
        return Vs, p_attn

class Encoder(nn.Module):
    def __init__(self, in_size, d_model, hidden=128, heads=3, dropout=0.3):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.enc = Encoding(in_size, d_model)
        self.att = nn.ModuleList([copy.deepcopy(SelfAttention(d_model)) for _ in range(heads)])
        self.lin = nn.Linear(heads*d_model, d_model, bias=True)
        self.norm = LayerNorm(d_model)
        self.ffw = nn.Sequential(nn.Linear(d_model, hidden), nn.ReLU(),
                                nn.Linear(hidden, d_model))
        self.drop1 = nn.Dropout(p=dropout)
        self.drop2 = nn.Dropout(p=dropout)
        self.drop3 = nn.Dropout(p=dropout)
    
    def forward(self, x):
        x_e = self.enc(x)
        x_e = self.drop1(x_e)
        head_att = [head(x_e) for head in self.att]
        self.att_weights = [head[1] for head in head_att]
        heads = [head[0] for head in head_att]
        multi_heads = torch.cat(heads, dim=-1)
        h = self.lin(multi_heads)
        h_norm = x_e + self.norm(h)
        h_norm = self.drop2(h_norm)
        out = self.ffw(h)
        
        return self.drop3(h_norm + self.norm(out))

class Decoder(nn.Module):
    def __init__(self, in_size, d_model, hidden=128, heads=3, dropout=0.3):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.enc = Encoding(in_size, d_model)
        self.self_att = nn.ModuleList([copy.deepcopy(MaskAttention(d_model)) for _ in range(heads)])
        self.att = Attention(d_model)
        self.lin = nn.Linear(heads*d_model, d_model, bias=True)
        self.norm = LayerNorm(d_model)
        self.ffw1 = nn.Sequential(nn.Linear(d_model, hidden), nn.ReLU(),
                                nn.Linear(hidden, d_model))
        self.ffw2 = nn.Sequential(nn.Linear(d_model, hidden), nn.ReLU(),
                                nn.Linear(hidden, d_model))
        self.drop1 = nn.Dropout(p=dropout)
        self.drop2 = nn.Dropout(p=dropout)
        self.drop3 = nn.Dropout(p=dropout)
    
    def forward(self, x, encode):
        x_e = self.enc(x)
        x_e = self.drop1(x_e)
        head_att = [head(x_e) for head in self.self_att]
        self.att_weights = [head[1] for head in head_att]
        heads = [head[0] for head in head_att]
        multi_heads = torch.cat(heads, dim=-1)
        h = self.lin(multi_heads)
        h_norm = x_e + self.norm(h)
        h_norm = self.ffw1(h_norm)
        h_norm = h + self.norm(h_norm)
        h_norm = self.drop2(h_norm)
        enc_dec, self.enc_dec_att = self.att(h_norm, encode)
        enc_dec = h_norm + self.norm(enc_dec)
        out = self.ffw2(enc_dec)
        
        return self.drop3(h_norm + self.norm(out))

class EncoderDecoder(nn.Module):
    def __init__(self, in_size, out_size, d_model, hidden=128, heads=5, dropout=0.3):
        super(EncoderDecoder, self).__init__()
        self.d_model = d_model
        self.encoder = Encoder(in_size, d_model, hidden=hidden, heads=heads, dropout=dropout)
        self.decoder = Decoder(out_size, d_model, hidden=hidden, heads=heads, dropout=dropout)
        self.generator = nn.Sequential(nn.Linear(d_model, out_size), nn.Softmax(1))
        
    def forward(self, x, y):
        enc = self.encode(x)
        out = self.decode(y, enc)
        
        return out

    def encode(self, x):
        return self.encoder(x)
    
    def decode(self, x, encode):
        return self.generator(self.decoder(x, encode))

import elotl.corpus
import matplotlib.pyplot as plt
from seaborn import heatmap as hm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

corpus_ot = elotl.corpus.load('tsunkua')
src, tgt = zip(*[(sents[0].lower().split(), sents[1].lower().split()) for sents in corpus_ot])

bos = 0
eos = 1
src_voc = vocab()
tgt_voc = vocab()
src_voc['bos'], tgt_voc['bos'] = bos, bos
src_voc['eos'], tgt_voc['eos'] = eos, eos

src_sents = list(index(src, src_voc))
tgt_sents = list(index(tgt, tgt_voc))
x = [torch.cat((torch.tensor([src_voc['[bos]']]),s, torch.tensor([src_voc['[eos]']])), axis=0).to(device) for s in src_sents]
y = [torch.cat((torch.tensor([tgt_voc['[bos]']]),s, torch.tensor([tgt_voc['[eos]']])), axis=0).to(device) for s in tgt_sents]

#print(src_voc, tgt_voc)

len_src = len(src_voc)
len_tgt = len(tgt_voc)
model = EncoderDecoder(in_size=len_src, out_size=len_tgt, d_model=128, hidden=256).to(device)

#Carga del mo"delo
model.load_state_dict(torch.load('transformer.model', map_location=torch.device('cpu')))
model.eval()

EncoderDecoder(
  (encoder): Encoder(
    (enc): Encoding(
      (emb): Embedding(4044, 128)
      (pe): PositionalEncoding()
    )
    (att): ModuleList(
      (0-4): 5 x SelfAttention(
        (Q): Linear(in_features=128, out_features=128, bias=False)
        (K): Linear(in_features=128, out_features=128, bias=False)
        (V): Linear(in_features=128, out_features=128, bias=False)
      )
    )
    (lin): Linear(in_features=640, out_features=128, bias=True)
    (norm): LayerNorm()
    (ffw): Sequential(
      (0): Linear(in_features=128, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=128, bias=True)
    )
    (drop1): Dropout(p=0.3, inplace=False)
    (drop2): Dropout(p=0.3, inplace=False)
    (drop3): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (enc): Encoding(
      (emb): Embedding(3599, 128)
      (pe): PositionalEncoding()
    )
    (self_att): ModuleList(
      (0-4): 5 x MaskAttention(
        (Q): Linear(in_features=128, out_features=128, bias=False)
        (K): Linear(in_features=128, out_features=128, bias=False)
        (V): Linear(in_features=128, out_features=128, bias=False)
      )
    )
    (att): Attention(
      (Q): Linear(in_features=128, out_features=128, bias=False)
      (K): Linear(in_features=128, out_features=128, bias=False)
      (V): Linear(in_features=128, out_features=128, bias=False)
    )
    (lin): Linear(in_features=640, out_features=128, bias=True)
    (norm): LayerNorm()
    (ffw1): Sequential(
      (0): Linear(in_features=128, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=128, bias=True)
    )
    (ffw2): Sequential(
      (0): Linear(in_features=128, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=128, bias=True)
    )
    (drop1): Dropout(p=0.3, inplace=False)
    (drop2): Dropout(p=0.3, inplace=False)
    (drop3): Dropout(p=0.3, inplace=False)
  )
  (generator): Sequential(
    (0): Linear(in_features=128, out_features=3599, bias=True)
    (1): Softmax(dim=1)
  )
)

criterion = nn.CrossEntropyLoss()
optimizer = NoamOptimizer(model.parameters(), model.d_model, decay=0)
epochs = range(100)

#Entrenamiento
model.train()
for t in tqdm(epochs):
    for i in torch.randperm(len(x)):
        prediction = model(x[i], y[i])
        optimizer.zero_grad()
        loss_value = criterion(prediction, y[i])
        loss_value.backward()
        optimizer.step()
        
#torch.save(model.state_dict(), 'transformer.model')

model.eval()

def greedy_decode(model, x, max_len, start_symbol):
    encode = model.encode(x)
    ys = torch.ones(1).fill_(start_symbol).type_as(x.data)
    for i in range(max_len-1):
        prob = model.decode(ys, encode)
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word[-1].reshape(1)
        ys = torch.cat((ys, next_word), dim=0)
        
    return ys

tgt_voc_rev = {k:v for v,k in tgt_voc.items()}
def translate(sent, model, max_len=10):
    x_sents = []
    for w in sent.split():
        idx_w = src_voc[w]
        x_sents.append(idx_w)
    x_sents = [bos] + x_sents + [eos]
    y = greedy_decode(model, torch.tensor(x_sents).to(device), max_len, bos)
    sent = y.cpu().detach().tolist()
        
    return ' '.join([tgt_voc_rev[word] for word in sent][1:])

text = 'se calcina su piel'
result = translate(text, model, max_len=len(text.split())+1)
print('Original: {}\nTraducción: {}'.format(text,result))

hm(model.decoder.enc_dec_att.cpu().detach().numpy(), xticklabels=['bos']+text.split()+['eos'], 
   yticklabels=result.split(), vmin=0, vmax=1)
plt.title('Pesos de atención en transformador')
plt.show()

Original: se calcina su piel
Traducción: xbi hñähñu. hokagihe nge'u̱

for i,att in enumerate(model.encoder.att_weights):
    hm(att.cpu().detach().numpy(), xticklabels=['bos']+text.split()+['eos'], 
       yticklabels=['bos']+text.split()+['eos'], vmin=0, vmax=1)
    plt.title('Auto-atención del encoder')
    plt.show()

for i,att in enumerate(model.decoder.att_weights):
    hm(att.cpu().detach().numpy(), xticklabels=result.split(), 
       yticklabels=result.split(), vmin=0, vmax=1)
    plt.title('Auto-atención del decoder')
    plt.show()

Transformador¶

Atención encoder-decoder¶

Encoder¶

Decoder¶

Encoder-decoder¶

Aplicaicón del transformador¶

Entrenamiento¶

Inferencia¶

Referencias¶