import torch
from transformers import BertForMaskedLM, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("BETOpytorch/", do_lower_case=True)
model = BertForMaskedLM.from_pretrained("BETOpytorch/")

2026-01-11 13:31:48.466243: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-11 13:31:48.832837: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-01-11 13:31:49.574721: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT

print(model)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
  )
  (cls): BertOnlyMLMHead(
    (predictions): BertLMPredictionHead(
      (transform): BertPredictionHeadTransform(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (transform_act_fn): GELUActivation()
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      )
      (decoder): Linear(in_features=768, out_features=31002, bias=True)
    )
  )
)

text = "[CLS] Cuando yo [MASK] al [MASK], entonces la fábrica comenzó a [MASK] [SEP]"

#Se obtienen los tokens del texto
tokens = tokenizer.tokenize(text)

#Indexa los tokens y guarda los que están enmascarados
masked_indxs = tuple(i for i,w in enumerate(tokens) if w=='[MASK]')

#Convierte los tokens en índices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
#Convierte los tokens en un tensor
tokens_tensor = torch.tensor([indexed_tokens])
print(tokens_tensor.shape)

#Aplica el modelo
predictions = model(tokens_tensor)[0]

print(predictions.shape)

for i,mask_idx in enumerate(masked_indxs):
    #Ordena las predicciones de la más probable a la menos
    idxs = torch.argsort(predictions[0,mask_idx], descending=True)
    #Convierte los índices en palabras o subwords
    predicted_token = tokenizer.convert_ids_to_tokens(idxs)[:5]
    print('MASK',i,':',predicted_token)

torch.Size([1, 15])
torch.Size([1, 15, 31002])
MASK 0 : ['llegue', 'estaba', 'iba', 'fui', 'regrese']
MASK 1 : ['trabajo', 'mercado', 'colegio', 'pueblo', 'negocio']
MASK 2 : ['funcionar', 'ser', 'trabajar', 'producir', 'fabricar']

#Vocabulario
model_voc = dict(tokenizer.vocab)

print('Número de tipos:', len(model_voc.keys()))

#Embeddings
embs = model.bert.embeddings.word_embeddings.weight.data
pos = model.bert.embeddings.position_embeddings.weight.data

print(embs.shape)
print(pos.shape)

Número de tipos: 31002
torch.Size([31002, 768])
torch.Size([512, 768])

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
#from sklearn.manifold import TSNE

#Funcion para plotear los datos con labels
def plot_words(Z,ids):
    #Reduce la dimensionalidad a 2
    Z = PCA(2).fit_transform(Z)
    
    #Plotea con la marcas (marker) y el color indicado (c)
    r=0
    plt.scatter(Z[:,0],Z[:,1], marker='o', c='blue')
    for label,x,y in zip(ids, Z[:,0], Z[:,1]):
        plt.annotate(label, xy=(x,y), xytext=(-1,1), textcoords='offset points', ha='center', va='bottom')
        r+=1

plot_words(embs[1000:1100], list(model_voc.keys())[1000:1100])
plt.show()

import pandas as pd
from tqdm import tqdm
from transformers_functions import *

import string

def remove_punctuation_translate(text):
    translator = str.maketrans('', '', string.punctuation)
    
    return text.translate(translator)

#Lectura de los datos de texto
n = 11047
data = pd.read_csv('IMDBSpanish/IMDB Dataset SPANISH.csv', encoding='utf-8')
dataX = [remove_punctuation_translate(w) for w in data['review_es'].tolist()[:n]]
dataY = data['sentimiento'].tolist()[:n]

#Etiquetas
labels = {'negativo':0, 'positivo':1}
y = [labels[l] for l in dataY]

#tokenización y símbolo de clase
x = [torch.tensor(tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(data_i)[:511])) 
          for data_i in dataX]

print(len(x), len(y))

#Cargadores para entrenamiento y test
train_loader, test_loader = get_dataset(x, y, pad=1, batch_size=8)

print(train_loader.dataset.x.shape, train_loader.dataset.y.shape)

11047 11047
torch.Size([7732, 512]) torch.Size([7732])

/home/cienciasia/Documentos/Proyectos/BERT_Prueba/transformers_functions.py:10: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  self.x = torch.tensor(nn.utils.rnn.pad_sequence(x, padding_value=pad)).T #x

classifier = nn.Sequential(nn.Linear(768, 512), nn.ReLU(),
                          nn.Linear(512, len(labels)), nn.Softmax(-1))

model.load_state_dict(torch.load('beto.model', weights_only=True))
classifier.load_state_dict(torch.load('classifier_for_beto.model', weights_only=True))

<All keys matched successfully>

criterion = nn.CrossEntropyLoss()
optimizer = NoamOptimizer(list(model.bert.parameters())+list(classifier.parameters()), 768, init_lr=0.01, decay=1e-5, warmup=40000)
epochs = 1

model.bert.train()
classifier.train()
for epoch in range(epochs):
    for xi, yi in tqdm(train_loader):
        optimizer.zero_grad()
        cls = model.bert(xi)[0][:,4]
        output = classifier(cls)
        
        loss = criterion(output, torch.tensor(yi))
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

  0%|                                                   | 0/967 [00:00<?, ?it/s]/tmp/ipykernel_5132/1325448251.py:13: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  loss = criterion(output, torch.tensor(yi))
100%|███████████████████████████████████████| 967/967 [3:29:15<00:00, 12.98s/it]

Epoch 0, Loss: 0.3404

from sklearn.metrics import classification_report

model.eval()
classifier.eval()

x_pred = []
y_labels = []
for xi, yi in tqdm(test_loader):
    x_pred += list(classifier(model.bert(xi)[0][:,4]).argmax(1).detach().numpy())
    y_labels += list(yi.numpy())

print(classification_report(x_pred, y_labels, target_names=['negativo', 'positivo']))

100%|█████████████████████████████████████████| 332/332 [22:08<00:00,  4.00s/it]

              precision    recall  f1-score   support

    negativo       0.55      0.98      0.70       951
    positivo       0.99      0.68      0.80      2364

    accuracy                           0.76      3315
   macro avg       0.77      0.83      0.75      3315
weighted avg       0.86      0.76      0.77      3315

torch.save(model.state_dict(), 'beto.model')
torch.save(classifier.state_dict(), 'classifier_for_beto.model')

Modelo de BETO (BERT en español)¶

Modelo¶

Modelos del lenguaje enmascarados¶

Embeddings estáticos¶

Visualización de los embeddings estáticos¶

Clasificación¶

Clasificación de opiniones¶

Ajuste fino del modelo¶

Evaluación del modelo¶

Referencias¶