import information class
import dataset
import torch
import torch.yeah as yeah
import tqdm
@information class.information class
class bart config:
“”“BERT Mannequin Settings”“”
vocabulary measurement: integer = 30522
Variety of layers: integer = 12
hidden measurement: integer = 768
num_heads: integer = 12
Risk of dropout: float = 0.1
Pad ID: integer = 0
max_seq_len: integer = 512
Variety of varieties: integer = 2
class bart block(yeah.module):
“”“One Transformer Block of BERT”“”
certainly __Initialization__(self, hidden measurement: integer, num_heads: integer, Risk of dropout: float):
great().__Initialization__()
self.Notice = yeah.multi head consideration(hidden measurement, num_heads,
dropout=Risk of dropout, batch_first=reality)
self.attn_norm = yeah.layer norm(hidden measurement)
self.ff_norm = yeah.layer norm(hidden measurement)
self.dropout = yeah.dropout(Risk of dropout)
self.feedforward = yeah.sequence(
yeah.linear(hidden measurement, 4 * hidden measurement),
yeah.gel(),
yeah.linear(4 * hidden measurement, hidden measurement),
)
certainly ahead(self, ×: torch.tensor, pad masks: torch.tensor) -> torch.tensor:
# Self-care with padding masks and postnorms
attn_output, _ = self.Notice(×, ×, ×, key_padding_mask=pad masks)
× = self.attn_norm(× + attn_output)
# Feedforward with GeLU activation and postnorm
ff_output = self.feedforward(×)
× = self.ff_norm(× + self.dropout(ff_output))
return ×
class bert puller(yeah.module):
“”“Pool layer for BERT processing [CLS] Token output. ”“”
certainly __Initialization__(self, hidden measurement: integer):
great().__Initialization__()
self.densely packed = yeah.linear(hidden measurement, hidden measurement)
self.activation = yeah.Tan()
certainly ahead(self, ×: torch.tensor) -> torch.tensor:
× = self.densely packed(×)
× = self.activation(×)
return ×
class bart mannequin(yeah.module):
“”“Spine of the BERT mannequin”“”
certainly __Initialization__(self, composition: bart config):
great().__Initialization__()
# embedding layer
self.word_embeddings = yeah.embedded(composition.vocabulary measurement, composition.hidden measurement,
padding_idx=composition.Pad ID)
self.type_embeddings = yeah.embedded(composition.Variety of varieties, composition.hidden measurement)
self.place embedding = yeah.embedded(composition.max_seq_len, composition.hidden measurement)
self.embeddings_norm = yeah.layer norm(composition.hidden measurement)
self.Embed_Dropout = yeah.dropout(composition.Risk of dropout)
# Transformer block
self.block = yeah.module checklist([
BertBlock(config.hidden_size, config.num_heads, config.dropout_prob)
for _ in range(config.num_layers)
])
# [CLS] puller layer
self.puller = bert puller(composition.hidden measurement)
certainly ahead(self, input_ids: torch.tensor, token_type_ids: torch.tensor, Pad ID: integer = 0
) -> tuple[torch.Tensor, torch.Tensor]:
# Create an consideration masks for padding tokens
pad masks = input_ids == pad_ID
# Convert integer token to embedding vector
batch measurement, seq_len = input_ids.form
location id = torch.organize(seq_len, gadget=input_ids.gadget).squeeze(0)
place embedding = self.place embedding(location id)
type_embeddings = self.type_embeddings(token_type_ids)
Token embedding = self.word_embeddings(input_ids)
× = Token embedding + type_embeddings + place_embedded
× = self.embeddings_norm(×)
× = self.Embed_Dropout(×)
# Course of sequences with transformer blocks
for block in self.block:
× = block(×, pad masks)
# Pool the hidden state of `[CLS]` Token
pooled output = self.puller(×[:, 0, :])
return ×, pooled output
class Bert pre-trained mannequin(yeah.module):
certainly __Initialize__(self, composition: bart config):
great().__Initialize__()
self.bart = bart mannequin(composition)
self.mlm_head = yeah.sequence(
yeah.linear(composition.hidden measurement, composition.hidden measurement),
yeah.gel(),
yeah.layer norm(composition.hidden measurement),
yeah.linear(composition.hidden measurement, composition.vocabulary measurement),
)
self.nsp_head = yeah.linear(composition.hidden measurement, 2)
certainly ahead(self, input_ids: torch.tensor, token_type_ids: torch.tensor, Pad ID: integer = 0
) -> tuple[torch.Tensor, torch.Tensor]:
# Course of sequences utilizing the BERT mannequin spine
×, pooled output = self.bart(input_ids, token_type_ids, Pad ID)
# Predict the classification of masked tokens for MLM duties and NSP duties
mlm_logits = self.mlm_head(×)
nsp_logits = self.nsp_head(pooled output)
return mlm_logits, NSP_logit
# coaching parameters
epoch = 10
studying price = 1e–4
batch measurement = 32
# Load the dataset and configure the info loader
dataset = dataset.dataset.from_parquet(“wikitext-2_train_data.parquet”)
certainly collation_fn(batch: checklist[dict]):
“”“Customized matching features for dealing with variable size sequences in datasets.”“”
# All the time max size: token, segment_ids; at all times singleton: is_random_next
input_ids = torch.tensor([item[“tokens”] for merchandise in batch])
token_type_ids = torch.tensor([item[“segment_ids”] for merchandise in batch]).abs()
is_random_next = torch.tensor([item[“is_random_next”] for merchandise in batch]).to(integer)
# variable size: masked place, masked label
masked place = [(idx, pos) for idx, item in enumerate(batch) for pos in item[“masked_positions”]]
masked label = torch.tensor([label for item in batch for label in item[“masked_labels”]])
return input_ids, token_type_ids, is_random_next, masked place, masked label
information loader = torch.utility.information.information loader(dataset, batch measurement=batch measurement, shuffle=reality,
collation_fn=collation_fn, num_workers=8)
# prepare the mannequin
gadget = torch.gadget(“Cuda” if torch.Kuda.is on the market() Aside from that “CPU”)
mannequin = Bert pre-trained mannequin(bart config()).to(gadget)
mannequin.prepare()
optimizer = torch.optimum.Adam W(mannequin.parameters(), lr=studying price)
scheduler = torch.optimum.lr_scheduler.step LR(optimizer, step measurement=1, gamma=0.1)
loss_fn = yeah.cross entropy loss()
for epoch in vary(epoch):
pevalle = tqdm.tqdm(information loader, rationalization=f“Epoch {epoch+1}/{epochs}”)
for batch in pevalle:
# Get batched information
input_ids, token_type_ids, is_random_next, masked place, masked label = batch
input_ids = input_ids.to(gadget)
token_type_ids = token_type_ids.to(gadget)
is_random_next = is_random_next.to(gadget)
masked label = masked label.to(gadget)
# extract the output from the mannequin
mlm_logits, nsp_logits = mannequin(input_ids, token_type_ids)
# MLM loss: Masked_positions is a listing of (B, S) tuples.
# Corresponding logits from tensor mlm_logits of form (B, S, V)
batch index, Token place = zip(*masked place)
mlm_logits = mlm_logits[batch_indices, token_positions]
mlm_loss = loss_fn(mlm_logits, masked label)
# Calculate lack of NSP job
nsp_loss = loss_fn(nsp_logits, is_random_next)
# Backwards with whole loss
Whole loss quantity = mlm_loss + nsp_loss
pevalle.set_postfix(MLM=mlm_loss.merchandise(), NSP=nsp_loss.merchandise(), whole=Whole loss quantity.merchandise())
optimizer.zero grad()
Whole loss quantity.backwards()
optimizer.step()
scheduler.step()
pevalle.replace(1)
pevalle.shut()
# Save the mannequin
torch.preserve(mannequin.state_dict(), “bert_pretraining_model.pth”)
torch.preserve(mannequin.bart.state_dict(), “bert_model.pth”)

