Pretrain a BERT mannequin from scratch

by root December 16, 2025

written by root December 16, 2025 0 comment 84 views

import information class

import dataset

import torch

import torch.yeah as yeah

import tqdm

@information class.information class

class bart config:

“”“BERT Mannequin Settings”“”

vocabulary measurement: integer = 30522

Variety of layers: integer = 12

hidden measurement: integer = 768

num_heads: integer = 12

Risk of dropout: float = 0.1

Pad ID: integer = 0

max_seq_len: integer = 512

Variety of varieties: integer = 2

class bart block(yeah.module):

“”“One Transformer Block of BERT”“”

certainly __Initialization__(self, hidden measurement: integer, num_heads: integer, Risk of dropout: float):

great().__Initialization__()

self.Notice = yeah.multi head consideration(hidden measurement, num_heads,

dropout=Risk of dropout, batch_first=reality)

self.attn_norm = yeah.layer norm(hidden measurement)

self.ff_norm = yeah.layer norm(hidden measurement)

self.dropout = yeah.dropout(Risk of dropout)

self.feedforward = yeah.sequence(

yeah.linear(hidden measurement, 4 * hidden measurement),

yeah.gel(),

yeah.linear(4 * hidden measurement, hidden measurement),

)

certainly ahead(self, ×: torch.tensor, pad masks: torch.tensor) -> torch.tensor:

# Self-care with padding masks and postnorms

attn_output, _ = self.Notice(×, ×, ×, key_padding_mask=pad masks)

× = self.attn_norm(× + attn_output)

# Feedforward with GeLU activation and postnorm

ff_output = self.feedforward(×)

× = self.ff_norm(× + self.dropout(ff_output))

return ×

class bert puller(yeah.module):

“”“Pool layer for BERT processing [CLS] Token output. ”“”

certainly __Initialization__(self, hidden measurement: integer):

great().__Initialization__()

self.densely packed = yeah.linear(hidden measurement, hidden measurement)

self.activation = yeah.Tan()

certainly ahead(self, ×: torch.tensor) -> torch.tensor:

× = self.densely packed(×)

× = self.activation(×)

return ×

class bart mannequin(yeah.module):

“”“Spine of the BERT mannequin”“”

certainly __Initialization__(self, composition: bart config):

great().__Initialization__()

# embedding layer

self.word_embeddings = yeah.embedded(composition.vocabulary measurement, composition.hidden measurement,

padding_idx=composition.Pad ID)

self.type_embeddings = yeah.embedded(composition.Variety of varieties, composition.hidden measurement)

self.place embedding = yeah.embedded(composition.max_seq_len, composition.hidden measurement)

self.embeddings_norm = yeah.layer norm(composition.hidden measurement)

self.Embed_Dropout = yeah.dropout(composition.Risk of dropout)

# Transformer block

self.block = yeah.module checklist([

BertBlock(config.hidden_size, config.num_heads, config.dropout_prob)

for _ in range(config.num_layers)

])

# [CLS] puller layer

self.puller = bert puller(composition.hidden measurement)

certainly ahead(self, input_ids: torch.tensor, token_type_ids: torch.tensor, Pad ID: integer = 0

) -> tuple[torch.Tensor, torch.Tensor]:

# Create an consideration masks for padding tokens

pad masks = input_ids == pad_ID

# Convert integer token to embedding vector

batch measurement, seq_len = input_ids.form

location id = torch.organize(seq_len, gadget=input_ids.gadget).squeeze(0)

place embedding = self.place embedding(location id)

type_embeddings = self.type_embeddings(token_type_ids)

Token embedding = self.word_embeddings(input_ids)

× = Token embedding + type_embeddings + place_embedded

× = self.embeddings_norm(×)

× = self.Embed_Dropout(×)

# Course of sequences with transformer blocks

for block in self.block:

× = block(×, pad masks)

# Pool the hidden state of `[CLS]` Token

pooled output = self.puller(×[:, 0, :])

return ×, pooled output

class Bert pre-trained mannequin(yeah.module):

certainly __Initialize__(self, composition: bart config):

great().__Initialize__()

self.bart = bart mannequin(composition)

self.mlm_head = yeah.sequence(

yeah.linear(composition.hidden measurement, composition.hidden measurement),

yeah.gel(),

yeah.layer norm(composition.hidden measurement),

yeah.linear(composition.hidden measurement, composition.vocabulary measurement),

)

self.nsp_head = yeah.linear(composition.hidden measurement, 2)

certainly ahead(self, input_ids: torch.tensor, token_type_ids: torch.tensor, Pad ID: integer = 0

) -> tuple[torch.Tensor, torch.Tensor]:

# Course of sequences utilizing the BERT mannequin spine

×, pooled output = self.bart(input_ids, token_type_ids, Pad ID)

# Predict the classification of masked tokens for MLM duties and NSP duties

mlm_logits = self.mlm_head(×)

nsp_logits = self.nsp_head(pooled output)

return mlm_logits, NSP_logit

# coaching parameters

epoch = 10

studying price = 1e–4

batch measurement = 32

# Load the dataset and configure the info loader

dataset = dataset.dataset.from_parquet(“wikitext-2_train_data.parquet”)

certainly collation_fn(batch: checklist[dict]):

“”“Customized matching features for dealing with variable size sequences in datasets.”“”

# All the time max size: token, segment_ids; at all times singleton: is_random_next

input_ids = torch.tensor([item[“tokens”] for merchandise in batch])

token_type_ids = torch.tensor([item[“segment_ids”] for merchandise in batch]).abs()

is_random_next = torch.tensor([item[“is_random_next”] for merchandise in batch]).to(integer)

# variable size: masked place, masked label

masked place = [(idx, pos) for idx, item in enumerate(batch) for pos in item[“masked_positions”]]

masked label = torch.tensor([label for item in batch for label in item[“masked_labels”]])

return input_ids, token_type_ids, is_random_next, masked place, masked label

information loader = torch.utility.information.information loader(dataset, batch measurement=batch measurement, shuffle=reality,

collation_fn=collation_fn, num_workers=8)

# prepare the mannequin

gadget = torch.gadget(“Cuda” if torch.Kuda.is on the market() Aside from that “CPU”)

mannequin = Bert pre-trained mannequin(bart config()).to(gadget)

mannequin.prepare()

optimizer = torch.optimum.Adam W(mannequin.parameters(), lr=studying price)

scheduler = torch.optimum.lr_scheduler.step LR(optimizer, step measurement=1, gamma=0.1)

loss_fn = yeah.cross entropy loss()

for epoch in vary(epoch):

pevalle = tqdm.tqdm(information loader, rationalization=f“Epoch {epoch+1}/{epochs}”)

for batch in pevalle:

# Get batched information

input_ids, token_type_ids, is_random_next, masked place, masked label = batch

input_ids = input_ids.to(gadget)

token_type_ids = token_type_ids.to(gadget)

is_random_next = is_random_next.to(gadget)

masked label = masked label.to(gadget)

# extract the output from the mannequin

mlm_logits, nsp_logits = mannequin(input_ids, token_type_ids)

# MLM loss: Masked_positions is a listing of (B, S) tuples.

# Corresponding logits from tensor mlm_logits of form (B, S, V)

batch index, Token place = zip(*masked place)

mlm_logits = mlm_logits[batch_indices, token_positions]

mlm_loss = loss_fn(mlm_logits, masked label)

# Calculate lack of NSP job

nsp_loss = loss_fn(nsp_logits, is_random_next)

# Backwards with whole loss

Whole loss quantity = mlm_loss + nsp_loss

pevalle.set_postfix(MLM=mlm_loss.merchandise(), NSP=nsp_loss.merchandise(), whole=Whole loss quantity.merchandise())

optimizer.zero grad()

Whole loss quantity.backwards()

optimizer.step()

scheduler.step()

pevalle.replace(1)

pevalle.shut()

# Save the mannequin

torch.preserve(mannequin.state_dict(), “bert_pretraining_model.pth”)

torch.preserve(mannequin.bart.state_dict(), “bert_model.pth”)

Welcome to Ivugangingo!

At Ivugangingo, we're passionate about delivering insightful content that empowers and informs our readers across a spectrum of crucial topics. Whether you're delving into the world of insurance, navigating the complexities of cryptocurrency, or seeking wellness tips in health and fitness, we've got you covered.

Pretrain a BERT mannequin from scratch

SBI Holdings, Startail to challenge yen-backed secure coin

Hoka Coupon Code: 10% Off | December 2025

Converter

Editors Pick

Newsletter

Categories

Related Posts

Leave a Comment Cancel Reply

Latest

Best selling

Top rated

Products

Latest Posts

Welcome to Ivugangingo!

Random Picks