Solega Co. Done For Your E-Commerce solutions.
  • Home
  • E-commerce
  • Start Ups
  • Project Management
  • Artificial Intelligence
  • Investment
  • More
    • Cryptocurrency
    • Finance
    • Real Estate
    • Travel
No Result
View All Result
  • Home
  • E-commerce
  • Start Ups
  • Project Management
  • Artificial Intelligence
  • Investment
  • More
    • Cryptocurrency
    • Finance
    • Real Estate
    • Travel
No Result
View All Result
No Result
View All Result
Home Artificial Intelligence

Train Your Large Model on Multiple GPUs with Pipeline Parallelism

Solega Team by Solega Team
January 9, 2026
in Artificial Intelligence
Reading Time: 57 mins read
0
Train Your Large Model on Multiple GPUs with Pipeline Parallelism
0
SHARES
1
VIEWS
Share on FacebookShare on Twitter


import dataclasses

import os

Ā 

import datasets

import tokenizers

import torch

import torch.distributed as dist

import torch.nn as nn

import torch.nn.functional as F

import torch.optim.lr_scheduler as lr_scheduler

import tqdm

from torch import Tensor

from torch.distributed.checkpoint import load, save

from torch.distributed.checkpoint.state_dict import StateDictOptions, get_state_dict, set_state_dict

from torch.distributed.pipelining import PipelineStage, ScheduleGPipe

Ā 

Ā 

# Build the model

@dataclasses.dataclass

class LlamaConfig:

Ā Ā Ā Ā “”“Define Llama model hyperparameters.”“”

Ā Ā Ā Ā vocab_size: int = 50000Ā Ā # Size of the tokenizer vocabulary

Ā Ā Ā Ā max_position_embeddings: int = 2048Ā Ā # Maximum sequence length

Ā Ā Ā Ā hidden_size: int = 768Ā Ā # Dimension of hidden layers

Ā Ā Ā Ā intermediate_size: int = 4*768Ā Ā # Dimension of MLP’s hidden layer

Ā Ā Ā Ā num_hidden_layers: int = 12Ā Ā # Number of transformer layers

Ā Ā Ā Ā num_attention_heads: int = 12Ā Ā # Number of attention heads

Ā Ā Ā Ā num_key_value_heads: int = 3Ā Ā # Number of key-value heads for GQA

Ā 

Ā 

class RotaryPositionEncoding(nn.Module):

Ā Ā Ā Ā “”“Rotary position encoding.”“”

Ā 

Ā Ā Ā Ā def __init__(self, dim: int, max_position_embeddings: int) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā “”“Initialize the RotaryPositionEncoding module.

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā Args:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā dim: The hidden dimension of the input tensor to which RoPE is applied

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā max_position_embeddings: The maximum sequence length of the input tensor

Ā Ā Ā Ā Ā Ā Ā Ā ““”

Ā Ā Ā Ā Ā Ā Ā Ā super().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.dim = dim

Ā Ā Ā Ā Ā Ā Ā Ā self.max_position_embeddings = max_position_embeddings

Ā Ā Ā Ā Ā Ā Ā Ā # compute a matrix of n\theta_i

Ā Ā Ā Ā Ā Ā Ā Ā N = 10_000.0

Ā Ā Ā Ā Ā Ā Ā Ā inv_freq = 1.0 / (N ** (torch.arange(0, dim, 2) / dim))

Ā Ā Ā Ā Ā Ā Ā Ā inv_freq = torch.cat((inv_freq, inv_freq), dim=–1)

Ā Ā Ā Ā Ā Ā Ā Ā position = torch.arange(max_position_embeddings)

Ā Ā Ā Ā Ā Ā Ā Ā sinusoid_inp = torch.outer(position, inv_freq)

Ā Ā Ā Ā Ā Ā Ā Ā # save cosine and sine matrices as buffers, not parameters

Ā Ā Ā Ā Ā Ā Ā Ā self.register_buffer(“cos”, sinusoid_inp.cos())

Ā Ā Ā Ā Ā Ā Ā Ā self.register_buffer(“sin”, sinusoid_inp.sin())

Ā 

Ā Ā Ā Ā def forward(self, x: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā “”“Apply RoPE to tensor x.

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā Args:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā x: Input tensor of shape (batch_size, seq_length, num_heads, head_dim)

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā Returns:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Output tensor of shape (batch_size, seq_length, num_heads, head_dim)

Ā Ā Ā Ā Ā Ā Ā Ā ““”

Ā Ā Ā Ā Ā Ā Ā Ā batch_size, seq_len, num_heads, head_dim = x.shape

Ā Ā Ā Ā Ā Ā Ā Ā dtype = x.dtype

Ā Ā Ā Ā Ā Ā Ā Ā # transform the cosine and sine matrices to 4D tensor and the same dtype as x

Ā Ā Ā Ā Ā Ā Ā Ā cos = self.cos.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

Ā Ā Ā Ā Ā Ā Ā Ā sin = self.sin.to(dtype)[:seq_len].view(1, seq_len, 1, –1)

Ā Ā Ā Ā Ā Ā Ā Ā # apply RoPE to x

Ā Ā Ā Ā Ā Ā Ā Ā x1, x2 = x.chunk(2, dim=–1)

Ā Ā Ā Ā Ā Ā Ā Ā rotated = torch.cat((–x2, x1), dim=–1)

Ā Ā Ā Ā Ā Ā Ā Ā output = (x * cos) + (rotated * sin)

Ā Ā Ā Ā Ā Ā Ā Ā return output

Ā 

Ā 

class LlamaAttention(nn.Module):

Ā Ā Ā Ā “”“Grouped-query attention with rotary embeddings.”“”

Ā 

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā super().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.hidden_size = config.hidden_size

Ā Ā Ā Ā Ā Ā Ā Ā self.num_heads = config.num_attention_heads

Ā Ā Ā Ā Ā Ā Ā Ā self.head_dim = self.hidden_size // self.num_heads

Ā Ā Ā Ā Ā Ā Ā Ā self.num_kv_heads = config.num_key_value_headsĀ Ā # GQA: H_kv < H_q

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # hidden_size must be divisible by num_heads

Ā Ā Ā Ā Ā Ā Ā Ā assert (self.head_dim * self.num_heads) == self.hidden_size

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Linear layers for Q, K, V projections

Ā Ā Ā Ā Ā Ā Ā Ā self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)

Ā 

Ā Ā Ā Ā def forward(self, hidden_states: Tensor, rope: RotaryPositionEncoding) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā bs, seq_len, dim = hidden_states.size()

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Project inputs to Q, K, V

Ā Ā Ā Ā Ā Ā Ā Ā query_states = self.q_proj(hidden_states).view(bs, seq_len, self.num_heads, self.head_dim)

Ā Ā Ā Ā Ā Ā Ā Ā key_states = self.k_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

Ā Ā Ā Ā Ā Ā Ā Ā value_states = self.v_proj(hidden_states).view(bs, seq_len, self.num_kv_heads, self.head_dim)

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Apply rotary position embeddings

Ā Ā Ā Ā Ā Ā Ā Ā query_states = rope(query_states)

Ā Ā Ā Ā Ā Ā Ā Ā key_states = rope(key_states)

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Transpose tensors from BSHD to BHSD dimension for scaled_dot_product_attention

Ā Ā Ā Ā Ā Ā Ā Ā query_states = query_states.transpose(1, 2)

Ā Ā Ā Ā Ā Ā Ā Ā key_states = key_states.transpose(1, 2)

Ā Ā Ā Ā Ā Ā Ā Ā value_states = value_states.transpose(1, 2)

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Use PyTorch’s optimized attention implementation

Ā Ā Ā Ā Ā Ā Ā Ā # setting is_causal=True is incompatible with setting explicit attention mask

Ā Ā Ā Ā Ā Ā Ā Ā attn_output = F.scaled_dot_product_attention(

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā query_states,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā key_states,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā value_states,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā is_causal=True,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā dropout_p=0.0,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā enable_gqa=True,

Ā Ā Ā Ā Ā Ā Ā Ā )

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Transpose output tensor from BHSD to BSHD dimension, reshape to 3D, and then project output

Ā Ā Ā Ā Ā Ā Ā Ā attn_output = attn_output.transpose(1, 2).reshape(bs, seq_len, self.hidden_size)

Ā Ā Ā Ā Ā Ā Ā Ā attn_output = self.o_proj(attn_output)

Ā Ā Ā Ā Ā Ā Ā Ā return attn_output

Ā 

Ā 

class LlamaMLP(nn.Module):

Ā Ā Ā Ā “”“Feed-forward network with SwiGLU activation.”“”

Ā 

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā super().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā # Two parallel projections for SwiGLU

Ā Ā Ā Ā Ā Ā Ā Ā self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)

Ā Ā Ā Ā Ā Ā Ā Ā self.act_fn = F.siluĀ Ā # SwiGLU activation function

Ā Ā Ā Ā Ā Ā Ā Ā # Project back to hidden size

Ā Ā Ā Ā Ā Ā Ā Ā self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)

Ā 

Ā Ā Ā Ā def forward(self, x: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā # SwiGLU activation: multiply gate and up-projected inputs

Ā Ā Ā Ā Ā Ā Ā Ā gate = self.act_fn(self.gate_proj(x))

Ā Ā Ā Ā Ā Ā Ā Ā up = self.up_proj(x)

Ā Ā Ā Ā Ā Ā Ā Ā return self.down_proj(gate * up)

Ā 

Ā 

class LlamaDecoderLayer(nn.Module):

Ā Ā Ā Ā “”“Single transformer layer for a Llama model.”“”

Ā 

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā super().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

Ā Ā Ā Ā Ā Ā Ā Ā self.self_attn = LlamaAttention(config)

Ā Ā Ā Ā Ā Ā Ā Ā self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=1e–5)

Ā Ā Ā Ā Ā Ā Ā Ā self.mlp = LlamaMLP(config)

Ā 

Ā Ā Ā Ā def forward(self, hidden_states: Tensor, rope: RotaryPositionEncoding) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā # First residual block: Self-attention

Ā Ā Ā Ā Ā Ā Ā Ā residual = hidden_states

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.input_layernorm(hidden_states)

Ā Ā Ā Ā Ā Ā Ā Ā attn_outputs = self.self_attn(hidden_states, rope=rope)

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = attn_outputs + residual

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā # Second residual block: MLP

Ā Ā Ā Ā Ā Ā Ā Ā residual = hidden_states

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.post_attention_layernorm(hidden_states)

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.mlp(hidden_states) + residual

Ā Ā Ā Ā Ā Ā Ā Ā return hidden_states

Ā 

Ā 

class LlamaModel(nn.Module):

Ā Ā Ā Ā “”“The full Llama model without any pretraining heads.”“”

Ā 

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā super().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.rope = RotaryPositionEncoding(

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā config.hidden_size // config.num_attention_heads,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā config.max_position_embeddings,

Ā Ā Ā Ā Ā Ā Ā Ā )

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

Ā Ā Ā Ā Ā Ā Ā Ā self.layers = nn.ModuleDict({

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā str(i): LlamaDecoderLayer(config) for i in range(config.num_hidden_layers)

Ā Ā Ā Ā Ā Ā Ā Ā })

Ā Ā Ā Ā Ā Ā Ā Ā self.norm = nn.RMSNorm(config.hidden_size, eps=1e–5)

Ā 

Ā Ā Ā Ā def forward(self, input_ids: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā # Convert input token IDs to embeddings

Ā Ā Ā Ā Ā Ā Ā Ā if self.embed_tokens is not None:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.embed_tokens(input_ids)

Ā Ā Ā Ā Ā Ā Ā Ā else:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = input_ids

Ā Ā Ā Ā Ā Ā Ā Ā # Process through all transformer layers, then the final norm layer

Ā Ā Ā Ā Ā Ā Ā Ā for n in range(len(self.layers)):

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā if self.layers[str(n)] is not None:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.layers[str(n)](hidden_states, self.rope)

Ā Ā Ā Ā Ā Ā Ā Ā if self.norm is not None:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.norm(hidden_states)

Ā Ā Ā Ā Ā Ā Ā Ā # Return the final hidden states, and copy over the attention mask

Ā Ā Ā Ā Ā Ā Ā Ā return hidden_states

Ā 

Ā 

class LlamaForPretraining(nn.Module):

Ā Ā Ā Ā def __init__(self, config: LlamaConfig) -> None:

Ā Ā Ā Ā Ā Ā Ā Ā super().__init__()

Ā Ā Ā Ā Ā Ā Ā Ā self.base_model = LlamaModel(config)

Ā Ā Ā Ā Ā Ā Ā Ā self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

Ā 

Ā Ā Ā Ā def forward(self, input_ids: Tensor) -> Tensor:

Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.base_model(input_ids)

Ā Ā Ā Ā Ā Ā Ā Ā if self.lm_head is not None:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā hidden_states = self.lm_head(hidden_states)

Ā Ā Ā Ā Ā Ā Ā Ā return hidden_states

Ā 

Ā 

# Generator function to create padded sequences of fixed length

class PretrainingDataset(torch.utils.data.Dataset):

Ā Ā Ā Ā def __init__(self, dataset: datasets.Dataset, tokenizer: tokenizers.Tokenizer,

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā  seq_length: int, device: torch.device = None):

Ā Ā Ā Ā Ā Ā Ā Ā self.dataset = dataset

Ā Ā Ā Ā Ā Ā Ā Ā self.tokenizer = tokenizer

Ā Ā Ā Ā Ā Ā Ā Ā self.device = device

Ā Ā Ā Ā Ā Ā Ā Ā self.seq_length = seq_length

Ā Ā Ā Ā Ā Ā Ā Ā self.bot = tokenizer.token_to_id(“[BOT]”)

Ā Ā Ā Ā Ā Ā Ā Ā self.eot = tokenizer.token_to_id(“[EOT]”)

Ā Ā Ā Ā Ā Ā Ā Ā self.pad = tokenizer.token_to_id(“[PAD]”)

Ā 

Ā Ā Ā Ā def __len__(self):

Ā Ā Ā Ā Ā Ā Ā Ā return len(self.dataset)

Ā 

Ā Ā Ā Ā def __getitem__(self, index):

Ā Ā Ā Ā Ā Ā Ā Ā “”“Get a sequence of token ids from the dataset. [BOT] and [EOT] tokens

Ā Ā Ā Ā Ā Ā Ā Ā are added. Clipped and padded to the sequence length.

Ā Ā Ā Ā Ā Ā Ā Ā ““”

Ā Ā Ā Ā Ā Ā Ā Ā seq = self.dataset[index][“text”]

Ā Ā Ā Ā Ā Ā Ā Ā tokens: list[int] = [self.bot] + self.tokenizer.encode(seq).ids + [self.eot]

Ā Ā Ā Ā Ā Ā Ā Ā # pad to target sequence length

Ā Ā Ā Ā Ā Ā Ā Ā toklen = len(tokens)

Ā Ā Ā Ā Ā Ā Ā Ā if toklen < self.seq_length+1:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā pad_length = self.seq_length+1 – toklen

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā tokens += [self.pad] * pad_length

Ā Ā Ā Ā Ā Ā Ā Ā # return the sequence

Ā Ā Ā Ā Ā Ā Ā Ā x = torch.tensor(tokens[:self.seq_length], dtype=torch.int64, device=self.device)

Ā Ā Ā Ā Ā Ā Ā Ā y = torch.tensor(tokens[1:self.seq_length+1], dtype=torch.int64, device=self.device)

Ā Ā Ā Ā Ā Ā Ā Ā return x, y

Ā 

Ā 

def load_checkpoint(model: nn.Module, optimizer: torch.optim.Optimizer) -> None:

Ā Ā Ā Ā dist.barrier()

Ā Ā Ā Ā model_state, optimizer_state = get_state_dict(

Ā Ā Ā Ā Ā Ā Ā Ā model, optimizer, options=StateDictOptions(full_state_dict=True),

Ā Ā Ā Ā )

Ā Ā Ā Ā load(

Ā Ā Ā Ā Ā Ā Ā Ā {“model”: model_state, “optimizer”: optimizer_state},

Ā Ā Ā Ā Ā Ā Ā Ā checkpoint_id=“checkpoint-dist”,

Ā Ā Ā Ā )

Ā Ā Ā Ā set_state_dict(

Ā Ā Ā Ā Ā Ā Ā Ā model, optimizer,

Ā Ā Ā Ā Ā Ā Ā Ā model_state_dict=model_state, optim_state_dict=optimizer_state,

Ā Ā Ā Ā Ā Ā Ā Ā options=StateDictOptions(broadcast_from_rank0=True, full_state_dict=True),

Ā Ā Ā Ā )

Ā Ā Ā Ā dist.barrier()

Ā 

Ā 

def save_checkpoint(model: nn.Module, optimizer: torch.optim.Optimizer) -> None:

Ā Ā Ā Ā dist.barrier()

Ā Ā Ā Ā model_state, optimizer_state = get_state_dict(

Ā Ā Ā Ā Ā Ā Ā Ā model, optimizer, options=StateDictOptions(full_state_dict=True),

Ā Ā Ā Ā )

Ā Ā Ā Ā save(

Ā Ā Ā Ā Ā Ā Ā Ā {“model”: model_state, “optimizer”: optimizer_state},

Ā Ā Ā Ā Ā Ā Ā Ā checkpoint_id=“checkpoint-dist”,

Ā Ā Ā Ā )

Ā Ā Ā Ā dist.barrier()

Ā 

Ā 

# Load the tokenizer and dataset

tokenizer = tokenizers.Tokenizer.from_file(“bpe_50K.json”)

dataset = datasets.load_dataset(“HuggingFaceFW/fineweb”, “sample-10BT”, split=“train”)

Ā 

# Initialize the distributed environment

dist.init_process_group(backend=“nccl”)

rank = dist.get_rank()

local_rank = int(os.environ[“LOCAL_RANK”])

world_size = dist.get_world_size()

device = torch.device(f“cuda:{local_rank}”)

print(f“World size {world_size}, rank {rank}, local rank {local_rank}. Using {device}”)

assert world_size == 3, f“This script is designed for 3 GPUs, got {world_size}”

Ā 

# Create pretraining model with default config on meta device to prevent OOM

with torch.device(“meta”):

Ā Ā Ā Ā model_config = LlamaConfig()

Ā Ā Ā Ā model = LlamaForPretraining(model_config)

Ā Ā Ā Ā # Partition the model by removing some layers

Ā Ā Ā Ā num_layers = model_config.num_hidden_layers

Ā Ā Ā Ā partition = [num_layers // 3, 2 * num_layers // 3, num_layers]

Ā Ā Ā Ā if rank == 0:

Ā Ā Ā Ā Ā Ā Ā Ā # from embedding to 1/3 of the decoder layers

Ā Ā Ā Ā Ā Ā Ā Ā for n in range(partition[0], partition[2]):

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā model.base_model.layers[str(n)] = None

Ā Ā Ā Ā Ā Ā Ā Ā model.base_model.norm = None

Ā Ā Ā Ā Ā Ā Ā Ā model.lm_head = None

Ā Ā Ā Ā elif rank == 1:

Ā Ā Ā Ā Ā Ā Ā Ā # from 1/3 to 2/3 of the decoder layers

Ā Ā Ā Ā Ā Ā Ā Ā model.base_model.embed_tokens = None

Ā Ā Ā Ā Ā Ā Ā Ā for n in range(0, partition[0]):

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā model.base_model.layers[str(n)] = None

Ā Ā Ā Ā Ā Ā Ā Ā for n in range(partition[1], partition[2]):

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā model.base_model.layers[str(n)] = None

Ā Ā Ā Ā Ā Ā Ā Ā model.base_model.norm = None

Ā Ā Ā Ā Ā Ā Ā Ā model.lm_head = None

Ā Ā Ā Ā elif rank == 2:

Ā Ā Ā Ā Ā Ā Ā Ā # from 2/3 to the end of the decoder layers and the final norm layer, LM head

Ā Ā Ā Ā Ā Ā Ā Ā model.base_model.embed_tokens = None

Ā Ā Ā Ā Ā Ā Ā Ā for n in range(partition[1]):

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā model.base_model.layers[str(n)] = None

Ā Ā Ā Ā else:

Ā Ā Ā Ā Ā Ā Ā Ā raise ValueError(f“Invalid rank: {rank}”)

Ā 

Ā 

# Move model from meta device to CUDA device, then initialize the weights

def reset_all_weights(model: nn.Module) -> None:

Ā Ā Ā Ā @torch.no_grad()

Ā Ā Ā Ā def weight_reset(m: nn.Module):

Ā Ā Ā Ā Ā Ā Ā Ā reset_parameters = getattr(m, “reset_parameters”, None)

Ā Ā Ā Ā Ā Ā Ā Ā if callable(reset_parameters):

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā m.reset_parameters()

Ā 

Ā Ā Ā Ā # Applies fn recursively to model itself and all of model.children()

Ā Ā Ā Ā model.apply(fn=weight_reset)

Ā 

Ā 

model.to_empty(device=device)

reset_all_weights(model)

model.train()

stage = PipelineStage(model, stage_index=rank, num_stages=world_size, device=device)

Ā 

# Training parameters

epochs = 3

learning_rate = 1e–3

batch_size = 64

seq_length = 512

num_warmup_steps = 1000

PAD_TOKEN_ID = tokenizer.token_to_id(“[PAD]”)

Ā 

# DataLoader, optimizer, scheduler, and loss function

dataset = PretrainingDataset(dataset, tokenizer, seq_length, device)

dataloader = torch.utils.data.DataLoader(

Ā Ā Ā Ā dataset,

Ā Ā Ā Ā batch_size=batch_size,

)

num_training_steps = len(dataloader) * epochs

print(f“Number of training steps: {num_training_steps} = {len(dataloader)} * {epochs}”)

Ā 

optimizer = torch.optim.AdamW(

Ā Ā Ā Ā model.parameters(), lr=learning_rate, betas=(0.9, 0.99), eps=1e–8, weight_decay=0.1,

)

warmup_scheduler = lr_scheduler.LinearLR(

Ā Ā Ā Ā optimizer,

Ā Ā Ā Ā start_factor=0.1, end_factor=1.0, total_iters=num_warmup_steps,

)

cosine_scheduler = lr_scheduler.CosineAnnealingLR(

Ā Ā Ā Ā optimizer,

Ā Ā Ā Ā T_max=num_training_steps – num_warmup_steps,

Ā Ā Ā Ā eta_min=0,

)

scheduler = lr_scheduler.SequentialLR(

Ā Ā Ā Ā optimizer,

Ā Ā Ā Ā schedulers=[warmup_scheduler, cosine_scheduler],

Ā Ā Ā Ā milestones=[num_warmup_steps],

)

Ā 

# if checkpoint-dist dir exists, load the checkpoint to model and optimizer

# Note: You should implement how to reset the epoch and step to allow correct resume

if os.path.exists(“checkpoint-dist”):

Ā Ā Ā Ā load_checkpoint(model, optimizer)

Ā 

# Create pipeline schedule

def loss_fn(logits: Tensor, target_ids: Tensor) -> Tensor:

Ā Ā Ā Ā logits = logits.view(–1, logits.size(–1))

Ā Ā Ā Ā target_ids = target_ids.view(–1)

Ā Ā Ā Ā return F.cross_entropy(logits, target_ids, ignore_index=PAD_TOKEN_ID)

Ā 

n_microbatches = 4Ā Ā # num split per batch

schedule = ScheduleGPipe(stage, n_microbatches=n_microbatches, loss_fn=loss_fn)

Ā 

# start training

for epoch in range(epochs):

Ā Ā Ā Ā pbar = tqdm.tqdm(dataloader, desc=f“Epoch {epoch+1}/{epochs}”, disable=(rank != world_size – 1))

Ā Ā Ā Ā for batch_id, batch in enumerate(pbar):

Ā Ā Ā Ā Ā Ā Ā Ā if batch_id % 1000 == 0:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā save_checkpoint(model, optimizer)

Ā Ā Ā Ā Ā Ā Ā Ā # zero grad before forward pass, since no explicit backward pass is called

Ā Ā Ā Ā Ā Ā Ā Ā optimizer.zero_grad(set_to_none=True)

Ā Ā Ā Ā Ā Ā Ā Ā # get batched data

Ā Ā Ā Ā Ā Ā Ā Ā input_ids, target_ids = batch

Ā Ā Ā Ā Ā Ā Ā Ā if rank == 0:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā schedule.step(input_ids)

Ā Ā Ā Ā Ā Ā Ā Ā elif rank == world_size – 1:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā losses = []Ā Ā # expects one lost per microbatch

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā logits = schedule.step(target=target_ids, losses=losses)

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā with torch.no_grad():

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā pbar.set_postfix(loss=sum(losses).item() / len(losses))

Ā Ā Ā Ā Ā Ā Ā Ā else:

Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā Ā schedule.step()

Ā 

Ā Ā Ā Ā Ā Ā Ā Ā torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

Ā Ā Ā Ā Ā Ā Ā Ā optimizer.step()

Ā Ā Ā Ā Ā Ā Ā Ā scheduler.step()

Ā Ā Ā Ā Ā Ā Ā Ā pbar.update(1)

Ā Ā Ā Ā pbar.close()

Ā 

# Save the model

save_checkpoint(model, optimizer)

Ā 

# Clean up the distributed environment

dist.destroy_process_group()



Source link

Tags: GPUslargeModelmultipleParallelismPipelinetrain
Previous Post

Client Challenge

Next Post

Harrison Polites names his top 3 games of 2025

Next Post
Harrison Polites names his top 3 games of 2025

Harrison Polites names his top 3 games of 2025

Leave a Reply Cancel reply

Your email address will not be published. Required fields are marked *

POPULAR POSTS

  • Health-specific embedding tools for dermatology and pathology

    Health-specific embedding tools for dermatology and pathology

    0 shares
    Share 0 Tweet 0
  • 20 Best Resource Management Software of 2025 (Free & Paid)

    0 shares
    Share 0 Tweet 0
  • 10 Ways To Get a Free DoorDash Gift Card

    0 shares
    Share 0 Tweet 0
  • How To Save for a Baby in 9 Months

    0 shares
    Share 0 Tweet 0
  • How to Make a Stakeholder Map

    0 shares
    Share 0 Tweet 0
Solega Blog

Categories

  • Artificial Intelligence
  • Cryptocurrency
  • E-commerce
  • Finance
  • Investment
  • Project Management
  • Real Estate
  • Start Ups
  • Travel

Connect With Us

Recent Posts

3 ways to make power-hungry data centres more efficient

3 ways to make power-hungry data centres more efficient

January 11, 2026
Why AI predictions are getting harder to make

Why AI predictions are getting harder to make

January 10, 2026

Ā© 2024 Solega, LLC. All Rights Reserved | Solega.co

No Result
View All Result
  • Home
  • E-commerce
  • Start Ups
  • Project Management
  • Artificial Intelligence
  • Investment
  • More
    • Cryptocurrency
    • Finance
    • Real Estate
    • Travel

Ā© 2024 Solega, LLC. All Rights Reserved | Solega.co