Commit 71e79847 authored by chenzk's avatar chenzk
Browse files

v1.0.3

parents
Pipeline #2034 canceled with stages
import pytest
import torch
from torch.nn import LayerNorm
from nanotron.nn.layer_norm import TritonLayerNorm
@pytest.mark.fa2
@pytest.mark.parametrize(
"hidden_size",
[1024, 1025], # fused layer norm supports 1024 as hidden size but not 1025
)
def test_fused_layer_norm(hidden_size):
BATCH_SIZE = 5
SEQ_LEN = 128
DEVICE, DTYPE = torch.device("cuda:0"), torch.float16
inputs = torch.rand(BATCH_SIZE, SEQ_LEN, hidden_size, device=DEVICE, dtype=DTYPE)
layer_norm = LayerNorm(normalized_shape=inputs.size(-1), device=DEVICE, dtype=DTYPE)
ref_outputs = layer_norm(inputs)
fused_layer_norm = TritonLayerNorm(
normalized_shape=inputs.size(-1),
device=DEVICE,
dtype=DTYPE,
)
outputs = fused_layer_norm(inputs)
# NOTE: with torch.float16, FA2's use a atol of 1e-2
# https://github.com/Dao-AILab/flash-attention/blob/87a1277653fc55cd615f5341255e00c69d5c00a1/tests/ops/triton/test_layer_norm.py#L63-L64
torch.testing.assert_close(outputs, ref_outputs, rtol=1e-3, atol=1e-2)
outputs.sum().backward()
ref_outputs.sum().backward()
# NOTE: same as above
torch.testing.assert_close(fused_layer_norm.weight.grad, layer_norm.weight.grad, rtol=1e-3, atol=1e-2)
torch.testing.assert_close(fused_layer_norm.bias.grad, layer_norm.bias.grad, rtol=1e-3, atol=1e-2)
import sys
from math import isclose
from pathlib import Path
from typing import List
package_path = Path(__file__).parent.parent
sys.path.append(str(package_path))
import numpy as np
import pytest
from helpers.context import TestContext
from helpers.data import (
assert_batch_dataloader,
assert_nanoset_sync_across_all_ranks,
compute_batch_hash,
create_dataset_paths,
create_dummy_json_dataset,
preprocess_dummy_dataset,
)
from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
from nanotron.data.dataloader_builder import build_nanoset_dataloader
from nanotron.data.nanoset import Nanoset
from nanotron.data.utils import count_dataset_indexes, normalize
from nanotron.parallel import ParallelContext
from nanotron.utils import main_rank_first
from transformers import AutoTokenizer
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@pytest.mark.parametrize("train_steps", [500, 10000])
@pytest.mark.parametrize("sequence_length", [512, 8192])
@pytest.mark.parametrize("tokenizer_name_or_path", ["openai-community/gpt2", "unsloth/llama-3-8b-bnb-4bit"])
@rerun_if_address_is_in_use()
def test_build_nanoset_dataloader(
tp: int, dp: int, pp: int, train_steps: int, sequence_length: int, tokenizer_name_or_path: str
):
test_context = TestContext()
# Create dataset folders
json_paths, datatrove_tokenized_dataset_folders = create_dataset_paths(
tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2
)
# Create dummy json datasets
for idx, json_path in enumerate(json_paths):
create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
# Preprocess json dataset with datatrove
for json_path, datatrove_tokenized_dataset_folder in zip(json_paths, datatrove_tokenized_dataset_folders):
preprocess_dummy_dataset(json_path, datatrove_tokenized_dataset_folder, tokenizer_name_or_path)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(
datatrove_tokenized_dataset_folders=datatrove_tokenized_dataset_folders,
train_steps=train_steps,
sequence_length=sequence_length,
tokenizer_name_or_path=tokenizer_name_or_path,
)
def _test_build_nanoset_dataloader(
parallel_context: ParallelContext,
datatrove_tokenized_dataset_folders: List[str],
train_steps: int,
sequence_length: int,
tokenizer_name_or_path: str,
):
SEED = 1234
MICRO_BATCH_SIZE = 4
N_MICRO_BATCHES_PER_BATCH = 8
GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
# Get tokenizer cardinality
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
del tokenizer
# Create Nanoset configs: 1. Normal 2. Blended 3. Blended with weights
nanoset_config = {
"dataset_folders": [datatrove_tokenized_dataset_folders[0]],
"dataset_weights": [1],
"sequence_length": sequence_length,
"token_size": token_size,
"train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
blended_nanoset_config = {
"dataset_folders": datatrove_tokenized_dataset_folders,
"dataset_weights": None,
"sequence_length": sequence_length,
"token_size": token_size,
"train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
blended_weighted_nanoset_config = {
"dataset_folders": datatrove_tokenized_dataset_folders,
"dataset_weights": [8, 2],
"sequence_length": sequence_length,
"token_size": token_size,
"train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
configs = [nanoset_config, blended_nanoset_config, blended_weighted_nanoset_config]
for config in configs:
# Create Nanoset
with main_rank_first(parallel_context.world_pg):
train_dataset = Nanoset(**config)
# Assert we have the same Nanoset in all ranks
assert_nanoset_sync_across_all_ranks(train_dataset, parallel_context)
dataset_sample_count = count_dataset_indexes(train_dataset.dataset_index, len(train_dataset.dataset_folders))
for idx, ds_length in enumerate(train_dataset.dataset_lengths):
# Assert Nanoset doesn't sample indexes greater than the datasets
assert (
np.max(train_dataset.dataset_sample_index, where=train_dataset.dataset_index == idx, initial=-1)
< ds_length
), f"Error building Nanoset Indexes: Tryng to access sample {np.max(train_dataset.dataset_sample_index, where=train_dataset.dataset_index==idx, initial = -1)} of a {ds_length} sample dataset"
# Assert Nanoset builds up the correct blend WRT the dataset_weights
assert isclose(
normalize(dataset_sample_count).tolist()[idx], train_dataset.dataset_weights[idx], abs_tol=0.05
), f"Requested Nanoset to contain {round(train_dataset.dataset_weights[idx]*100, 2)}% of samples from {train_dataset.dataset_folders[idx]} but got {round(normalize(dataset_sample_count).tolist()[idx]*100, 2)}%"
# Create Dataloaders
dataloader = build_nanoset_dataloader(
train_dataset,
sequence_length=sequence_length,
parallel_context=parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=MICRO_BATCH_SIZE,
dataloader_num_workers=0,
dataloader_drop_last=True,
)
# Check a batch produced by the Dataloader
batch = next(iter(dataloader))
assert_batch_dataloader(
batch=batch,
parallel_context=parallel_context,
micro_batch_size=MICRO_BATCH_SIZE,
sequence_length=sequence_length,
)
parallel_context.destroy()
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@pytest.mark.parametrize("skipped_batches", [20, 5555])
@pytest.mark.parametrize("tokenizer_name_or_path", ["openai-community/gpt2", "unsloth/llama-3-8b-bnb-4bit"])
@rerun_if_address_is_in_use()
def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, skipped_batches: int, tokenizer_name_or_path: str):
test_context = TestContext()
# Create dataset folders
json_paths, datatrove_tokenized_dataset_folders = create_dataset_paths(
tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2
)
# Create dummy json datasets
for idx, json_path in enumerate(json_paths):
create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
# Preprocess json dataset with datatrove
for json_path, datatrove_tokenized_dataset_folder in zip(json_paths, datatrove_tokenized_dataset_folders):
preprocess_dummy_dataset(json_path, datatrove_tokenized_dataset_folder, tokenizer_name_or_path)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_recover_nanoset_dataloader)(
datatrove_tokenized_dataset_folders=datatrove_tokenized_dataset_folders,
skipped_batches=skipped_batches,
tokenizer_name_or_path=tokenizer_name_or_path,
)
def _test_recover_nanoset_dataloader(
parallel_context: ParallelContext,
datatrove_tokenized_dataset_folders: List[str],
skipped_batches: int,
tokenizer_name_or_path: str,
):
SEED = 1234
MICRO_BATCH_SIZE = 4
N_MICRO_BATCHES_PER_BATCH = 8
GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
SEQUENCE_LENGTH = 1024
TRAIN_STEPS = 10000
input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
# Get tokenizer cardinality
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
del tokenizer
# Create Nanoset configs: 1. Normal 2. Blended 3. Blended with weights
nanoset_config = {
"dataset_folders": [datatrove_tokenized_dataset_folders[0]],
"dataset_weights": [1],
"sequence_length": SEQUENCE_LENGTH,
"token_size": token_size,
"train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
blended_nanoset_config = {
"dataset_folders": datatrove_tokenized_dataset_folders,
"dataset_weights": None,
"sequence_length": SEQUENCE_LENGTH,
"token_size": token_size,
"train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
blended_weighted_nanoset_config = {
"dataset_folders": datatrove_tokenized_dataset_folders,
"dataset_weights": [8, 2],
"sequence_length": SEQUENCE_LENGTH,
"token_size": token_size,
"train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
configs = [nanoset_config, blended_nanoset_config, blended_weighted_nanoset_config]
for config in configs:
# Create Nanoset
with main_rank_first(parallel_context.world_pg):
train_dataset = Nanoset(**config)
# Create initial Dataloader
dataloader = build_nanoset_dataloader(
train_dataset,
sequence_length=SEQUENCE_LENGTH,
parallel_context=parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=MICRO_BATCH_SIZE,
dataloader_num_workers=0,
dataloader_drop_last=True,
)
# Recover from failures
dataloader = iter(dataloader)
for _ in range(skipped_batches + 1): # In order to compare with the first batch of the recovered DataLoader
batch = next(dataloader)
# Create recover Dataloader
recovered_dataloader = build_nanoset_dataloader(
train_dataset,
sequence_length=SEQUENCE_LENGTH,
parallel_context=parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=MICRO_BATCH_SIZE,
dataloader_num_workers=0,
dataloader_drop_last=True,
# NOTE The dataloader serves batches of micro_batch_size despite of batch_accumulation_per_replica
consumed_train_samples=skipped_batches * MICRO_BATCH_SIZE * parallel_context.dp_pg.size(),
)
recovered_first_batch = next(iter(recovered_dataloader))
assert compute_batch_hash(batch) == compute_batch_hash(recovered_first_batch)
parallel_context.destroy()
[pytest]
addopts=-n 35
markers =
fa2: FA2-related
import pytest
import torch
import torch.distributed as dist
from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
from helpers.utils import init_distributed, rerun_if_address_is_in_use
from nanotron.config import Config, ModelArgs, RandomInit
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.block import PipelineBlock
from torch import nn
@pytest.mark.parametrize("tp,dp,pp", [(1, 1, 1), (2, 2, 2)])
@pytest.mark.skip
@rerun_if_address_is_in_use()
def test_get_named_modules_in_pp_rank(tp: int, dp: int, pp: int):
model_args = ModelArgs(init_method=RandomInit(std=1.0), model_config=TINY_LLAMA_CONFIG)
config = get_llama_training_config(model_args)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_get_named_modules_in_pp_rank)(config=config)
def _test_get_named_modules_in_pp_rank(
parallel_context: ParallelContext,
config: Config,
):
model = create_llama_from_config(
model_config=config.model.model_config,
device=torch.device("cuda"),
parallel_context=parallel_context,
)
model.init_model_randomly(config=config)
modules_that_not_in_current_pp_rank = {}
current_pp_rank = dist.get_rank(group=parallel_context.pp_pg)
for name, module in model.named_modules():
if isinstance(module, PipelineBlock) and module.rank != current_pp_rank:
modules_that_not_in_current_pp_rank[name] = module
named_modules_in_pp_rank = model.named_modules_in_pp_rank
for name, module in named_modules_in_pp_rank.items():
# NOTE: if a module is in the current rank, we expect it to be an initialized module
# not PipelineBlock
assert isinstance(module, nn.Module)
assert name not in modules_that_not_in_current_pp_rank
from typing import Union
import torch
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.utils import checkpoint_method
from torch import nn
class CheckpointedModel(nn.Module):
def __init__(self, is_checkpointed: bool = False):
super().__init__()
self.dense1 = nn.Linear(10, 10)
self.dense2 = nn.Linear(10, 10)
self.dropout = nn.Dropout(0.1)
self.is_checkpointed = is_checkpointed
self.fwd_counter = 0
@checkpoint_method("is_checkpointed")
def forward(self, x: Union[torch.Tensor, TensorPointer]):
x = self.dense1(x)
if self.is_checkpointed and self.fwd_counter == 0:
assert not x.requires_grad, "x should not require grad when checkpointed, because fwd runs in no_grad mode"
assert (
x.grad_fn is None
), "x should not store any activation when checkpointed, because fwd runs in no_grad mode"
x = self.dense2(x)
x = self.dropout(x)
self.fwd_counter += 1
return x
class DummyModel(nn.Module):
def __init__(self, is_checkpointed: bool = False):
super().__init__()
self.dense0 = nn.Linear(10, 10)
self.checkpointed_model = CheckpointedModel(is_checkpointed=is_checkpointed)
self.dense3 = nn.Linear(10, 10)
def forward(self, x: Union[torch.Tensor, TensorPointer]):
x = self.dense0(x)
x = self.checkpointed_model(x)
assert x.requires_grad # inside forward, x should require grad even if calculated in no_grad mode
x = self.dense3(x)
return x
def test_activation_checkpointing():
dtype = torch.float16
device = torch.device("cuda")
test_model = DummyModel(is_checkpointed=True)
ref_model = DummyModel(is_checkpointed=False)
for model in [test_model, ref_model]:
model.to(device=device, dtype=dtype)
# copy weights
test_model.load_state_dict(ref_model.state_dict())
assert test_model.checkpointed_model.is_checkpointed is True
assert ref_model.checkpointed_model.is_checkpointed is False
# generate random input
x = torch.randn(10, 10, device=device, dtype=dtype)
# Forward pass
with torch.random.fork_rng(devices=["cuda"]):
ref_output = ref_model(x)
checkpointed_output = test_model(x)
assert test_model.checkpointed_model.fwd_counter == 1
torch.testing.assert_close(checkpointed_output, ref_output)
# Backward pass (check that fwd is called twice, and that we don't store the activations)
ref_output.sum().backward()
assert ref_model.checkpointed_model.fwd_counter == 1, "ref_model fwd should not be called twice"
# make sure grads are not synced between test_model and ref_model
assert ref_model.dense0.weight.grad is not None
assert test_model.dense0.weight.grad is None
assert test_model.checkpointed_model.fwd_counter == 1
checkpointed_output.sum().backward()
assert test_model.checkpointed_model.fwd_counter == 2, "test_model fwd should be called twice"
# compare all models grads
for ref_param, checkpointed_param in zip(ref_model.parameters(), test_model.parameters()):
torch.testing.assert_close(ref_param.grad, checkpointed_param.grad)
# TODO @nouamanetazi: test `checkpoint_method` vs `torch.utils.checkpoint.checkpoint`
# TODO @nouamanetazi: test a method with kwargs values
# TODO @nouamanetazi: test `checkpoint_method` in a distributed setting
# TODO @nouamanetazi: test BatchNorm layers with checkpointing
import math
import os
import pytest
import torch
from helpers.dummy import DummyModel, dummy_infinite_data_loader
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.models import init_on_device_and_dtype
from nanotron.optim.clip_grads import clip_grad_norm
from nanotron.optim.gradient_accumulator import (
FP32GradientAccumulator,
)
from nanotron.parallel import ParallelContext
from nanotron.parallel.parameters import NanotronParameter, sanity_check
from nanotron.parallel.pipeline_parallel.engine import (
AllForwardAllBackwardPipelineEngine,
)
from nanotron.parallel.pipeline_parallel.p2p import P2P
from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
from nanotron.parallel.tensor_parallel.nn import (
TensorParallelColumnLinear,
)
from nanotron.parallel.tied_parameters import (
sync_tied_weights_gradients,
tie_parameters,
)
from nanotron.parallel.utils import initial_sync
from nanotron.sanity_checks import assert_tensor_synced_across_pg
from torch import nn
@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_with_pp requires at least 2 gpus")
@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
@rerun_if_address_is_in_use()
def test_clip_grads_with_pp(norm_type: float):
init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_with_pp)(norm_type=norm_type)
def _test_clip_grads_with_pp(parallel_context: ParallelContext, norm_type: float):
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
pipeline_engine = AllForwardAllBackwardPipelineEngine()
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
# spawn model
model = DummyModel(p2p=p2p)
if has_reference_model:
reference_model = DummyModel(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
for module in model.modules():
if isinstance(module, nn.Linear):
setattr(module, "weight", NanotronParameter(module.weight))
setattr(module, "bias", NanotronParameter(module.bias))
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
if pp_rank == current_pp_rank:
# We already have the weights locally
non_linear = model.mlp[pp_rank].linear.pp_block
reference_non_linear.weight.data.copy_(non_linear.weight.data)
reference_non_linear.bias.data.copy_(non_linear.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.weight.data.copy_(weight.data)
reference_non_linear.bias.data.copy_(bias.data)
else:
p2p.send_tensors(
[model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg) # First rank receives data
n_micro_batches_per_batch = 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
pipeline_engine.train_batch_iter(
model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
)
# Equivalent on the reference model
if has_reference_model:
for micro_batch in batch:
loss = reference_model(**micro_batch)
loss /= n_micro_batches_per_batch
loss.backward()
# Check that gradient are the same as reference
pp_rank = dist.get_rank(parallel_context.pp_pg)
if has_reference_model:
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
if pp_rank == current_pp_rank:
# We already have the gradients locally
non_linear = model.mlp[pp_rank].linear.pp_block
torch.testing.assert_close(
non_linear.weight.grad,
reference_non_linear.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(non_linear.bias.grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
else:
p2p.send_tensors(
[model.mlp[pp_rank].linear.pp_block.weight.grad, model.mlp[pp_rank].linear.pp_block.bias.grad],
to_rank=reference_rank,
)
non_linear = model.mlp[current_pp_rank].linear.pp_block
old_weight_grad = non_linear.weight.grad.clone()
old_bias_grad = non_linear.bias.grad.clone()
# Clip grads
total_norm = clip_grad_norm(
mp_pg=parallel_context.mp_pg,
named_parameters=model.named_parameters(),
grad_accumulator=None,
max_norm=1.0,
norm_type=norm_type,
)
if has_reference_model:
reference_total_norm = torch.nn.utils.clip_grad_norm_(
reference_model.parameters(), max_norm=1.0, norm_type=norm_type
)
torch.testing.assert_close(total_norm, reference_total_norm, atol=1e-6, rtol=1e-7)
# Check that grad changed
assert not torch.allclose(old_weight_grad, non_linear.weight.grad), "Grad should have changed"
assert not torch.allclose(old_bias_grad, non_linear.weight.grad), "Grad should have changed"
# Check that gradient are the same as reference
if has_reference_model:
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
if pp_rank == current_pp_rank:
# We already have the gradients locally
non_linear = model.mlp[pp_rank].linear.pp_block
torch.testing.assert_close(
non_linear.weight.grad,
reference_non_linear.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
non_linear.bias.grad,
reference_non_linear.bias.grad,
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
else:
p2p.send_tensors(
[
model.mlp[current_pp_rank].linear.pp_block.weight.grad,
model.mlp[current_pp_rank].linear.pp_block.bias.grad,
],
to_rank=reference_rank,
)
print(parallel_context.__dir__())
parallel_context.destroy()
@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_with_tp requires at least 2 gpus")
@pytest.mark.parametrize(
"tp_mode,async_communication",
[
pytest.param(TensorParallelLinearMode.ALL_REDUCE, False),
pytest.param(TensorParallelLinearMode.REDUCE_SCATTER, True),
],
)
@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
@rerun_if_address_is_in_use()
def test_clip_grads_with_tp(tp_mode: TensorParallelLinearMode, async_communication: bool, norm_type: float):
init_distributed(tp=2, dp=1, pp=1)(_test_clip_grads_with_tp)(
tp_mode=tp_mode, async_communication=async_communication, norm_type=norm_type
)
def _test_clip_grads_with_tp(
parallel_context: ParallelContext, tp_mode: TensorParallelLinearMode, async_communication: bool, norm_type: float
):
if async_communication:
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
in_features = 4
out_features_per_tp_rank = 8
out_features = parallel_context.tp_pg.size() * out_features_per_tp_rank
# Sharded
column_linear = TensorParallelColumnLinear(
in_features=in_features,
out_features=out_features,
pg=parallel_context.tp_pg,
mode=tp_mode,
device="cuda",
async_communication=async_communication,
)
# Un-sharded
reference_linear = nn.Linear(in_features=in_features, out_features=out_features, device="cuda")
# Copy weights/bias from sharded to un-sharded
with torch.inference_mode():
dist.all_gather(
tensor_list=list(reference_linear.weight.split(out_features_per_tp_rank, dim=0)),
tensor=column_linear.weight,
group=parallel_context.tp_pg,
)
dist.all_gather(
tensor_list=list(reference_linear.bias.split(out_features_per_tp_rank, dim=0)),
tensor=column_linear.bias,
group=parallel_context.tp_pg,
)
# Generate random input
random_input: torch.Tensor
sharded_random_input: torch.Tensor
if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
batch_size = 5
random_input = torch.randn(batch_size, in_features, device="cuda")
# synchronize random_input across tp
dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
sharded_random_input = random_input
elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
sharded_batch_size = 5
sharded_random_input = torch.randn(sharded_batch_size, in_features, device="cuda")
random_input = torch.empty(
sharded_batch_size * parallel_context.tp_pg.size(),
*(sharded_random_input.shape[1:]),
device=sharded_random_input.device,
dtype=sharded_random_input.dtype,
)
dist.all_gather_into_tensor(random_input, sharded_random_input, group=parallel_context.tp_pg)
else:
ValueError(f"Unsupported mode: {tp_mode}")
# Test that we get the same output after forward pass
sharded_output = column_linear(sharded_random_input)
reference_output = reference_linear(random_input)
# TODO @thomasw21: Tune tolerance
torch.testing.assert_close(
sharded_output,
reference_output[
:,
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank,
],
atol=1e-6,
rtol=1e-7,
)
# Test that we get the same gradient after backward pass
sharded_output.sum().backward()
reference_output.sum().backward()
torch.testing.assert_close(
column_linear.weight.grad,
reference_linear.weight.grad[
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank
],
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
column_linear.bias.grad,
reference_linear.bias.grad[
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank
],
atol=1e-6,
rtol=1e-7,
)
old_grad = column_linear.weight.grad.clone()
# Clip grads
total_norm = clip_grad_norm(
mp_pg=parallel_context.mp_pg,
named_parameters=column_linear.named_parameters(),
grad_accumulator=None,
max_norm=1.0,
norm_type=norm_type,
)
ref_total_norm = torch.nn.utils.clip_grad_norm_(reference_linear.parameters(), max_norm=1.0, norm_type=norm_type)
# Check that the gradients have changed
assert not torch.allclose(old_grad, column_linear.weight.grad), "Gradients should have changed after clipping"
# Test that we get the same gradient after clipping
torch.testing.assert_close(
column_linear.weight.grad,
reference_linear.weight.grad[
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank
],
)
torch.testing.assert_close(
column_linear.bias.grad,
reference_linear.bias.grad[
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank
],
)
torch.testing.assert_close(total_norm, ref_total_norm)
parallel_context.destroy()
@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_tied_weights requires at least 2 gpus")
@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
@rerun_if_address_is_in_use()
def test_clip_grads_tied_weights(norm_type: float):
init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_tied_weights)(norm_type=norm_type)
def _test_clip_grads_tied_weights(parallel_context: ParallelContext, norm_type: float):
if dist.get_rank(parallel_context.pp_pg) == 0:
model = nn.ModuleDict({"dense0": nn.Linear(10, 10, device="cuda")})
else:
model = nn.ModuleDict({"dense1": nn.Linear(10, 10, device="cuda")})
# Tie weights/bias
tie_parameters(
root_module=model,
ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
tie_parameters(
root_module=model,
ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
group = parallel_context.world_ranks_to_pg[(0, 1)]
# Check that model weights are not in fact synchronized
if dist.get_rank(parallel_context.pp_pg) == 0:
weight = model.dense0.weight
bias = model.dense0.bias
else:
weight = model.dense1.weight
bias = model.dense1.bias
# Make sure that weight/bias are NanotronParameter and that they are tied
assert isinstance(weight, NanotronParameter)
assert weight.is_tied
assert isinstance(bias, NanotronParameter)
assert bias.is_tied
# Sync tied weights: basic assumption
initial_sync(model=model, parallel_context=parallel_context)
# Check that weights are now synced
assert_tensor_synced_across_pg(weight, group)
assert_tensor_synced_across_pg(bias, group)
# Compute gradient
input_ = torch.randn(13, 10, device="cuda")
if dist.get_rank(parallel_context.pp_pg) == 0:
out = model.dense0(input_)
else:
out = model.dense1(input_)
out.sum().backward()
# sync gradients
sync_tied_weights_gradients(model, parallel_context=parallel_context, grad_accumulator=None)
# We check that we both gradients are synchronized
assert_tensor_synced_across_pg(weight.grad, group)
assert_tensor_synced_across_pg(bias.grad, group)
# Save grads as reference
ref_weight = weight.clone()
ref_weight.grad = weight.grad.clone()
ref_bias = bias.clone()
ref_bias.grad = bias.grad.clone()
old_grad = weight.grad.clone()
# Clip grads
total_norm = clip_grad_norm(
mp_pg=parallel_context.mp_pg,
named_parameters=model.named_parameters(),
grad_accumulator=None,
max_norm=1.0,
norm_type=norm_type,
)
ref_total_norm = torch.nn.utils.clip_grad_norm_([ref_weight, ref_bias], max_norm=1.0, norm_type=norm_type)
# Check that the gradients have changed
assert not torch.allclose(old_grad, weight.grad), "Gradients should have changed after clipping"
# Test that we get the same gradient after clipping
assert torch.allclose(weight.grad, ref_weight.grad, rtol=1e-7, atol=1e-6)
assert torch.allclose(bias.grad, ref_bias.grad, rtol=1e-7, atol=1e-6)
assert torch.allclose(total_norm, ref_total_norm, rtol=0, atol=0), f"Got {total_norm} and {ref_total_norm}"
parallel_context.destroy()
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
@rerun_if_address_is_in_use()
def test_clip_grads_fp32_accumulator(norm_type: float, half_precision: torch.dtype):
init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_fp32_accumulator)(
norm_type=norm_type, half_precision=half_precision
)
def _test_clip_grads_fp32_accumulator(
parallel_context: ParallelContext, norm_type: float, half_precision: torch.dtype
):
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
pipeline_engine = AllForwardAllBackwardPipelineEngine()
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
# spawn model
model = DummyModel(p2p=p2p)
if has_reference_model:
reference_model = DummyModel(p2p=p2p).to(torch.float)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
for module in model.modules():
if isinstance(module, nn.Linear):
setattr(module, "weight", NanotronParameter(module.weight))
setattr(module, "bias", NanotronParameter(module.bias))
# model goes to half precision
model = model.to(half_precision)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
if pp_rank == current_pp_rank:
# We already have the weights locally
non_linear = model.mlp[pp_rank].linear.pp_block
reference_non_linear.weight.data.copy_(non_linear.weight.data)
reference_non_linear.bias.data.copy_(non_linear.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.weight.data.copy_(weight.data)
reference_non_linear.bias.data.copy_(bias.data)
else:
p2p.send_tensors(
[model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
to_rank=reference_rank,
)
# Add gradient accumulator
grad_accumulator = FP32GradientAccumulator(model.named_parameters())
# Check that our model is a valid model
sanity_check(model)
# Compute backward
# Get infinite dummy data iterator
data_iterator = dummy_infinite_data_loader(
pp_pg=parallel_context.pp_pg, dtype=half_precision
) # First rank receives data
n_micro_batches_per_batch = 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
pipeline_engine.train_batch_iter(
model,
pg=parallel_context.pp_pg,
batch=batch,
nb_microbatches=n_micro_batches_per_batch,
grad_accumulator=grad_accumulator,
)
# We're going to copy the model gradients to the reference model gradient
# The reason why we do this, instead of computing backward using autograd is because of numerical precisions
if has_reference_model:
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
prefix_name = f"mlp.{pp_rank}.linear.pp_block"
if pp_rank == current_pp_rank:
# We already have the gradients locally
reference_non_linear.weight.grad = grad_accumulator.get_grad_buffer(f"{prefix_name}.weight").clone()
reference_non_linear.bias.grad = grad_accumulator.get_grad_buffer(f"{prefix_name}.bias").clone()
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.weight.grad = weight_grad
reference_non_linear.bias.grad = bias_grad
else:
p2p.send_tensors(
[
grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.weight"),
grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.bias"),
],
to_rank=reference_rank,
)
old_fp32_grads = {
name: grad_accumulator.get_grad_buffer(name=name).clone() for name, _ in model.named_parameters()
}
# Clip grads
total_norm = clip_grad_norm(
mp_pg=parallel_context.mp_pg,
named_parameters=model.named_parameters(),
grad_accumulator=grad_accumulator,
max_norm=1.0,
norm_type=norm_type,
)
if has_reference_model:
ref_total_norm = torch.nn.utils.clip_grad_norm_(
reference_model.parameters(), max_norm=1.0, norm_type=norm_type
)
# Check that the gradients have changed
for name, _ in model.named_parameters():
new_fp32_grad = grad_accumulator.get_grad_buffer(name=name)
assert not torch.allclose(old_fp32_grads[name], new_fp32_grad), "Gradients should have changed after clipping"
# We check that we get the same gradient accumulation. In theory we do get more precision by promoting gradients to fp32.
if has_reference_model:
torch.testing.assert_close(
total_norm.view(1),
ref_total_norm.view(1),
atol=1e-6,
rtol=1e-7,
msg=lambda msg: f"Expected {total_norm} to match {ref_total_norm}.\n{msg}",
)
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
prefix_name = f"mlp.{pp_rank}.linear.pp_block"
if pp_rank == current_pp_rank:
# We already have the gradients locally
torch.testing.assert_close(
reference_non_linear.weight.grad,
grad_accumulator.get_grad_buffer(f"{prefix_name}.weight"),
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
reference_non_linear.bias.grad,
grad_accumulator.get_grad_buffer(f"{prefix_name}.bias"),
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(
reference_non_linear.weight.grad,
weight_grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
reference_non_linear.bias.grad,
bias_grad,
atol=1e-6,
rtol=1e-7,
)
else:
p2p.send_tensors(
[
grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.weight"),
grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.bias"),
],
to_rank=reference_rank,
)
parallel_context.destroy()
from contextlib import nullcontext
import pytest
import torch
from helpers.exception import assert_fail_except_rank_with
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.parallel.data_parallel.utils import ddp_trigger_sync_in_bwd
from nanotron.parallel.parameters import NanotronParameter
from nanotron.sanity_checks import assert_tensor_synced_across_pg
from torch import nn
from torch.distributed import GradBucket
@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_ddp_with_afab requires at least 2 gpus")
@pytest.mark.parametrize("accumulation_steps", [1, 3])
@rerun_if_address_is_in_use()
def test_ddp_with_afab(accumulation_steps):
init_distributed(tp=1, dp=2, pp=1)(_test_ddp_with_afab)(accumulation_steps=accumulation_steps)
def _test_ddp_with_afab(parallel_context: ParallelContext, accumulation_steps: int):
half_precision = torch.float16
def allreduce_hook(process_group: dist.ProcessGroup, bucket: GradBucket):
# DDP groups grads in GradBuckets. This hook is called throughout the bwd pass, once each bucket is ready to overlap communication with computation.
# See https://pytorch.org/docs/stable/ddp_comm_hooks.html#what-does-a-communication-hook-operate-on for more details.
half_flat_bucket_buffer = bucket.buffer()
group_to_use = process_group if process_group is not None else parallel_context.dp_pg
return (
dist.all_reduce(half_flat_bucket_buffer, group=group_to_use, async_op=True, op=dist.ReduceOp.AVG)
.get_future()
.then(lambda fut: fut.value()[0])
)
model_hook = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
# Create Nanotron Parameter
model_hook.weight = NanotronParameter(model_hook.weight)
model_ddp_hook = torch.nn.parallel.DistributedDataParallel(
model_hook,
process_group=parallel_context.dp_pg,
)
# Register DDP hook
model_ddp_hook.register_comm_hook(state=None, hook=allreduce_hook)
activations = []
# All forward
for i in range(accumulation_steps):
input = torch.randn(5, 3, dtype=half_precision, device="cuda")
with model_ddp_hook.no_sync():
loss_hook = model_ddp_hook(input).sum()
activations.append(loss_hook)
# All backward
for i in range(accumulation_steps):
context = nullcontext()
if i == accumulation_steps - 1:
context = ddp_trigger_sync_in_bwd(model_ddp_hook) # triggers a sync for the final backward
loss_hook = activations[i]
with context:
loss_hook.backward()
grad_hook = model_ddp_hook.module.weight.grad.clone()
# Check that the gradients are synchronized across DP
if i == accumulation_steps - 1:
assert_tensor_synced_across_pg(grad_hook, parallel_context.dp_pg)
else:
with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
assert_tensor_synced_across_pg(grad_hook, parallel_context.dp_pg)
parallel_context.destroy()
import numpy as np
import pytest
import torch.distributed as dist
from helpers.utils import (
available_gpus,
get_all_3d_configurations,
init_distributed,
rerun_if_address_is_in_use,
)
from nanotron.parallel import ParallelContext
from torch.distributed import ProcessGroup
def _test_init_parallel_context(parallel_context: ParallelContext):
assert dist.is_initialized() is True
assert isinstance(parallel_context.world_pg, ProcessGroup)
assert isinstance(parallel_context.tp_pg, ProcessGroup) if parallel_context.tensor_parallel_size > 1 else True
assert isinstance(parallel_context.pp_pg, ProcessGroup) if parallel_context.pipeline_parallel_size > 1 else True
assert isinstance(parallel_context.dp_pg, ProcessGroup) if parallel_context.data_parallel_size > 1 else True
world_rank = dist.get_rank(parallel_context.world_pg)
ranks3d = parallel_context.get_local_ranks(world_rank)
assert isinstance(ranks3d, tuple) and len(ranks3d)
assert isinstance(parallel_context.world_rank_matrix, np.ndarray)
assert isinstance(parallel_context.world_ranks_to_pg, dict)
local_rank = tuple(i.item() for i in np.where(parallel_context.world_rank_matrix == world_rank))
global_rank = parallel_context.get_global_rank(*local_rank)
assert isinstance(global_rank, np.int64), f"The type of global_rank is {type(global_rank)}"
assert global_rank == dist.get_rank()
parallel_context.destroy()
assert dist.is_initialized() is False
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_init_parallel_context(tp: int, dp: int, pp: int):
init_distributed(tp=tp, dp=dp, pp=pp)(_test_init_parallel_context)()
\ No newline at end of file
from typing import Union
import pytest
import torch
from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
from helpers.utils import init_distributed, rerun_if_address_is_in_use
from nanotron.config import ModelArgs, RandomInit, SpectralMupInit
from nanotron.helpers import get_custom_lr_for_named_parameters
from nanotron.parallel import ParallelContext
from nanotron.scaling.parametrization import ParametrizationMethod
@pytest.mark.parametrize("tp,dp,pp", [(1, 1, 1), (2, 1, 1), (1, 1, 2), (2, 1, 2)])
@pytest.mark.parametrize(
"parametrization_method", [ParametrizationMethod.STANDARD, ParametrizationMethod.SPECTRAL_MUP]
)
@pytest.mark.skip
@rerun_if_address_is_in_use()
def test_get_custom_lr(tp: int, dp: int, pp: int, parametrization_method: ParametrizationMethod):
LR = 1e-3
if parametrization_method == ParametrizationMethod.STANDARD:
init_method = RandomInit(std=1.0)
elif parametrization_method == ParametrizationMethod.SPECTRAL_MUP:
init_method = SpectralMupInit(use_mup=True)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_get_custom_lr)(
lr=LR,
init_method=init_method,
parametrization_method=parametrization_method,
)
def _test_get_custom_lr(
parallel_context: ParallelContext,
lr: float,
init_method: Union[RandomInit, SpectralMupInit],
parametrization_method: ParametrizationMethod,
):
model_args = ModelArgs(init_method=init_method, model_config=TINY_LLAMA_CONFIG)
config = get_llama_training_config(model_args)
llama = create_llama_from_config(
model_config=TINY_LLAMA_CONFIG,
device=torch.device("cuda"),
parallel_context=parallel_context,
)
llama.init_model_randomly(config=config, init_method=parametrization_method)
named_parameters = list(llama.get_named_params_with_correct_tied())
if len(named_parameters) == 0:
# NOTE: some pp ranks don't have any parameters
return
named_param_groups = get_custom_lr_for_named_parameters(
parametrization_method=parametrization_method, lr=lr, named_parameters=named_parameters, model=llama
)
assert len(named_param_groups) == len(named_parameters)
assert all(isinstance(named_param_group["lr"], float) for named_param_group in named_param_groups)
assert all(isinstance(named_param_group["named_params"], list) for named_param_group in named_param_groups)
is_all_lr_the_same = parametrization_method == ParametrizationMethod.STANDARD
assert all(named_param_group["lr"] == lr for named_param_group in named_param_groups) is is_all_lr_the_same
import pytest
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron.optim.gradient_accumulator import FP32GradientAccumulator
from nanotron.optim.named_optimizer import NamedOptimizer
from nanotron.optim.optimizer_from_gradient_accumulator import OptimizerFromGradientAccumulator
from nanotron.parallel.context import ParallelContext
from nanotron.parallel.parameters import NanotronParameter
from nanotron.random import set_random_seed
class DummyModel(nn.Module):
def __init__(self, dtype=torch.float32):
super(DummyModel, self).__init__()
self.fc1 = nn.Linear(10, 20, bias=False).to(dtype=dtype)
self.fc2 = nn.Linear(20, 2, bias=False).to(dtype=dtype)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return x
def test_optimizer_lr_one_group():
set_random_seed(42)
model = DummyModel().to("cuda")
lr1 = 0.1
named_params_or_groups = []
for name, param in model.named_parameters():
named_params_or_groups.append((name, param))
named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1}]
optimizer = NamedOptimizer(
named_params_or_groups=named_params_or_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
input = torch.randn(10, 10).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for _ in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output, target)
loss.backward()
fc1_grad = model.fc1.weight.grad.clone()
fc2_grad = model.fc2.weight.grad.clone()
# compute gradient manually
with torch.no_grad():
expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
expected_fc2_weight = model.fc2.weight - lr1 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
def test_optimizer_lr_multiple_group():
set_random_seed(42)
model = DummyModel().to("cuda")
lr1, lr2 = 0.1, 0.001
named_params_or_groups = [
{"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name], "lr": lr1},
{"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name], "lr": lr2},
]
optimizer = NamedOptimizer(
named_params_or_groups=named_params_or_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
input = torch.randn(10, 10).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for _ in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output, target)
loss.backward()
fc1_grad = model.fc1.weight.grad.clone()
fc2_grad = model.fc2.weight.grad.clone()
with torch.no_grad():
expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
expected_fc2_weight = model.fc2.weight - lr2 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
def test_optimizer_lr_weight_decay_one_group():
set_random_seed(42)
model = DummyModel().to("cuda")
lr1 = 0.1
weight_decay = 0.1
named_params_or_groups = []
for name, param in model.named_parameters():
named_params_or_groups.append((name, param))
named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1, "weight_decay": weight_decay}]
optimizer = NamedOptimizer(
named_params_or_groups=named_params_or_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
input = torch.randn(10, 10).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for _ in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output, target)
loss.backward()
# Compute gradient manually and apply weight decay
with torch.no_grad():
expected_fc1_weight = (1 - lr1 * weight_decay) * model.fc1.weight - lr1 * model.fc1.weight.grad
expected_fc2_weight = (1 - lr1 * weight_decay) * model.fc2.weight - lr1 * model.fc2.weight.grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
def test_optimizer_lr_weight_decay_multiple_group():
set_random_seed(42)
model = DummyModel().to("cuda")
lr1, lr2 = 0.1, 0.001
weight_decay1, weight_decay2 = 0.1, 0.001
named_params_or_groups = [
{
"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name],
"lr": lr1,
"weight_decay": weight_decay1,
},
{
"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name],
"lr": lr2,
"weight_decay": weight_decay2,
},
]
optimizer = NamedOptimizer(
named_params_or_groups=named_params_or_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
input = torch.randn(10, 10).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for _ in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output, target)
loss.backward()
# Compute gradient manually and apply weight decay
with torch.no_grad():
expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * model.fc1.weight.grad
expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * model.fc2.weight.grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
def test_optimizer_grad_accumulation_lr_one_group(half_precision: torch.dtype, accumulation_steps: int):
set_random_seed(42)
dtype = half_precision
lr1 = 0.1
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
named_params_or_groups = []
for name, param in model.named_parameters():
named_params_or_groups.append((name, param))
named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1}]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
expected_fc2_weight = model.fc2.weight - lr1 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
def test_optimizer_grad_accumulation_lr_multiple_group(half_precision: torch.dtype, accumulation_steps: int):
set_random_seed(42)
dtype = half_precision
lr1, lr2 = 0.1, 0.001
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
named_params_or_groups = [
{"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name], "lr": lr1},
{"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name], "lr": lr2},
]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
expected_fc2_weight = model.fc2.weight - lr2 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
def test_optimizer_grad_accumulation_lr_weight_decay_one_group(half_precision: torch.dtype, accumulation_steps: int):
set_random_seed(42)
dtype = half_precision
lr1 = 0.1
weight_decay = 0.1
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
named_params_or_groups = []
for name, param in model.named_parameters():
named_params_or_groups.append((name, param))
named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1, "weight_decay": weight_decay}]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
weight_decay=9999999, # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
expected_fc1_weight = (1 - lr1 * weight_decay) * model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
expected_fc2_weight = (1 - lr1 * weight_decay) * model.fc2.weight - lr1 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
def test_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
half_precision: torch.dtype, accumulation_steps: int
):
set_random_seed(42)
dtype = half_precision
lr1, lr2 = 0.1, 0.001
weight_decay1, weight_decay2 = 0.1, 0.001
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
named_params_or_groups = [
{
"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name],
"lr": lr1,
"weight_decay": weight_decay1,
},
{
"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name],
"lr": lr2,
"weight_decay": weight_decay2,
},
]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
weight_decay=9999999, # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.skipif(available_gpus() < 2, reason="Testing requires at least 2 gpus")
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
@rerun_if_address_is_in_use()
def test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
half_precision: torch.dtype, accumulation_steps: int
):
init_distributed(tp=1, dp=2, pp=1)(_test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group)(
half_precision=half_precision,
accumulation_steps=accumulation_steps,
)
def _test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
parallel_context: ParallelContext, half_precision: torch.dtype, accumulation_steps: int
):
set_random_seed(42)
dtype = half_precision
# Making it bigger so that the difference is more visible during update
lr1, lr2 = 0.04, 0.05
weight_decay1, weight_decay2 = 0.5, 0.2
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
model_ddp = torch.nn.parallel.DistributedDataParallel(
model,
process_group=parallel_context.dp_pg,
)
named_params_or_groups = [
{
"named_params": [(name, param) for name, param in model_ddp.named_parameters() if "fc1" in name],
"lr": lr1,
"weight_decay": weight_decay1,
},
{
"named_params": [(name, param) for name, param in model_ddp.named_parameters() if "fc2" in name],
"lr": lr2,
"weight_decay": weight_decay2,
},
]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
weight_decay=9999999, # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="module.fc1.weight").to(dtype)
expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="module.fc2.weight").to(dtype)
expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
import contextlib
import pytest
import torch
from helpers.exception import assert_fail_with
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.p2p import P2P
@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_ddp_with_afab requires at least 2 gpus")
@pytest.mark.parametrize("send_contiguous", [True, False])
@pytest.mark.parametrize("full", [True, False])
@rerun_if_address_is_in_use()
def test_check_send_recv_tensor(send_contiguous: bool, full: bool):
init_distributed(tp=1, dp=1, pp=2)(_test_check_send_recv_tensor)(send_contiguous=send_contiguous, full=full)
def _test_check_send_recv_tensor(parallel_context: ParallelContext, send_contiguous: bool, full: bool):
p2p = P2P(pg=parallel_context.pp_pg, device=torch.device("cuda"))
if dist.get_rank(p2p.pg) == 0:
tensor_to_send = torch.randn(3, 5, dtype=torch.float, device=torch.device("cuda"))
if send_contiguous is True:
assert tensor_to_send.is_contiguous()
else:
tensor_to_send = tensor_to_send.transpose(0, 1)
assert not tensor_to_send.is_contiguous()
# `full` defines if we take a non trivial slice of the tensor
if full is False:
tensor_to_send = tensor_to_send[1:3]
if send_contiguous is False and full is False:
# This is supposed to return a ValueError mentioning that you should have sent a smaller model by running `contiguous` before.
send_first_context = assert_fail_with(
AssertionError,
error_msg="Expect storage_size to be smaller than tensor size. It might not be true, when you use slicing for example though. We probably don't want to support it in our P2P system",
)
fail_at_first_send = True
else:
send_first_context = contextlib.nullcontext()
fail_at_first_send = False
# Send tensor back and forth through p2p protocol and check that we get the same thing.
if dist.get_rank(p2p.pg) == 0:
with send_first_context:
handles = p2p.isend_tensors([tensor_to_send], to_rank=1)
if fail_at_first_send is True:
# We early return if we caught an error
return
for handle in handles:
handle.wait()
tensor_travelled_back_and_forth = p2p.recv_tensors(1, from_rank=1)[0]
torch.testing.assert_close(tensor_to_send, tensor_travelled_back_and_forth, atol=0, rtol=0)
elif dist.get_rank(p2p.pg) == 1:
# Instead of letting first rank hang since sending won't be possible, we early return
tensors, handles = p2p.irecv_tensors(1, from_rank=0)
if fail_at_first_send is True:
return
for handle in handles:
handle.wait()
tensor_to_recv = tensors[0]
p2p.send_tensors([tensor_to_recv], to_rank=0)
else:
raise ValueError()
if full is False and send_contiguous is True:
# We can actually check that we haven't sent the entire storage as storage not accessed by the tensor are not sent
if dist.get_rank(p2p.pg) == 0:
# Check that the first element in the storages don't correspond (because they are not support to be communicated when the tensor is not full).
print(tensor_to_send.untyped_storage()[:4], tensor_travelled_back_and_forth.untyped_storage()[:4])
print(tensor_to_send.as_strided(size=(1,), stride=(1,), storage_offset=0))
print(tensor_travelled_back_and_forth.as_strided(size=(1,), stride=(1,), storage_offset=0))
assert not torch.allclose(
tensor_to_send.as_strided(size=(1,), stride=(1,), storage_offset=0),
tensor_travelled_back_and_forth.as_strided(size=(1,), stride=(1,), storage_offset=0),
)
parallel_context.destroy()
import torch
from helpers.exception import assert_fail_with
from nanotron.models.base import DTypeInvariantTensor, init_on_device_and_dtype
from nanotron.parallel.parameters import NanotronParameter
from torch import nn
def test_nanotron_parameter_does_not_override_some_parameter_variable():
param = nn.Parameter(torch.empty(3))
assert not hasattr(param, NanotronParameter.NANOTRON_PARAMETER_METADATA_ATTRIBUTE_NAME)
def test_uncastable_tensor():
# Test that we can create an DTypeInvariantTensor
x = DTypeInvariantTensor(torch.randn(3, 3))
assert isinstance(x, torch.Tensor)
assert isinstance(x, DTypeInvariantTensor)
# Test that we cannot modify the type of an DTypeInvariantTensor
with assert_fail_with(RuntimeError, error_msg="Cannot convert the type of an DTypeInvariantTensor to float"):
x = x.float()
with assert_fail_with(RuntimeError, error_msg="Cannot convert the type of an DTypeInvariantTensor to half"):
x = x.half()
with assert_fail_with(RuntimeError, error_msg="Cannot change the type of an DTypeInvariantTensor"):
x = x.to(torch.float32)
with assert_fail_with(RuntimeError, error_msg="Cannot change the type of an DTypeInvariantTensor"):
x = x.to(dtype=torch.float32)
# Test that we can modify the value of an DTypeInvariantTensor
x[0, 0] = 1
assert x[0, 0] == 1
# Test that we can modify the device of an DTypeInvariantTensor
x = x.to("cuda")
assert x.device.type == "cuda"
def test_register_buffer_does_not_update_uncastable_tensor():
old_device = torch.device("cuda")
old_dtype = torch.float32
new_device = torch.device("cpu")
new_dtype = torch.bfloat16
with init_on_device_and_dtype(device=new_device, dtype=new_dtype):
module = torch.nn.Module()
# Test that we can register an DTypeInvariantTensor as a buffer
tensor = DTypeInvariantTensor(torch.randn(3, 4, dtype=old_dtype, device=old_device))
module.register_buffer("buffer", tensor)
# Test that we can modify the buffer
module.buffer[0, 0] = 1
assert module.buffer[0, 0] == 1
# Test that device has been updated
assert module.buffer.device.type == new_device.type
# Test that dtype has not been modified
assert module.buffer.dtype is old_dtype
import copy
import nanotron.distributed as dist
import pytest
import torch
from helpers.dummy import DummyModel, dummy_infinite_data_loader
from helpers.exception import assert_fail_except_rank_with, timeout_after
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron.models import init_on_device_and_dtype
from nanotron.optim import ZeroDistributedOptimizer
from nanotron.optim.gradient_accumulator import FP32GradBucketManager, FP32GradientAccumulator, get_fp32_accum_hook
from nanotron.optim.named_optimizer import NamedOptimizer
from nanotron.optim.optimizer_from_gradient_accumulator import (
OptimizerFromGradientAccumulator,
)
from nanotron.parallel import ParallelContext
from nanotron.parallel.parameters import NanotronParameter, sanity_check
from nanotron.parallel.pipeline_parallel.engine import (
AllForwardAllBackwardPipelineEngine,
OneForwardOneBackwardPipelineEngine,
PipelineEngine,
)
from nanotron.parallel.pipeline_parallel.p2p import P2P
from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of
from nanotron.parallel.tied_parameters import (
get_tied_id_to_param,
sync_tied_weights_gradients,
tie_parameters,
)
from nanotron.parallel.utils import initial_sync
from nanotron.sanity_checks import assert_tensor_synced_across_pg
from nanotron.utils import ContextManagers
from torch import nn
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
def test_gradient_promoting_in_fp32(half_precision: torch.dtype):
model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
# Create Nanotron Parameter
model.weight = NanotronParameter(model.weight)
# Add gradient accumulator
accumulator = FP32GradientAccumulator(model.named_parameters())
# Check that our model is a valid model
sanity_check(model)
# Compute backward
input = torch.randn(5, 3, dtype=half_precision, device="cuda")
accumulator.backward(model(input).sum())
# Check that we have an high precision gradient and that the low precision one is cleared
assert accumulator.parameters["weight"]["fp32"].grad.dtype == torch.float
if model.weight.grad is not None:
# We check that it's zero
torch.testing.assert_close(model.weight.grad, torch.zeros_like(model.weight.grad), atol=1e-6, rtol=1e-7)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
def test_gradient_accumulated_in_fp32(half_precision: torch.dtype):
model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
ref_model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
with torch.inference_mode():
ref_model.weight.copy_(model.weight)
# Create Nanotron Parameter
model.weight = NanotronParameter(model.weight)
# Add gradient accumulator
accumulator = FP32GradientAccumulator(model.named_parameters())
# Check that our model is a valid model
sanity_check(model)
# Compute backward
grad_accumulation_steps = 2
for _ in range(grad_accumulation_steps):
# We want large input to have large gradients.
input = (torch.randn(5, 3, dtype=half_precision, device="cuda") ** 2 + 1) * 100
# Compute backwards
accumulator.backward(model(input).sum())
ref_model(input).sum().backward()
# We check that we get the same gradient accumulation. In theory we do get more precision by promoting gradients to fp32.
torch.testing.assert_close(
accumulator.parameters["weight"]["fp32"].grad.to(half_precision),
ref_model.weight.grad,
)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
def test_optimizer_can_step_gradient_in_fp32(half_precision: torch.dtype):
model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
original_weight = model.weight.detach().clone()
# Create Nanotron Parameter
model.weight = NanotronParameter(model.weight)
# Add optimizer
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
)
accumulator = optimizer.gradient_accumulator
# Check that our model is a valid model
sanity_check(model)
# Compute backward
input = torch.randn(5, 3, dtype=half_precision, device="cuda")
accumulator.backward(model(input).sum())
# Check that we have an high precision gradient and that the low precision one is cleared
assert accumulator.parameters["weight"]["fp32"].grad.dtype == torch.float
if model.weight.grad is not None:
# We check that it's zero
torch.testing.assert_close(model.weight.grad, torch.zeros_like(model.weight.grad), atol=1e-6, rtol=1e-7)
optimizer.step()
optimizer.zero_grad()
# Check that we don't have gradients anymore and that it's set to `None`
assert accumulator.parameters["weight"]["fp32"].grad is None
assert model.weight.grad is None
# Check that gradients have been set to zero
fp32_grad = accumulator.get_grad_buffer(name="weight")
torch.testing.assert_close(fp32_grad, torch.zeros_like(fp32_grad), atol=1e-6, rtol=1e-7)
# weights has been updates
assert not torch.allclose(original_weight, model.weight)
@pytest.mark.skipif(available_gpus() < 2, reason="Testing ddp_hook_allreduce requires at least 2 gpus")
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
@pytest.mark.parametrize("train_iterations", [1, 3])
@rerun_if_address_is_in_use()
def test_ddp_with_grad_accum_in_fp32(half_precision: torch.dtype, accumulation_steps: int, train_iterations: int):
init_distributed(tp=1, dp=2, pp=1)(_test_ddp_with_grad_accum_in_fp32)(
half_precision=half_precision,
accumulation_steps=accumulation_steps,
train_iterations=train_iterations,
)
def _test_ddp_with_grad_accum_in_fp32(
parallel_context: ParallelContext,
half_precision: torch.dtype,
accumulation_steps: int,
train_iterations: int,
):
hidden_size = 32
n_layers = 3
model = nn.Sequential(
nn.Linear(3, hidden_size, bias=False, dtype=half_precision, device="cuda"),
*(
nn.Linear(hidden_size, hidden_size, bias=False, dtype=half_precision, device="cuda")
for _ in range(n_layers - 1)
),
)
model_hook = copy.deepcopy(model)
# Create Nanotron Parameters
for module in model.modules():
if isinstance(module, nn.Linear):
setattr(module, "weight", NanotronParameter(module.weight))
for module in model_hook.modules():
if isinstance(module, nn.Linear):
setattr(module, "weight", NanotronParameter(module.weight))
# Needed in order to obtain smaller gradient buckets when using `DistributedDataParallel`
model_ddp = torch.nn.parallel.DistributedDataParallel(
model,
process_group=parallel_context.dp_pg,
) # we won't actually use DDP anywhere, it's just to have same module names
model_ddp_accum_ref = {}
model_ddp_fp32_accum = torch.nn.parallel.DistributedDataParallel(
model_hook,
process_group=parallel_context.dp_pg,
)
# Add gradient accumulator
accumulator = FP32GradientAccumulator(model_ddp_fp32_accum.named_parameters())
# Register DDP hook
state = FP32GradBucketManager(
dp_pg=parallel_context.dp_pg,
accumulator=accumulator,
param_id_to_name={id(param): name for name, param in model_ddp_fp32_accum.named_parameters()},
)
model_ddp_fp32_accum.register_comm_hook(
state=state,
hook=get_fp32_accum_hook(
reduce_scatter=False,
reduce_op=dist.ReduceOp.AVG,
),
)
for train_iter in range(train_iterations):
# Gradient accumulation steps
for accum_step in range(accumulation_steps - 1):
# Forward-Backward
input = torch.randn(10, 3, dtype=half_precision, device="cuda")
loss = model_ddp.module(input).sum()
assert not torch.isinf(loss).any(), "loss is inf"
loss.backward()
with ContextManagers([model_ddp_fp32_accum.no_sync(), accumulator.no_sync()]):
loss_fp32_accum = model_ddp_fp32_accum(input).sum()
accumulator.backward(loss_fp32_accum)
for name, param in model_ddp.named_parameters():
grad = param.grad
grad_fp32_accum = accumulator.parameters[name]["fp32"].grad
fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
# Check that FP32GradAccum+DDP+hook gives close gradients to DDP
model_ddp_accum_ref[name] = (
grad.float() if accum_step == 0 else model_ddp_accum_ref[name] + grad.float()
)
dist.barrier()
torch.testing.assert_close(model_ddp_accum_ref[name], fp32_grad_bucket, atol=1e-6, rtol=1e-7)
dist.barrier()
# Check that we correctly copied grads from buckets to params (`copy_buckets_to_grads`)
torch.testing.assert_close(fp32_grad_bucket, grad_fp32_accum, atol=1e-6, rtol=1e-7)
# Check that the gradients are not synchronized across DP
with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
assert_tensor_synced_across_pg(grad, parallel_context.dp_pg)
with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
assert_tensor_synced_across_pg(fp32_grad_bucket, parallel_context.dp_pg)
# We zero out half grads for `model_ddp` because we're accumulating grads manually in `model_ddp_accum_ref`
model_ddp.zero_grad()
# Last accumulation step (Sync grads across DDP)
input = torch.randn(10, 3, dtype=half_precision, device="cuda")
loss = model_ddp.module(input).sum()
loss.backward()
# manually reduce grads across DDP
for name, param in model_ddp.named_parameters():
grad = param.grad
model_ddp_accum_ref[name] = (
model_ddp_accum_ref[name] + grad.float() if name in model_ddp_accum_ref else grad.float()
)
dist.all_reduce(model_ddp_accum_ref[name], group=parallel_context.dp_pg, op=dist.ReduceOp.AVG)
loss_fp32_accum = model_ddp_fp32_accum(input).sum()
accumulator.backward(loss_fp32_accum)
for name, param in model_ddp_fp32_accum.named_parameters():
# Check that half grads has been set to None in sync step, to avoid it being uncorrectly used
half_grad = param.grad
assert half_grad is None, f"{half_grad} != None"
grad = model_ddp_accum_ref[name]
grad_fp32_accum = accumulator.parameters[name]["fp32"].grad
fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
# Check that FP32GradAccum+DDP+hook gives close gradients to DDP
dist.barrier()
torch.testing.assert_close(grad, fp32_grad_bucket, atol=1e-6, rtol=1e-7)
# Check that grad points to the same memory as the bucket
assert grad_fp32_accum.data_ptr() == fp32_grad_bucket.data_ptr()
# Check that the gradients are synchronized across DP
assert_tensor_synced_across_pg(grad, parallel_context.dp_pg)
assert_tensor_synced_across_pg(grad_fp32_accum, parallel_context.dp_pg)
# Zero out gradients (Usually it's the optimizer that does this)
model_ddp.zero_grad()
model_ddp_accum_ref = {}
accumulator.zero_grad() # Sets half grads to None and zeroes out fp32 grad buckets
for name, elt in accumulator.parameters.items():
fp32_param = elt["fp32"]
fp32_param.grad = None
# Check that fp32 grad buckets are zeroed out and `param.grad` is set to None
for name, param in model_ddp_fp32_accum.named_parameters():
assert param.grad is None
fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
dist.barrier()
torch.testing.assert_close(fp32_grad_bucket, torch.zeros_like(fp32_grad_bucket), atol=1e-6, rtol=1e-7)
# Check that all fp32 grad buckets are zeroed out
for _, elt in accumulator.fp32_grad_buffers.items():
fp32_grad = elt["fp32_grad"]
# This is important as we assume grad buckets to be zeroed out at the first accumulation step
dist.barrier()
torch.testing.assert_close(fp32_grad, torch.zeros_like(fp32_grad), atol=1e-6, rtol=1e-7)
parallel_context.destroy()
@pytest.mark.skipif(
available_gpus() < 4, reason="Testing test_tied_weights_sync_with_grad_accum_in_fp32 requires at least 4 gpus"
)
@pytest.mark.parametrize(
"pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
)
@pytest.mark.parametrize("reduce_scatter", [True, False])
@rerun_if_address_is_in_use()
def test_tied_weights_sync_with_grad_accum_in_fp32(pipeline_engine: PipelineEngine, reduce_scatter: bool):
init_distributed(tp=1, dp=2, pp=2)(_test_tied_weights_sync_with_grad_accum_in_fp32)(
pipeline_engine=pipeline_engine, reduce_scatter=reduce_scatter
)
def _test_tied_weights_sync_with_grad_accum_in_fp32(
parallel_context: ParallelContext, pipeline_engine: PipelineEngine, reduce_scatter: bool
):
# We init two replicas of 2 denses. Each dense is on a device.
dtype = torch.float16
device = torch.device("cuda")
p2p = P2P(pg=parallel_context.pp_pg, device=device)
model = DummyModel(p2p=p2p)
reference_model = DummyModel(p2p=p2p)
reference_model_accum_ref = {}
for mdl in [model, reference_model]:
# Set the ranks
with init_on_device_and_dtype(device, dtype):
assert parallel_context.pp_pg.size() == len(mdl.mlp)
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), mdl.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
mdl.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# Tie all dense weights across PP
tie_parameters(
root_module=mdl,
ties=[
(
target,
(
parallel_context.get_global_rank(
ep_rank=dist.get_rank(parallel_context.expert_pg),
pp_rank=get_pp_rank_of(target, module=mdl),
dp_rank=dist.get_rank(parallel_context.dp_pg),
tp_rank=dist.get_rank(parallel_context.tp_pg),
),
),
)
for target in [
f"mlp.{pp_rank}.linear.pp_block.weight" for pp_rank in range(parallel_context.pp_pg.size())
]
],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
for name, module in mdl.named_modules():
if isinstance(module, nn.Linear):
module.bias = NanotronParameter(module.bias)
# Sync DP and tied weights: basic assumption
initial_sync(model=mdl, parallel_context=parallel_context)
# Sync params between `model` and `reference_model`
with torch.no_grad():
for name, param in model.named_parameters():
param.copy_(reference_model.get_parameter(name))
# DDP
model_ddp = torch.nn.parallel.DistributedDataParallel(model, process_group=parallel_context.dp_pg)
module_id_to_prefix = {id(module): f"{module_name}." for module_name, module in model.named_modules()}
reference_module_id_to_prefix = {
id(module): f"{module_name}." for module_name, module in reference_model.named_modules()
}
# Fix the root_model
module_id_to_prefix[id(model)] = ""
reference_module_id_to_prefix[id(reference_model)] = ""
# named parameters
named_parameters = [
(
param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix)
if param.is_tied
else name,
param,
)
for name, param in model.named_parameters()
]
# Optimizer: We don't actually run the optimizer, we just use it to build the gradient accumulator
optimizer = ZeroDistributedOptimizer(
dp_pg=parallel_context.dp_pg,
named_params_or_groups=named_parameters,
optimizer_builder=lambda named_param_groups_1: OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(
named_parameters=named_params,
grad_buckets_named_params=named_parameters,
),
named_params_or_groups=named_param_groups_1,
optimizer_builder=lambda named_param_groups_2: NamedOptimizer(
named_params_or_groups=named_param_groups_2,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
),
)
param_id_to_name = {
id(param): param.get_tied_info().get_full_name_from_module_id_to_prefix(
module_id_to_prefix=module_id_to_prefix
)
if param.is_tied
else name
for name, param in model.named_parameters()
}
# Add gradient accumulator
# We use `model_ddp.module` in order ta have the parameter names without the `module.` prefix
accumulator = optimizer.optimizer.gradient_accumulator
accumulator.assign_param_offsets(
dp_rank=dist.get_rank(parallel_context.dp_pg),
param_name_to_offsets=optimizer.param_name_to_dp_rank_offsets,
)
model_ddp.register_comm_hook(
state=FP32GradBucketManager(
dp_pg=parallel_context.dp_pg,
accumulator=accumulator,
param_id_to_name=param_id_to_name,
),
hook=get_fp32_accum_hook(reduce_scatter=reduce_scatter, reduce_op=dist.ReduceOp.AVG),
)
# Get infinite dummy data iterator
data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg, dtype=dtype) # First rank receives data
n_micro_batches_per_batch = 2
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
## Reference model iteration step
def forward_backward_reference(mdl, micro_batch):
pipeline_engine.train_batch_iter(
mdl, pg=parallel_context.pp_pg, batch=[micro_batch], nb_microbatches=1, grad_accumulator=None
)
for accum_step in range(n_micro_batches_per_batch - 1):
# Forward-Backward
forward_backward_reference(reference_model, batch[accum_step])
# Accumulate grads
for name, param in reference_model.named_parameters():
grad = param.grad
if param.is_tied:
tied_info = param.get_tied_info()
name = tied_info.get_full_name_from_module_id_to_prefix(
module_id_to_prefix=reference_module_id_to_prefix
)
reference_model_accum_ref[name] = (
grad.float() if accum_step == 0 else reference_model_accum_ref[name] + grad.float()
)
# We zero out half grads for `reference_model` because we're accumulating grads manually in `reference_model_accum_ref`
reference_model.zero_grad()
# Last accumulation step (Sync grads across DDP)
forward_backward_reference(reference_model, batch[-1])
# manually reduce grads across DDP
for name, param in reference_model.named_parameters():
grad = param.grad
if param.is_tied:
tied_info = param.get_tied_info()
name = tied_info.get_full_name_from_module_id_to_prefix(module_id_to_prefix=reference_module_id_to_prefix)
reference_model_accum_ref[name] = (
reference_model_accum_ref[name] + grad.float() if name in reference_model_accum_ref else grad.float()
)
dist.all_reduce(reference_model_accum_ref[name], group=parallel_context.dp_pg, op=dist.ReduceOp.AVG)
## Model iteration step
pipeline_engine.train_batch_iter(
model_ddp,
pg=parallel_context.pp_pg,
batch=batch,
nb_microbatches=n_micro_batches_per_batch,
grad_accumulator=accumulator,
)
for name, param in model_ddp.module.named_parameters():
if param.is_tied:
tied_info = param.get_tied_info()
name = tied_info.get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix)
# Each parameter is sharded across DP.
assert (
name in accumulator.parameters
), f"`accumulator.parameters` must have all params {name} not in `accumulator.parameters`. Existing keys are: {accumulator.parameters}"
fp32_grad = accumulator.get_grad_buffer(name=name)
if not reduce_scatter:
# Check that the gradients are synchronized across DP
assert_tensor_synced_across_pg(fp32_grad, parallel_context.dp_pg)
fp32_grad_ref = reference_model_accum_ref[name]
dist.barrier()
if reduce_scatter:
slice_ = slice(*accumulator.param_name_to_offsets[name])
# Check that gradients are correct
torch.testing.assert_close(
fp32_grad_ref.view(-1)[slice_] / n_micro_batches_per_batch,
fp32_grad.view(-1)[slice_],
rtol=1e-7,
atol=1e-6,
msg=lambda msg: f"FP32 Gradients at `{name}` don't match\n - Expected: {fp32_grad_ref.view(-1)[slice_] / n_micro_batches_per_batch}\n - Got: {fp32_grad.view(-1)[slice_]}",
)
else:
# Check that gradients are correct
torch.testing.assert_close(fp32_grad_ref / n_micro_batches_per_batch, fp32_grad, rtol=1e-7, atol=1e-6)
# Check that tied weights grads are not synchronized yet
for (name, group_ranks), param in sorted(
get_tied_id_to_param(parameters=model_ddp.parameters(), root_module=model_ddp.module).items(),
key=lambda x: x[0],
):
if not (isinstance(param, NanotronParameter) and param.is_tied):
continue
group = parallel_context.world_ranks_to_pg[group_ranks]
fp32_grad = accumulator.get_grad_buffer(name=name)
with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=group):
assert_tensor_synced_across_pg(
tensor=fp32_grad,
pg=group,
msg=lambda err: f"Tied weights's grads {name} are not synchronized. {err}",
)
# Sync tied weights grads (e.g. sync dense1 and dense2 grads in DP=0, but the problem is that DP=0 has only optim states for dense1)
# - Translate tied ranks along DP axis to find the DP rank that has the tied weights
# - accumulator keeps grads for all DPs, so we can just sync the grads
with timeout_after():
sync_tied_weights_gradients(
module=model_ddp.module, parallel_context=parallel_context, grad_accumulator=accumulator
)
tied_infos_dict = {
(
param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix),
param.get_tied_info().global_ranks,
param.get_tied_info().reduce_op,
): param
for name, param in model_ddp.module.named_parameters()
if param.is_tied
}
# Check that tied weights grads are synchronized
for (name, group_ranks, reduce_op), param in sorted(tied_infos_dict.items(), key=lambda x: x[0]):
# Make sure we don't get None for reduce_op
assert reduce_op == dist.ReduceOp.SUM
fp32_grad_buffer = accumulator.get_grad_buffer(name=name)
# Grad buffers are only attached to param.grad on ranks that are sharded depending on `param_to_dprank`
fp32_grad = accumulator.parameters[name]["fp32"].grad
# Tied weights are synced using the fp32 grad buffers. Let's make sure they still point to the same memory
# When using ZeRODistributedOptimizer gradients are slices across dp
dp_slice_fp_32_grad_buffer = fp32_grad_buffer.view(-1)[slice(*accumulator.param_name_to_offsets[name])]
assert (
dp_slice_fp_32_grad_buffer.data_ptr() == fp32_grad.data_ptr()
), "dp_slice_fp_32_grad_buffer and fp32_grad should point to the same memory"
group = parallel_context.world_ranks_to_pg[group_ranks]
# Check that fp32 grads for tied weights are synced (Used in optimizer step)
# Since we use `reduce_scatter = False` the entire gradient buffer is all reduced, causing it to be synced
if reduce_scatter:
assert_tensor_synced_across_pg(
tensor=dp_slice_fp_32_grad_buffer,
pg=group,
msg=lambda err: f"Tied weights's fp32 grads {name} are not synchronized. {err}",
)
else:
assert_tensor_synced_across_pg(
tensor=fp32_grad_buffer,
pg=group,
msg=lambda err: f"Tied weights's fp32 grads {name} are not synchronized. {err}",
)
# Manually sync reference model's tied weights grads
dist.all_reduce(reference_model_accum_ref[name], group=group, op=reduce_op)
# Check that accumulated grads are correct
for name, elt in accumulator.fp32_grad_buffers.items():
fp32_grad = elt["fp32_grad"]
dist.barrier()
if reduce_scatter:
slice_ = slice(*accumulator.param_name_to_offsets[name])
torch.testing.assert_close(
reference_model_accum_ref[name].view(-1)[slice_] / n_micro_batches_per_batch,
fp32_grad.view(-1)[slice_],
atol=1e-6,
rtol=1e-7,
msg=lambda msg: f"Grad for {name} is not correct.\n{msg}",
)
else:
torch.testing.assert_close(
reference_model_accum_ref[name] / n_micro_batches_per_batch,
fp32_grad,
atol=1e-6,
rtol=1e-7,
msg=lambda msg: f"Grad for {name} is not correct.\n{msg}",
)
parallel_context.destroy()
import math
from typing import Union
import pytest
import torch
from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
from helpers.utils import init_distributed, rerun_if_address_is_in_use
from nanotron.config import ModelArgs, RandomInit, SpectralMupInit
from nanotron.parallel import ParallelContext
from nanotron.scaling.parametrization import ParametrizationMethod
@pytest.mark.parametrize("tp,dp,pp", [(2, 1, 1)])
@pytest.mark.parametrize("parametrization_method", [ParametrizationMethod.SPECTRAL_MUP])
@pytest.mark.skip
@rerun_if_address_is_in_use()
def test_parametrization(tp: int, dp: int, pp: int, parametrization_method: ParametrizationMethod):
if parametrization_method == ParametrizationMethod.STANDARD:
init_method = RandomInit(std=1.0)
elif parametrization_method == ParametrizationMethod.SPECTRAL_MUP:
init_method = SpectralMupInit(use_mup=True)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_parametrization)(
init_method=init_method,
parametrization_method=parametrization_method,
)
def _test_parametrization(
parallel_context: ParallelContext,
init_method: Union[RandomInit, SpectralMupInit],
parametrization_method: ParametrizationMethod,
):
def spectral_std(fan_in: int, fan_out: int):
return torch.tensor((1.0 / math.sqrt(fan_in)) * min(1, math.sqrt(fan_out / fan_in)))
model_args = ModelArgs(init_method=init_method, model_config=TINY_LLAMA_CONFIG)
config = get_llama_training_config(model_args)
llama = create_llama_from_config(
model_config=TINY_LLAMA_CONFIG,
device=torch.device("cuda"),
parallel_context=parallel_context,
)
llama.init_model_randomly(config=config, init_method=parametrization_method)
hidden_size = TINY_LLAMA_CONFIG.hidden_size
interdimte_size = TINY_LLAMA_CONFIG.intermediate_size
o_proj_infeatures = llama.model.decoder[0].pp_block.attn.o_proj.in_features * parallel_context.tensor_parallel_size
NAME_TO_EXPECTED_STD = {
"input_layernorm": torch.tensor(0.0),
"post_attention_layernorm": torch.tensor(0.0),
"final_layer_norm": torch.tensor(0.0),
"token_embedding": torch.tensor(1.0),
# "lm_head": torch.tensor(1.0),
"qkv_proj": spectral_std(fan_in=hidden_size, fan_out=interdimte_size),
"o_proj": spectral_std(fan_in=o_proj_infeatures, fan_out=hidden_size),
"gate_up_proj": spectral_std(fan_in=hidden_size, fan_out=interdimte_size),
"down_proj": spectral_std(fan_in=interdimte_size, fan_out=hidden_size),
}
def find_expected_std(param_name):
for name in NAME_TO_EXPECTED_STD:
if name in param_name:
return NAME_TO_EXPECTED_STD[name]
for name, param in llama.model.named_parameters():
if "lm_head" in name:
continue
expected_std = find_expected_std(name)
assert expected_std is not None, f"Could not find expected std for {name}"
assert torch.allclose(
param.std().float(), expected_std, atol=0.05
), f"name: {name}, expected: {expected_std}, actual: {param.std()}"
from typing import Union
import pytest
import torch
from helpers.dummy import DummyModel, dummy_infinite_data_loader
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.models import init_on_device_and_dtype
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.block import PipelineBlock
from nanotron.parallel.pipeline_parallel.engine import (
AllForwardAllBackwardPipelineEngine,
OneForwardOneBackwardPipelineEngine,
PipelineEngine,
)
from nanotron.parallel.pipeline_parallel.p2p import P2P
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from torch import nn
from torch.nn import functional as F
@pytest.mark.skipif(available_gpus() < 2, reason="Testing build_and_set_rank requires at least 2 gpus")
@rerun_if_address_is_in_use()
def test_build_and_set_rank():
init_distributed(tp=1, dp=1, pp=2)(_test_build_and_set_rank)()
def _test_build_and_set_rank(parallel_context: ParallelContext):
device = torch.device("cuda")
p2p = P2P(pg=parallel_context.pp_pg, device=device)
model = DummyModel(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# Check that the ranks are set correctly
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
assert model.mlp[current_pp_rank].linear.rank == current_pp_rank
assert model.mlp[current_pp_rank].activation.rank == current_pp_rank
# Check that blocks were built on the correct ranks
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
if pp_rank == current_pp_rank:
assert hasattr(non_linear.linear, "pp_block")
assert hasattr(non_linear.activation, "pp_block")
else:
assert not hasattr(non_linear.linear, "pp_block")
assert not hasattr(non_linear.activation, "pp_block")
parallel_context.destroy()
@pytest.mark.skipif(available_gpus() < 1, reason="Testing test_init_on_device_and_dtype requires at least 1 gpus")
def test_init_on_device_and_dtype():
device = torch.device(type="cuda", index=0)
with init_on_device_and_dtype(device=device, dtype=torch.bfloat16):
model = nn.Linear(10, 10)
assert model.weight.dtype == torch.bfloat16, "Model weight wasn't initialised with the correct dtype"
assert model.weight.device == device, "Model weight wasn't initialised with the correct device"
@pytest.mark.skipif(available_gpus() < 2, reason="Testing AFAB requires at least 2 gpus")
@pytest.mark.parametrize(
"pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
)
@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
@rerun_if_address_is_in_use()
def test_pipeline_engine(pipeline_engine: PipelineEngine, pp: int):
init_distributed(tp=1, dp=1, pp=pp)(_test_pipeline_engine)(pipeline_engine=pipeline_engine)
def _test_pipeline_engine(parallel_context: ParallelContext, pipeline_engine: PipelineEngine):
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
# spawn model
model = DummyModel(p2p=p2p)
if has_reference_model:
reference_model = DummyModel(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank in range(parallel_context.pp_pg.size()):
non_linear = model.mlp[pp_rank]
reference_non_linear = reference_model.mlp[pp_rank]
if pp_rank == current_pp_rank:
# We already have the weights locally
reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
else:
p2p.send_tensors(
[model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg) # First rank receives data
# Have at least as many microbatches as PP size.
n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
losses = pipeline_engine.train_batch_iter(
model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
)
# Equivalent on the reference model
if has_reference_model:
reference_losses = []
for micro_batch in batch:
loss = reference_model(**micro_batch)
loss /= n_micro_batches_per_batch
loss.backward()
reference_losses.append(loss.detach())
# Gather loss in reference_rank
if has_reference_model:
_losses = []
for loss in losses:
if isinstance(loss["loss"], torch.Tensor):
if has_reference_model:
_losses.append(loss["loss"])
else:
p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
else:
assert isinstance(loss["loss"], TensorPointer)
if not has_reference_model:
continue
_losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
if has_reference_model:
losses = _losses
# Check loss are the same as reference
if has_reference_model:
for loss, ref_loss in zip(losses, reference_losses):
torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
# Check that gradient flows through the entire model
for param in model.parameters():
assert param.grad is not None
# Check that gradient are the same as reference
if has_reference_model:
for pp_rank in range(parallel_context.pp_pg.size()):
non_linear = model.mlp[pp_rank]
reference_non_linear = reference_model.mlp[pp_rank]
if pp_rank == current_pp_rank:
# We already have the weights locally
torch.testing.assert_close(
non_linear.linear.pp_block.weight.grad,
reference_non_linear.linear.pp_block.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
non_linear.linear.pp_block.bias.grad,
reference_non_linear.linear.pp_block.bias.grad,
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(
weight_grad, reference_non_linear.linear.pp_block.weight.grad, atol=1e-6, rtol=1e-7
)
torch.testing.assert_close(bias_grad, reference_non_linear.linear.pp_block.bias.grad, atol=1e-6, rtol=1e-7)
else:
p2p.send_tensors(
[
model.mlp[current_pp_rank].linear.pp_block.weight.grad,
model.mlp[current_pp_rank].linear.pp_block.bias.grad,
],
to_rank=reference_rank,
)
parallel_context.destroy()
@pytest.mark.skipif(
available_gpus() < 2,
reason="Testing `test_pipeline_engine_with_tensor_that_does_not_require_grad` requires at least 2 gpus",
)
@pytest.mark.parametrize(
"pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
)
@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
@rerun_if_address_is_in_use()
def test_pipeline_engine_with_tensor_that_does_not_require_grad(pipeline_engine: PipelineEngine, pp: int):
init_distributed(pp=pp, dp=1, tp=1)(_test_pipeline_engine_with_tensor_that_does_not_require_grad)(
pipeline_engine=pipeline_engine
)
def _test_pipeline_engine_with_tensor_that_does_not_require_grad(
parallel_context: ParallelContext, pipeline_engine: PipelineEngine
):
def activation(x: torch.Tensor, y: torch.Tensor):
return {"output": F.sigmoid(x) * y, "y": y}
class LinearWithDummyInput(nn.Linear):
def __init__(self, in_features, out_features):
super().__init__(in_features=in_features, out_features=out_features)
def forward(self, x: torch.Tensor, y: torch.Tensor):
return {"output": super().forward(x), "y": y}
class DummyModelPassingNonDifferentiableTensor(nn.Module):
def __init__(
self,
p2p: P2P,
):
super().__init__()
self.p2p = p2p
self.mlp = nn.Sequential(
*(
nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=LinearWithDummyInput,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"x", "y"},
module_output_keys={"output", "y"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=lambda: activation,
module_kwargs={},
module_input_keys={"x", "y"},
module_output_keys={"output", "y"},
),
}
)
for _ in range(p2p.pg.size() + 1)
)
)
self.loss = PipelineBlock(
p2p=p2p,
module_builder=lambda: lambda x: x.sum(),
module_kwargs={},
module_input_keys={"x"},
module_output_keys={"output"},
)
def forward(
self,
differentiable_tensor: Union[torch.Tensor, TensorPointer],
non_differentiable_tensor: Union[torch.Tensor, TensorPointer],
):
for non_linear in self.mlp:
linear_output = non_linear.linear(x=differentiable_tensor, y=non_differentiable_tensor)
output = non_linear.activation(x=linear_output["output"], y=linear_output["y"])
differentiable_tensor, non_differentiable_tensor = output["output"], output["y"]
if isinstance(differentiable_tensor, torch.Tensor):
assert differentiable_tensor.requires_grad is True
if isinstance(non_differentiable_tensor, torch.Tensor):
assert non_differentiable_tensor.requires_grad is False
differentiable_tensor = self.loss(x=differentiable_tensor)["output"]
return differentiable_tensor
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
has_reference_model = current_pp_rank == reference_rank
# spawn model
model = DummyModelPassingNonDifferentiableTensor(p2p=p2p)
if has_reference_model:
reference_model = DummyModelPassingNonDifferentiableTensor(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size() + 1
# An additional mlp is in the end
mlp_index_pp_rank = [(i, i) for i in range(parallel_context.pp_pg.size())] + [
(parallel_context.pp_pg.size(), parallel_context.pp_pg.size() - 1)
]
with init_on_device_and_dtype(device):
for (mlp_index, pp_rank), non_linear in zip(mlp_index_pp_rank, model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for (mlp_index, pp_rank) in mlp_index_pp_rank:
non_linear = model.mlp[mlp_index]
reference_non_linear = reference_model.mlp[mlp_index]
if pp_rank == current_pp_rank:
# We already have the weights locally
reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
else:
for (mlp_index, pp_rank) in mlp_index_pp_rank:
if pp_rank == current_pp_rank:
p2p.send_tensors(
[model.mlp[mlp_index].linear.pp_block.weight, model.mlp[mlp_index].linear.pp_block.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
def dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
):
micro_batch_size = 3
# We assume the first linear is always built on the first rank.
while True:
yield {
"differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
"non_differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
}
data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg=parallel_context.pp_pg
) # First rank receives data
# Have at least as many microbatches as PP size.
n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
losses = pipeline_engine.train_batch_iter(
model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
)
# Equivalent on the reference model
if has_reference_model:
reference_losses = []
for micro_batch in batch:
loss = reference_model(**micro_batch)
loss /= n_micro_batches_per_batch
loss.backward()
reference_losses.append(loss.detach())
# Gather loss in reference_rank
if has_reference_model:
_losses = []
for loss in losses:
if isinstance(loss["loss"], torch.Tensor):
if has_reference_model:
_losses.append(loss["loss"])
else:
p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
else:
assert isinstance(loss["loss"], TensorPointer)
if not has_reference_model:
continue
_losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
if has_reference_model:
losses = _losses
# Check loss are the same as reference
if has_reference_model:
for loss, ref_loss in zip(losses, reference_losses):
torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
# Check that gradient flows through the entire model
for param in model.parameters():
assert param.grad is not None
# Check that gradient are the same as reference
if has_reference_model:
for (mlp_index, pp_rank) in mlp_index_pp_rank:
non_linear = model.mlp[mlp_index]
reference_non_linear = reference_model.mlp[mlp_index]
if pp_rank == current_pp_rank:
# We already have the weights locally
torch.testing.assert_close(
non_linear.linear.pp_block.weight.grad,
reference_non_linear.linear.pp_block.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
non_linear.linear.pp_block.bias.grad,
reference_non_linear.linear.pp_block.bias.grad,
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(
weight_grad, reference_non_linear.linear.pp_block.weight.grad, atol=1e-6, rtol=1e-7
)
torch.testing.assert_close(bias_grad, reference_non_linear.linear.pp_block.bias.grad, atol=1e-6, rtol=1e-7)
else:
for (mlp_index, pp_rank) in mlp_index_pp_rank:
if pp_rank == current_pp_rank:
p2p.send_tensors(
[model.mlp[mlp_index].linear.pp_block.weight.grad, model.mlp[mlp_index].linear.pp_block.bias.grad],
to_rank=reference_rank,
)
parallel_context.destroy()
@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
@rerun_if_address_is_in_use()
def test_pipeline_forward_without_engine(pp: int):
init_distributed(pp=pp, dp=1, tp=1)(_test_pipeline_forward_without_engine)()
def _test_pipeline_forward_without_engine(parallel_context: ParallelContext):
def activation(x: torch.Tensor, y: torch.Tensor):
return {"output": F.sigmoid(x) * y, "y": y}
class DummyModel(nn.Module):
def __init__(
self,
p2p: P2P,
):
super().__init__()
self.p2p = p2p
self.mlp = nn.Sequential(
*(
nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Linear,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"input"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=lambda: activation,
module_kwargs={},
module_input_keys={"x", "y"},
module_output_keys={"output", "y"},
),
}
)
for _ in range(p2p.pg.size())
)
)
self.loss = PipelineBlock(
p2p=p2p,
module_builder=lambda: lambda x: x.sum(),
module_kwargs={},
module_input_keys={"x"},
module_output_keys={"output"},
)
def forward(
self,
differentiable_tensor: Union[torch.Tensor, TensorPointer],
non_differentiable_tensor: Union[torch.Tensor, TensorPointer],
):
for non_linear in self.mlp:
differentiable_tensor = non_linear.linear(input=differentiable_tensor)["output"]
output = non_linear.activation(x=differentiable_tensor, y=non_differentiable_tensor)
differentiable_tensor, non_differentiable_tensor = output["output"], output["y"]
differentiable_tensor = self.loss(x=differentiable_tensor)["output"]
return differentiable_tensor
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
has_reference_model = current_pp_rank == reference_rank
# spawn model
model = DummyModel(p2p=p2p)
if has_reference_model:
reference_model = DummyModel(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank in range(parallel_context.pp_pg.size()):
non_linear = model.mlp[pp_rank]
reference_non_linear = reference_model.mlp[pp_rank]
if pp_rank == current_pp_rank:
# We already have the weights locally
reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
else:
p2p.send_tensors(
[model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
def dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
):
micro_batch_size = 3
# We assume the first linear is always built on the first rank.
while True:
yield {
"differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
"non_differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
}
data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg=parallel_context.pp_pg
) # First rank receives data
# Have at least as many microbatches as PP size.
n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
# Run the model
losses = []
for micro_batch in batch:
with torch.inference_mode():
loss = model(**micro_batch)
losses.append(loss)
# Equivalent on the reference model
if has_reference_model:
reference_losses = []
for micro_batch in batch:
loss = reference_model(**micro_batch)
reference_losses.append(loss.detach())
# Gather loss in reference_rank
if has_reference_model:
_losses = []
for loss in losses:
if isinstance(loss, torch.Tensor):
if has_reference_model:
_losses.append(loss)
else:
p2p.send_tensors([loss], to_rank=reference_rank)
else:
assert isinstance(loss, TensorPointer)
if not has_reference_model:
continue
_losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss.group_rank)[0])
if has_reference_model:
losses = _losses
# Check loss are the same as reference
if has_reference_model:
for loss, ref_loss in zip(losses, reference_losses):
torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
parallel_context.destroy()
@pytest.mark.skipif(available_gpus() < 4, reason="Testing `test_pipeline_engine_diamond` requires at least 4 gpus")
@pytest.mark.parametrize(
"pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
)
@rerun_if_address_is_in_use()
def test_pipeline_engine_diamond(pipeline_engine: PipelineEngine):
init_distributed(pp=4, dp=1, tp=1)(_test_pipeline_engine_diamond)(pipeline_engine=pipeline_engine)
pass
def _test_pipeline_engine_diamond(parallel_context: ParallelContext, pipeline_engine: PipelineEngine):
class DiamondModel(nn.Module):
def __init__(self, p2p: P2P):
super().__init__()
self.p2p = p2p
self.dense_bottom = nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Linear,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"input"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=nn.ReLU,
module_kwargs={},
module_input_keys={"input"},
module_output_keys={"output"},
),
}
)
self.dense_left = nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Linear,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"input"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=nn.ReLU,
module_kwargs={},
module_input_keys={"input"},
module_output_keys={"output"},
),
}
)
self.dense_right = nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Linear,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"input"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=nn.ReLU,
module_kwargs={},
module_input_keys={"input"},
module_output_keys={"output"},
),
}
)
self.dense_top = nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Bilinear,
module_kwargs={"in1_features": 10, "in2_features": 10, "out_features": 10},
module_input_keys={"input1", "input2"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=nn.ReLU,
module_kwargs={},
module_input_keys={"input"},
module_output_keys={"output"},
),
}
)
self.loss = PipelineBlock(
p2p=p2p,
module_builder=lambda: lambda x: x.sum(),
module_kwargs={},
module_input_keys={"x"},
module_output_keys={"output"},
)
def forward(self, x):
x = self.dense_bottom.activation(input=self.dense_bottom.linear(input=x)["output"])["output"]
y = self.dense_left.activation(input=self.dense_left.linear(input=x)["output"])["output"]
z = self.dense_right.activation(input=self.dense_right.linear(input=x)["output"])["output"]
out = self.dense_top.activation(input=self.dense_top.linear(input1=y, input2=z)["output"])["output"]
return self.loss(x=out)["output"]
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
has_reference_model = current_pp_rank == reference_rank
# spawn model
model = DiamondModel(p2p=p2p)
if has_reference_model:
reference_model = DiamondModel(p2p=p2p)
# Set the ranks
assert parallel_context.pp_pg.size() == len(
[model.dense_bottom, model.dense_left, model.dense_right, model.dense_top]
)
assert parallel_context.pp_pg.size() == 4
pp_rank_to_dense_name = ["dense_bottom", "dense_left", "dense_right", "dense_top"]
with init_on_device_and_dtype(device):
for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
non_linear = model.get_submodule(module_name)
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for module_name in pp_rank_to_dense_name:
non_linear = reference_model.get_submodule(module_name)
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
reference_non_linear = reference_model.get_submodule(module_name).linear.pp_block
if pp_rank == current_pp_rank:
# We already have the weights locally
non_linear = model.get_submodule(module_name).linear.pp_block
reference_non_linear.weight.data.copy_(non_linear.weight.data)
reference_non_linear.bias.data.copy_(non_linear.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.weight.data.copy_(weight.data)
reference_non_linear.bias.data.copy_(bias.data)
else:
non_linear = model.get_submodule(pp_rank_to_dense_name[current_pp_rank]).linear.pp_block
p2p.send_tensors(
[non_linear.weight, non_linear.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
def dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
):
micro_batch_size = 3
# We assume the first linear is always built on the first rank.
while True:
yield {
"x": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
}
data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg=parallel_context.pp_pg
) # First rank receives data
# Have at least as many microbatches as PP size.
n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
losses = pipeline_engine.train_batch_iter(
model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
)
# Equivalent on the reference model
if has_reference_model:
reference_losses = []
for micro_batch in batch:
loss = reference_model(**micro_batch)
loss /= n_micro_batches_per_batch
loss.backward()
reference_losses.append(loss.detach())
# Gather loss in reference_rank
if has_reference_model:
_losses = []
for loss in losses:
if isinstance(loss["loss"], torch.Tensor):
if has_reference_model:
_losses.append(loss["loss"])
else:
p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
else:
assert isinstance(loss["loss"], TensorPointer)
if not has_reference_model:
continue
_losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
if has_reference_model:
losses = _losses
# Check loss are the same as reference
if has_reference_model:
for loss, ref_loss in zip(losses, reference_losses):
torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
# Check that gradient flows through the entire model
for param in model.parameters():
assert param.grad is not None
# Check that gradient are the same as reference
if has_reference_model:
for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
reference_non_linear = reference_model.get_submodule(module_name).linear.pp_block
if pp_rank == current_pp_rank:
# We already have the weights locally
non_linear = model.get_submodule(module_name).linear.pp_block
torch.testing.assert_close(
non_linear.weight.grad,
reference_non_linear.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
non_linear.bias.grad,
reference_non_linear.bias.grad,
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
else:
non_linear = model.get_submodule(pp_rank_to_dense_name[current_pp_rank]).linear.pp_block
p2p.send_tensors(
[non_linear.weight.grad, non_linear.bias.grad],
to_rank=reference_rank,
)
parallel_context.destroy()
import pytest
import torch
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.random import (
RandomStates,
branch_random_state,
get_current_random_state,
get_synced_random_state,
)
@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_random_state_sync requires at least 2 gpus")
@pytest.mark.parametrize("tp,dp,pp", [(2, 1, 1), (1, 2, 1), (1, 1, 2)])
@rerun_if_address_is_in_use()
def test_random_state_sync(tp: int, dp: int, pp: int):
# TODO @nouamane: Make a test with 4 gpus (2 in one pg, 2 in other pg)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_random_state_sync)()
def _test_random_state_sync(parallel_context: ParallelContext):
current_random_state = get_current_random_state()
reference_rank = 0
pg = next(
(pg for pg in [parallel_context.tp_pg, parallel_context.dp_pg, parallel_context.pp_pg] if pg.size() == 2)
)
# Check that they are not equal across process group
if dist.get_rank(pg) == reference_rank:
random_states = [current_random_state]
else:
random_states = [None]
dist.broadcast_object_list(random_states, src=reference_rank, group=pg)
if dist.get_rank(pg) != reference_rank:
assert current_random_state != random_states[0]
# Sync random state
synced_random_state = get_synced_random_state(current_random_state, pg=pg)
# Check that they are equal across process group
random_states = [synced_random_state]
dist.broadcast_object_list(random_states, src=reference_rank, group=pg)
if dist.get_rank(pg) != reference_rank:
assert current_random_state != random_states[0]
parallel_context.destroy()
def test_random_state_fork_random_operation_in_global_context():
key = "my_random_state"
random_state = get_current_random_state()
random_states = RandomStates({key: random_state})
assert random_states[key] == random_state
# Random operation that updates the random state
torch.randn(1)
new_random_state = get_current_random_state()
# Check that random states changed
assert new_random_state != random_state
assert random_states[key] == random_state
# Check that within the context manager the random state matches the one we stored in `random_states`
with branch_random_state(random_states=random_states, key=key, enabled=True):
assert random_states[key] == random_state
assert get_current_random_state() == random_states[key]
# Check that random states if back to global one
assert get_current_random_state() == new_random_state
def test_random_state_fork_random_operation_in_local_context():
key = "my_random_state"
random_state = get_current_random_state()
random_states = RandomStates({key: random_state})
# Check that within the context manager the random state matches the one we stored in `random_states`
with branch_random_state(random_states=random_states, key=key, enabled=True):
old_random_state = get_current_random_state()
assert old_random_state == random_states[key]
# Random operation that updates the random state
torch.randn(1)
# Check that random states changed
new_random_state = get_current_random_state()
# Check that global random_state hasn't changed
assert get_current_random_state() == random_state
# Check that local random_state has changed and is equal to `new_random_state`
assert old_random_state != random_states[key]
assert new_random_state == random_states[key]
import pytest
import torch
from helpers.context import TestContext
from helpers.dummy import dummy_infinite_data_loader, init_dummy_model
from helpers.utils import (
available_gpus,
get_all_3d_configurations,
init_distributed,
is_dict_equal,
rerun_if_address_is_in_use,
)
from nanotron import distributed as dist
from nanotron.constants import CHECKPOINT_VERSION
from nanotron.optim.gradient_accumulator import FP32GradientAccumulator
from nanotron.optim.named_optimizer import NamedOptimizer
from nanotron.optim.optimizer_from_gradient_accumulator import (
OptimizerFromGradientAccumulator,
)
from nanotron.optim.zero import ZeroDistributedOptimizer
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.engine import (
AllForwardAllBackwardPipelineEngine,
)
from nanotron.parallel.sharded_parameters import SplitConfig, create_sharded_parameter_from_config
from nanotron.parallel.tied_parameters import sync_tied_weights_gradients
from nanotron.random import RandomStates, get_current_random_state, get_synced_random_state
from nanotron.serialize import (
load_optimizer,
load_random_states,
load_weights,
save_optimizer,
save_random_states,
save_weights,
)
from nanotron.serialize.metadata import TensorMetadata
from torch.nn.parallel import DistributedDataParallel
def test_save_and_load_with_changed_topolgy():
# TODO @thomasw21: We want to be able to support a change of topology mechanism
return
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_and_load_model(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_and_load_model)(test_context=test_context)
def _test_save_and_load_model(parallel_context: ParallelContext, test_context: TestContext):
model = init_dummy_model(parallel_context=parallel_context)
store_folder = test_context.get_auto_remove_tmp_dir()
# Save
save_weights(model=model, parallel_context=parallel_context, root_folder=store_folder)
# Load
new_model = init_dummy_model(parallel_context=parallel_context)
# Check that the newly initialised model isn't the same.
match, msg = is_dict_equal(new_model.state_dict(), model.state_dict())
if len(model.state_dict()) == 0:
# Edge case where there's no parameters/buffers stored in the model.
pass
else:
assert not match, "Newly initialised model should not match."
load_weights(model=new_model, parallel_context=parallel_context, root_folder=store_folder)
# Assert the weights are exactly the same after loading
match, msg = is_dict_equal(new_model.state_dict(), model.state_dict())
assert match, msg
parallel_context.destroy()
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_and_load_optimizer(tp: int, dp: int, pp: int):
test_context = TestContext()
if pp > 1:
pytest.skip("Pipeline parallelism not supported for this test yet")
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_and_load_optimizer)(test_context=test_context)
def _test_save_and_load_optimizer(parallel_context: ParallelContext, test_context: TestContext):
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context)
optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
)
# Manually sync tied parameters
sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Check that the newly initialised optimizer isn't the same.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
if len(optimizer.state_dict()["state"]) == 0:
# Edge case where there's no state stored in the optimizer.
pass
else:
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# Assert the optimizer states are exactly the same after loading.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
assert match, msg
# Test loading optimizer states to CPU
cpu_optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Load optimizer states to CPU
load_optimizer(
optimizer=cpu_optimizer, parallel_context=parallel_context, root_folder=store_folder, map_location="cpu"
)
# Get state dicts
gpu_state = optimizer.state_dict()
cpu_state = cpu_optimizer.state_dict()
# Check that states match except for device
for param_id in gpu_state["state"]:
for key, gpu_value in gpu_state["state"][param_id].items():
cpu_value = cpu_state["state"][param_id][key]
if isinstance(gpu_value, torch.Tensor):
assert torch.equal(gpu_value.cpu(), cpu_value), f"Values don't match for param {param_id}, key {key}"
if key != "step": # Skip device checks for 'step' key
assert (
cpu_value.device.type == "cpu"
), f"CPU optimizer state should be on CPU for param {param_id}, key {key}"
assert (
gpu_value.device.type == "cuda"
), f"GPU optimizer state should be on CUDA for param {param_id}, key {key}"
else:
assert gpu_value == cpu_value, f"Non-tensor values don't match for param {param_id}, key {key}"
parallel_context.destroy()
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_zero_optimizer_and_load_optimizer(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_zero_optimizer_and_load_optimizer)(test_context=test_context)
def _test_save_zero_optimizer_and_load_optimizer(parallel_context: ParallelContext, test_context: TestContext):
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context)
optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
)
# Manually sync tied parameters
sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
# Check that the newly initialised optimizer isn't the same.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
if len(optimizer.state_dict()["state"]) == 0:
# Edge case where there's no state stored in the optimizer.
pass
else:
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# Assert the optimizer states are exactly the same after loading.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
assert match, msg
parallel_context.destroy()
@pytest.mark.skip(reason="Assumption that zero and non zero optimizer have the same serialization format doesn't hold")
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_zero_optimizer_and_load_data_parallel_optimizer(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_zero_optimizer_and_load_data_parallel_optimizer)(
test_context=test_context
)
def _test_save_zero_optimizer_and_load_data_parallel_optimizer(
parallel_context: ParallelContext, test_context: TestContext
):
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context)
optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
)
# Manually sync tied parameters
sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Check that the newly initialised optimizer isn't the same.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
if len(optimizer.state_dict()["state"]) == 0:
# Edge case where there's no state stored in the optimizer.
pass
else:
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# TODO @thomasw21: Compare zero optimizer with non zero
parallel_context.destroy()
@pytest.mark.skip(reason="Assumption that zero and non zero optimizer have the same serialization format doesn't hold")
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_data_parallel_optimizer_and_load_zero_optimizer(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_data_parallel_optimizer_and_load_zero_optimizer)(
test_context=test_context
)
def _test_save_data_parallel_optimizer_and_load_zero_optimizer(
parallel_context: ParallelContext, test_context: TestContext
):
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context)
optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
)
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
# Check that the newly initialised optimizer isn't the same.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
if len(optimizer.state_dict()["state"]) == 0:
# Edge case where there's no state stored in the optimizer.
pass
else:
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# TODO @thomasw21: Compare zero optimizer with non zero
parallel_context.destroy()
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_optimizer_with_additional_state_dict_keys(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_optimizer_with_additional_state_dict_keys)(
test_context=test_context
)
def _test_save_optimizer_with_additional_state_dict_keys(parallel_context: ParallelContext, test_context: TestContext):
dtype = torch.float16
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context, dtype=dtype)
if isinstance(model, DistributedDataParallel):
# Remove the annoying "module." prefix
unwrapped_model = model.module
else:
unwrapped_model = model
named_parameters = list(unwrapped_model.named_parameters())
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_parameters,
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
)
grad_accumulator = optimizer.gradient_accumulator
assert len(optimizer.state_dict_additional_keys()) > 0
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg, dtype=dtype))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model,
pg=parallel_context.pp_pg,
batch=[minibatch],
nb_microbatches=1,
grad_accumulator=grad_accumulator,
)
# Manually sync tied parameters
sync_tied_weights_gradients(
module=unwrapped_model, parallel_context=parallel_context, grad_accumulator=grad_accumulator
)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_parameters,
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
)
new_grad_accumulator = new_optimizer.gradient_accumulator
# Check that the newly initialised optimizer isn't the same.
if len(optimizer.state_dict()["state"]) == 0:
pass
else:
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# Assert the optimizer states are exactly the same after loading.
match, msg = is_dict_equal(optimizer.state_dict()["state"], new_optimizer.state_dict()["state"])
assert match, msg
# Assert the optimizer state_dict are exactly the same after loading.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
assert match, msg
# Assert the internal optimizer states are exactly the same after loading.
keys_to_ignore = []
match, msg = is_dict_equal(
{
name: {key: tensor for key, tensor in elt.items() if key not in keys_to_ignore}
for name, elt in grad_accumulator.parameters.items()
},
{
name: {key: tensor for key, tensor in elt.items() if key not in keys_to_ignore}
for name, elt in new_grad_accumulator.parameters.items()
},
)
assert match, msg
parallel_context.destroy()
# TODO @thomasw21: Test with a optimizer that uses `named_param_groups` instead of `param_groups`
@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_save_and_load_random_states requires at least 2 gpus")
@rerun_if_address_is_in_use()
def test_save_and_load_random_states():
test_context = TestContext()
# We use DP=2 as we're interested in testing
init_distributed(tp=2, dp=1, pp=1)(_test_save_and_load_random_states)(test_context=test_context)
def _test_save_and_load_random_states(parallel_context: ParallelContext, test_context: TestContext):
pg = next(
(pg for pg in [parallel_context.tp_pg, parallel_context.dp_pg, parallel_context.pp_pg] if pg.size() == 2)
)
random_states = RandomStates(
{
"my_synced_random_state": get_synced_random_state(random_state=get_current_random_state(), pg=pg),
"my_own_random_state": get_current_random_state(),
}
)
store_folder = test_context.get_auto_remove_tmp_dir()
# Check that random states are unequal between ranks (due to `my_own_random_state`)
reference_rank = 0
if dist.get_rank(pg) == reference_rank:
random_statess = [random_states]
else:
random_statess = [None]
dist.broadcast_object_list(random_statess, src=dist.get_global_rank(group_rank=reference_rank, group=pg), group=pg)
if dist.get_rank(pg) != reference_rank:
assert random_states != random_statess[0]
# save
save_random_states(random_states=random_states, parallel_context=parallel_context, root_folder=store_folder)
# load
new_random_states = load_random_states(parallel_context=parallel_context, root_folder=store_folder)
# Each rank has restored it's own random state
assert random_states == new_random_states
parallel_context.destroy()
@rerun_if_address_is_in_use()
def test_serialize_deserialize_tensormetadata():
test_context = TestContext()
init_distributed(tp=2, dp=1, pp=1)(_test_serialize_deserialize_tensormetadata)(test_context=test_context)
def _test_serialize_deserialize_tensormetadata(parallel_context: ParallelContext, test_context: TestContext):
param = torch.nn.Parameter(torch.randn(16, 64))
split_config = SplitConfig(
split_dim=0,
contiguous_chunks=(8, 8),
)
param = create_sharded_parameter_from_config(parameter=param, pg=parallel_context.tp_pg, split_config=split_config)
sharded_info = param.get_sharded_info()
metadata = TensorMetadata(
version=CHECKPOINT_VERSION,
local_global_slices_pairs=sharded_info.local_global_slices_pairs,
unsharded_shape=sharded_info.unsharded_shape,
)
metadata_str_dict = metadata.to_str_dict()
# Assert metadata_str_dict is Dict[str, str]
assert isinstance(metadata_str_dict, dict)
assert all(isinstance(key, str) for key in metadata_str_dict.keys())
assert all(isinstance(value, str) for value in metadata_str_dict.values())
metadata_from_str_dict = TensorMetadata.from_str_dict(metadata_str_dict)
assert metadata == metadata_from_str_dict
parallel_context.destroy()
import os
import pytest
import torch
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.distributed import get_global_rank
from nanotron.parallel import ParallelContext
from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
from nanotron.parallel.tensor_parallel.nn import (
TensorParallelColumnLinear,
TensorParallelEmbedding,
TensorParallelRowLinear,
)
from torch import nn as torch_nn
@pytest.mark.parametrize("tp,dp,pp", [pytest.param(i, 1, 1) for i in range(1, min(4, available_gpus()) + 1)])
@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
@pytest.mark.parametrize("async_communication", [False, True])
@pytest.mark.parametrize("tp_recompute_allgather", [False, True])
@rerun_if_address_is_in_use()
def test_column_linear(
tp: int,
dp: int,
pp: int,
tp_mode: TensorParallelLinearMode,
async_communication: bool,
tp_recompute_allgather: bool,
):
if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
pytest.skip("ALL_REDUCE mode does not support async communication")
if tp_mode is TensorParallelLinearMode.ALL_REDUCE and tp_recompute_allgather:
pytest.skip("ALL_REDUCE mode is unaffected by tp_recompute_allgather")
init_distributed(tp=tp, dp=dp, pp=pp)(_test_column_linear)(
tp_mode=tp_mode, async_communication=async_communication, tp_recompute_allgather=tp_recompute_allgather
)
def _test_column_linear(
parallel_context: ParallelContext,
tp_mode: TensorParallelLinearMode,
async_communication: bool,
tp_recompute_allgather: bool,
):
if async_communication:
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
in_features = 2
out_features_per_tp_rank = 3
out_features = parallel_context.tp_pg.size() * out_features_per_tp_rank
# Sharded
column_linear = TensorParallelColumnLinear(
in_features=in_features,
out_features=out_features,
pg=parallel_context.tp_pg,
mode=tp_mode,
device="cuda",
async_communication=async_communication,
tp_recompute_allgather=tp_recompute_allgather,
)
# Un-sharded
reference_linear = torch_nn.Linear(in_features=in_features, out_features=out_features, device="cuda")
# Copy weights/bias from sharded to un-sharded
with torch.inference_mode():
dist.all_gather(
tensor_list=list(reference_linear.weight.split(out_features_per_tp_rank, dim=0)),
tensor=column_linear.weight,
group=parallel_context.tp_pg,
)
dist.all_gather(
tensor_list=list(reference_linear.bias.split(out_features_per_tp_rank, dim=0)),
tensor=column_linear.bias,
group=parallel_context.tp_pg,
)
# Generate random input
random_input: torch.Tensor
sharded_random_input: torch.Tensor
if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
batch_size = 5
random_input = torch.randn(batch_size, in_features, device="cuda")
# synchronize random_input across tp
dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
sharded_random_input = random_input
elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
sharded_batch_size = 5
sharded_random_input = torch.randn(sharded_batch_size, in_features, device="cuda")
if parallel_context.tp_pg.size() > 1:
random_input = torch.empty(
sharded_batch_size * parallel_context.tp_pg.size(),
*(sharded_random_input.shape[1:]),
device=sharded_random_input.device,
dtype=sharded_random_input.dtype,
)
dist.all_gather_into_tensor(random_input, sharded_random_input, group=parallel_context.tp_pg)
else:
random_input = sharded_random_input
else:
ValueError(f"Unsupported mode: {tp_mode}")
# It's important that `random_input` and `sharded_random_input` are two separate tensors with separate storage
sharded_random_input = sharded_random_input.clone()
random_input.requires_grad = True
sharded_random_input.requires_grad = True
# Test that we get the same output after forward pass
sharded_output = column_linear(sharded_random_input)
reference_output = reference_linear(random_input)
# TODO @thomasw21: Tune tolerance
try:
torch.testing.assert_close(
sharded_output,
reference_output[
:,
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank,
],
)
except BaseException as e:
print(f"Rank {dist.get_rank(parallel_context.tp_pg)}: FAIL.")
dist.barrier()
raise e
print(f"Rank {dist.get_rank(parallel_context.tp_pg)}: SUCCESS.")
dist.barrier()
# Test that we get the same gradient after backward pass
sharded_output.sum().backward()
reference_output.sum().backward()
hidden_dim_slice = slice(
dist.get_rank(parallel_context.tp_pg) * out_features_per_tp_rank,
(dist.get_rank(parallel_context.tp_pg) + 1) * out_features_per_tp_rank,
)
torch.testing.assert_close(
column_linear.weight.grad,
reference_linear.weight.grad[hidden_dim_slice],
)
torch.testing.assert_close(
column_linear.bias.grad,
reference_linear.bias.grad[hidden_dim_slice],
)
if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
torch.testing.assert_close(
sharded_random_input.grad,
random_input.grad,
)
elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
batch_dim_slice = slice(
dist.get_rank(parallel_context.tp_pg) * sharded_batch_size,
(dist.get_rank(parallel_context.tp_pg) + 1) * sharded_batch_size,
)
torch.testing.assert_close(
sharded_random_input.grad,
random_input.grad[batch_dim_slice],
)
else:
ValueError(f"Unsupported mode: {tp_mode}")
parallel_context.destroy()
@pytest.mark.parametrize("tp,dp,pp", [pytest.param(i, 1, 1) for i in range(1, min(4, available_gpus()) + 1)])
@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
@pytest.mark.parametrize("async_communication", [False, True])
@pytest.mark.parametrize("tp_recompute_allgather", [False, True])
@rerun_if_address_is_in_use()
def test_row_linear(
tp: int,
dp: int,
pp: int,
tp_mode: TensorParallelLinearMode,
async_communication: bool,
tp_recompute_allgather: bool,
):
if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
pytest.skip("ALL_REDUCE mode does not support async communication")
if tp_mode is TensorParallelLinearMode.ALL_REDUCE and tp_recompute_allgather:
pytest.skip("ALL_REDUCE mode is not affected by tp_recompute_allgather")
init_distributed(tp=tp, dp=dp, pp=pp)(_test_row_linear)(
tp_mode=tp_mode, async_communication=async_communication, tp_recompute_allgather=tp_recompute_allgather
)
def _test_row_linear(
parallel_context: ParallelContext,
tp_mode: TensorParallelLinearMode,
async_communication: bool,
tp_recompute_allgather: bool,
):
if async_communication:
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
out_features = 3
in_features_per_rank = 2
in_features = parallel_context.tp_pg.size() * in_features_per_rank
# Sharded
row_linear = TensorParallelRowLinear(
in_features=in_features,
out_features=out_features,
pg=parallel_context.tp_pg,
mode=tp_mode,
device="cuda",
async_communication=async_communication,
)
# Un-sharded
reference_linear = torch_nn.Linear(in_features=in_features, out_features=out_features, device="cuda")
# Copy weights/bias from sharded to un-sharded
with torch.inference_mode():
dist.all_reduce(tensor=reference_linear.weight, op=dist.ReduceOp.SUM, group=parallel_context.tp_pg)
row_linear.weight.copy_(
reference_linear.weight[
:,
dist.get_rank(parallel_context.tp_pg)
* in_features_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* in_features_per_rank,
]
)
# broadcast bias from rank 0, and the other don't have bias
if dist.get_rank(parallel_context.tp_pg) == 0:
row_linear.bias.copy_(reference_linear.bias)
dist.broadcast(
tensor=reference_linear.bias,
src=get_global_rank(group=parallel_context.tp_pg, group_rank=0),
group=parallel_context.tp_pg,
)
# Generate random input
if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
batch_size = 5
elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
batch_size = 5 * parallel_context.tp_pg.size()
else:
raise ValueError()
random_input = torch.randn(batch_size, in_features, device="cuda")
# synchronize random_input across tp
dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
random_input.requires_grad = True
# Row linear receives as input sharded input
random_sharded_input = (
random_input[
:,
dist.get_rank(parallel_context.tp_pg)
* in_features_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* in_features_per_rank,
]
.detach()
.clone()
)
random_sharded_input.requires_grad = True
# Test that we get the same output after forward pass
# TODO @kunhao: We may want to have our custom error type
sharded_output = row_linear(random_sharded_input)
reference_output = reference_linear(random_input)
if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
sharded_reference_output = reference_output
elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
assert batch_size % parallel_context.tp_pg.size() == 0
sharded_batch_size = batch_size // parallel_context.tp_pg.size()
sharded_reference_output = reference_output[
dist.get_rank(parallel_context.tp_pg)
* sharded_batch_size : (dist.get_rank(parallel_context.tp_pg) + 1)
* sharded_batch_size
]
else:
raise ValueError(f"Unsupported mode: {tp_mode}")
# TODO @thomasw21: Tune tolerance
torch.testing.assert_close(
sharded_output,
sharded_reference_output,
)
# Test that we get the same gradient after backward pass
sharded_output.sum().backward()
reference_output.sum().backward()
torch.testing.assert_close(
row_linear.weight.grad,
reference_linear.weight.grad[
:,
dist.get_rank(parallel_context.tp_pg)
* in_features_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* in_features_per_rank,
],
)
if dist.get_rank(parallel_context.tp_pg) == 0:
torch.testing.assert_close(
row_linear.bias.grad,
reference_linear.bias.grad,
)
else:
assert row_linear.bias is None
torch.testing.assert_close(
random_sharded_input.grad,
random_input.grad[
:,
dist.get_rank(parallel_context.tp_pg)
* in_features_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* in_features_per_rank,
],
)
parallel_context.destroy()
@pytest.mark.parametrize("tp,dp,pp", [pytest.param(i, 1, 1) for i in range(1, min(4, available_gpus()) + 1)])
@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
@rerun_if_address_is_in_use()
def test_tensor_parallel_embedding(tp: int, dp: int, pp: int, tp_mode: TensorParallelLinearMode):
init_distributed(tp=tp, dp=dp, pp=pp)(_test_tensor_parallel_embedding)(tp_mode=tp_mode)
def _test_tensor_parallel_embedding(parallel_context: ParallelContext, tp_mode: TensorParallelLinearMode):
num_embeddings_per_rank = 100
embedding_dim = 3
num_embeddings = parallel_context.tp_pg.size() * num_embeddings_per_rank
# Sharded
sharded_embedding = TensorParallelEmbedding(
num_embeddings=num_embeddings,
embedding_dim=embedding_dim,
pg=parallel_context.tp_pg,
mode=tp_mode,
device="cuda",
)
# Un-sharded
reference_embedding = torch_nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, device="cuda")
# Copy weights/bias from sharded to un-sharded
with torch.inference_mode():
dist.all_reduce(tensor=reference_embedding.weight, op=dist.ReduceOp.SUM, group=parallel_context.tp_pg)
sharded_embedding.weight.copy_(
reference_embedding.weight[
dist.get_rank(parallel_context.tp_pg)
* num_embeddings_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* num_embeddings_per_rank,
:,
]
)
# Generate random input
random_input: torch.Tensor
if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
batch_size = 5
elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
batch_size = 5 * parallel_context.tp_pg.size()
else:
raise ValueError(f"Unsupported mode: {tp_mode}")
random_input = torch.randint(low=0, high=num_embeddings, size=(batch_size,), device="cuda")
dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
# Test that we get the same output after forward pass
sharded_output = sharded_embedding(random_input)
reference_output = reference_embedding(random_input)
weights = torch.arange(batch_size, device="cuda")[:, None]
if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
sharded_reference_output = reference_output
sharded_weights = weights
elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
assert batch_size % parallel_context.tp_pg.size() == 0
sharded_batch_size = batch_size // parallel_context.tp_pg.size()
sharded_reference_output = reference_output[
dist.get_rank(parallel_context.tp_pg)
* sharded_batch_size : (dist.get_rank(parallel_context.tp_pg) + 1)
* sharded_batch_size
]
sharded_weights = weights[
dist.get_rank(parallel_context.tp_pg)
* sharded_batch_size : (dist.get_rank(parallel_context.tp_pg) + 1)
* sharded_batch_size
]
else:
raise ValueError(f"Unsupported mode: {tp_mode}")
# TODO @thomasw21: Tune tolerance
torch.testing.assert_close(sharded_output, sharded_reference_output, atol=0, rtol=0)
# Test that we get the same gradient after backward pass
(sharded_output * sharded_weights).sum().backward()
(reference_output * weights).sum().backward()
torch.testing.assert_close(
sharded_embedding.weight.grad,
reference_embedding.weight.grad[
dist.get_rank(parallel_context.tp_pg)
* num_embeddings_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* num_embeddings_per_rank,
:,
],
atol=0,
rtol=0,
)
parallel_context.destroy()
import torch
from helpers.distributed_tensor import assert_tensor_equal_over_group
from helpers.exception import assert_fail_with
from helpers.utils import init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.parallel.parameters import NanotronParameter
from nanotron.parallel.tied_parameters import (
get_tied_id_to_param,
sync_tied_weights_gradients,
tie_parameters,
)
from torch import nn
@rerun_if_address_is_in_use()
def test_tie_weight_in_same_device():
init_distributed(tp=1, dp=1, pp=1)(_test_tie_weight_in_same_device)()
def _test_tie_weight_in_same_device(parallel_context: ParallelContext):
model = nn.ModuleDict({"dense0": nn.Linear(10, 10, device="cuda"), "dense1": nn.Linear(10, 10, device="cuda")})
# Tie weights/bias
tie_parameters(
root_module=model,
ties=[("dense0.weight", (0,)), ("dense1.weight", (0,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
tie_parameters(
root_module=model,
ties=[("dense0.bias", (0,)), ("dense1.bias", (0,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
weight0 = model.get_parameter("dense0.weight")
weight1 = model.get_parameter("dense1.weight")
bias0 = model.get_parameter("dense0.bias")
bias1 = model.get_parameter("dense1.bias")
# We check that we use the same parameter for both linear layers
assert id(weight0) == id(weight1)
assert id(bias0) == id(bias1)
parallel_context.destroy()
@rerun_if_address_is_in_use()
def test_tie_weight_in_different_device():
init_distributed(tp=1, dp=1, pp=2)(_test_tie_weight_in_different_device)()
def _test_tie_weight_in_different_device(parallel_context: ParallelContext):
if dist.get_rank(parallel_context.pp_pg) == 0:
model = nn.ModuleDict(
{
"dense0": nn.Linear(10, 10, device="cuda"),
}
)
else:
model = nn.ModuleDict(
{
"dense1": nn.Linear(10, 10, device="cuda"),
}
)
# Tie weights/bias
tie_parameters(
root_module=model,
ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
tie_parameters(
root_module=model,
ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
group = parallel_context.world_ranks_to_pg[(0, 1)]
# Check that model weights are not in fact synchronized
if dist.get_rank(parallel_context.pp_pg) == 0:
weight = model.dense0.weight
bias = model.dense0.bias
else:
weight = model.dense1.weight
bias = model.dense1.bias
# Make sure that weight/bias are NanotronParameter and that they are tied
assert isinstance(weight, NanotronParameter)
assert weight.is_tied
assert isinstance(bias, NanotronParameter)
assert bias.is_tied
# Weights/bias are not synced yet
assert not assert_tensor_equal_over_group(weight, group=group, assert_=False)
assert not assert_tensor_equal_over_group(bias, group=group, assert_=False)
# Manually sync weights
for (_, group_ranks), param in sorted(
get_tied_id_to_param(
parameters=model.parameters(),
root_module=model,
).items(),
key=lambda x: x[0],
):
group = parallel_context.world_ranks_to_pg[group_ranks]
dist.all_reduce(param, op=dist.ReduceOp.AVG, group=group)
# We check that we use the same parameter for both linear layers
assert_tensor_equal_over_group(weight, group=group)
assert_tensor_equal_over_group(bias, group=group)
parallel_context.destroy()
@rerun_if_address_is_in_use()
def test_tie_weight_across_dp_is_impossible():
init_distributed(tp=1, dp=2, pp=1)(_test_tie_weight_across_dp_is_impossible)()
def _test_tie_weight_across_dp_is_impossible(parallel_context: ParallelContext):
if dist.get_rank(parallel_context.dp_pg) == 0:
model = nn.ModuleDict(
{
"dense0": nn.Linear(10, 10, device="cuda"),
}
)
else:
model = nn.ModuleDict(
{
"dense1": nn.Linear(10, 10, device="cuda"),
}
)
# Tie weights/bias
with assert_fail_with(AssertionError):
tie_parameters(
root_module=model,
ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
with assert_fail_with(AssertionError):
tie_parameters(
root_module=model,
ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
parallel_context.destroy()
@rerun_if_address_is_in_use()
def test_tie_weight_in_different_device_have_gradients_synchronized():
init_distributed(tp=1, dp=1, pp=2)(_test_tie_weight_in_different_device_have_gradients_synchronized)()
def _test_tie_weight_in_different_device_have_gradients_synchronized(parallel_context: ParallelContext):
if dist.get_rank(parallel_context.pp_pg) == 0:
model = nn.ModuleDict(
{
"dense0": nn.Linear(10, 10, device="cuda"),
}
)
else:
model = nn.ModuleDict(
{
"dense1": nn.Linear(10, 10, device="cuda"),
}
)
# Tie weights/bias
tie_parameters(
root_module=model,
ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
tie_parameters(
root_module=model,
ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
group = parallel_context.world_ranks_to_pg[(0, 1)]
# Check that model weights are not in fact synchronized
if dist.get_rank(parallel_context.pp_pg) == 0:
weight = model.dense0.weight
bias = model.dense0.bias
else:
weight = model.dense1.weight
bias = model.dense1.bias
# Make sure that weight/bias are NanotronParameter and that they are tied
assert isinstance(weight, NanotronParameter)
assert weight.is_tied
assert isinstance(bias, NanotronParameter)
assert bias.is_tied
# Weights/bias are not synced yet
assert not assert_tensor_equal_over_group(weight, group=group, assert_=False)
assert not assert_tensor_equal_over_group(bias, group=group, assert_=False)
# Compute gradient
input_ = torch.randn(13, 10, device="cuda")
if dist.get_rank(parallel_context.pp_pg) == 0:
out = model.dense0(input_)
else:
out = model.dense1(input_)
out.sum().backward()
# sync gradients
# TODO @thomasw21: This should be done in hooks
sync_tied_weights_gradients(model, parallel_context=parallel_context, grad_accumulator=None)
# Check that we have gradient
assert weight.grad is not None
assert bias.grad is not None
# We check that we both gradients are synchronized
assert_tensor_equal_over_group(weight.grad, group=group)
assert_tensor_equal_over_group(bias.grad, group=group)
parallel_context.destroy()
import os
import pytest
import torch
from helpers.distributed_tensor import assert_tensor_equal_over_group
from helpers.dummy import dummy_infinite_data_loader, init_dummy_model
from helpers.exception import assert_fail_with
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.optim import NamedOptimizer, ZeroDistributedOptimizer
from nanotron.optim.zero import SlicedFlatTensor
from nanotron.parallel import ParallelContext
from nanotron.parallel.data_parallel.utils import sync_gradients_across_dp
from nanotron.parallel.parameters import NanotronParameter
from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.parallel.tensor_parallel import nn
from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
from nanotron.parallel.tied_parameters import sync_tied_weights_gradients
from nanotron.random import RandomStates, branch_random_state, get_current_random_state, get_synced_random_state
from torch import nn as torch_nn
from torch.nn.parallel import DistributedDataParallel
@pytest.mark.parametrize("tp,dp,pp", [pytest.param(1, i, 1) for i in range(1, min(4, available_gpus()) + 1)])
@rerun_if_address_is_in_use()
def test_zero_optimizer(tp: int, dp: int, pp: int):
init_distributed(pp=pp, dp=dp, tp=tp)(_test_zero_optimizer)()
def _test_zero_optimizer(parallel_context: ParallelContext):
model = init_dummy_model(parallel_context=parallel_context)
optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
index_to_name = [name for name, _ in model.named_parameters()]
# reference model
reference_model = init_dummy_model(parallel_context=parallel_context)
reference_optimizer = torch.optim.AdamW(reference_model.parameters())
# sync weights between reference_model and model
with torch.no_grad():
for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
assert name == ref_name
param.copy_(ref_param)
# Get infinite dummy data iterator
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
batches = [[next(data_loader)] for _ in range(nb_optim_steps)]
pipeline_engine = AllForwardAllBackwardPipelineEngine()
# Training loop
for i, batch in enumerate(batches):
# store original reference parameter
old_named_params = {name: param.detach().clone() for name, param in model.named_parameters()}
# Run forward/backward
losses = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=1, grad_accumulator=None
)
ref_losses = pipeline_engine.train_batch_iter(
model=reference_model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=1, grad_accumulator=None
)
# Check loss match
losses = list(losses)
ref_losses = list(ref_losses)
assert len(losses) == len(ref_losses)
for loss, ref_loss in zip(losses, ref_losses):
assert isinstance(loss["loss"], torch.Tensor)
assert isinstance(ref_loss["loss"], torch.Tensor)
torch.testing.assert_close(
loss["loss"], ref_loss["loss"], atol=0, rtol=0, msg=lambda msg: f"At iteration {i}, {msg}"
)
# Manually sync tied parameters' gradients
sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
sync_tied_weights_gradients(module=reference_model, parallel_context=parallel_context, grad_accumulator=None)
# We rely on DDP to synchronize gradients across DP. We only need to manually synchronize them if we don't use DDP.
if not isinstance(model, DistributedDataParallel):
sync_gradients_across_dp(
model, dp_pg=parallel_context.dp_pg, reduce_op=dist.ReduceOp.AVG, grad_accumulator=None
)
if not isinstance(reference_model, DistributedDataParallel):
sync_gradients_across_dp(
reference_model, dp_pg=parallel_context.dp_pg, reduce_op=dist.ReduceOp.AVG, grad_accumulator=None
)
# Check gradients are synced across DP
for name, param in model.named_parameters():
assert_tensor_equal_over_group(param.grad, group=parallel_context.dp_pg)
for ref_name, ref_param in reference_model.named_parameters():
assert_tensor_equal_over_group(ref_param.grad, group=parallel_context.dp_pg)
# Check gradients are the same with reference_model
for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
assert name == ref_name
torch.testing.assert_close(
param.grad, ref_param.grad, atol=0, rtol=0, msg=lambda msg: f"At iteration {i}, {msg}"
)
assert len(optimizer.param_groups) == 1
assert len(list(model.named_parameters())) == len(optimizer.param_groups[0]["params"])
with torch.no_grad():
for (name, param), sliced_param in zip(model.named_parameters(), optimizer.param_groups[0]["params"]):
offsets = optimizer.param_name_to_dp_rank_offsets[name][dist.get_rank(parallel_context.dp_pg)]
# Check that weights are the same
expected_slice = param.view(-1)[slice(*offsets)].view_as(sliced_param)
torch.testing.assert_close(
expected_slice,
sliced_param,
atol=0,
rtol=0,
msg=lambda msg: f"Weights don't match: {msg}\n - Expected slice: {expected_slice}\n - Got: {sliced_param}\n - Full gradient: {param}",
)
assert (
expected_slice.data_ptr() == sliced_param.data_ptr()
), "Parameters should actually share the same data pointer"
# Check gradients is the view
expected_slice = param.grad.view(-1)[slice(*offsets)].view_as(sliced_param.grad)
assert (
expected_slice.data_ptr() == sliced_param.grad.data_ptr()
), "Parameters should actually share the same data pointer"
torch.testing.assert_close(
expected_slice,
sliced_param.grad,
atol=0,
rtol=0,
msg=lambda msg: f"Gradients don't match: {msg}\n - Expected slice: {expected_slice}\n - Got: {sliced_param.grad}\n - Full gradient: {param.grad}",
)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
reference_optimizer.step()
reference_optimizer.zero_grad()
# Check that params are synced across DP
for name, param in model.named_parameters():
assert_tensor_equal_over_group(param, group=parallel_context.dp_pg)
assert param.grad is None
# Check that gradients are reset
for ref_name, ref_param in reference_model.named_parameters():
assert_tensor_equal_over_group(ref_param, group=parallel_context.dp_pg)
assert ref_param.grad is None
for param_group in optimizer.param_groups:
for param in param_group["params"]:
assert param.grad is None
# Check params are the same with reference_model
for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
assert name == ref_name
# TODO @thomasw21: Figure out how to make this pass at `atol`/`rtol` set to 0.
torch.testing.assert_close(param, ref_param, msg=lambda msg: f"At iteration {i}, {msg}")
# Check params have been updated correctly
for (name, param) in model.named_parameters():
old_param = old_named_params[name]
assert not torch.allclose(param, old_param)
# We need to check that the optimizer states are the same
state_dict = optimizer.state_dict()
reference_state_dict = reference_optimizer.state_dict()
state = state_dict["state"]
ref_state = reference_state_dict["state"]
assert set(state) == set(ref_state)
for index, optim_state in state.items():
ref_optim_state = ref_state[index]
name = index_to_name[index]
offsets = optimizer.param_name_to_dp_rank_offsets[name][dist.get_rank(parallel_context.dp_pg)]
assert set(optim_state) == set(ref_optim_state)
for key in ["exp_avg", "exp_avg_sq"]:
value = optim_state[key]
ref_value = ref_optim_state[key]
torch.testing.assert_close(
value,
ref_value.view(-1)[slice(*offsets)].view_as(value),
atol=0,
rtol=0,
msg=lambda msg: f"At iteration {i}, {msg}",
)
parallel_context.destroy()
@pytest.mark.parametrize("tp,dp,pp", [pytest.param(2, i, 1) for i in range(1, available_gpus() // 2 + 1)])
@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
@pytest.mark.parametrize("async_communication", [False, True])
@rerun_if_address_is_in_use()
def test_zero_optimizer_with_tp(
tp: int, dp: int, pp: int, tp_mode: TensorParallelLinearMode, async_communication: bool
):
if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
pytest.skip("ALL_REDUCE mode does not support async communication")
init_distributed(pp=pp, dp=dp, tp=tp)(_test_zero_optimizer_with_tp)(
tp_mode=tp_mode, async_communication=async_communication
)
def _test_zero_optimizer_with_tp(
parallel_context: ParallelContext, tp_mode: TensorParallelLinearMode, async_communication: bool
):
if async_communication:
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
model = torch_nn.Sequential(
nn.TensorParallelColumnLinear(
in_features=5,
out_features=parallel_context.tp_pg.size(),
mode=tp_mode,
pg=parallel_context.tp_pg,
device="cuda",
async_communication=async_communication,
),
# We choose `sigmoid` instead of `relu` since `relu` can result in a sparse gradient, causing no update to certain parameters
torch_nn.Sigmoid(),
nn.TensorParallelRowLinear(
in_features=parallel_context.tp_pg.size(),
out_features=3,
mode=tp_mode,
pg=parallel_context.tp_pg,
device="cuda",
),
)
optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
optimizer_name_to_id = {v: k for k, v in optimizer.optimizer.id_to_name.items()}
assert len(optimizer_name_to_id) == len(optimizer.id_to_name)
# reference model
reference_model = torch_nn.Sequential(
torch_nn.Linear(in_features=5, out_features=parallel_context.tp_pg.size(), device="cuda"),
torch_nn.Sigmoid(),
torch_nn.Linear(in_features=parallel_context.tp_pg.size(), out_features=3, device="cuda"),
)
for module in reference_model.modules():
for name, param in module.named_parameters(recurse=False):
setattr(module, name, NanotronParameter(param))
reference_optimizer = torch.optim.AdamW(reference_model.parameters())
# TODO @thomasw21: This is a hack to obtain `AdamW` index in it's state.
name_to_index = {name: index for index, (name, _) in enumerate(reference_model.named_parameters())}
# sync parameters
with torch.no_grad():
for ref_name, ref_param in reference_model.named_parameters():
dist.all_reduce(ref_param, op=dist.ReduceOp.AVG, group=parallel_context.world_pg)
for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
assert name == ref_name
assert isinstance(param, NanotronParameter)
if param.is_sharded:
sharded_info = param.get_sharded_info()
for local_global_slices_pair in sharded_info.local_global_slices_pairs:
local_slices = local_global_slices_pair.local_slices
global_slices = local_global_slices_pair.global_slices
param[local_slices].copy_(ref_param[global_slices])
else:
param.copy_(ref_param)
# Get infinite dummy data iterator, it has to be synced across TP
random_states = RandomStates(
{
"tp_synced": get_synced_random_state(random_state=get_current_random_state(), pg=parallel_context.tp_pg),
}
)
batch_size = 2 * parallel_context.tp_pg.size() if tp_mode is TensorParallelLinearMode.REDUCE_SCATTER else 7
with branch_random_state(random_states=random_states, key="tp_synced", enabled=True):
nb_optim_steps = 3
batches = [
torch.randn(batch_size, 5, device="cuda")
if dist.get_rank(parallel_context.pp_pg) == 0
else TensorPointer(0)
for _ in range(nb_optim_steps)
]
# Model training loop
for i, batch in enumerate(batches):
# store original reference parameter
old_named_params = {name: param.detach().clone() for name, param in model.named_parameters()}
# Run forward pass
if tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
batch_size = batch.shape[0]
assert batch_size % parallel_context.tp_pg.size() == 0
step = batch_size // parallel_context.tp_pg.size()
loss = model(
batch[
dist.get_rank(parallel_context.tp_pg) * step : (dist.get_rank(parallel_context.tp_pg) + 1) * step
]
)
else:
loss = model(batch)
ref_loss = reference_model(batch)
# Run backward pass
loss.sum().backward()
ref_loss.sum().backward()
# Check loss is the same
loss = loss.detach()
ref_loss = ref_loss.detach()
assert isinstance(loss, torch.Tensor)
assert isinstance(ref_loss, torch.Tensor)
if tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
batch_size = batch.shape[0]
assert batch_size % parallel_context.tp_pg.size() == 0
step = batch_size // parallel_context.tp_pg.size()
torch.testing.assert_close(
loss,
ref_loss[
dist.get_rank(parallel_context.tp_pg) * step : (dist.get_rank(parallel_context.tp_pg) + 1) * step
],
msg=lambda msg: f"At iteration {i}, {msg}",
)
else:
torch.testing.assert_close(loss, ref_loss, msg=lambda msg: f"At iteration {i}, {msg}")
# Manually sync tied parameters
sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
sync_tied_weights_gradients(module=reference_model, parallel_context=parallel_context, grad_accumulator=None)
# We rely on DDP to synchronize gradients across DP. We only need to manually synchronize them if we don't use DDP.
if not isinstance(model, DistributedDataParallel):
sync_gradients_across_dp(
model, dp_pg=parallel_context.dp_pg, reduce_op=dist.ReduceOp.AVG, grad_accumulator=None
)
if not isinstance(reference_model, DistributedDataParallel):
sync_gradients_across_dp(
reference_model, dp_pg=parallel_context.dp_pg, reduce_op=dist.ReduceOp.AVG, grad_accumulator=None
)
# Check gradients are synced across DP
for name, param in model.named_parameters():
assert_tensor_equal_over_group(param.grad, group=parallel_context.dp_pg)
for ref_name, ref_param in reference_model.named_parameters():
assert_tensor_equal_over_group(ref_param.grad, group=parallel_context.dp_pg)
# Check gradients are the same with reference_model
for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
assert name == ref_name
if param.is_sharded:
sharded_info = param.get_sharded_info()
for local_global_slices_pair in sharded_info.local_global_slices_pairs:
local_slices = local_global_slices_pair.local_slices
global_slices = local_global_slices_pair.global_slices
torch.testing.assert_close(
param.grad[local_slices],
ref_param.grad[global_slices],
msg=lambda msg: f"At iteration {i}, {msg}",
)
else:
torch.testing.assert_close(param.grad, ref_param.grad, msg=lambda msg: f"At iteration {i}, {msg}")
with torch.no_grad():
optim_param_id_to_param = {id(param): param for param in optimizer.param_groups[0]["params"]}
assert len(optim_param_id_to_param) == len(optimizer.param_groups[0]["params"])
for name, param in model.named_parameters():
if dist.get_rank(parallel_context.dp_pg) not in optimizer.param_name_to_dp_rank_offsets[name]:
assert name not in optimizer_name_to_id
continue
param_id = optimizer_name_to_id[name]
sliced_param = optim_param_id_to_param[param_id]
offsets = optimizer.param_name_to_dp_rank_offsets[name][dist.get_rank(parallel_context.dp_pg)]
# Check that weights share the same storage
expected_slice = param.view(-1)[slice(*offsets)].view_as(sliced_param)
torch.testing.assert_close(
expected_slice,
sliced_param,
atol=0,
rtol=0,
msg=lambda msg: f"At iteration {i}, weights don't match: {msg}\n - Expected slice: {expected_slice}\n - Got: {sliced_param}\n - Full gradient: {param}",
)
assert (
expected_slice.data_ptr() == sliced_param.data_ptr()
), "Parameters should actually share the same data pointer"
# Check that gradients share the same storage
expected_slice = param.grad.view(-1)[slice(*offsets)].view_as(sliced_param.grad)
assert (
expected_slice.data_ptr() == sliced_param.grad.data_ptr()
), "Parameters should actually share the same data pointer"
torch.testing.assert_close(
expected_slice,
sliced_param.grad,
atol=0,
rtol=0,
msg=lambda msg: f"At iteration {i}, gradients don't match: {msg}\n - Expected slice: {expected_slice}\n - Got: {sliced_param.grad}\n - Full gradient: {param.grad}",
)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
reference_optimizer.step()
reference_optimizer.zero_grad()
# Check that params are synced across DP
for name, param in model.named_parameters():
assert_tensor_equal_over_group(param, group=parallel_context.dp_pg)
assert param.grad is None
# Check that gradients are reset
for ref_name, ref_param in reference_model.named_parameters():
assert_tensor_equal_over_group(ref_param, group=parallel_context.dp_pg)
assert ref_param.grad is None
for param_group in optimizer.param_groups:
for param in param_group["params"]:
assert param.grad is None
# Check params are the same with reference_model
for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
assert name == ref_name
if param.is_sharded:
sharded_info = param.get_sharded_info()
for local_global_slices_pair in sharded_info.local_global_slices_pairs:
local_slices = local_global_slices_pair.local_slices
global_slices = local_global_slices_pair.global_slices
torch.testing.assert_close(
param[local_slices], ref_param[global_slices], msg=lambda msg: f"At iteration {i}, {msg}"
)
else:
torch.testing.assert_close(param, ref_param, msg=lambda msg: f"At iteration {i}, {msg}")
# Check params have been updated correctly:
for (name, param) in model.named_parameters():
old_param = old_named_params[name]
assert not torch.allclose(param, old_param)
# We need to check that the optimizer states are the same
state_dict = optimizer.state_dict()
reference_state_dict = reference_optimizer.state_dict()
state = state_dict["state"]
ref_state = reference_state_dict["state"]
assert "names" in state_dict
state_index_to_name = state_dict["names"]
state_name_to_index = {name: index for index, name in state_index_to_name.items()}
# Check that this is a bijection
assert len(state_index_to_name) == len(state_name_to_index)
for name, param in model.named_parameters():
if name not in state_name_to_index:
# Parameters is not passed to optimizer, mainly due to zero sharding strategy
continue
index = state_name_to_index[name]
optim_state = state[index]
ref_optim_state = ref_state[name_to_index[name]]
offsets = optimizer.param_name_to_dp_rank_offsets[name][dist.get_rank(parallel_context.dp_pg)]
assert set(optim_state) == set(ref_optim_state)
assert isinstance(param, NanotronParameter)
for key in ["exp_avg", "exp_avg_sq"]:
value = optim_state[key]
ref_value = ref_optim_state[key]
if param.is_sharded:
sharded_info = param.get_sharded_info()
for local_global_slices_pair in sharded_info.local_global_slices_pairs:
global_slices = local_global_slices_pair.global_slices
torch.testing.assert_close(
# TODO @thomasw21: We can't add any information about `local_slices` to `value` because it's already flattened
# For now, we're going to assume that sharded parameters are contiguous, and `local_slices` are trivial all none slice
value,
ref_value[global_slices].view(-1)[slice(*offsets)],
msg=lambda msg: f"At iteration {i}, {msg}",
)
else:
torch.testing.assert_close(
value,
ref_value.view(-1)[slice(*offsets)].view_as(value),
msg=lambda msg: f"At iteration {i}, {msg}",
)
parallel_context.destroy()
@rerun_if_address_is_in_use()
def test_sliced_flat_tensor():
init_distributed(1, 1, 1)(_test_sliced_flat_tensor)()
def _test_sliced_flat_tensor(parallel_context: ParallelContext):
a = torch.randn(2, 3, requires_grad=True)
grad = torch.randn(2, 3)
a.grad = grad
start_offset, end_offset = 1, 5
b = SlicedFlatTensor(a, start_offset=start_offset, end_offset=end_offset)
torch.testing.assert_close(a.grad, grad, atol=0, rtol=0)
torch.testing.assert_close(b.grad, grad.view(-1)[start_offset:end_offset])
# Deallocate the gradient by setting it to None
a.grad = None
assert a.grad is None
assert b.grad is None
# Setting gradient to None on the sliced tensor works
a.grad = grad
assert a.grad is not None
assert b.grad is not None
b.grad = None
assert b.grad is None
assert a.grad is None
with assert_fail_with(NotImplementedError):
b.grad = torch.randn(1, 5)
with assert_fail_with(NotImplementedError):
del b.grad
c = b[:3]
# It's important not to contaminate everyone.
assert not isinstance(c, SlicedFlatTensor)
parallel_context.destroy()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment