"official/legacy/bert/run_classifier.py" did not exist on "871c4e0a393ef4385534bee55354a5df8aa1ccf4"
Commit dfcb88ff authored by chenzk's avatar chenzk
Browse files

v1.0.8

parents
import torch
from nanotron.config import (
AllForwardAllBackwardPipelineEngine,
CheckpointsArgs,
Config,
DataArgs,
DatasetStageArgs,
GeneralArgs,
LlamaConfig,
LoggingArgs,
LRSchedulerArgs,
ModelArgs,
OptimizerArgs,
ParallelismArgs,
TensorParallelLinearMode,
TokenizerArgs,
TokensArgs,
)
from nanotron.config.config import PretrainDatasetsArgs
from nanotron.models import build_model
from nanotron.models.llama import LlamaForTraining
from nanotron.parallel.context import ParallelContext
from nanotron.trainer import mark_tied_parameters
TINY_LLAMA_CONFIG = LlamaConfig(
**{
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 16,
"initializer_range": 0.02,
"intermediate_size": 32,
"is_llama_config": True,
"max_position_embeddings": 128,
"num_attention_heads": 8,
"num_hidden_layers": 4,
"num_key_value_heads": 4,
"pad_token_id": None,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": None,
"tie_word_embeddings": False,
"use_cache": True,
"vocab_size": 4096,
}
)
def get_llama_training_config(model_config: ModelArgs):
return Config(
model=model_config,
general=GeneralArgs(project="unittest", run="sanity_llama", seed=42),
checkpoints=CheckpointsArgs(
checkpoints_path="./checkpoints",
checkpoint_interval=10,
),
parallelism=ParallelismArgs(
dp=1,
pp=1,
tp=2,
expert_parallel_size=2,
pp_engine="1f1b",
tp_mode="ALL_REDUCE",
tp_linear_async_communication=False,
),
tokenizer=TokenizerArgs("gpt2"),
optimizer=OptimizerArgs(
zero_stage=0,
weight_decay=0.01,
clip_grad=1.0,
accumulate_grad_in_fp32=False,
adam_eps=1e-08,
adam_beta1=0.9,
adam_beta2=0.95,
torch_adam_is_fused=True,
learning_rate_scheduler=LRSchedulerArgs(
learning_rate=3e-4,
lr_warmup_steps=100,
lr_warmup_style="linear",
lr_decay_style="cosine",
min_decay_lr=1e-5,
),
),
logging=LoggingArgs(),
tokens=TokensArgs(sequence_length=16, train_steps=10, micro_batch_size=16, batch_accumulation_per_replica=1),
data_stages=[
DatasetStageArgs(
name="train",
start_training_step=1,
data=DataArgs(
seed=42,
num_loading_workers=1,
dataset=PretrainDatasetsArgs(
hf_dataset_or_datasets="HuggingFaceH4/testing_alpaca_small",
hf_dataset_splits="train",
text_column_name="completion",
dataset_processing_num_proc_per_process=12,
),
),
)
],
)
def create_llama_from_config(
model_config: LlamaConfig, device: torch.device, parallel_context: ParallelContext
) -> LlamaForTraining:
"""
Creates and returns a nanotron model.
If `model_config` is None, then `checkpoint_path` must be set, in which case
the configuration will be loaded from such path.
If `checkpoint_path` is None, then `model_config` must be set, in which case
the model created will have random weights.
"""
parallel_config = ParallelismArgs(
dp=parallel_context.data_parallel_size,
pp=parallel_context.pipeline_parallel_size,
tp=parallel_context.tensor_parallel_size,
pp_engine=AllForwardAllBackwardPipelineEngine(),
tp_mode=TensorParallelLinearMode.ALL_REDUCE,
tp_linear_async_communication=False,
)
model = build_model(
model_builder=lambda: LlamaForTraining(
config=model_config,
parallel_context=parallel_context,
parallel_config=parallel_config,
random_states=None,
),
parallel_context=parallel_context,
dtype=torch.bfloat16,
device=device,
)
mark_tied_parameters(model=model, parallel_context=parallel_context)
return model
import contextlib
import os
import re
from inspect import signature
from typing import Any, Callable, Dict, List, Optional, Tuple
import torch.cuda
import torch.multiprocessing as mp
from nanotron.parallel import ParallelContext
from packaging import version
def available_gpus():
if not torch.cuda.is_available():
return 0
device_properties = [torch.cuda.get_device_properties(i) for i in range(torch.cuda.device_count())]
# We filter out
blacklisted_gpu_names = {"NVIDIA DGX Display"}
device_properties = [property_ for property_ in device_properties if property_.name not in blacklisted_gpu_names]
# TODO @thomasw21: Can we do this cross node
return len(device_properties)
# from https://stackoverflow.com/a/34333710/9201239
@contextlib.contextmanager
def mock_os_environ(remove_keys: List[str] = None, update_key_values: Dict[str, Any] = None):
"""
Temporarily updates the ``os.environ`` dictionary in-place.
The ``os.environ`` dictionary is updated in-place so that the modification is sure to work in all situations.
Args:
remove_keys: Environment variables to remove.
update_key_values: Dictionary of environment variables and values to add/update.
"""
env = os.environ
update_key_values = update_key_values or {}
remove_keys = remove_keys or []
update_keys = set(update_key_values.keys())
remove_keys = set(remove_keys)
assert remove_keys.isdisjoint(update_keys)
stomped = (update_keys | remove_keys) & set(env.keys())
reverse_change = {
# Environment variables and values to restore on exit.
**{k: env[k] for k in update_keys & stomped},
# Environment variables and values to remove on exit.
**{k: env[k] for k in remove_keys & stomped},
}
try:
env.update(update_key_values)
for k in remove_keys:
env.pop(k, None)
yield
finally:
env.update(reverse_change)
def is_dict_equal(first: Dict, second: Dict, sub_paths: Optional[List[str]] = None) -> Tuple[bool, Optional[str]]:
"""Returns True or False if the dictionaries match, and an additional message when it's False"""
if sub_paths is None:
sub_paths = []
first_keys = set(first.keys())
second_keys = set(second.keys())
if first_keys != second_keys:
return False, f"Keys don't match in {'.'.join(sub_paths)}.\nCur: {first_keys}\nRef: {second_keys}"
for key in first_keys:
first_elt = first[key]
second_elt = second[key]
if isinstance(first_elt, dict):
if not isinstance(second_elt, dict):
return (
False,
f"Object types don't match in {'.'.join(sub_paths + [str(key)])}.\nCur: {first_elt}\nRef: {second_elt}",
)
match, msg = is_dict_equal(first_elt, second_elt, sub_paths=sub_paths + [str(key)])
if match is False:
return False, msg
elif isinstance(first_elt, torch.Tensor):
if not isinstance(second_elt, torch.Tensor):
return (
False,
f"Object types don't match in {'.'.join(sub_paths + [str(key)])}.\nCur: {first_elt}\nRef: {second_elt}",
)
try:
torch.testing.assert_close(
first_elt,
second_elt,
atol=0.0,
rtol=0.0,
msg=lambda msg: f"Tensor at {'.'.join(sub_paths + [str(key)])} don't match.\nCur: {first_elt}\nRef: {second_elt}\n{msg}",
)
except AssertionError as error:
return False, error.args[0]
else:
if first_elt != second_elt:
return (
False,
f"Objects at key {'.'.join(sub_paths + [str(key)])} don't match.\nCur: {first_elt}\nRef: {second_elt}",
)
return True, None
def get_all_3d_configurations(gpus: int) -> List[Tuple[int, int, int]]:
"""Given a number of gpus, we want all 3d configurations possible such that pp * dp * tp = gpus"""
result = []
for tp in range(1, gpus + 1):
if gpus % tp != 0:
continue
gpus_left_after_tp = gpus // tp
for dp in range(1, gpus_left_after_tp + 1):
if gpus_left_after_tp % dp != 0:
continue
gpus_left_after_dp = gpus_left_after_tp // dp
for pp in range(1, gpus_left_after_dp + 1):
if gpus_left_after_dp % pp != 0:
continue
if tp * dp * pp == gpus:
result.append((pp, dp, tp))
return result
def rerun_if_address_is_in_use(max_try: int = 500):
"""
This function reruns a wrapped function if "address already in use" occurs
in testing spawned with torch.multiprocessing
Credits: https://github.com/hpcaitech/ColossalAI/blob/adae123df3badfb15d044bd416f0cf29f250bc86/colossalai/testing/utils.py#L157
Usage::
@rerun_if_address_is_in_use()
def test_something():
...
"""
# check version
torch_version = version.parse(torch.__version__)
assert torch_version.major >= 1
# only torch >= 1.8 has ProcessRaisedException
if torch_version >= version.parse("1.8.0"):
exception = torch.multiprocessing.ProcessRaisedException
else:
exception = Exception
func_wrapper = rerun_on_exception(exception_type=exception, pattern=".*Address already in use.*", max_try=max_try)
return func_wrapper
def rerun_on_exception(exception_type: Exception = Exception, pattern: str = None, max_try: int = 10) -> Callable:
"""
A decorator on a function to re-run when an exception occurs.
Credits: https://github.com/hpcaitech/ColossalAI/blob/adae123df3badfb15d044bd416f0cf29f250bc86/colossalai/testing/utils.py#L71
Usage::
# rerun for all kinds of exception
@rerun_on_exception()
def test_method():
print('hey')
raise RuntimeError('Address already in use')
# rerun for RuntimeError only
@rerun_on_exception(exception_type=RuntimeError)
def test_method():
print('hey')
raise RuntimeError('Address already in use')
# rerun for maximum 10 times if Runtime error occurs
@rerun_on_exception(exception_type=RuntimeError, max_try=10)
def test_method():
print('hey')
raise RuntimeError('Address already in use')
# rerun for infinite times if Runtime error occurs
@rerun_on_exception(exception_type=RuntimeError, max_try=None)
def test_method():
print('hey')
raise RuntimeError('Address already in use')
# rerun only the exception message is matched with pattern
# for infinite times if Runtime error occurs
@rerun_on_exception(exception_type=RuntimeError, pattern="^Address.*$")
def test_method():
print('hey')
raise RuntimeError('Address already in use')
Args:
exception_type (Exception, Optional): The type of exception to detect for rerun
pattern (str, Optional): The pattern to match the exception message.
If the pattern is not None and matches the exception message,
the exception will be detected for rerun
max_try (int, Optional): Maximum reruns for this function. The default value is 5.
If max_try is None, it will rerun forever if exception keeps occurring
"""
def _match_lines(lines, pattern):
for line in lines:
if re.match(pattern, line):
return True
return False
def _wrapper(func):
def _run_until_success(*args, **kwargs):
try_count = 0
assert max_try is None or isinstance(
max_try, int
), f"Expected max_try to be None or int, but got {type(max_try)}"
while max_try is None or try_count < max_try:
try:
try_count += 1
ret = func(*args, **kwargs)
return ret
except exception_type as e:
error_lines = str(e).split("\n")
if try_count < max_try and (pattern is None or _match_lines(error_lines, pattern)):
print("Exception is caught, retrying...")
# when pattern is not specified, we always skip the exception
# when pattern is specified, we only skip when pattern is matched
continue
else:
print("Maximum number of attempts is reached or pattern is not matched, no more retrying...")
raise e
# Override signature
# otherwise pytest.mark.parameterize will raise the following error:
# function does not use argument xxx
sig = signature(func)
_run_until_success.__signature__ = sig
return _run_until_success
return _wrapper
def global_wrapper(rank, func, tp, pp, dp, port, kwargs):
def setup_dist_env(rank, world_size, port):
os.environ["WORLD_SIZE"] = str(world_size)
os.environ["RANK"] = str(rank)
# NOTE: since we do unit tests in a
# single node => this is fine!
os.environ["LOCAL_RANK"] = str(rank)
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(port)
world_size = tp * pp * dp
setup_dist_env(rank, world_size, port)
parallel_context = ParallelContext(data_parallel_size=dp, pipeline_parallel_size=pp, tensor_parallel_size=tp)
func(parallel_context, **kwargs)
def init_distributed(tp: int, dp: int, pp: int):
def _init_distributed(func):
def wrapper(**kwargs):
from nanotron.utils import find_free_port
world_size = tp * pp * dp
port = find_free_port()
# Note that kwargs needs to be passed as part of args in a way that can be unpacked
args = (func, tp, pp, dp, port, kwargs)
mp.spawn(global_wrapper, args=args, nprocs=world_size)
return wrapper
return _init_distributed
import torch
from nanotron.logging import LoggerWriter
from nanotron.nn.layer_norm import TritonLayerNorm
from torch.nn import LayerNorm
def get_time_name():
import datetime
today = datetime.datetime.now()
return today.strftime("%d/%m/%Y_%H:%M:%S")
if __name__ == "__main__":
BATCH_SIZE = 1
SEQ_LEN = 2
DEVICE, DTYPE = torch.device("cuda:0"), torch.float32
HIDDEN_SIZE = 1024
NUM_STEPS = 10_000
inputs = torch.randn(BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE, device=DEVICE, dtype=DTYPE)
layer_norm = LayerNorm(normalized_shape=inputs.size(-1), device=DEVICE, dtype=DTYPE)
fused_layer_norm = TritonLayerNorm(
normalized_shape=inputs.size(-1),
device=DEVICE,
dtype=DTYPE,
)
ref_optim = torch.optim.Adam(layer_norm.parameters(), lr=0.1)
optim = torch.optim.Adam(fused_layer_norm.parameters(), lr=0.1)
logger = LoggerWriter()
def loss_function(x):
return x.sum()
for step in range(NUM_STEPS):
# NOTE: just make the output fluctuate a bit
random = torch.randn(1, device=DEVICE) * 0.01
ref_outputs = layer_norm(inputs) * random
outputs = fused_layer_norm(inputs) * random
loss = loss_function(outputs)
ref_loss = loss_function(ref_outputs)
ref_optim.zero_grad()
ref_loss.backward()
ref_optim.step()
optim.zero_grad()
loss.backward()
optim.step()
print(f"Step: {step}, outputs: {outputs.sum()}, ref_loss: {ref_outputs.sum()}")
print(f"Step: {step}, loss: {loss}, ref_loss: {ref_loss}")
# wandb.log({"loss": loss.item(), "ref_loss": ref_loss.item(), "step": step})
logger.add_scalar("loss", loss.item(), step)
logger.add_scalar("ref_loss", ref_loss.item(), step)
import pytest
import torch
from torch.nn import LayerNorm
from nanotron.nn.layer_norm import TritonLayerNorm
@pytest.mark.fa2
@pytest.mark.parametrize(
"hidden_size",
[1024, 1025], # fused layer norm supports 1024 as hidden size but not 1025
)
def test_fused_layer_norm(hidden_size):
BATCH_SIZE = 5
SEQ_LEN = 128
DEVICE, DTYPE = torch.device("cuda:0"), torch.float16
inputs = torch.rand(BATCH_SIZE, SEQ_LEN, hidden_size, device=DEVICE, dtype=DTYPE)
layer_norm = LayerNorm(normalized_shape=inputs.size(-1), device=DEVICE, dtype=DTYPE)
ref_outputs = layer_norm(inputs)
fused_layer_norm = TritonLayerNorm(
normalized_shape=inputs.size(-1),
device=DEVICE,
dtype=DTYPE,
)
outputs = fused_layer_norm(inputs)
# NOTE: with torch.float16, FA2's use a atol of 1e-2
# https://github.com/Dao-AILab/flash-attention/blob/87a1277653fc55cd615f5341255e00c69d5c00a1/tests/ops/triton/test_layer_norm.py#L63-L64
torch.testing.assert_close(outputs, ref_outputs, rtol=1e-3, atol=1e-2)
outputs.sum().backward()
ref_outputs.sum().backward()
# NOTE: same as above
torch.testing.assert_close(fused_layer_norm.weight.grad, layer_norm.weight.grad, rtol=1e-3, atol=1e-2)
torch.testing.assert_close(fused_layer_norm.bias.grad, layer_norm.bias.grad, rtol=1e-3, atol=1e-2)
import sys
from math import isclose
from pathlib import Path
from typing import List
package_path = Path(__file__).parent.parent
sys.path.append(str(package_path))
import numpy as np
import pytest
from helpers.context import TestContext
from helpers.data import (
assert_batch_dataloader,
assert_nanoset_sync_across_all_ranks,
compute_batch_hash,
create_dataset_paths,
create_dummy_json_dataset,
preprocess_dummy_dataset,
)
from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
from nanotron.data.dataloader_builder import build_nanoset_dataloader
from nanotron.data.nanoset import Nanoset
from nanotron.data.utils import count_dataset_indexes, normalize
from nanotron.parallel import ParallelContext
from nanotron.utils import main_rank_first
from transformers import AutoTokenizer
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@pytest.mark.parametrize("train_steps", [500, 10000])
@pytest.mark.parametrize("sequence_length", [512, 8192])
@pytest.mark.parametrize("tokenizer_name_or_path", ["openai-community/gpt2", "unsloth/llama-3-8b-bnb-4bit"])
@rerun_if_address_is_in_use()
def test_build_nanoset_dataloader(
tp: int, dp: int, pp: int, train_steps: int, sequence_length: int, tokenizer_name_or_path: str
):
test_context = TestContext()
# Create dataset folders
json_paths, datatrove_tokenized_dataset_folders = create_dataset_paths(
tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2
)
# Create dummy json datasets
for idx, json_path in enumerate(json_paths):
create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
# Preprocess json dataset with datatrove
for json_path, datatrove_tokenized_dataset_folder in zip(json_paths, datatrove_tokenized_dataset_folders):
preprocess_dummy_dataset(json_path, datatrove_tokenized_dataset_folder, tokenizer_name_or_path)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(
datatrove_tokenized_dataset_folders=datatrove_tokenized_dataset_folders,
train_steps=train_steps,
sequence_length=sequence_length,
tokenizer_name_or_path=tokenizer_name_or_path,
)
def _test_build_nanoset_dataloader(
parallel_context: ParallelContext,
datatrove_tokenized_dataset_folders: List[str],
train_steps: int,
sequence_length: int,
tokenizer_name_or_path: str,
):
SEED = 1234
MICRO_BATCH_SIZE = 4
N_MICRO_BATCHES_PER_BATCH = 8
GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
# Get tokenizer cardinality
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
del tokenizer
# Create Nanoset configs: 1. Normal 2. Blended 3. Blended with weights
nanoset_config = {
"dataset_folders": [datatrove_tokenized_dataset_folders[0]],
"dataset_weights": [1],
"sequence_length": sequence_length,
"token_size": token_size,
"train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
blended_nanoset_config = {
"dataset_folders": datatrove_tokenized_dataset_folders,
"dataset_weights": None,
"sequence_length": sequence_length,
"token_size": token_size,
"train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
blended_weighted_nanoset_config = {
"dataset_folders": datatrove_tokenized_dataset_folders,
"dataset_weights": [8, 2],
"sequence_length": sequence_length,
"token_size": token_size,
"train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
configs = [nanoset_config, blended_nanoset_config, blended_weighted_nanoset_config]
for config in configs:
# Create Nanoset
with main_rank_first(parallel_context.world_pg):
train_dataset = Nanoset(**config)
# Assert we have the same Nanoset in all ranks
assert_nanoset_sync_across_all_ranks(train_dataset, parallel_context)
dataset_sample_count = count_dataset_indexes(train_dataset.dataset_index, len(train_dataset.dataset_folders))
for idx, ds_length in enumerate(train_dataset.dataset_lengths):
# Assert Nanoset doesn't sample indexes greater than the datasets
assert (
np.max(train_dataset.dataset_sample_index, where=train_dataset.dataset_index == idx, initial=-1)
< ds_length
), f"Error building Nanoset Indexes: Tryng to access sample {np.max(train_dataset.dataset_sample_index, where=train_dataset.dataset_index==idx, initial = -1)} of a {ds_length} sample dataset"
# Assert Nanoset builds up the correct blend WRT the dataset_weights
assert isclose(
normalize(dataset_sample_count).tolist()[idx], train_dataset.dataset_weights[idx], abs_tol=0.05
), f"Requested Nanoset to contain {round(train_dataset.dataset_weights[idx]*100, 2)}% of samples from {train_dataset.dataset_folders[idx]} but got {round(normalize(dataset_sample_count).tolist()[idx]*100, 2)}%"
# Create Dataloaders
dataloader = build_nanoset_dataloader(
train_dataset,
sequence_length=sequence_length,
parallel_context=parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=MICRO_BATCH_SIZE,
dataloader_num_workers=0,
dataloader_drop_last=True,
)
# Check a batch produced by the Dataloader
batch = next(iter(dataloader))
assert_batch_dataloader(
batch=batch,
parallel_context=parallel_context,
micro_batch_size=MICRO_BATCH_SIZE,
sequence_length=sequence_length,
)
parallel_context.destroy()
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@pytest.mark.parametrize("skipped_batches", [20, 5555])
@pytest.mark.parametrize("tokenizer_name_or_path", ["openai-community/gpt2", "unsloth/llama-3-8b-bnb-4bit"])
@rerun_if_address_is_in_use()
def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, skipped_batches: int, tokenizer_name_or_path: str):
test_context = TestContext()
# Create dataset folders
json_paths, datatrove_tokenized_dataset_folders = create_dataset_paths(
tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2
)
# Create dummy json datasets
for idx, json_path in enumerate(json_paths):
create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
# Preprocess json dataset with datatrove
for json_path, datatrove_tokenized_dataset_folder in zip(json_paths, datatrove_tokenized_dataset_folders):
preprocess_dummy_dataset(json_path, datatrove_tokenized_dataset_folder, tokenizer_name_or_path)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_recover_nanoset_dataloader)(
datatrove_tokenized_dataset_folders=datatrove_tokenized_dataset_folders,
skipped_batches=skipped_batches,
tokenizer_name_or_path=tokenizer_name_or_path,
)
def _test_recover_nanoset_dataloader(
parallel_context: ParallelContext,
datatrove_tokenized_dataset_folders: List[str],
skipped_batches: int,
tokenizer_name_or_path: str,
):
SEED = 1234
MICRO_BATCH_SIZE = 4
N_MICRO_BATCHES_PER_BATCH = 8
GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
SEQUENCE_LENGTH = 1024
TRAIN_STEPS = 10000
input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
# Get tokenizer cardinality
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
del tokenizer
# Create Nanoset configs: 1. Normal 2. Blended 3. Blended with weights
nanoset_config = {
"dataset_folders": [datatrove_tokenized_dataset_folders[0]],
"dataset_weights": [1],
"sequence_length": SEQUENCE_LENGTH,
"token_size": token_size,
"train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
blended_nanoset_config = {
"dataset_folders": datatrove_tokenized_dataset_folders,
"dataset_weights": None,
"sequence_length": SEQUENCE_LENGTH,
"token_size": token_size,
"train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
blended_weighted_nanoset_config = {
"dataset_folders": datatrove_tokenized_dataset_folders,
"dataset_weights": [8, 2],
"sequence_length": SEQUENCE_LENGTH,
"token_size": token_size,
"train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
"random_seed": SEED,
}
configs = [nanoset_config, blended_nanoset_config, blended_weighted_nanoset_config]
for config in configs:
# Create Nanoset
with main_rank_first(parallel_context.world_pg):
train_dataset = Nanoset(**config)
# Create initial Dataloader
dataloader = build_nanoset_dataloader(
train_dataset,
sequence_length=SEQUENCE_LENGTH,
parallel_context=parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=MICRO_BATCH_SIZE,
dataloader_num_workers=0,
dataloader_drop_last=True,
)
# Recover from failures
dataloader = iter(dataloader)
for _ in range(skipped_batches + 1): # In order to compare with the first batch of the recovered DataLoader
batch = next(dataloader)
# Create recover Dataloader
recovered_dataloader = build_nanoset_dataloader(
train_dataset,
sequence_length=SEQUENCE_LENGTH,
parallel_context=parallel_context,
input_pp_rank=input_pp_rank,
output_pp_rank=output_pp_rank,
micro_batch_size=MICRO_BATCH_SIZE,
dataloader_num_workers=0,
dataloader_drop_last=True,
# NOTE The dataloader serves batches of micro_batch_size despite of batch_accumulation_per_replica
consumed_train_samples=skipped_batches * MICRO_BATCH_SIZE * parallel_context.dp_pg.size(),
)
recovered_first_batch = next(iter(recovered_dataloader))
assert compute_batch_hash(batch) == compute_batch_hash(recovered_first_batch)
parallel_context.destroy()
[pytest]
addopts=-n 35
markers =
fa2: FA2-related
import pytest
import torch
import torch.distributed as dist
from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
from helpers.utils import init_distributed, rerun_if_address_is_in_use
from nanotron.config import Config, ModelArgs, RandomInit
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.block import PipelineBlock
from torch import nn
@pytest.mark.parametrize("tp,dp,pp", [(1, 1, 1), (2, 2, 2)])
@pytest.mark.skip
@rerun_if_address_is_in_use()
def test_get_named_modules_in_pp_rank(tp: int, dp: int, pp: int):
model_args = ModelArgs(init_method=RandomInit(std=1.0), model_config=TINY_LLAMA_CONFIG)
config = get_llama_training_config(model_args)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_get_named_modules_in_pp_rank)(config=config)
def _test_get_named_modules_in_pp_rank(
parallel_context: ParallelContext,
config: Config,
):
model = create_llama_from_config(
model_config=config.model.model_config,
device=torch.device("cuda"),
parallel_context=parallel_context,
)
model.init_model_randomly(config=config)
modules_that_not_in_current_pp_rank = {}
current_pp_rank = dist.get_rank(group=parallel_context.pp_pg)
for name, module in model.named_modules():
if isinstance(module, PipelineBlock) and module.rank != current_pp_rank:
modules_that_not_in_current_pp_rank[name] = module
named_modules_in_pp_rank = model.named_modules_in_pp_rank
for name, module in named_modules_in_pp_rank.items():
# NOTE: if a module is in the current rank, we expect it to be an initialized module
# not PipelineBlock
assert isinstance(module, nn.Module)
assert name not in modules_that_not_in_current_pp_rank
from typing import Union
import torch
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.utils import checkpoint_method
from torch import nn
class CheckpointedModel(nn.Module):
def __init__(self, is_checkpointed: bool = False):
super().__init__()
self.dense1 = nn.Linear(10, 10)
self.dense2 = nn.Linear(10, 10)
self.dropout = nn.Dropout(0.1)
self.is_checkpointed = is_checkpointed
self.fwd_counter = 0
@checkpoint_method("is_checkpointed")
def forward(self, x: Union[torch.Tensor, TensorPointer]):
x = self.dense1(x)
if self.is_checkpointed and self.fwd_counter == 0:
assert not x.requires_grad, "x should not require grad when checkpointed, because fwd runs in no_grad mode"
assert (
x.grad_fn is None
), "x should not store any activation when checkpointed, because fwd runs in no_grad mode"
x = self.dense2(x)
x = self.dropout(x)
self.fwd_counter += 1
return x
class DummyModel(nn.Module):
def __init__(self, is_checkpointed: bool = False):
super().__init__()
self.dense0 = nn.Linear(10, 10)
self.checkpointed_model = CheckpointedModel(is_checkpointed=is_checkpointed)
self.dense3 = nn.Linear(10, 10)
def forward(self, x: Union[torch.Tensor, TensorPointer]):
x = self.dense0(x)
x = self.checkpointed_model(x)
assert x.requires_grad # inside forward, x should require grad even if calculated in no_grad mode
x = self.dense3(x)
return x
def test_activation_checkpointing():
dtype = torch.float16
device = torch.device("cuda")
test_model = DummyModel(is_checkpointed=True)
ref_model = DummyModel(is_checkpointed=False)
for model in [test_model, ref_model]:
model.to(device=device, dtype=dtype)
# copy weights
test_model.load_state_dict(ref_model.state_dict())
assert test_model.checkpointed_model.is_checkpointed is True
assert ref_model.checkpointed_model.is_checkpointed is False
# generate random input
x = torch.randn(10, 10, device=device, dtype=dtype)
# Forward pass
with torch.random.fork_rng(devices=["cuda"]):
ref_output = ref_model(x)
checkpointed_output = test_model(x)
assert test_model.checkpointed_model.fwd_counter == 1
torch.testing.assert_close(checkpointed_output, ref_output)
# Backward pass (check that fwd is called twice, and that we don't store the activations)
ref_output.sum().backward()
assert ref_model.checkpointed_model.fwd_counter == 1, "ref_model fwd should not be called twice"
# make sure grads are not synced between test_model and ref_model
assert ref_model.dense0.weight.grad is not None
assert test_model.dense0.weight.grad is None
assert test_model.checkpointed_model.fwd_counter == 1
checkpointed_output.sum().backward()
assert test_model.checkpointed_model.fwd_counter == 2, "test_model fwd should be called twice"
# compare all models grads
for ref_param, checkpointed_param in zip(ref_model.parameters(), test_model.parameters()):
torch.testing.assert_close(ref_param.grad, checkpointed_param.grad)
# TODO @nouamanetazi: test `checkpoint_method` vs `torch.utils.checkpoint.checkpoint`
# TODO @nouamanetazi: test a method with kwargs values
# TODO @nouamanetazi: test `checkpoint_method` in a distributed setting
# TODO @nouamanetazi: test BatchNorm layers with checkpointing
import math
import os
import pytest
import torch
from helpers.dummy import DummyModel, dummy_infinite_data_loader
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.models import init_on_device_and_dtype
from nanotron.optim.clip_grads import clip_grad_norm
from nanotron.optim.gradient_accumulator import (
FP32GradientAccumulator,
)
from nanotron.parallel import ParallelContext
from nanotron.parallel.parameters import NanotronParameter, sanity_check
from nanotron.parallel.pipeline_parallel.engine import (
AllForwardAllBackwardPipelineEngine,
)
from nanotron.parallel.pipeline_parallel.p2p import P2P
from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
from nanotron.parallel.tensor_parallel.nn import (
TensorParallelColumnLinear,
)
from nanotron.parallel.tied_parameters import (
sync_tied_weights_gradients,
tie_parameters,
)
from nanotron.parallel.utils import initial_sync
from nanotron.sanity_checks import assert_tensor_synced_across_pg
from torch import nn
@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_with_pp requires at least 2 gpus")
@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
@rerun_if_address_is_in_use()
def test_clip_grads_with_pp(norm_type: float):
init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_with_pp)(norm_type=norm_type)
def _test_clip_grads_with_pp(parallel_context: ParallelContext, norm_type: float):
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
pipeline_engine = AllForwardAllBackwardPipelineEngine()
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
# spawn model
model = DummyModel(p2p=p2p)
if has_reference_model:
reference_model = DummyModel(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
for module in model.modules():
if isinstance(module, nn.Linear):
setattr(module, "weight", NanotronParameter(module.weight))
setattr(module, "bias", NanotronParameter(module.bias))
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
if pp_rank == current_pp_rank:
# We already have the weights locally
non_linear = model.mlp[pp_rank].linear.pp_block
reference_non_linear.weight.data.copy_(non_linear.weight.data)
reference_non_linear.bias.data.copy_(non_linear.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.weight.data.copy_(weight.data)
reference_non_linear.bias.data.copy_(bias.data)
else:
p2p.send_tensors(
[model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg) # First rank receives data
n_micro_batches_per_batch = 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
pipeline_engine.train_batch_iter(
model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
)
# Equivalent on the reference model
if has_reference_model:
for micro_batch in batch:
loss = reference_model(**micro_batch)
loss /= n_micro_batches_per_batch
loss.backward()
# Check that gradient are the same as reference
pp_rank = dist.get_rank(parallel_context.pp_pg)
if has_reference_model:
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
if pp_rank == current_pp_rank:
# We already have the gradients locally
non_linear = model.mlp[pp_rank].linear.pp_block
torch.testing.assert_close(
non_linear.weight.grad,
reference_non_linear.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(non_linear.bias.grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
else:
p2p.send_tensors(
[model.mlp[pp_rank].linear.pp_block.weight.grad, model.mlp[pp_rank].linear.pp_block.bias.grad],
to_rank=reference_rank,
)
non_linear = model.mlp[current_pp_rank].linear.pp_block
old_weight_grad = non_linear.weight.grad.clone()
old_bias_grad = non_linear.bias.grad.clone()
# Clip grads
total_norm = clip_grad_norm(
mp_pg=parallel_context.mp_pg,
named_parameters=model.named_parameters(),
grad_accumulator=None,
max_norm=1.0,
norm_type=norm_type,
)
if has_reference_model:
reference_total_norm = torch.nn.utils.clip_grad_norm_(
reference_model.parameters(), max_norm=1.0, norm_type=norm_type
)
torch.testing.assert_close(total_norm, reference_total_norm, atol=1e-6, rtol=1e-7)
# Check that grad changed
assert not torch.allclose(old_weight_grad, non_linear.weight.grad), "Grad should have changed"
assert not torch.allclose(old_bias_grad, non_linear.weight.grad), "Grad should have changed"
# Check that gradient are the same as reference
if has_reference_model:
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
if pp_rank == current_pp_rank:
# We already have the gradients locally
non_linear = model.mlp[pp_rank].linear.pp_block
torch.testing.assert_close(
non_linear.weight.grad,
reference_non_linear.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
non_linear.bias.grad,
reference_non_linear.bias.grad,
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
else:
p2p.send_tensors(
[
model.mlp[current_pp_rank].linear.pp_block.weight.grad,
model.mlp[current_pp_rank].linear.pp_block.bias.grad,
],
to_rank=reference_rank,
)
print(parallel_context.__dir__())
parallel_context.destroy()
@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_with_tp requires at least 2 gpus")
@pytest.mark.parametrize(
"tp_mode,async_communication",
[
pytest.param(TensorParallelLinearMode.ALL_REDUCE, False),
pytest.param(TensorParallelLinearMode.REDUCE_SCATTER, True),
],
)
@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
@rerun_if_address_is_in_use()
def test_clip_grads_with_tp(tp_mode: TensorParallelLinearMode, async_communication: bool, norm_type: float):
init_distributed(tp=2, dp=1, pp=1)(_test_clip_grads_with_tp)(
tp_mode=tp_mode, async_communication=async_communication, norm_type=norm_type
)
def _test_clip_grads_with_tp(
parallel_context: ParallelContext, tp_mode: TensorParallelLinearMode, async_communication: bool, norm_type: float
):
if async_communication:
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
in_features = 4
out_features_per_tp_rank = 8
out_features = parallel_context.tp_pg.size() * out_features_per_tp_rank
# Sharded
column_linear = TensorParallelColumnLinear(
in_features=in_features,
out_features=out_features,
pg=parallel_context.tp_pg,
mode=tp_mode,
device="cuda",
async_communication=async_communication,
)
# Un-sharded
reference_linear = nn.Linear(in_features=in_features, out_features=out_features, device="cuda")
# Copy weights/bias from sharded to un-sharded
with torch.inference_mode():
dist.all_gather(
tensor_list=list(reference_linear.weight.split(out_features_per_tp_rank, dim=0)),
tensor=column_linear.weight,
group=parallel_context.tp_pg,
)
dist.all_gather(
tensor_list=list(reference_linear.bias.split(out_features_per_tp_rank, dim=0)),
tensor=column_linear.bias,
group=parallel_context.tp_pg,
)
# Generate random input
random_input: torch.Tensor
sharded_random_input: torch.Tensor
if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
batch_size = 5
random_input = torch.randn(batch_size, in_features, device="cuda")
# synchronize random_input across tp
dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
sharded_random_input = random_input
elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
sharded_batch_size = 5
sharded_random_input = torch.randn(sharded_batch_size, in_features, device="cuda")
random_input = torch.empty(
sharded_batch_size * parallel_context.tp_pg.size(),
*(sharded_random_input.shape[1:]),
device=sharded_random_input.device,
dtype=sharded_random_input.dtype,
)
dist.all_gather_into_tensor(random_input, sharded_random_input, group=parallel_context.tp_pg)
else:
ValueError(f"Unsupported mode: {tp_mode}")
# Test that we get the same output after forward pass
sharded_output = column_linear(sharded_random_input)
reference_output = reference_linear(random_input)
# TODO @thomasw21: Tune tolerance
torch.testing.assert_close(
sharded_output,
reference_output[
:,
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank,
],
atol=1e-6,
rtol=1e-7,
)
# Test that we get the same gradient after backward pass
sharded_output.sum().backward()
reference_output.sum().backward()
torch.testing.assert_close(
column_linear.weight.grad,
reference_linear.weight.grad[
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank
],
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
column_linear.bias.grad,
reference_linear.bias.grad[
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank
],
atol=1e-6,
rtol=1e-7,
)
old_grad = column_linear.weight.grad.clone()
# Clip grads
total_norm = clip_grad_norm(
mp_pg=parallel_context.mp_pg,
named_parameters=column_linear.named_parameters(),
grad_accumulator=None,
max_norm=1.0,
norm_type=norm_type,
)
ref_total_norm = torch.nn.utils.clip_grad_norm_(reference_linear.parameters(), max_norm=1.0, norm_type=norm_type)
# Check that the gradients have changed
assert not torch.allclose(old_grad, column_linear.weight.grad), "Gradients should have changed after clipping"
# Test that we get the same gradient after clipping
torch.testing.assert_close(
column_linear.weight.grad,
reference_linear.weight.grad[
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank
],
)
torch.testing.assert_close(
column_linear.bias.grad,
reference_linear.bias.grad[
dist.get_rank(parallel_context.tp_pg)
* out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
* out_features_per_tp_rank
],
)
torch.testing.assert_close(total_norm, ref_total_norm)
parallel_context.destroy()
@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_tied_weights requires at least 2 gpus")
@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
@rerun_if_address_is_in_use()
def test_clip_grads_tied_weights(norm_type: float):
init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_tied_weights)(norm_type=norm_type)
def _test_clip_grads_tied_weights(parallel_context: ParallelContext, norm_type: float):
if dist.get_rank(parallel_context.pp_pg) == 0:
model = nn.ModuleDict({"dense0": nn.Linear(10, 10, device="cuda")})
else:
model = nn.ModuleDict({"dense1": nn.Linear(10, 10, device="cuda")})
# Tie weights/bias
tie_parameters(
root_module=model,
ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
tie_parameters(
root_module=model,
ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
group = parallel_context.world_ranks_to_pg[(0, 1)]
# Check that model weights are not in fact synchronized
if dist.get_rank(parallel_context.pp_pg) == 0:
weight = model.dense0.weight
bias = model.dense0.bias
else:
weight = model.dense1.weight
bias = model.dense1.bias
# Make sure that weight/bias are NanotronParameter and that they are tied
assert isinstance(weight, NanotronParameter)
assert weight.is_tied
assert isinstance(bias, NanotronParameter)
assert bias.is_tied
# Sync tied weights: basic assumption
initial_sync(model=model, parallel_context=parallel_context)
# Check that weights are now synced
assert_tensor_synced_across_pg(weight, group)
assert_tensor_synced_across_pg(bias, group)
# Compute gradient
input_ = torch.randn(13, 10, device="cuda")
if dist.get_rank(parallel_context.pp_pg) == 0:
out = model.dense0(input_)
else:
out = model.dense1(input_)
out.sum().backward()
# sync gradients
sync_tied_weights_gradients(model, parallel_context=parallel_context, grad_accumulator=None)
# We check that we both gradients are synchronized
assert_tensor_synced_across_pg(weight.grad, group)
assert_tensor_synced_across_pg(bias.grad, group)
# Save grads as reference
ref_weight = weight.clone()
ref_weight.grad = weight.grad.clone()
ref_bias = bias.clone()
ref_bias.grad = bias.grad.clone()
old_grad = weight.grad.clone()
# Clip grads
total_norm = clip_grad_norm(
mp_pg=parallel_context.mp_pg,
named_parameters=model.named_parameters(),
grad_accumulator=None,
max_norm=1.0,
norm_type=norm_type,
)
ref_total_norm = torch.nn.utils.clip_grad_norm_([ref_weight, ref_bias], max_norm=1.0, norm_type=norm_type)
# Check that the gradients have changed
assert not torch.allclose(old_grad, weight.grad), "Gradients should have changed after clipping"
# Test that we get the same gradient after clipping
assert torch.allclose(weight.grad, ref_weight.grad, rtol=1e-7, atol=1e-6)
assert torch.allclose(bias.grad, ref_bias.grad, rtol=1e-7, atol=1e-6)
assert torch.allclose(total_norm, ref_total_norm, rtol=0, atol=0), f"Got {total_norm} and {ref_total_norm}"
parallel_context.destroy()
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
@rerun_if_address_is_in_use()
def test_clip_grads_fp32_accumulator(norm_type: float, half_precision: torch.dtype):
init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_fp32_accumulator)(
norm_type=norm_type, half_precision=half_precision
)
def _test_clip_grads_fp32_accumulator(
parallel_context: ParallelContext, norm_type: float, half_precision: torch.dtype
):
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
pipeline_engine = AllForwardAllBackwardPipelineEngine()
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
# spawn model
model = DummyModel(p2p=p2p)
if has_reference_model:
reference_model = DummyModel(p2p=p2p).to(torch.float)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
for module in model.modules():
if isinstance(module, nn.Linear):
setattr(module, "weight", NanotronParameter(module.weight))
setattr(module, "bias", NanotronParameter(module.bias))
# model goes to half precision
model = model.to(half_precision)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
if pp_rank == current_pp_rank:
# We already have the weights locally
non_linear = model.mlp[pp_rank].linear.pp_block
reference_non_linear.weight.data.copy_(non_linear.weight.data)
reference_non_linear.bias.data.copy_(non_linear.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.weight.data.copy_(weight.data)
reference_non_linear.bias.data.copy_(bias.data)
else:
p2p.send_tensors(
[model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
to_rank=reference_rank,
)
# Add gradient accumulator
grad_accumulator = FP32GradientAccumulator(model.named_parameters())
# Check that our model is a valid model
sanity_check(model)
# Compute backward
# Get infinite dummy data iterator
data_iterator = dummy_infinite_data_loader(
pp_pg=parallel_context.pp_pg, dtype=half_precision
) # First rank receives data
n_micro_batches_per_batch = 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
pipeline_engine.train_batch_iter(
model,
pg=parallel_context.pp_pg,
batch=batch,
nb_microbatches=n_micro_batches_per_batch,
grad_accumulator=grad_accumulator,
)
# We're going to copy the model gradients to the reference model gradient
# The reason why we do this, instead of computing backward using autograd is because of numerical precisions
if has_reference_model:
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
prefix_name = f"mlp.{pp_rank}.linear.pp_block"
if pp_rank == current_pp_rank:
# We already have the gradients locally
reference_non_linear.weight.grad = grad_accumulator.get_grad_buffer(f"{prefix_name}.weight").clone()
reference_non_linear.bias.grad = grad_accumulator.get_grad_buffer(f"{prefix_name}.bias").clone()
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.weight.grad = weight_grad
reference_non_linear.bias.grad = bias_grad
else:
p2p.send_tensors(
[
grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.weight"),
grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.bias"),
],
to_rank=reference_rank,
)
old_fp32_grads = {
name: grad_accumulator.get_grad_buffer(name=name).clone() for name, _ in model.named_parameters()
}
# Clip grads
total_norm = clip_grad_norm(
mp_pg=parallel_context.mp_pg,
named_parameters=model.named_parameters(),
grad_accumulator=grad_accumulator,
max_norm=1.0,
norm_type=norm_type,
)
if has_reference_model:
ref_total_norm = torch.nn.utils.clip_grad_norm_(
reference_model.parameters(), max_norm=1.0, norm_type=norm_type
)
# Check that the gradients have changed
for name, _ in model.named_parameters():
new_fp32_grad = grad_accumulator.get_grad_buffer(name=name)
assert not torch.allclose(old_fp32_grads[name], new_fp32_grad), "Gradients should have changed after clipping"
# We check that we get the same gradient accumulation. In theory we do get more precision by promoting gradients to fp32.
if has_reference_model:
torch.testing.assert_close(
total_norm.view(1),
ref_total_norm.view(1),
atol=1e-6,
rtol=1e-7,
msg=lambda msg: f"Expected {total_norm} to match {ref_total_norm}.\n{msg}",
)
for pp_rank in range(parallel_context.pp_pg.size()):
reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
prefix_name = f"mlp.{pp_rank}.linear.pp_block"
if pp_rank == current_pp_rank:
# We already have the gradients locally
torch.testing.assert_close(
reference_non_linear.weight.grad,
grad_accumulator.get_grad_buffer(f"{prefix_name}.weight"),
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
reference_non_linear.bias.grad,
grad_accumulator.get_grad_buffer(f"{prefix_name}.bias"),
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(
reference_non_linear.weight.grad,
weight_grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
reference_non_linear.bias.grad,
bias_grad,
atol=1e-6,
rtol=1e-7,
)
else:
p2p.send_tensors(
[
grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.weight"),
grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.bias"),
],
to_rank=reference_rank,
)
parallel_context.destroy()
from contextlib import nullcontext
import pytest
import torch
from helpers.exception import assert_fail_except_rank_with
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.parallel.data_parallel.utils import ddp_trigger_sync_in_bwd
from nanotron.parallel.parameters import NanotronParameter
from nanotron.sanity_checks import assert_tensor_synced_across_pg
from torch import nn
from torch.distributed import GradBucket
@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_ddp_with_afab requires at least 2 gpus")
@pytest.mark.parametrize("accumulation_steps", [1, 3])
@rerun_if_address_is_in_use()
def test_ddp_with_afab(accumulation_steps):
init_distributed(tp=1, dp=2, pp=1)(_test_ddp_with_afab)(accumulation_steps=accumulation_steps)
def _test_ddp_with_afab(parallel_context: ParallelContext, accumulation_steps: int):
half_precision = torch.float16
def allreduce_hook(process_group: dist.ProcessGroup, bucket: GradBucket):
# DDP groups grads in GradBuckets. This hook is called throughout the bwd pass, once each bucket is ready to overlap communication with computation.
# See https://pytorch.org/docs/stable/ddp_comm_hooks.html#what-does-a-communication-hook-operate-on for more details.
half_flat_bucket_buffer = bucket.buffer()
group_to_use = process_group if process_group is not None else parallel_context.dp_pg
return (
dist.all_reduce(half_flat_bucket_buffer, group=group_to_use, async_op=True, op=dist.ReduceOp.AVG)
.get_future()
.then(lambda fut: fut.value()[0])
)
model_hook = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
# Create Nanotron Parameter
model_hook.weight = NanotronParameter(model_hook.weight)
model_ddp_hook = torch.nn.parallel.DistributedDataParallel(
model_hook,
process_group=parallel_context.dp_pg,
)
# Register DDP hook
model_ddp_hook.register_comm_hook(state=None, hook=allreduce_hook)
activations = []
# All forward
for i in range(accumulation_steps):
input = torch.randn(5, 3, dtype=half_precision, device="cuda")
with model_ddp_hook.no_sync():
loss_hook = model_ddp_hook(input).sum()
activations.append(loss_hook)
# All backward
for i in range(accumulation_steps):
context = nullcontext()
if i == accumulation_steps - 1:
context = ddp_trigger_sync_in_bwd(model_ddp_hook) # triggers a sync for the final backward
loss_hook = activations[i]
with context:
loss_hook.backward()
grad_hook = model_ddp_hook.module.weight.grad.clone()
# Check that the gradients are synchronized across DP
if i == accumulation_steps - 1:
assert_tensor_synced_across_pg(grad_hook, parallel_context.dp_pg)
else:
with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
assert_tensor_synced_across_pg(grad_hook, parallel_context.dp_pg)
parallel_context.destroy()
import numpy as np
import pytest
import torch.distributed as dist
from helpers.utils import (
available_gpus,
get_all_3d_configurations,
init_distributed,
rerun_if_address_is_in_use,
)
from nanotron.parallel import ParallelContext
from torch.distributed import ProcessGroup
def _test_init_parallel_context(parallel_context: ParallelContext):
assert dist.is_initialized() is True
assert isinstance(parallel_context.world_pg, ProcessGroup)
assert isinstance(parallel_context.tp_pg, ProcessGroup) if parallel_context.tensor_parallel_size > 1 else True
assert isinstance(parallel_context.pp_pg, ProcessGroup) if parallel_context.pipeline_parallel_size > 1 else True
assert isinstance(parallel_context.dp_pg, ProcessGroup) if parallel_context.data_parallel_size > 1 else True
world_rank = dist.get_rank(parallel_context.world_pg)
ranks3d = parallel_context.get_local_ranks(world_rank)
assert isinstance(ranks3d, tuple) and len(ranks3d)
assert isinstance(parallel_context.world_rank_matrix, np.ndarray)
assert isinstance(parallel_context.world_ranks_to_pg, dict)
local_rank = tuple(i.item() for i in np.where(parallel_context.world_rank_matrix == world_rank))
global_rank = parallel_context.get_global_rank(*local_rank)
assert isinstance(global_rank, np.int64), f"The type of global_rank is {type(global_rank)}"
assert global_rank == dist.get_rank()
parallel_context.destroy()
assert dist.is_initialized() is False
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_init_parallel_context(tp: int, dp: int, pp: int):
init_distributed(tp=tp, dp=dp, pp=pp)(_test_init_parallel_context)()
\ No newline at end of file
from typing import Union
import pytest
import torch
from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
from helpers.utils import init_distributed, rerun_if_address_is_in_use
from nanotron.config import ModelArgs, RandomInit, SpectralMupInit
from nanotron.helpers import get_custom_lr_for_named_parameters
from nanotron.parallel import ParallelContext
from nanotron.scaling.parametrization import ParametrizationMethod
@pytest.mark.parametrize("tp,dp,pp", [(1, 1, 1), (2, 1, 1), (1, 1, 2), (2, 1, 2)])
@pytest.mark.parametrize(
"parametrization_method", [ParametrizationMethod.STANDARD, ParametrizationMethod.SPECTRAL_MUP]
)
@pytest.mark.skip
@rerun_if_address_is_in_use()
def test_get_custom_lr(tp: int, dp: int, pp: int, parametrization_method: ParametrizationMethod):
LR = 1e-3
if parametrization_method == ParametrizationMethod.STANDARD:
init_method = RandomInit(std=1.0)
elif parametrization_method == ParametrizationMethod.SPECTRAL_MUP:
init_method = SpectralMupInit(use_mup=True)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_get_custom_lr)(
lr=LR,
init_method=init_method,
parametrization_method=parametrization_method,
)
def _test_get_custom_lr(
parallel_context: ParallelContext,
lr: float,
init_method: Union[RandomInit, SpectralMupInit],
parametrization_method: ParametrizationMethod,
):
model_args = ModelArgs(init_method=init_method, model_config=TINY_LLAMA_CONFIG)
config = get_llama_training_config(model_args)
llama = create_llama_from_config(
model_config=TINY_LLAMA_CONFIG,
device=torch.device("cuda"),
parallel_context=parallel_context,
)
llama.init_model_randomly(config=config, init_method=parametrization_method)
named_parameters = list(llama.get_named_params_with_correct_tied())
if len(named_parameters) == 0:
# NOTE: some pp ranks don't have any parameters
return
named_param_groups = get_custom_lr_for_named_parameters(
parametrization_method=parametrization_method, lr=lr, named_parameters=named_parameters, model=llama
)
assert len(named_param_groups) == len(named_parameters)
assert all(isinstance(named_param_group["lr"], float) for named_param_group in named_param_groups)
assert all(isinstance(named_param_group["named_params"], list) for named_param_group in named_param_groups)
is_all_lr_the_same = parametrization_method == ParametrizationMethod.STANDARD
assert all(named_param_group["lr"] == lr for named_param_group in named_param_groups) is is_all_lr_the_same
import pytest
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron.optim.gradient_accumulator import FP32GradientAccumulator
from nanotron.optim.named_optimizer import NamedOptimizer
from nanotron.optim.optimizer_from_gradient_accumulator import OptimizerFromGradientAccumulator
from nanotron.parallel.context import ParallelContext
from nanotron.parallel.parameters import NanotronParameter
from nanotron.random import set_random_seed
class DummyModel(nn.Module):
def __init__(self, dtype=torch.float32):
super(DummyModel, self).__init__()
self.fc1 = nn.Linear(10, 20, bias=False).to(dtype=dtype)
self.fc2 = nn.Linear(20, 2, bias=False).to(dtype=dtype)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return x
def test_optimizer_lr_one_group():
set_random_seed(42)
model = DummyModel().to("cuda")
lr1 = 0.1
named_params_or_groups = []
for name, param in model.named_parameters():
named_params_or_groups.append((name, param))
named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1}]
optimizer = NamedOptimizer(
named_params_or_groups=named_params_or_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
input = torch.randn(10, 10).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for _ in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output, target)
loss.backward()
fc1_grad = model.fc1.weight.grad.clone()
fc2_grad = model.fc2.weight.grad.clone()
# compute gradient manually
with torch.no_grad():
expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
expected_fc2_weight = model.fc2.weight - lr1 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
def test_optimizer_lr_multiple_group():
set_random_seed(42)
model = DummyModel().to("cuda")
lr1, lr2 = 0.1, 0.001
named_params_or_groups = [
{"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name], "lr": lr1},
{"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name], "lr": lr2},
]
optimizer = NamedOptimizer(
named_params_or_groups=named_params_or_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
input = torch.randn(10, 10).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for _ in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output, target)
loss.backward()
fc1_grad = model.fc1.weight.grad.clone()
fc2_grad = model.fc2.weight.grad.clone()
with torch.no_grad():
expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
expected_fc2_weight = model.fc2.weight - lr2 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
def test_optimizer_lr_weight_decay_one_group():
set_random_seed(42)
model = DummyModel().to("cuda")
lr1 = 0.1
weight_decay = 0.1
named_params_or_groups = []
for name, param in model.named_parameters():
named_params_or_groups.append((name, param))
named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1, "weight_decay": weight_decay}]
optimizer = NamedOptimizer(
named_params_or_groups=named_params_or_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
input = torch.randn(10, 10).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for _ in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output, target)
loss.backward()
# Compute gradient manually and apply weight decay
with torch.no_grad():
expected_fc1_weight = (1 - lr1 * weight_decay) * model.fc1.weight - lr1 * model.fc1.weight.grad
expected_fc2_weight = (1 - lr1 * weight_decay) * model.fc2.weight - lr1 * model.fc2.weight.grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
def test_optimizer_lr_weight_decay_multiple_group():
set_random_seed(42)
model = DummyModel().to("cuda")
lr1, lr2 = 0.1, 0.001
weight_decay1, weight_decay2 = 0.1, 0.001
named_params_or_groups = [
{
"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name],
"lr": lr1,
"weight_decay": weight_decay1,
},
{
"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name],
"lr": lr2,
"weight_decay": weight_decay2,
},
]
optimizer = NamedOptimizer(
named_params_or_groups=named_params_or_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
input = torch.randn(10, 10).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for _ in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output, target)
loss.backward()
# Compute gradient manually and apply weight decay
with torch.no_grad():
expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * model.fc1.weight.grad
expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * model.fc2.weight.grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
def test_optimizer_grad_accumulation_lr_one_group(half_precision: torch.dtype, accumulation_steps: int):
set_random_seed(42)
dtype = half_precision
lr1 = 0.1
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
named_params_or_groups = []
for name, param in model.named_parameters():
named_params_or_groups.append((name, param))
named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1}]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
expected_fc2_weight = model.fc2.weight - lr1 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
def test_optimizer_grad_accumulation_lr_multiple_group(half_precision: torch.dtype, accumulation_steps: int):
set_random_seed(42)
dtype = half_precision
lr1, lr2 = 0.1, 0.001
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
named_params_or_groups = [
{"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name], "lr": lr1},
{"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name], "lr": lr2},
]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
expected_fc2_weight = model.fc2.weight - lr2 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
def test_optimizer_grad_accumulation_lr_weight_decay_one_group(half_precision: torch.dtype, accumulation_steps: int):
set_random_seed(42)
dtype = half_precision
lr1 = 0.1
weight_decay = 0.1
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
named_params_or_groups = []
for name, param in model.named_parameters():
named_params_or_groups.append((name, param))
named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1, "weight_decay": weight_decay}]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
weight_decay=9999999, # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
expected_fc1_weight = (1 - lr1 * weight_decay) * model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
expected_fc2_weight = (1 - lr1 * weight_decay) * model.fc2.weight - lr1 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
def test_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
half_precision: torch.dtype, accumulation_steps: int
):
set_random_seed(42)
dtype = half_precision
lr1, lr2 = 0.1, 0.001
weight_decay1, weight_decay2 = 0.1, 0.001
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
named_params_or_groups = [
{
"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name],
"lr": lr1,
"weight_decay": weight_decay1,
},
{
"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name],
"lr": lr2,
"weight_decay": weight_decay2,
},
]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
weight_decay=9999999, # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
@pytest.mark.skipif(available_gpus() < 2, reason="Testing requires at least 2 gpus")
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
@rerun_if_address_is_in_use()
def test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
half_precision: torch.dtype, accumulation_steps: int
):
init_distributed(tp=1, dp=2, pp=1)(_test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group)(
half_precision=half_precision,
accumulation_steps=accumulation_steps,
)
def _test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
parallel_context: ParallelContext, half_precision: torch.dtype, accumulation_steps: int
):
set_random_seed(42)
dtype = half_precision
# Making it bigger so that the difference is more visible during update
lr1, lr2 = 0.04, 0.05
weight_decay1, weight_decay2 = 0.5, 0.2
model = DummyModel(dtype=dtype).to("cuda")
# Need to convert the weights to NanotronParameter for the gradient accumulation to work
model.fc1.weight = NanotronParameter(model.fc1.weight)
model.fc2.weight = NanotronParameter(model.fc2.weight)
model_ddp = torch.nn.parallel.DistributedDataParallel(
model,
process_group=parallel_context.dp_pg,
)
named_params_or_groups = [
{
"named_params": [(name, param) for name, param in model_ddp.named_parameters() if "fc1" in name],
"lr": lr1,
"weight_decay": weight_decay1,
},
{
"named_params": [(name, param) for name, param in model_ddp.named_parameters() if "fc2" in name],
"lr": lr2,
"weight_decay": weight_decay2,
},
]
# Optimizer
def optimizer_builder(inp_param_groups):
return NamedOptimizer(
named_params_or_groups=inp_param_groups,
optimizer_builder=lambda param_groups: optim.SGD(
param_groups,
lr=9999999, # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
weight_decay=9999999, # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
),
)
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_params_or_groups,
optimizer_builder=optimizer_builder,
)
accumulator = optimizer.gradient_accumulator
input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
target = torch.randint(0, 2, (10,)).to(device="cuda")
for batch_idx in range(100):
optimizer.zero_grad()
output = model(input)
loss = F.cross_entropy(output.float(), target)
accumulator.backward(loss)
if (batch_idx + 1) % accumulation_steps == 0:
# Manual update weights for ref
with torch.no_grad():
fc1_grad = accumulator.get_grad_buffer(name="module.fc1.weight").to(dtype)
expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * fc1_grad
fc2_grad = accumulator.get_grad_buffer(name="module.fc2.weight").to(dtype)
expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * fc2_grad
optimizer.step()
updated_fc1_weight = model.fc1.weight
updated_fc2_weight = model.fc2.weight
torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
import contextlib
import pytest
import torch
from helpers.exception import assert_fail_with
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.p2p import P2P
@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_ddp_with_afab requires at least 2 gpus")
@pytest.mark.parametrize("send_contiguous", [True, False])
@pytest.mark.parametrize("full", [True, False])
@rerun_if_address_is_in_use()
def test_check_send_recv_tensor(send_contiguous: bool, full: bool):
init_distributed(tp=1, dp=1, pp=2)(_test_check_send_recv_tensor)(send_contiguous=send_contiguous, full=full)
def _test_check_send_recv_tensor(parallel_context: ParallelContext, send_contiguous: bool, full: bool):
p2p = P2P(pg=parallel_context.pp_pg, device=torch.device("cuda"))
if dist.get_rank(p2p.pg) == 0:
tensor_to_send = torch.randn(3, 5, dtype=torch.float, device=torch.device("cuda"))
if send_contiguous is True:
assert tensor_to_send.is_contiguous()
else:
tensor_to_send = tensor_to_send.transpose(0, 1)
assert not tensor_to_send.is_contiguous()
# `full` defines if we take a non trivial slice of the tensor
if full is False:
tensor_to_send = tensor_to_send[1:3]
if send_contiguous is False and full is False:
# This is supposed to return a ValueError mentioning that you should have sent a smaller model by running `contiguous` before.
send_first_context = assert_fail_with(
AssertionError,
error_msg="Expect storage_size to be smaller than tensor size. It might not be true, when you use slicing for example though. We probably don't want to support it in our P2P system",
)
fail_at_first_send = True
else:
send_first_context = contextlib.nullcontext()
fail_at_first_send = False
# Send tensor back and forth through p2p protocol and check that we get the same thing.
if dist.get_rank(p2p.pg) == 0:
with send_first_context:
handles = p2p.isend_tensors([tensor_to_send], to_rank=1)
if fail_at_first_send is True:
# We early return if we caught an error
return
for handle in handles:
handle.wait()
tensor_travelled_back_and_forth = p2p.recv_tensors(1, from_rank=1)[0]
torch.testing.assert_close(tensor_to_send, tensor_travelled_back_and_forth, atol=0, rtol=0)
elif dist.get_rank(p2p.pg) == 1:
# Instead of letting first rank hang since sending won't be possible, we early return
tensors, handles = p2p.irecv_tensors(1, from_rank=0)
if fail_at_first_send is True:
return
for handle in handles:
handle.wait()
tensor_to_recv = tensors[0]
p2p.send_tensors([tensor_to_recv], to_rank=0)
else:
raise ValueError()
if full is False and send_contiguous is True:
# We can actually check that we haven't sent the entire storage as storage not accessed by the tensor are not sent
if dist.get_rank(p2p.pg) == 0:
# Check that the first element in the storages don't correspond (because they are not support to be communicated when the tensor is not full).
print(tensor_to_send.untyped_storage()[:4], tensor_travelled_back_and_forth.untyped_storage()[:4])
print(tensor_to_send.as_strided(size=(1,), stride=(1,), storage_offset=0))
print(tensor_travelled_back_and_forth.as_strided(size=(1,), stride=(1,), storage_offset=0))
assert not torch.allclose(
tensor_to_send.as_strided(size=(1,), stride=(1,), storage_offset=0),
tensor_travelled_back_and_forth.as_strided(size=(1,), stride=(1,), storage_offset=0),
)
parallel_context.destroy()
import torch
from helpers.exception import assert_fail_with
from nanotron.models.base import DTypeInvariantTensor, init_on_device_and_dtype
from nanotron.parallel.parameters import NanotronParameter
from torch import nn
def test_nanotron_parameter_does_not_override_some_parameter_variable():
param = nn.Parameter(torch.empty(3))
assert not hasattr(param, NanotronParameter.NANOTRON_PARAMETER_METADATA_ATTRIBUTE_NAME)
def test_uncastable_tensor():
# Test that we can create an DTypeInvariantTensor
x = DTypeInvariantTensor(torch.randn(3, 3))
assert isinstance(x, torch.Tensor)
assert isinstance(x, DTypeInvariantTensor)
# Test that we cannot modify the type of an DTypeInvariantTensor
with assert_fail_with(RuntimeError, error_msg="Cannot convert the type of an DTypeInvariantTensor to float"):
x = x.float()
with assert_fail_with(RuntimeError, error_msg="Cannot convert the type of an DTypeInvariantTensor to half"):
x = x.half()
with assert_fail_with(RuntimeError, error_msg="Cannot change the type of an DTypeInvariantTensor"):
x = x.to(torch.float32)
with assert_fail_with(RuntimeError, error_msg="Cannot change the type of an DTypeInvariantTensor"):
x = x.to(dtype=torch.float32)
# Test that we can modify the value of an DTypeInvariantTensor
x[0, 0] = 1
assert x[0, 0] == 1
# Test that we can modify the device of an DTypeInvariantTensor
x = x.to("cuda")
assert x.device.type == "cuda"
def test_register_buffer_does_not_update_uncastable_tensor():
old_device = torch.device("cuda")
old_dtype = torch.float32
new_device = torch.device("cpu")
new_dtype = torch.bfloat16
with init_on_device_and_dtype(device=new_device, dtype=new_dtype):
module = torch.nn.Module()
# Test that we can register an DTypeInvariantTensor as a buffer
tensor = DTypeInvariantTensor(torch.randn(3, 4, dtype=old_dtype, device=old_device))
module.register_buffer("buffer", tensor)
# Test that we can modify the buffer
module.buffer[0, 0] = 1
assert module.buffer[0, 0] == 1
# Test that device has been updated
assert module.buffer.device.type == new_device.type
# Test that dtype has not been modified
assert module.buffer.dtype is old_dtype
import copy
import nanotron.distributed as dist
import pytest
import torch
from helpers.dummy import DummyModel, dummy_infinite_data_loader
from helpers.exception import assert_fail_except_rank_with, timeout_after
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron.models import init_on_device_and_dtype
from nanotron.optim import ZeroDistributedOptimizer
from nanotron.optim.gradient_accumulator import FP32GradBucketManager, FP32GradientAccumulator, get_fp32_accum_hook
from nanotron.optim.named_optimizer import NamedOptimizer
from nanotron.optim.optimizer_from_gradient_accumulator import (
OptimizerFromGradientAccumulator,
)
from nanotron.parallel import ParallelContext
from nanotron.parallel.parameters import NanotronParameter, sanity_check
from nanotron.parallel.pipeline_parallel.engine import (
AllForwardAllBackwardPipelineEngine,
OneForwardOneBackwardPipelineEngine,
PipelineEngine,
)
from nanotron.parallel.pipeline_parallel.p2p import P2P
from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of
from nanotron.parallel.tied_parameters import (
get_tied_id_to_param,
sync_tied_weights_gradients,
tie_parameters,
)
from nanotron.parallel.utils import initial_sync
from nanotron.sanity_checks import assert_tensor_synced_across_pg
from nanotron.utils import ContextManagers
from torch import nn
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
def test_gradient_promoting_in_fp32(half_precision: torch.dtype):
model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
# Create Nanotron Parameter
model.weight = NanotronParameter(model.weight)
# Add gradient accumulator
accumulator = FP32GradientAccumulator(model.named_parameters())
# Check that our model is a valid model
sanity_check(model)
# Compute backward
input = torch.randn(5, 3, dtype=half_precision, device="cuda")
accumulator.backward(model(input).sum())
# Check that we have an high precision gradient and that the low precision one is cleared
assert accumulator.parameters["weight"]["fp32"].grad.dtype == torch.float
if model.weight.grad is not None:
# We check that it's zero
torch.testing.assert_close(model.weight.grad, torch.zeros_like(model.weight.grad), atol=1e-6, rtol=1e-7)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
def test_gradient_accumulated_in_fp32(half_precision: torch.dtype):
model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
ref_model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
with torch.inference_mode():
ref_model.weight.copy_(model.weight)
# Create Nanotron Parameter
model.weight = NanotronParameter(model.weight)
# Add gradient accumulator
accumulator = FP32GradientAccumulator(model.named_parameters())
# Check that our model is a valid model
sanity_check(model)
# Compute backward
grad_accumulation_steps = 2
for _ in range(grad_accumulation_steps):
# We want large input to have large gradients.
input = (torch.randn(5, 3, dtype=half_precision, device="cuda") ** 2 + 1) * 100
# Compute backwards
accumulator.backward(model(input).sum())
ref_model(input).sum().backward()
# We check that we get the same gradient accumulation. In theory we do get more precision by promoting gradients to fp32.
torch.testing.assert_close(
accumulator.parameters["weight"]["fp32"].grad.to(half_precision),
ref_model.weight.grad,
)
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
def test_optimizer_can_step_gradient_in_fp32(half_precision: torch.dtype):
model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
original_weight = model.weight.detach().clone()
# Create Nanotron Parameter
model.weight = NanotronParameter(model.weight)
# Add optimizer
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
)
accumulator = optimizer.gradient_accumulator
# Check that our model is a valid model
sanity_check(model)
# Compute backward
input = torch.randn(5, 3, dtype=half_precision, device="cuda")
accumulator.backward(model(input).sum())
# Check that we have an high precision gradient and that the low precision one is cleared
assert accumulator.parameters["weight"]["fp32"].grad.dtype == torch.float
if model.weight.grad is not None:
# We check that it's zero
torch.testing.assert_close(model.weight.grad, torch.zeros_like(model.weight.grad), atol=1e-6, rtol=1e-7)
optimizer.step()
optimizer.zero_grad()
# Check that we don't have gradients anymore and that it's set to `None`
assert accumulator.parameters["weight"]["fp32"].grad is None
assert model.weight.grad is None
# Check that gradients have been set to zero
fp32_grad = accumulator.get_grad_buffer(name="weight")
torch.testing.assert_close(fp32_grad, torch.zeros_like(fp32_grad), atol=1e-6, rtol=1e-7)
# weights has been updates
assert not torch.allclose(original_weight, model.weight)
@pytest.mark.skipif(available_gpus() < 2, reason="Testing ddp_hook_allreduce requires at least 2 gpus")
@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
@pytest.mark.parametrize("accumulation_steps", [1, 10])
@pytest.mark.parametrize("train_iterations", [1, 3])
@rerun_if_address_is_in_use()
def test_ddp_with_grad_accum_in_fp32(half_precision: torch.dtype, accumulation_steps: int, train_iterations: int):
init_distributed(tp=1, dp=2, pp=1)(_test_ddp_with_grad_accum_in_fp32)(
half_precision=half_precision,
accumulation_steps=accumulation_steps,
train_iterations=train_iterations,
)
def _test_ddp_with_grad_accum_in_fp32(
parallel_context: ParallelContext,
half_precision: torch.dtype,
accumulation_steps: int,
train_iterations: int,
):
hidden_size = 32
n_layers = 3
model = nn.Sequential(
nn.Linear(3, hidden_size, bias=False, dtype=half_precision, device="cuda"),
*(
nn.Linear(hidden_size, hidden_size, bias=False, dtype=half_precision, device="cuda")
for _ in range(n_layers - 1)
),
)
model_hook = copy.deepcopy(model)
# Create Nanotron Parameters
for module in model.modules():
if isinstance(module, nn.Linear):
setattr(module, "weight", NanotronParameter(module.weight))
for module in model_hook.modules():
if isinstance(module, nn.Linear):
setattr(module, "weight", NanotronParameter(module.weight))
# Needed in order to obtain smaller gradient buckets when using `DistributedDataParallel`
model_ddp = torch.nn.parallel.DistributedDataParallel(
model,
process_group=parallel_context.dp_pg,
) # we won't actually use DDP anywhere, it's just to have same module names
model_ddp_accum_ref = {}
model_ddp_fp32_accum = torch.nn.parallel.DistributedDataParallel(
model_hook,
process_group=parallel_context.dp_pg,
)
# Add gradient accumulator
accumulator = FP32GradientAccumulator(model_ddp_fp32_accum.named_parameters())
# Register DDP hook
state = FP32GradBucketManager(
dp_pg=parallel_context.dp_pg,
accumulator=accumulator,
param_id_to_name={id(param): name for name, param in model_ddp_fp32_accum.named_parameters()},
)
model_ddp_fp32_accum.register_comm_hook(
state=state,
hook=get_fp32_accum_hook(
reduce_scatter=False,
reduce_op=dist.ReduceOp.AVG,
),
)
for train_iter in range(train_iterations):
# Gradient accumulation steps
for accum_step in range(accumulation_steps - 1):
# Forward-Backward
input = torch.randn(10, 3, dtype=half_precision, device="cuda")
loss = model_ddp.module(input).sum()
assert not torch.isinf(loss).any(), "loss is inf"
loss.backward()
with ContextManagers([model_ddp_fp32_accum.no_sync(), accumulator.no_sync()]):
loss_fp32_accum = model_ddp_fp32_accum(input).sum()
accumulator.backward(loss_fp32_accum)
for name, param in model_ddp.named_parameters():
grad = param.grad
grad_fp32_accum = accumulator.parameters[name]["fp32"].grad
fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
# Check that FP32GradAccum+DDP+hook gives close gradients to DDP
model_ddp_accum_ref[name] = (
grad.float() if accum_step == 0 else model_ddp_accum_ref[name] + grad.float()
)
dist.barrier()
torch.testing.assert_close(model_ddp_accum_ref[name], fp32_grad_bucket, atol=1e-6, rtol=1e-7)
dist.barrier()
# Check that we correctly copied grads from buckets to params (`copy_buckets_to_grads`)
torch.testing.assert_close(fp32_grad_bucket, grad_fp32_accum, atol=1e-6, rtol=1e-7)
# Check that the gradients are not synchronized across DP
with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
assert_tensor_synced_across_pg(grad, parallel_context.dp_pg)
with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
assert_tensor_synced_across_pg(fp32_grad_bucket, parallel_context.dp_pg)
# We zero out half grads for `model_ddp` because we're accumulating grads manually in `model_ddp_accum_ref`
model_ddp.zero_grad()
# Last accumulation step (Sync grads across DDP)
input = torch.randn(10, 3, dtype=half_precision, device="cuda")
loss = model_ddp.module(input).sum()
loss.backward()
# manually reduce grads across DDP
for name, param in model_ddp.named_parameters():
grad = param.grad
model_ddp_accum_ref[name] = (
model_ddp_accum_ref[name] + grad.float() if name in model_ddp_accum_ref else grad.float()
)
dist.all_reduce(model_ddp_accum_ref[name], group=parallel_context.dp_pg, op=dist.ReduceOp.AVG)
loss_fp32_accum = model_ddp_fp32_accum(input).sum()
accumulator.backward(loss_fp32_accum)
for name, param in model_ddp_fp32_accum.named_parameters():
# Check that half grads has been set to None in sync step, to avoid it being uncorrectly used
half_grad = param.grad
assert half_grad is None, f"{half_grad} != None"
grad = model_ddp_accum_ref[name]
grad_fp32_accum = accumulator.parameters[name]["fp32"].grad
fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
# Check that FP32GradAccum+DDP+hook gives close gradients to DDP
dist.barrier()
torch.testing.assert_close(grad, fp32_grad_bucket, atol=1e-6, rtol=1e-7)
# Check that grad points to the same memory as the bucket
assert grad_fp32_accum.data_ptr() == fp32_grad_bucket.data_ptr()
# Check that the gradients are synchronized across DP
assert_tensor_synced_across_pg(grad, parallel_context.dp_pg)
assert_tensor_synced_across_pg(grad_fp32_accum, parallel_context.dp_pg)
# Zero out gradients (Usually it's the optimizer that does this)
model_ddp.zero_grad()
model_ddp_accum_ref = {}
accumulator.zero_grad() # Sets half grads to None and zeroes out fp32 grad buckets
for name, elt in accumulator.parameters.items():
fp32_param = elt["fp32"]
fp32_param.grad = None
# Check that fp32 grad buckets are zeroed out and `param.grad` is set to None
for name, param in model_ddp_fp32_accum.named_parameters():
assert param.grad is None
fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
dist.barrier()
torch.testing.assert_close(fp32_grad_bucket, torch.zeros_like(fp32_grad_bucket), atol=1e-6, rtol=1e-7)
# Check that all fp32 grad buckets are zeroed out
for _, elt in accumulator.fp32_grad_buffers.items():
fp32_grad = elt["fp32_grad"]
# This is important as we assume grad buckets to be zeroed out at the first accumulation step
dist.barrier()
torch.testing.assert_close(fp32_grad, torch.zeros_like(fp32_grad), atol=1e-6, rtol=1e-7)
parallel_context.destroy()
@pytest.mark.skipif(
available_gpus() < 4, reason="Testing test_tied_weights_sync_with_grad_accum_in_fp32 requires at least 4 gpus"
)
@pytest.mark.parametrize(
"pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
)
@pytest.mark.parametrize("reduce_scatter", [True, False])
@rerun_if_address_is_in_use()
def test_tied_weights_sync_with_grad_accum_in_fp32(pipeline_engine: PipelineEngine, reduce_scatter: bool):
init_distributed(tp=1, dp=2, pp=2)(_test_tied_weights_sync_with_grad_accum_in_fp32)(
pipeline_engine=pipeline_engine, reduce_scatter=reduce_scatter
)
def _test_tied_weights_sync_with_grad_accum_in_fp32(
parallel_context: ParallelContext, pipeline_engine: PipelineEngine, reduce_scatter: bool
):
# We init two replicas of 2 denses. Each dense is on a device.
dtype = torch.float16
device = torch.device("cuda")
p2p = P2P(pg=parallel_context.pp_pg, device=device)
model = DummyModel(p2p=p2p)
reference_model = DummyModel(p2p=p2p)
reference_model_accum_ref = {}
for mdl in [model, reference_model]:
# Set the ranks
with init_on_device_and_dtype(device, dtype):
assert parallel_context.pp_pg.size() == len(mdl.mlp)
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), mdl.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
mdl.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# Tie all dense weights across PP
tie_parameters(
root_module=mdl,
ties=[
(
target,
(
parallel_context.get_global_rank(
ep_rank=dist.get_rank(parallel_context.expert_pg),
pp_rank=get_pp_rank_of(target, module=mdl),
dp_rank=dist.get_rank(parallel_context.dp_pg),
tp_rank=dist.get_rank(parallel_context.tp_pg),
),
),
)
for target in [
f"mlp.{pp_rank}.linear.pp_block.weight" for pp_rank in range(parallel_context.pp_pg.size())
]
],
parallel_context=parallel_context,
reduce_op=dist.ReduceOp.SUM,
)
for name, module in mdl.named_modules():
if isinstance(module, nn.Linear):
module.bias = NanotronParameter(module.bias)
# Sync DP and tied weights: basic assumption
initial_sync(model=mdl, parallel_context=parallel_context)
# Sync params between `model` and `reference_model`
with torch.no_grad():
for name, param in model.named_parameters():
param.copy_(reference_model.get_parameter(name))
# DDP
model_ddp = torch.nn.parallel.DistributedDataParallel(model, process_group=parallel_context.dp_pg)
module_id_to_prefix = {id(module): f"{module_name}." for module_name, module in model.named_modules()}
reference_module_id_to_prefix = {
id(module): f"{module_name}." for module_name, module in reference_model.named_modules()
}
# Fix the root_model
module_id_to_prefix[id(model)] = ""
reference_module_id_to_prefix[id(reference_model)] = ""
# named parameters
named_parameters = [
(
param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix)
if param.is_tied
else name,
param,
)
for name, param in model.named_parameters()
]
# Optimizer: We don't actually run the optimizer, we just use it to build the gradient accumulator
optimizer = ZeroDistributedOptimizer(
dp_pg=parallel_context.dp_pg,
named_params_or_groups=named_parameters,
optimizer_builder=lambda named_param_groups_1: OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(
named_parameters=named_params,
grad_buckets_named_params=named_parameters,
),
named_params_or_groups=named_param_groups_1,
optimizer_builder=lambda named_param_groups_2: NamedOptimizer(
named_params_or_groups=named_param_groups_2,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
),
)
param_id_to_name = {
id(param): param.get_tied_info().get_full_name_from_module_id_to_prefix(
module_id_to_prefix=module_id_to_prefix
)
if param.is_tied
else name
for name, param in model.named_parameters()
}
# Add gradient accumulator
# We use `model_ddp.module` in order ta have the parameter names without the `module.` prefix
accumulator = optimizer.optimizer.gradient_accumulator
accumulator.assign_param_offsets(
dp_rank=dist.get_rank(parallel_context.dp_pg),
param_name_to_offsets=optimizer.param_name_to_dp_rank_offsets,
)
model_ddp.register_comm_hook(
state=FP32GradBucketManager(
dp_pg=parallel_context.dp_pg,
accumulator=accumulator,
param_id_to_name=param_id_to_name,
),
hook=get_fp32_accum_hook(reduce_scatter=reduce_scatter, reduce_op=dist.ReduceOp.AVG),
)
# Get infinite dummy data iterator
data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg, dtype=dtype) # First rank receives data
n_micro_batches_per_batch = 2
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
## Reference model iteration step
def forward_backward_reference(mdl, micro_batch):
pipeline_engine.train_batch_iter(
mdl, pg=parallel_context.pp_pg, batch=[micro_batch], nb_microbatches=1, grad_accumulator=None
)
for accum_step in range(n_micro_batches_per_batch - 1):
# Forward-Backward
forward_backward_reference(reference_model, batch[accum_step])
# Accumulate grads
for name, param in reference_model.named_parameters():
grad = param.grad
if param.is_tied:
tied_info = param.get_tied_info()
name = tied_info.get_full_name_from_module_id_to_prefix(
module_id_to_prefix=reference_module_id_to_prefix
)
reference_model_accum_ref[name] = (
grad.float() if accum_step == 0 else reference_model_accum_ref[name] + grad.float()
)
# We zero out half grads for `reference_model` because we're accumulating grads manually in `reference_model_accum_ref`
reference_model.zero_grad()
# Last accumulation step (Sync grads across DDP)
forward_backward_reference(reference_model, batch[-1])
# manually reduce grads across DDP
for name, param in reference_model.named_parameters():
grad = param.grad
if param.is_tied:
tied_info = param.get_tied_info()
name = tied_info.get_full_name_from_module_id_to_prefix(module_id_to_prefix=reference_module_id_to_prefix)
reference_model_accum_ref[name] = (
reference_model_accum_ref[name] + grad.float() if name in reference_model_accum_ref else grad.float()
)
dist.all_reduce(reference_model_accum_ref[name], group=parallel_context.dp_pg, op=dist.ReduceOp.AVG)
## Model iteration step
pipeline_engine.train_batch_iter(
model_ddp,
pg=parallel_context.pp_pg,
batch=batch,
nb_microbatches=n_micro_batches_per_batch,
grad_accumulator=accumulator,
)
for name, param in model_ddp.module.named_parameters():
if param.is_tied:
tied_info = param.get_tied_info()
name = tied_info.get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix)
# Each parameter is sharded across DP.
assert (
name in accumulator.parameters
), f"`accumulator.parameters` must have all params {name} not in `accumulator.parameters`. Existing keys are: {accumulator.parameters}"
fp32_grad = accumulator.get_grad_buffer(name=name)
if not reduce_scatter:
# Check that the gradients are synchronized across DP
assert_tensor_synced_across_pg(fp32_grad, parallel_context.dp_pg)
fp32_grad_ref = reference_model_accum_ref[name]
dist.barrier()
if reduce_scatter:
slice_ = slice(*accumulator.param_name_to_offsets[name])
# Check that gradients are correct
torch.testing.assert_close(
fp32_grad_ref.view(-1)[slice_] / n_micro_batches_per_batch,
fp32_grad.view(-1)[slice_],
rtol=1e-7,
atol=1e-6,
msg=lambda msg: f"FP32 Gradients at `{name}` don't match\n - Expected: {fp32_grad_ref.view(-1)[slice_] / n_micro_batches_per_batch}\n - Got: {fp32_grad.view(-1)[slice_]}",
)
else:
# Check that gradients are correct
torch.testing.assert_close(fp32_grad_ref / n_micro_batches_per_batch, fp32_grad, rtol=1e-7, atol=1e-6)
# Check that tied weights grads are not synchronized yet
for (name, group_ranks), param in sorted(
get_tied_id_to_param(parameters=model_ddp.parameters(), root_module=model_ddp.module).items(),
key=lambda x: x[0],
):
if not (isinstance(param, NanotronParameter) and param.is_tied):
continue
group = parallel_context.world_ranks_to_pg[group_ranks]
fp32_grad = accumulator.get_grad_buffer(name=name)
with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=group):
assert_tensor_synced_across_pg(
tensor=fp32_grad,
pg=group,
msg=lambda err: f"Tied weights's grads {name} are not synchronized. {err}",
)
# Sync tied weights grads (e.g. sync dense1 and dense2 grads in DP=0, but the problem is that DP=0 has only optim states for dense1)
# - Translate tied ranks along DP axis to find the DP rank that has the tied weights
# - accumulator keeps grads for all DPs, so we can just sync the grads
with timeout_after():
sync_tied_weights_gradients(
module=model_ddp.module, parallel_context=parallel_context, grad_accumulator=accumulator
)
tied_infos_dict = {
(
param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix),
param.get_tied_info().global_ranks,
param.get_tied_info().reduce_op,
): param
for name, param in model_ddp.module.named_parameters()
if param.is_tied
}
# Check that tied weights grads are synchronized
for (name, group_ranks, reduce_op), param in sorted(tied_infos_dict.items(), key=lambda x: x[0]):
# Make sure we don't get None for reduce_op
assert reduce_op == dist.ReduceOp.SUM
fp32_grad_buffer = accumulator.get_grad_buffer(name=name)
# Grad buffers are only attached to param.grad on ranks that are sharded depending on `param_to_dprank`
fp32_grad = accumulator.parameters[name]["fp32"].grad
# Tied weights are synced using the fp32 grad buffers. Let's make sure they still point to the same memory
# When using ZeRODistributedOptimizer gradients are slices across dp
dp_slice_fp_32_grad_buffer = fp32_grad_buffer.view(-1)[slice(*accumulator.param_name_to_offsets[name])]
assert (
dp_slice_fp_32_grad_buffer.data_ptr() == fp32_grad.data_ptr()
), "dp_slice_fp_32_grad_buffer and fp32_grad should point to the same memory"
group = parallel_context.world_ranks_to_pg[group_ranks]
# Check that fp32 grads for tied weights are synced (Used in optimizer step)
# Since we use `reduce_scatter = False` the entire gradient buffer is all reduced, causing it to be synced
if reduce_scatter:
assert_tensor_synced_across_pg(
tensor=dp_slice_fp_32_grad_buffer,
pg=group,
msg=lambda err: f"Tied weights's fp32 grads {name} are not synchronized. {err}",
)
else:
assert_tensor_synced_across_pg(
tensor=fp32_grad_buffer,
pg=group,
msg=lambda err: f"Tied weights's fp32 grads {name} are not synchronized. {err}",
)
# Manually sync reference model's tied weights grads
dist.all_reduce(reference_model_accum_ref[name], group=group, op=reduce_op)
# Check that accumulated grads are correct
for name, elt in accumulator.fp32_grad_buffers.items():
fp32_grad = elt["fp32_grad"]
dist.barrier()
if reduce_scatter:
slice_ = slice(*accumulator.param_name_to_offsets[name])
torch.testing.assert_close(
reference_model_accum_ref[name].view(-1)[slice_] / n_micro_batches_per_batch,
fp32_grad.view(-1)[slice_],
atol=1e-6,
rtol=1e-7,
msg=lambda msg: f"Grad for {name} is not correct.\n{msg}",
)
else:
torch.testing.assert_close(
reference_model_accum_ref[name] / n_micro_batches_per_batch,
fp32_grad,
atol=1e-6,
rtol=1e-7,
msg=lambda msg: f"Grad for {name} is not correct.\n{msg}",
)
parallel_context.destroy()
import math
from typing import Union
import pytest
import torch
from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
from helpers.utils import init_distributed, rerun_if_address_is_in_use
from nanotron.config import ModelArgs, RandomInit, SpectralMupInit
from nanotron.parallel import ParallelContext
from nanotron.scaling.parametrization import ParametrizationMethod
@pytest.mark.parametrize("tp,dp,pp", [(2, 1, 1)])
@pytest.mark.parametrize("parametrization_method", [ParametrizationMethod.SPECTRAL_MUP])
@pytest.mark.skip
@rerun_if_address_is_in_use()
def test_parametrization(tp: int, dp: int, pp: int, parametrization_method: ParametrizationMethod):
if parametrization_method == ParametrizationMethod.STANDARD:
init_method = RandomInit(std=1.0)
elif parametrization_method == ParametrizationMethod.SPECTRAL_MUP:
init_method = SpectralMupInit(use_mup=True)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_parametrization)(
init_method=init_method,
parametrization_method=parametrization_method,
)
def _test_parametrization(
parallel_context: ParallelContext,
init_method: Union[RandomInit, SpectralMupInit],
parametrization_method: ParametrizationMethod,
):
def spectral_std(fan_in: int, fan_out: int):
return torch.tensor((1.0 / math.sqrt(fan_in)) * min(1, math.sqrt(fan_out / fan_in)))
model_args = ModelArgs(init_method=init_method, model_config=TINY_LLAMA_CONFIG)
config = get_llama_training_config(model_args)
llama = create_llama_from_config(
model_config=TINY_LLAMA_CONFIG,
device=torch.device("cuda"),
parallel_context=parallel_context,
)
llama.init_model_randomly(config=config, init_method=parametrization_method)
hidden_size = TINY_LLAMA_CONFIG.hidden_size
interdimte_size = TINY_LLAMA_CONFIG.intermediate_size
o_proj_infeatures = llama.model.decoder[0].pp_block.attn.o_proj.in_features * parallel_context.tensor_parallel_size
NAME_TO_EXPECTED_STD = {
"input_layernorm": torch.tensor(0.0),
"post_attention_layernorm": torch.tensor(0.0),
"final_layer_norm": torch.tensor(0.0),
"token_embedding": torch.tensor(1.0),
# "lm_head": torch.tensor(1.0),
"qkv_proj": spectral_std(fan_in=hidden_size, fan_out=interdimte_size),
"o_proj": spectral_std(fan_in=o_proj_infeatures, fan_out=hidden_size),
"gate_up_proj": spectral_std(fan_in=hidden_size, fan_out=interdimte_size),
"down_proj": spectral_std(fan_in=interdimte_size, fan_out=hidden_size),
}
def find_expected_std(param_name):
for name in NAME_TO_EXPECTED_STD:
if name in param_name:
return NAME_TO_EXPECTED_STD[name]
for name, param in llama.model.named_parameters():
if "lm_head" in name:
continue
expected_std = find_expected_std(name)
assert expected_std is not None, f"Could not find expected std for {name}"
assert torch.allclose(
param.std().float(), expected_std, atol=0.05
), f"name: {name}, expected: {expected_std}, actual: {param.std()}"
from typing import Union
import pytest
import torch
from helpers.dummy import DummyModel, dummy_infinite_data_loader
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.models import init_on_device_and_dtype
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.block import PipelineBlock
from nanotron.parallel.pipeline_parallel.engine import (
AllForwardAllBackwardPipelineEngine,
OneForwardOneBackwardPipelineEngine,
PipelineEngine,
)
from nanotron.parallel.pipeline_parallel.p2p import P2P
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from torch import nn
from torch.nn import functional as F
@pytest.mark.skipif(available_gpus() < 2, reason="Testing build_and_set_rank requires at least 2 gpus")
@rerun_if_address_is_in_use()
def test_build_and_set_rank():
init_distributed(tp=1, dp=1, pp=2)(_test_build_and_set_rank)()
def _test_build_and_set_rank(parallel_context: ParallelContext):
device = torch.device("cuda")
p2p = P2P(pg=parallel_context.pp_pg, device=device)
model = DummyModel(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# Check that the ranks are set correctly
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
assert model.mlp[current_pp_rank].linear.rank == current_pp_rank
assert model.mlp[current_pp_rank].activation.rank == current_pp_rank
# Check that blocks were built on the correct ranks
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
if pp_rank == current_pp_rank:
assert hasattr(non_linear.linear, "pp_block")
assert hasattr(non_linear.activation, "pp_block")
else:
assert not hasattr(non_linear.linear, "pp_block")
assert not hasattr(non_linear.activation, "pp_block")
parallel_context.destroy()
@pytest.mark.skipif(available_gpus() < 1, reason="Testing test_init_on_device_and_dtype requires at least 1 gpus")
def test_init_on_device_and_dtype():
device = torch.device(type="cuda", index=0)
with init_on_device_and_dtype(device=device, dtype=torch.bfloat16):
model = nn.Linear(10, 10)
assert model.weight.dtype == torch.bfloat16, "Model weight wasn't initialised with the correct dtype"
assert model.weight.device == device, "Model weight wasn't initialised with the correct device"
@pytest.mark.skipif(available_gpus() < 2, reason="Testing AFAB requires at least 2 gpus")
@pytest.mark.parametrize(
"pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
)
@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
@rerun_if_address_is_in_use()
def test_pipeline_engine(pipeline_engine: PipelineEngine, pp: int):
init_distributed(tp=1, dp=1, pp=pp)(_test_pipeline_engine)(pipeline_engine=pipeline_engine)
def _test_pipeline_engine(parallel_context: ParallelContext, pipeline_engine: PipelineEngine):
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
# spawn model
model = DummyModel(p2p=p2p)
if has_reference_model:
reference_model = DummyModel(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank in range(parallel_context.pp_pg.size()):
non_linear = model.mlp[pp_rank]
reference_non_linear = reference_model.mlp[pp_rank]
if pp_rank == current_pp_rank:
# We already have the weights locally
reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
else:
p2p.send_tensors(
[model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg) # First rank receives data
# Have at least as many microbatches as PP size.
n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
losses = pipeline_engine.train_batch_iter(
model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
)
# Equivalent on the reference model
if has_reference_model:
reference_losses = []
for micro_batch in batch:
loss = reference_model(**micro_batch)
loss /= n_micro_batches_per_batch
loss.backward()
reference_losses.append(loss.detach())
# Gather loss in reference_rank
if has_reference_model:
_losses = []
for loss in losses:
if isinstance(loss["loss"], torch.Tensor):
if has_reference_model:
_losses.append(loss["loss"])
else:
p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
else:
assert isinstance(loss["loss"], TensorPointer)
if not has_reference_model:
continue
_losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
if has_reference_model:
losses = _losses
# Check loss are the same as reference
if has_reference_model:
for loss, ref_loss in zip(losses, reference_losses):
torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
# Check that gradient flows through the entire model
for param in model.parameters():
assert param.grad is not None
# Check that gradient are the same as reference
if has_reference_model:
for pp_rank in range(parallel_context.pp_pg.size()):
non_linear = model.mlp[pp_rank]
reference_non_linear = reference_model.mlp[pp_rank]
if pp_rank == current_pp_rank:
# We already have the weights locally
torch.testing.assert_close(
non_linear.linear.pp_block.weight.grad,
reference_non_linear.linear.pp_block.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
non_linear.linear.pp_block.bias.grad,
reference_non_linear.linear.pp_block.bias.grad,
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(
weight_grad, reference_non_linear.linear.pp_block.weight.grad, atol=1e-6, rtol=1e-7
)
torch.testing.assert_close(bias_grad, reference_non_linear.linear.pp_block.bias.grad, atol=1e-6, rtol=1e-7)
else:
p2p.send_tensors(
[
model.mlp[current_pp_rank].linear.pp_block.weight.grad,
model.mlp[current_pp_rank].linear.pp_block.bias.grad,
],
to_rank=reference_rank,
)
parallel_context.destroy()
@pytest.mark.skipif(
available_gpus() < 2,
reason="Testing `test_pipeline_engine_with_tensor_that_does_not_require_grad` requires at least 2 gpus",
)
@pytest.mark.parametrize(
"pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
)
@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
@rerun_if_address_is_in_use()
def test_pipeline_engine_with_tensor_that_does_not_require_grad(pipeline_engine: PipelineEngine, pp: int):
init_distributed(pp=pp, dp=1, tp=1)(_test_pipeline_engine_with_tensor_that_does_not_require_grad)(
pipeline_engine=pipeline_engine
)
def _test_pipeline_engine_with_tensor_that_does_not_require_grad(
parallel_context: ParallelContext, pipeline_engine: PipelineEngine
):
def activation(x: torch.Tensor, y: torch.Tensor):
return {"output": F.sigmoid(x) * y, "y": y}
class LinearWithDummyInput(nn.Linear):
def __init__(self, in_features, out_features):
super().__init__(in_features=in_features, out_features=out_features)
def forward(self, x: torch.Tensor, y: torch.Tensor):
return {"output": super().forward(x), "y": y}
class DummyModelPassingNonDifferentiableTensor(nn.Module):
def __init__(
self,
p2p: P2P,
):
super().__init__()
self.p2p = p2p
self.mlp = nn.Sequential(
*(
nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=LinearWithDummyInput,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"x", "y"},
module_output_keys={"output", "y"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=lambda: activation,
module_kwargs={},
module_input_keys={"x", "y"},
module_output_keys={"output", "y"},
),
}
)
for _ in range(p2p.pg.size() + 1)
)
)
self.loss = PipelineBlock(
p2p=p2p,
module_builder=lambda: lambda x: x.sum(),
module_kwargs={},
module_input_keys={"x"},
module_output_keys={"output"},
)
def forward(
self,
differentiable_tensor: Union[torch.Tensor, TensorPointer],
non_differentiable_tensor: Union[torch.Tensor, TensorPointer],
):
for non_linear in self.mlp:
linear_output = non_linear.linear(x=differentiable_tensor, y=non_differentiable_tensor)
output = non_linear.activation(x=linear_output["output"], y=linear_output["y"])
differentiable_tensor, non_differentiable_tensor = output["output"], output["y"]
if isinstance(differentiable_tensor, torch.Tensor):
assert differentiable_tensor.requires_grad is True
if isinstance(non_differentiable_tensor, torch.Tensor):
assert non_differentiable_tensor.requires_grad is False
differentiable_tensor = self.loss(x=differentiable_tensor)["output"]
return differentiable_tensor
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
has_reference_model = current_pp_rank == reference_rank
# spawn model
model = DummyModelPassingNonDifferentiableTensor(p2p=p2p)
if has_reference_model:
reference_model = DummyModelPassingNonDifferentiableTensor(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size() + 1
# An additional mlp is in the end
mlp_index_pp_rank = [(i, i) for i in range(parallel_context.pp_pg.size())] + [
(parallel_context.pp_pg.size(), parallel_context.pp_pg.size() - 1)
]
with init_on_device_and_dtype(device):
for (mlp_index, pp_rank), non_linear in zip(mlp_index_pp_rank, model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for (mlp_index, pp_rank) in mlp_index_pp_rank:
non_linear = model.mlp[mlp_index]
reference_non_linear = reference_model.mlp[mlp_index]
if pp_rank == current_pp_rank:
# We already have the weights locally
reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
else:
for (mlp_index, pp_rank) in mlp_index_pp_rank:
if pp_rank == current_pp_rank:
p2p.send_tensors(
[model.mlp[mlp_index].linear.pp_block.weight, model.mlp[mlp_index].linear.pp_block.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
def dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
):
micro_batch_size = 3
# We assume the first linear is always built on the first rank.
while True:
yield {
"differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
"non_differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
}
data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg=parallel_context.pp_pg
) # First rank receives data
# Have at least as many microbatches as PP size.
n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
losses = pipeline_engine.train_batch_iter(
model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
)
# Equivalent on the reference model
if has_reference_model:
reference_losses = []
for micro_batch in batch:
loss = reference_model(**micro_batch)
loss /= n_micro_batches_per_batch
loss.backward()
reference_losses.append(loss.detach())
# Gather loss in reference_rank
if has_reference_model:
_losses = []
for loss in losses:
if isinstance(loss["loss"], torch.Tensor):
if has_reference_model:
_losses.append(loss["loss"])
else:
p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
else:
assert isinstance(loss["loss"], TensorPointer)
if not has_reference_model:
continue
_losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
if has_reference_model:
losses = _losses
# Check loss are the same as reference
if has_reference_model:
for loss, ref_loss in zip(losses, reference_losses):
torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
# Check that gradient flows through the entire model
for param in model.parameters():
assert param.grad is not None
# Check that gradient are the same as reference
if has_reference_model:
for (mlp_index, pp_rank) in mlp_index_pp_rank:
non_linear = model.mlp[mlp_index]
reference_non_linear = reference_model.mlp[mlp_index]
if pp_rank == current_pp_rank:
# We already have the weights locally
torch.testing.assert_close(
non_linear.linear.pp_block.weight.grad,
reference_non_linear.linear.pp_block.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
non_linear.linear.pp_block.bias.grad,
reference_non_linear.linear.pp_block.bias.grad,
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(
weight_grad, reference_non_linear.linear.pp_block.weight.grad, atol=1e-6, rtol=1e-7
)
torch.testing.assert_close(bias_grad, reference_non_linear.linear.pp_block.bias.grad, atol=1e-6, rtol=1e-7)
else:
for (mlp_index, pp_rank) in mlp_index_pp_rank:
if pp_rank == current_pp_rank:
p2p.send_tensors(
[model.mlp[mlp_index].linear.pp_block.weight.grad, model.mlp[mlp_index].linear.pp_block.bias.grad],
to_rank=reference_rank,
)
parallel_context.destroy()
@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
@rerun_if_address_is_in_use()
def test_pipeline_forward_without_engine(pp: int):
init_distributed(pp=pp, dp=1, tp=1)(_test_pipeline_forward_without_engine)()
def _test_pipeline_forward_without_engine(parallel_context: ParallelContext):
def activation(x: torch.Tensor, y: torch.Tensor):
return {"output": F.sigmoid(x) * y, "y": y}
class DummyModel(nn.Module):
def __init__(
self,
p2p: P2P,
):
super().__init__()
self.p2p = p2p
self.mlp = nn.Sequential(
*(
nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Linear,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"input"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=lambda: activation,
module_kwargs={},
module_input_keys={"x", "y"},
module_output_keys={"output", "y"},
),
}
)
for _ in range(p2p.pg.size())
)
)
self.loss = PipelineBlock(
p2p=p2p,
module_builder=lambda: lambda x: x.sum(),
module_kwargs={},
module_input_keys={"x"},
module_output_keys={"output"},
)
def forward(
self,
differentiable_tensor: Union[torch.Tensor, TensorPointer],
non_differentiable_tensor: Union[torch.Tensor, TensorPointer],
):
for non_linear in self.mlp:
differentiable_tensor = non_linear.linear(input=differentiable_tensor)["output"]
output = non_linear.activation(x=differentiable_tensor, y=non_differentiable_tensor)
differentiable_tensor, non_differentiable_tensor = output["output"], output["y"]
differentiable_tensor = self.loss(x=differentiable_tensor)["output"]
return differentiable_tensor
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
has_reference_model = current_pp_rank == reference_rank
# spawn model
model = DummyModel(p2p=p2p)
if has_reference_model:
reference_model = DummyModel(p2p=p2p)
# Set the ranks
assert len(model.mlp) == parallel_context.pp_pg.size()
with init_on_device_and_dtype(device):
for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for non_linear in reference_model.mlp:
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank in range(parallel_context.pp_pg.size()):
non_linear = model.mlp[pp_rank]
reference_non_linear = reference_model.mlp[pp_rank]
if pp_rank == current_pp_rank:
# We already have the weights locally
reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
else:
p2p.send_tensors(
[model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
def dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
):
micro_batch_size = 3
# We assume the first linear is always built on the first rank.
while True:
yield {
"differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
"non_differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
}
data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg=parallel_context.pp_pg
) # First rank receives data
# Have at least as many microbatches as PP size.
n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
# Run the model
losses = []
for micro_batch in batch:
with torch.inference_mode():
loss = model(**micro_batch)
losses.append(loss)
# Equivalent on the reference model
if has_reference_model:
reference_losses = []
for micro_batch in batch:
loss = reference_model(**micro_batch)
reference_losses.append(loss.detach())
# Gather loss in reference_rank
if has_reference_model:
_losses = []
for loss in losses:
if isinstance(loss, torch.Tensor):
if has_reference_model:
_losses.append(loss)
else:
p2p.send_tensors([loss], to_rank=reference_rank)
else:
assert isinstance(loss, TensorPointer)
if not has_reference_model:
continue
_losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss.group_rank)[0])
if has_reference_model:
losses = _losses
# Check loss are the same as reference
if has_reference_model:
for loss, ref_loss in zip(losses, reference_losses):
torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
parallel_context.destroy()
@pytest.mark.skipif(available_gpus() < 4, reason="Testing `test_pipeline_engine_diamond` requires at least 4 gpus")
@pytest.mark.parametrize(
"pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
)
@rerun_if_address_is_in_use()
def test_pipeline_engine_diamond(pipeline_engine: PipelineEngine):
init_distributed(pp=4, dp=1, tp=1)(_test_pipeline_engine_diamond)(pipeline_engine=pipeline_engine)
pass
def _test_pipeline_engine_diamond(parallel_context: ParallelContext, pipeline_engine: PipelineEngine):
class DiamondModel(nn.Module):
def __init__(self, p2p: P2P):
super().__init__()
self.p2p = p2p
self.dense_bottom = nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Linear,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"input"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=nn.ReLU,
module_kwargs={},
module_input_keys={"input"},
module_output_keys={"output"},
),
}
)
self.dense_left = nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Linear,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"input"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=nn.ReLU,
module_kwargs={},
module_input_keys={"input"},
module_output_keys={"output"},
),
}
)
self.dense_right = nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Linear,
module_kwargs={"in_features": 10, "out_features": 10},
module_input_keys={"input"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=nn.ReLU,
module_kwargs={},
module_input_keys={"input"},
module_output_keys={"output"},
),
}
)
self.dense_top = nn.ModuleDict(
{
"linear": PipelineBlock(
p2p=p2p,
module_builder=nn.Bilinear,
module_kwargs={"in1_features": 10, "in2_features": 10, "out_features": 10},
module_input_keys={"input1", "input2"},
module_output_keys={"output"},
),
"activation": PipelineBlock(
p2p=p2p,
module_builder=nn.ReLU,
module_kwargs={},
module_input_keys={"input"},
module_output_keys={"output"},
),
}
)
self.loss = PipelineBlock(
p2p=p2p,
module_builder=lambda: lambda x: x.sum(),
module_kwargs={},
module_input_keys={"x"},
module_output_keys={"output"},
)
def forward(self, x):
x = self.dense_bottom.activation(input=self.dense_bottom.linear(input=x)["output"])["output"]
y = self.dense_left.activation(input=self.dense_left.linear(input=x)["output"])["output"]
z = self.dense_right.activation(input=self.dense_right.linear(input=x)["output"])["output"]
out = self.dense_top.activation(input=self.dense_top.linear(input1=y, input2=z)["output"])["output"]
return self.loss(x=out)["output"]
device = torch.device("cuda")
p2p = P2P(parallel_context.pp_pg, device=device)
reference_rank = 0
current_pp_rank = dist.get_rank(parallel_context.pp_pg)
has_reference_model = current_pp_rank == reference_rank
# spawn model
model = DiamondModel(p2p=p2p)
if has_reference_model:
reference_model = DiamondModel(p2p=p2p)
# Set the ranks
assert parallel_context.pp_pg.size() == len(
[model.dense_bottom, model.dense_left, model.dense_right, model.dense_top]
)
assert parallel_context.pp_pg.size() == 4
pp_rank_to_dense_name = ["dense_bottom", "dense_left", "dense_right", "dense_top"]
with init_on_device_and_dtype(device):
for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
non_linear = model.get_submodule(module_name)
non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
# build reference model
if has_reference_model:
for module_name in pp_rank_to_dense_name:
non_linear = reference_model.get_submodule(module_name)
non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
# synchronize weights
if has_reference_model:
with torch.inference_mode():
for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
reference_non_linear = reference_model.get_submodule(module_name).linear.pp_block
if pp_rank == current_pp_rank:
# We already have the weights locally
non_linear = model.get_submodule(module_name).linear.pp_block
reference_non_linear.weight.data.copy_(non_linear.weight.data)
reference_non_linear.bias.data.copy_(non_linear.bias.data)
continue
weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
reference_non_linear.weight.data.copy_(weight.data)
reference_non_linear.bias.data.copy_(bias.data)
else:
non_linear = model.get_submodule(pp_rank_to_dense_name[current_pp_rank]).linear.pp_block
p2p.send_tensors(
[non_linear.weight, non_linear.bias],
to_rank=reference_rank,
)
# Get infinite dummy data iterator
def dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
):
micro_batch_size = 3
# We assume the first linear is always built on the first rank.
while True:
yield {
"x": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
if current_pp_rank == input_pp_rank
else TensorPointer(group_rank=input_pp_rank),
}
data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
pp_pg=parallel_context.pp_pg
) # First rank receives data
# Have at least as many microbatches as PP size.
n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
losses = pipeline_engine.train_batch_iter(
model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
)
# Equivalent on the reference model
if has_reference_model:
reference_losses = []
for micro_batch in batch:
loss = reference_model(**micro_batch)
loss /= n_micro_batches_per_batch
loss.backward()
reference_losses.append(loss.detach())
# Gather loss in reference_rank
if has_reference_model:
_losses = []
for loss in losses:
if isinstance(loss["loss"], torch.Tensor):
if has_reference_model:
_losses.append(loss["loss"])
else:
p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
else:
assert isinstance(loss["loss"], TensorPointer)
if not has_reference_model:
continue
_losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
if has_reference_model:
losses = _losses
# Check loss are the same as reference
if has_reference_model:
for loss, ref_loss in zip(losses, reference_losses):
torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
# Check that gradient flows through the entire model
for param in model.parameters():
assert param.grad is not None
# Check that gradient are the same as reference
if has_reference_model:
for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
reference_non_linear = reference_model.get_submodule(module_name).linear.pp_block
if pp_rank == current_pp_rank:
# We already have the weights locally
non_linear = model.get_submodule(module_name).linear.pp_block
torch.testing.assert_close(
non_linear.weight.grad,
reference_non_linear.weight.grad,
atol=1e-6,
rtol=1e-7,
)
torch.testing.assert_close(
non_linear.bias.grad,
reference_non_linear.bias.grad,
atol=1e-6,
rtol=1e-7,
)
continue
weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
else:
non_linear = model.get_submodule(pp_rank_to_dense_name[current_pp_rank]).linear.pp_block
p2p.send_tensors(
[non_linear.weight.grad, non_linear.bias.grad],
to_rank=reference_rank,
)
parallel_context.destroy()
import pytest
import torch
from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
from nanotron import distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.random import (
RandomStates,
branch_random_state,
get_current_random_state,
get_synced_random_state,
)
@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_random_state_sync requires at least 2 gpus")
@pytest.mark.parametrize("tp,dp,pp", [(2, 1, 1), (1, 2, 1), (1, 1, 2)])
@rerun_if_address_is_in_use()
def test_random_state_sync(tp: int, dp: int, pp: int):
# TODO @nouamane: Make a test with 4 gpus (2 in one pg, 2 in other pg)
init_distributed(tp=tp, dp=dp, pp=pp)(_test_random_state_sync)()
def _test_random_state_sync(parallel_context: ParallelContext):
current_random_state = get_current_random_state()
reference_rank = 0
pg = next(
(pg for pg in [parallel_context.tp_pg, parallel_context.dp_pg, parallel_context.pp_pg] if pg.size() == 2)
)
# Check that they are not equal across process group
if dist.get_rank(pg) == reference_rank:
random_states = [current_random_state]
else:
random_states = [None]
dist.broadcast_object_list(random_states, src=reference_rank, group=pg)
if dist.get_rank(pg) != reference_rank:
assert current_random_state != random_states[0]
# Sync random state
synced_random_state = get_synced_random_state(current_random_state, pg=pg)
# Check that they are equal across process group
random_states = [synced_random_state]
dist.broadcast_object_list(random_states, src=reference_rank, group=pg)
if dist.get_rank(pg) != reference_rank:
assert current_random_state != random_states[0]
parallel_context.destroy()
def test_random_state_fork_random_operation_in_global_context():
key = "my_random_state"
random_state = get_current_random_state()
random_states = RandomStates({key: random_state})
assert random_states[key] == random_state
# Random operation that updates the random state
torch.randn(1)
new_random_state = get_current_random_state()
# Check that random states changed
assert new_random_state != random_state
assert random_states[key] == random_state
# Check that within the context manager the random state matches the one we stored in `random_states`
with branch_random_state(random_states=random_states, key=key, enabled=True):
assert random_states[key] == random_state
assert get_current_random_state() == random_states[key]
# Check that random states if back to global one
assert get_current_random_state() == new_random_state
def test_random_state_fork_random_operation_in_local_context():
key = "my_random_state"
random_state = get_current_random_state()
random_states = RandomStates({key: random_state})
# Check that within the context manager the random state matches the one we stored in `random_states`
with branch_random_state(random_states=random_states, key=key, enabled=True):
old_random_state = get_current_random_state()
assert old_random_state == random_states[key]
# Random operation that updates the random state
torch.randn(1)
# Check that random states changed
new_random_state = get_current_random_state()
# Check that global random_state hasn't changed
assert get_current_random_state() == random_state
# Check that local random_state has changed and is equal to `new_random_state`
assert old_random_state != random_states[key]
assert new_random_state == random_states[key]
import pytest
import torch
from helpers.context import TestContext
from helpers.dummy import dummy_infinite_data_loader, init_dummy_model
from helpers.utils import (
available_gpus,
get_all_3d_configurations,
init_distributed,
is_dict_equal,
rerun_if_address_is_in_use,
)
from nanotron import distributed as dist
from nanotron.constants import CHECKPOINT_VERSION
from nanotron.optim.gradient_accumulator import FP32GradientAccumulator
from nanotron.optim.named_optimizer import NamedOptimizer
from nanotron.optim.optimizer_from_gradient_accumulator import (
OptimizerFromGradientAccumulator,
)
from nanotron.optim.zero import ZeroDistributedOptimizer
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.engine import (
AllForwardAllBackwardPipelineEngine,
)
from nanotron.parallel.sharded_parameters import SplitConfig, create_sharded_parameter_from_config
from nanotron.parallel.tied_parameters import sync_tied_weights_gradients
from nanotron.random import RandomStates, get_current_random_state, get_synced_random_state
from nanotron.serialize import (
load_optimizer,
load_random_states,
load_weights,
save_optimizer,
save_random_states,
save_weights,
)
from nanotron.serialize.metadata import TensorMetadata
from torch.nn.parallel import DistributedDataParallel
def test_save_and_load_with_changed_topolgy():
# TODO @thomasw21: We want to be able to support a change of topology mechanism
return
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_and_load_model(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_and_load_model)(test_context=test_context)
def _test_save_and_load_model(parallel_context: ParallelContext, test_context: TestContext):
model = init_dummy_model(parallel_context=parallel_context)
store_folder = test_context.get_auto_remove_tmp_dir()
# Save
save_weights(model=model, parallel_context=parallel_context, root_folder=store_folder)
# Load
new_model = init_dummy_model(parallel_context=parallel_context)
# Check that the newly initialised model isn't the same.
match, msg = is_dict_equal(new_model.state_dict(), model.state_dict())
if len(model.state_dict()) == 0:
# Edge case where there's no parameters/buffers stored in the model.
pass
else:
assert not match, "Newly initialised model should not match."
load_weights(model=new_model, parallel_context=parallel_context, root_folder=store_folder)
# Assert the weights are exactly the same after loading
match, msg = is_dict_equal(new_model.state_dict(), model.state_dict())
assert match, msg
parallel_context.destroy()
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_and_load_optimizer(tp: int, dp: int, pp: int):
test_context = TestContext()
if pp > 1:
pytest.skip("Pipeline parallelism not supported for this test yet")
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_and_load_optimizer)(test_context=test_context)
def _test_save_and_load_optimizer(parallel_context: ParallelContext, test_context: TestContext):
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context)
optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
)
# Manually sync tied parameters
sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Check that the newly initialised optimizer isn't the same.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
if len(optimizer.state_dict()["state"]) == 0:
# Edge case where there's no state stored in the optimizer.
pass
else:
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# Assert the optimizer states are exactly the same after loading.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
assert match, msg
# Test loading optimizer states to CPU
cpu_optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Load optimizer states to CPU
load_optimizer(
optimizer=cpu_optimizer, parallel_context=parallel_context, root_folder=store_folder, map_location="cpu"
)
# Get state dicts
gpu_state = optimizer.state_dict()
cpu_state = cpu_optimizer.state_dict()
# Check that states match except for device
for param_id in gpu_state["state"]:
for key, gpu_value in gpu_state["state"][param_id].items():
cpu_value = cpu_state["state"][param_id][key]
if isinstance(gpu_value, torch.Tensor):
assert torch.equal(gpu_value.cpu(), cpu_value), f"Values don't match for param {param_id}, key {key}"
if key != "step": # Skip device checks for 'step' key
assert (
cpu_value.device.type == "cpu"
), f"CPU optimizer state should be on CPU for param {param_id}, key {key}"
assert (
gpu_value.device.type == "cuda"
), f"GPU optimizer state should be on CUDA for param {param_id}, key {key}"
else:
assert gpu_value == cpu_value, f"Non-tensor values don't match for param {param_id}, key {key}"
parallel_context.destroy()
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_zero_optimizer_and_load_optimizer(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_zero_optimizer_and_load_optimizer)(test_context=test_context)
def _test_save_zero_optimizer_and_load_optimizer(parallel_context: ParallelContext, test_context: TestContext):
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context)
optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
)
# Manually sync tied parameters
sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
# Check that the newly initialised optimizer isn't the same.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
if len(optimizer.state_dict()["state"]) == 0:
# Edge case where there's no state stored in the optimizer.
pass
else:
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# Assert the optimizer states are exactly the same after loading.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
assert match, msg
parallel_context.destroy()
@pytest.mark.skip(reason="Assumption that zero and non zero optimizer have the same serialization format doesn't hold")
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_zero_optimizer_and_load_data_parallel_optimizer(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_zero_optimizer_and_load_data_parallel_optimizer)(
test_context=test_context
)
def _test_save_zero_optimizer_and_load_data_parallel_optimizer(
parallel_context: ParallelContext, test_context: TestContext
):
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context)
optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
)
# Manually sync tied parameters
sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Check that the newly initialised optimizer isn't the same.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
if len(optimizer.state_dict()["state"]) == 0:
# Edge case where there's no state stored in the optimizer.
pass
else:
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# TODO @thomasw21: Compare zero optimizer with non zero
parallel_context.destroy()
@pytest.mark.skip(reason="Assumption that zero and non zero optimizer have the same serialization format doesn't hold")
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_data_parallel_optimizer_and_load_zero_optimizer(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_data_parallel_optimizer_and_load_zero_optimizer)(
test_context=test_context
)
def _test_save_data_parallel_optimizer_and_load_zero_optimizer(
parallel_context: ParallelContext, test_context: TestContext
):
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context)
optimizer = NamedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda params: torch.optim.AdamW(params),
)
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
)
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = ZeroDistributedOptimizer(
named_params_or_groups=model.named_parameters(),
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
dp_pg=parallel_context.dp_pg,
)
# Check that the newly initialised optimizer isn't the same.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
if len(optimizer.state_dict()["state"]) == 0:
# Edge case where there's no state stored in the optimizer.
pass
else:
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# TODO @thomasw21: Compare zero optimizer with non zero
parallel_context.destroy()
@pytest.mark.parametrize(
"tp,dp,pp",
[
pytest.param(*all_3d_configs)
for gpus in range(1, min(available_gpus(), 4) + 1)
for all_3d_configs in get_all_3d_configurations(gpus)
],
)
@rerun_if_address_is_in_use()
def test_save_optimizer_with_additional_state_dict_keys(tp: int, dp: int, pp: int):
test_context = TestContext()
# We use DP=2 as we're interested in testing that one
init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_optimizer_with_additional_state_dict_keys)(
test_context=test_context
)
def _test_save_optimizer_with_additional_state_dict_keys(parallel_context: ParallelContext, test_context: TestContext):
dtype = torch.float16
store_folder = test_context.get_auto_remove_tmp_dir()
model = init_dummy_model(parallel_context=parallel_context, dtype=dtype)
if isinstance(model, DistributedDataParallel):
# Remove the annoying "module." prefix
unwrapped_model = model.module
else:
unwrapped_model = model
named_parameters = list(unwrapped_model.named_parameters())
optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_parameters,
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
)
grad_accumulator = optimizer.gradient_accumulator
assert len(optimizer.state_dict_additional_keys()) > 0
# Train in order to update the optimizer step a few times
data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg, dtype=dtype))
nb_optim_steps = 3
pipeline_engine = AllForwardAllBackwardPipelineEngine()
for _ in range(nb_optim_steps):
minibatch = next(data_loader)
_ = pipeline_engine.train_batch_iter(
model=model,
pg=parallel_context.pp_pg,
batch=[minibatch],
nb_microbatches=1,
grad_accumulator=grad_accumulator,
)
# Manually sync tied parameters
sync_tied_weights_gradients(
module=unwrapped_model, parallel_context=parallel_context, grad_accumulator=grad_accumulator
)
# Optimizer steps
optimizer.step()
optimizer.zero_grad()
# Save optimizer
save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
dist.barrier(parallel_context.world_pg)
# Generate a new optimizer
new_optimizer = OptimizerFromGradientAccumulator(
gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
named_params_or_groups=named_parameters,
optimizer_builder=lambda named_param_groups: NamedOptimizer(
named_params_or_groups=named_param_groups,
optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
),
)
new_grad_accumulator = new_optimizer.gradient_accumulator
# Check that the newly initialised optimizer isn't the same.
if len(optimizer.state_dict()["state"]) == 0:
pass
else:
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
assert not match, "Newly initialised optimizer should not match."
load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
# Assert the optimizer states are exactly the same after loading.
match, msg = is_dict_equal(optimizer.state_dict()["state"], new_optimizer.state_dict()["state"])
assert match, msg
# Assert the optimizer state_dict are exactly the same after loading.
match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
assert match, msg
# Assert the internal optimizer states are exactly the same after loading.
keys_to_ignore = []
match, msg = is_dict_equal(
{
name: {key: tensor for key, tensor in elt.items() if key not in keys_to_ignore}
for name, elt in grad_accumulator.parameters.items()
},
{
name: {key: tensor for key, tensor in elt.items() if key not in keys_to_ignore}
for name, elt in new_grad_accumulator.parameters.items()
},
)
assert match, msg
parallel_context.destroy()
# TODO @thomasw21: Test with a optimizer that uses `named_param_groups` instead of `param_groups`
@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_save_and_load_random_states requires at least 2 gpus")
@rerun_if_address_is_in_use()
def test_save_and_load_random_states():
test_context = TestContext()
# We use DP=2 as we're interested in testing
init_distributed(tp=2, dp=1, pp=1)(_test_save_and_load_random_states)(test_context=test_context)
def _test_save_and_load_random_states(parallel_context: ParallelContext, test_context: TestContext):
pg = next(
(pg for pg in [parallel_context.tp_pg, parallel_context.dp_pg, parallel_context.pp_pg] if pg.size() == 2)
)
random_states = RandomStates(
{
"my_synced_random_state": get_synced_random_state(random_state=get_current_random_state(), pg=pg),
"my_own_random_state": get_current_random_state(),
}
)
store_folder = test_context.get_auto_remove_tmp_dir()
# Check that random states are unequal between ranks (due to `my_own_random_state`)
reference_rank = 0
if dist.get_rank(pg) == reference_rank:
random_statess = [random_states]
else:
random_statess = [None]
dist.broadcast_object_list(random_statess, src=dist.get_global_rank(group_rank=reference_rank, group=pg), group=pg)
if dist.get_rank(pg) != reference_rank:
assert random_states != random_statess[0]
# save
save_random_states(random_states=random_states, parallel_context=parallel_context, root_folder=store_folder)
# load
new_random_states = load_random_states(parallel_context=parallel_context, root_folder=store_folder)
# Each rank has restored it's own random state
assert random_states == new_random_states
parallel_context.destroy()
@rerun_if_address_is_in_use()
def test_serialize_deserialize_tensormetadata():
test_context = TestContext()
init_distributed(tp=2, dp=1, pp=1)(_test_serialize_deserialize_tensormetadata)(test_context=test_context)
def _test_serialize_deserialize_tensormetadata(parallel_context: ParallelContext, test_context: TestContext):
param = torch.nn.Parameter(torch.randn(16, 64))
split_config = SplitConfig(
split_dim=0,
contiguous_chunks=(8, 8),
)
param = create_sharded_parameter_from_config(parameter=param, pg=parallel_context.tp_pg, split_config=split_config)
sharded_info = param.get_sharded_info()
metadata = TensorMetadata(
version=CHECKPOINT_VERSION,
local_global_slices_pairs=sharded_info.local_global_slices_pairs,
unsharded_shape=sharded_info.unsharded_shape,
)
metadata_str_dict = metadata.to_str_dict()
# Assert metadata_str_dict is Dict[str, str]
assert isinstance(metadata_str_dict, dict)
assert all(isinstance(key, str) for key in metadata_str_dict.keys())
assert all(isinstance(value, str) for value in metadata_str_dict.values())
metadata_from_str_dict = TensorMetadata.from_str_dict(metadata_str_dict)
assert metadata == metadata_from_str_dict
parallel_context.destroy()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment