v1.0.3

71e79847 · chenzk · 71e79847 · 71e79847 · 71e79847 · 71e79847
Commit 71e79847 authored Dec 03, 2024 by chenzk
20 changed files
--- a/tests/kernels/test_layer_norm.py
+++ b/tests/kernels/test_layer_norm.py
+import pytest
+import torch
+from torch.nn import LayerNorm
+
+from nanotron.nn.layer_norm import TritonLayerNorm
+
+
+@pytest.mark.fa2
+@pytest.mark.parametrize(
+    "hidden_size",
+    [1024, 1025],  # fused layer norm supports 1024 as hidden size but not 1025
+)
+def test_fused_layer_norm(hidden_size):
+    BATCH_SIZE = 5
+    SEQ_LEN = 128
+    DEVICE, DTYPE = torch.device("cuda:0"), torch.float16
+    inputs = torch.rand(BATCH_SIZE, SEQ_LEN, hidden_size, device=DEVICE, dtype=DTYPE)
+
+    layer_norm = LayerNorm(normalized_shape=inputs.size(-1), device=DEVICE, dtype=DTYPE)
+    ref_outputs = layer_norm(inputs)
+
+    fused_layer_norm = TritonLayerNorm(
+        normalized_shape=inputs.size(-1),
+        device=DEVICE,
+        dtype=DTYPE,
+    )
+    outputs = fused_layer_norm(inputs)
+
+    # NOTE: with torch.float16, FA2's use a atol of 1e-2
+    # https://github.com/Dao-AILab/flash-attention/blob/87a1277653fc55cd615f5341255e00c69d5c00a1/tests/ops/triton/test_layer_norm.py#L63-L64
+    torch.testing.assert_close(outputs, ref_outputs, rtol=1e-3, atol=1e-2)
+
+    outputs.sum().backward()
+    ref_outputs.sum().backward()
+
+    # NOTE: same as above
+    torch.testing.assert_close(fused_layer_norm.weight.grad, layer_norm.weight.grad, rtol=1e-3, atol=1e-2)
+    torch.testing.assert_close(fused_layer_norm.bias.grad, layer_norm.bias.grad, rtol=1e-3, atol=1e-2)
--- a/tests/nanoset/test_build_nanoset_dataloader.py
+++ b/tests/nanoset/test_build_nanoset_dataloader.py
+import sys
+from math import isclose
+from pathlib import Path
+from typing import List
+
+package_path = Path(__file__).parent.parent
+sys.path.append(str(package_path))
+
+import numpy as np
+import pytest
+from helpers.context import TestContext
+from helpers.data import (
+    assert_batch_dataloader,
+    assert_nanoset_sync_across_all_ranks,
+    compute_batch_hash,
+    create_dataset_paths,
+    create_dummy_json_dataset,
+    preprocess_dummy_dataset,
+)
+from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
+from nanotron.data.dataloader_builder import build_nanoset_dataloader
+from nanotron.data.nanoset import Nanoset
+from nanotron.data.utils import count_dataset_indexes, normalize
+from nanotron.parallel import ParallelContext
+from nanotron.utils import main_rank_first
+from transformers import AutoTokenizer
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@pytest.mark.parametrize("train_steps", [500, 10000])
+@pytest.mark.parametrize("sequence_length", [512, 8192])
+@pytest.mark.parametrize("tokenizer_name_or_path", ["openai-community/gpt2", "unsloth/llama-3-8b-bnb-4bit"])
+@rerun_if_address_is_in_use()
+def test_build_nanoset_dataloader(
+    tp: int, dp: int, pp: int, train_steps: int, sequence_length: int, tokenizer_name_or_path: str
+):
+    test_context = TestContext()
+
+    # Create dataset folders
+    json_paths, datatrove_tokenized_dataset_folders = create_dataset_paths(
+        tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2
+    )
+
+    # Create dummy json datasets
+    for idx, json_path in enumerate(json_paths):
+        create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
+
+    # Preprocess json dataset with datatrove
+    for json_path, datatrove_tokenized_dataset_folder in zip(json_paths, datatrove_tokenized_dataset_folders):
+        preprocess_dummy_dataset(json_path, datatrove_tokenized_dataset_folder, tokenizer_name_or_path)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(
+        datatrove_tokenized_dataset_folders=datatrove_tokenized_dataset_folders,
+        train_steps=train_steps,
+        sequence_length=sequence_length,
+        tokenizer_name_or_path=tokenizer_name_or_path,
+    )
+
+
+def _test_build_nanoset_dataloader(
+    parallel_context: ParallelContext,
+    datatrove_tokenized_dataset_folders: List[str],
+    train_steps: int,
+    sequence_length: int,
+    tokenizer_name_or_path: str,
+):
+    SEED = 1234
+    MICRO_BATCH_SIZE = 4
+    N_MICRO_BATCHES_PER_BATCH = 8
+    GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
+
+    input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
+
+    # Get tokenizer cardinality
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+    token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
+    del tokenizer
+
+    # Create Nanoset configs: 1. Normal 2. Blended 3. Blended with weights
+    nanoset_config = {
+        "dataset_folders": [datatrove_tokenized_dataset_folders[0]],
+        "dataset_weights": [1],
+        "sequence_length": sequence_length,
+        "token_size": token_size,
+        "train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    blended_nanoset_config = {
+        "dataset_folders": datatrove_tokenized_dataset_folders,
+        "dataset_weights": None,
+        "sequence_length": sequence_length,
+        "token_size": token_size,
+        "train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    blended_weighted_nanoset_config = {
+        "dataset_folders": datatrove_tokenized_dataset_folders,
+        "dataset_weights": [8, 2],
+        "sequence_length": sequence_length,
+        "token_size": token_size,
+        "train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    configs = [nanoset_config, blended_nanoset_config, blended_weighted_nanoset_config]
+
+    for config in configs:
+        # Create Nanoset
+        with main_rank_first(parallel_context.world_pg):
+            train_dataset = Nanoset(**config)
+
+        # Assert we have the same Nanoset in all ranks
+        assert_nanoset_sync_across_all_ranks(train_dataset, parallel_context)
+        dataset_sample_count = count_dataset_indexes(train_dataset.dataset_index, len(train_dataset.dataset_folders))
+        for idx, ds_length in enumerate(train_dataset.dataset_lengths):
+            # Assert Nanoset doesn't sample indexes greater than the datasets
+            assert (
+                np.max(train_dataset.dataset_sample_index, where=train_dataset.dataset_index == idx, initial=-1)
+                < ds_length
+            ), f"Error building Nanoset Indexes: Tryng to access sample {np.max(train_dataset.dataset_sample_index, where=train_dataset.dataset_index==idx, initial = -1)} of a {ds_length} sample dataset"
+            # Assert Nanoset builds up the correct blend WRT the dataset_weights
+            assert isclose(
+                normalize(dataset_sample_count).tolist()[idx], train_dataset.dataset_weights[idx], abs_tol=0.05
+            ), f"Requested Nanoset to contain {round(train_dataset.dataset_weights[idx]*100, 2)}% of samples from {train_dataset.dataset_folders[idx]} but got {round(normalize(dataset_sample_count).tolist()[idx]*100, 2)}%"
+        # Create Dataloaders
+        dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=sequence_length,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            dataloader_drop_last=True,
+        )
+
+        # Check a batch produced by the Dataloader
+        batch = next(iter(dataloader))
+        assert_batch_dataloader(
+            batch=batch,
+            parallel_context=parallel_context,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            sequence_length=sequence_length,
+        )
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@pytest.mark.parametrize("skipped_batches", [20, 5555])
+@pytest.mark.parametrize("tokenizer_name_or_path", ["openai-community/gpt2", "unsloth/llama-3-8b-bnb-4bit"])
+@rerun_if_address_is_in_use()
+def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, skipped_batches: int, tokenizer_name_or_path: str):
+    test_context = TestContext()
+
+    # Create dataset folders
+    json_paths, datatrove_tokenized_dataset_folders = create_dataset_paths(
+        tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2
+    )
+
+    # Create dummy json datasets
+    for idx, json_path in enumerate(json_paths):
+        create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
+
+    # Preprocess json dataset with datatrove
+    for json_path, datatrove_tokenized_dataset_folder in zip(json_paths, datatrove_tokenized_dataset_folders):
+        preprocess_dummy_dataset(json_path, datatrove_tokenized_dataset_folder, tokenizer_name_or_path)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_recover_nanoset_dataloader)(
+        datatrove_tokenized_dataset_folders=datatrove_tokenized_dataset_folders,
+        skipped_batches=skipped_batches,
+        tokenizer_name_or_path=tokenizer_name_or_path,
+    )
+
+
+def _test_recover_nanoset_dataloader(
+    parallel_context: ParallelContext,
+    datatrove_tokenized_dataset_folders: List[str],
+    skipped_batches: int,
+    tokenizer_name_or_path: str,
+):
+    SEED = 1234
+    MICRO_BATCH_SIZE = 4
+    N_MICRO_BATCHES_PER_BATCH = 8
+    GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
+    SEQUENCE_LENGTH = 1024
+    TRAIN_STEPS = 10000
+
+    input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
+
+    # Get tokenizer cardinality
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+    token_size = 4 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else 2
+    del tokenizer
+
+    # Create Nanoset configs: 1. Normal 2. Blended 3. Blended with weights
+    nanoset_config = {
+        "dataset_folders": [datatrove_tokenized_dataset_folders[0]],
+        "dataset_weights": [1],
+        "sequence_length": SEQUENCE_LENGTH,
+        "token_size": token_size,
+        "train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    blended_nanoset_config = {
+        "dataset_folders": datatrove_tokenized_dataset_folders,
+        "dataset_weights": None,
+        "sequence_length": SEQUENCE_LENGTH,
+        "token_size": token_size,
+        "train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    blended_weighted_nanoset_config = {
+        "dataset_folders": datatrove_tokenized_dataset_folders,
+        "dataset_weights": [8, 2],
+        "sequence_length": SEQUENCE_LENGTH,
+        "token_size": token_size,
+        "train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    configs = [nanoset_config, blended_nanoset_config, blended_weighted_nanoset_config]
+
+    for config in configs:
+        # Create Nanoset
+        with main_rank_first(parallel_context.world_pg):
+            train_dataset = Nanoset(**config)
+
+        # Create initial Dataloader
+        dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=SEQUENCE_LENGTH,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            dataloader_drop_last=True,
+        )
+
+        # Recover from failures
+        dataloader = iter(dataloader)
+        for _ in range(skipped_batches + 1):  # In order to compare with the first batch of the recovered DataLoader
+            batch = next(dataloader)
+
+        # Create recover Dataloader
+        recovered_dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=SEQUENCE_LENGTH,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            dataloader_drop_last=True,
+            # NOTE The dataloader serves batches of micro_batch_size despite of batch_accumulation_per_replica
+            consumed_train_samples=skipped_batches * MICRO_BATCH_SIZE * parallel_context.dp_pg.size(),
+        )
+
+        recovered_first_batch = next(iter(recovered_dataloader))
+
+        assert compute_batch_hash(batch) == compute_batch_hash(recovered_first_batch)
+
+    parallel_context.destroy()
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
+[pytest]
+addopts=-n 35
+markers =
+    fa2: FA2-related
--- a/tests/test_base_model.py
+++ b/tests/test_base_model.py
+import pytest
+import torch
+import torch.distributed as dist
+from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
+from helpers.utils import init_distributed, rerun_if_address_is_in_use
+from nanotron.config import Config, ModelArgs, RandomInit
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.pipeline_parallel.block import PipelineBlock
+from torch import nn
+
+
+@pytest.mark.parametrize("tp,dp,pp", [(1, 1, 1), (2, 2, 2)])
+@pytest.mark.skip
+@rerun_if_address_is_in_use()
+def test_get_named_modules_in_pp_rank(tp: int, dp: int, pp: int):
+    model_args = ModelArgs(init_method=RandomInit(std=1.0), model_config=TINY_LLAMA_CONFIG)
+    config = get_llama_training_config(model_args)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_get_named_modules_in_pp_rank)(config=config)
+
+
+def _test_get_named_modules_in_pp_rank(
+    parallel_context: ParallelContext,
+    config: Config,
+):
+    model = create_llama_from_config(
+        model_config=config.model.model_config,
+        device=torch.device("cuda"),
+        parallel_context=parallel_context,
+    )
+    model.init_model_randomly(config=config)
+
+    modules_that_not_in_current_pp_rank = {}
+    current_pp_rank = dist.get_rank(group=parallel_context.pp_pg)
+    for name, module in model.named_modules():
+        if isinstance(module, PipelineBlock) and module.rank != current_pp_rank:
+            modules_that_not_in_current_pp_rank[name] = module
+
+    named_modules_in_pp_rank = model.named_modules_in_pp_rank
+
+    for name, module in named_modules_in_pp_rank.items():
+        # NOTE: if a module is in the current rank, we expect it to be an initialized module
+        # not PipelineBlock
+        assert isinstance(module, nn.Module)
+        assert name not in modules_that_not_in_current_pp_rank
--- a/tests/test_checkpointing.py
+++ b/tests/test_checkpointing.py
+from typing import Union
+
+import torch
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+from nanotron.utils import checkpoint_method
+from torch import nn
+
+
+class CheckpointedModel(nn.Module):
+    def __init__(self, is_checkpointed: bool = False):
+        super().__init__()
+        self.dense1 = nn.Linear(10, 10)
+        self.dense2 = nn.Linear(10, 10)
+        self.dropout = nn.Dropout(0.1)
+        self.is_checkpointed = is_checkpointed
+        self.fwd_counter = 0
+
+    @checkpoint_method("is_checkpointed")
+    def forward(self, x: Union[torch.Tensor, TensorPointer]):
+        x = self.dense1(x)
+        if self.is_checkpointed and self.fwd_counter == 0:
+            assert not x.requires_grad, "x should not require grad when checkpointed, because fwd runs in no_grad mode"
+            assert (
+                x.grad_fn is None
+            ), "x should not store any activation when checkpointed, because fwd runs in no_grad mode"
+        x = self.dense2(x)
+        x = self.dropout(x)
+        self.fwd_counter += 1
+        return x
+
+
+class DummyModel(nn.Module):
+    def __init__(self, is_checkpointed: bool = False):
+        super().__init__()
+        self.dense0 = nn.Linear(10, 10)
+        self.checkpointed_model = CheckpointedModel(is_checkpointed=is_checkpointed)
+        self.dense3 = nn.Linear(10, 10)
+
+    def forward(self, x: Union[torch.Tensor, TensorPointer]):
+        x = self.dense0(x)
+        x = self.checkpointed_model(x)
+        assert x.requires_grad  # inside forward, x should require grad even if calculated in no_grad mode
+        x = self.dense3(x)
+        return x
+
+
+def test_activation_checkpointing():
+    dtype = torch.float16
+    device = torch.device("cuda")
+    test_model = DummyModel(is_checkpointed=True)
+    ref_model = DummyModel(is_checkpointed=False)
+    for model in [test_model, ref_model]:
+        model.to(device=device, dtype=dtype)
+
+    # copy weights
+    test_model.load_state_dict(ref_model.state_dict())
+    assert test_model.checkpointed_model.is_checkpointed is True
+    assert ref_model.checkpointed_model.is_checkpointed is False
+
+    # generate random input
+    x = torch.randn(10, 10, device=device, dtype=dtype)
+
+    # Forward pass
+    with torch.random.fork_rng(devices=["cuda"]):
+        ref_output = ref_model(x)
+    checkpointed_output = test_model(x)
+    assert test_model.checkpointed_model.fwd_counter == 1
+    torch.testing.assert_close(checkpointed_output, ref_output)
+
+    # Backward pass (check that fwd is called twice, and that we don't store the activations)
+    ref_output.sum().backward()
+    assert ref_model.checkpointed_model.fwd_counter == 1, "ref_model fwd should not be called twice"
+
+    # make sure grads are not synced between test_model and ref_model
+    assert ref_model.dense0.weight.grad is not None
+    assert test_model.dense0.weight.grad is None
+
+    assert test_model.checkpointed_model.fwd_counter == 1
+    checkpointed_output.sum().backward()
+    assert test_model.checkpointed_model.fwd_counter == 2, "test_model fwd should be called twice"
+
+    # compare all models grads
+    for ref_param, checkpointed_param in zip(ref_model.parameters(), test_model.parameters()):
+        torch.testing.assert_close(ref_param.grad, checkpointed_param.grad)
+
+
+# TODO @nouamanetazi: test `checkpoint_method` vs `torch.utils.checkpoint.checkpoint`
+# TODO @nouamanetazi: test a method with kwargs values
+# TODO @nouamanetazi: test `checkpoint_method` in a distributed setting
+# TODO @nouamanetazi: test BatchNorm layers with checkpointing
--- a/tests/test_clip_grads.py
+++ b/tests/test_clip_grads.py
+import math
+import os
+
+import pytest
+import torch
+from helpers.dummy import DummyModel, dummy_infinite_data_loader
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.models import init_on_device_and_dtype
+from nanotron.optim.clip_grads import clip_grad_norm
+from nanotron.optim.gradient_accumulator import (
+    FP32GradientAccumulator,
+)
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.parameters import NanotronParameter, sanity_check
+from nanotron.parallel.pipeline_parallel.engine import (
+    AllForwardAllBackwardPipelineEngine,
+)
+from nanotron.parallel.pipeline_parallel.p2p import P2P
+from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
+from nanotron.parallel.tensor_parallel.nn import (
+    TensorParallelColumnLinear,
+)
+from nanotron.parallel.tied_parameters import (
+    sync_tied_weights_gradients,
+    tie_parameters,
+)
+from nanotron.parallel.utils import initial_sync
+from nanotron.sanity_checks import assert_tensor_synced_across_pg
+from torch import nn
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_with_pp requires at least 2 gpus")
+@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
+@rerun_if_address_is_in_use()
+def test_clip_grads_with_pp(norm_type: float):
+    init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_with_pp)(norm_type=norm_type)
+
+
+def _test_clip_grads_with_pp(parallel_context: ParallelContext, norm_type: float):
+    device = torch.device("cuda")
+    p2p = P2P(parallel_context.pp_pg, device=device)
+    reference_rank = 0
+    has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
+    pipeline_engine = AllForwardAllBackwardPipelineEngine()
+    current_pp_rank = dist.get_rank(parallel_context.pp_pg)
+
+    # spawn model
+    model = DummyModel(p2p=p2p)
+    if has_reference_model:
+        reference_model = DummyModel(p2p=p2p)
+
+    # Set the ranks
+    assert len(model.mlp) == parallel_context.pp_pg.size()
+    with init_on_device_and_dtype(device):
+        for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
+            non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
+            non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
+        model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
+
+        # build reference model
+        if has_reference_model:
+            for non_linear in reference_model.mlp:
+                non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
+                non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
+            reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
+
+    for module in model.modules():
+        if isinstance(module, nn.Linear):
+            setattr(module, "weight", NanotronParameter(module.weight))
+            setattr(module, "bias", NanotronParameter(module.bias))
+
+    # synchronize weights
+    if has_reference_model:
+        with torch.inference_mode():
+            for pp_rank in range(parallel_context.pp_pg.size()):
+                reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
+                if pp_rank == current_pp_rank:
+                    # We already have the weights locally
+                    non_linear = model.mlp[pp_rank].linear.pp_block
+                    reference_non_linear.weight.data.copy_(non_linear.weight.data)
+                    reference_non_linear.bias.data.copy_(non_linear.bias.data)
+                    continue
+
+                weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+                reference_non_linear.weight.data.copy_(weight.data)
+                reference_non_linear.bias.data.copy_(bias.data)
+    else:
+        p2p.send_tensors(
+            [model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
+            to_rank=reference_rank,
+        )
+
+    # Get infinite dummy data iterator
+    data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg)  # First rank receives data
+
+    n_micro_batches_per_batch = 5
+    batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
+    pipeline_engine.train_batch_iter(
+        model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
+    )
+
+    # Equivalent on the reference model
+    if has_reference_model:
+        for micro_batch in batch:
+            loss = reference_model(**micro_batch)
+            loss /= n_micro_batches_per_batch
+            loss.backward()
+
+    # Check that gradient are the same as reference
+    pp_rank = dist.get_rank(parallel_context.pp_pg)
+    if has_reference_model:
+        for pp_rank in range(parallel_context.pp_pg.size()):
+            reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
+            if pp_rank == current_pp_rank:
+                # We already have the gradients locally
+                non_linear = model.mlp[pp_rank].linear.pp_block
+                torch.testing.assert_close(
+                    non_linear.weight.grad,
+                    reference_non_linear.weight.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                torch.testing.assert_close(non_linear.bias.grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
+                continue
+
+            weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+            torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
+            torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
+    else:
+        p2p.send_tensors(
+            [model.mlp[pp_rank].linear.pp_block.weight.grad, model.mlp[pp_rank].linear.pp_block.bias.grad],
+            to_rank=reference_rank,
+        )
+
+    non_linear = model.mlp[current_pp_rank].linear.pp_block
+    old_weight_grad = non_linear.weight.grad.clone()
+    old_bias_grad = non_linear.bias.grad.clone()
+    # Clip grads
+    total_norm = clip_grad_norm(
+        mp_pg=parallel_context.mp_pg,
+        named_parameters=model.named_parameters(),
+        grad_accumulator=None,
+        max_norm=1.0,
+        norm_type=norm_type,
+    )
+    if has_reference_model:
+        reference_total_norm = torch.nn.utils.clip_grad_norm_(
+            reference_model.parameters(), max_norm=1.0, norm_type=norm_type
+        )
+        torch.testing.assert_close(total_norm, reference_total_norm, atol=1e-6, rtol=1e-7)
+
+    # Check that grad changed
+    assert not torch.allclose(old_weight_grad, non_linear.weight.grad), "Grad should have changed"
+    assert not torch.allclose(old_bias_grad, non_linear.weight.grad), "Grad should have changed"
+
+    # Check that gradient are the same as reference
+    if has_reference_model:
+        for pp_rank in range(parallel_context.pp_pg.size()):
+            reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
+            if pp_rank == current_pp_rank:
+                # We already have the gradients locally
+                non_linear = model.mlp[pp_rank].linear.pp_block
+                torch.testing.assert_close(
+                    non_linear.weight.grad,
+                    reference_non_linear.weight.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                torch.testing.assert_close(
+                    non_linear.bias.grad,
+                    reference_non_linear.bias.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                continue
+
+            weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+            torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
+            torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
+    else:
+        p2p.send_tensors(
+            [
+                model.mlp[current_pp_rank].linear.pp_block.weight.grad,
+                model.mlp[current_pp_rank].linear.pp_block.bias.grad,
+            ],
+            to_rank=reference_rank,
+        )
+
+    print(parallel_context.__dir__())
+
+    parallel_context.destroy()
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_with_tp requires at least 2 gpus")
+@pytest.mark.parametrize(
+    "tp_mode,async_communication",
+    [
+        pytest.param(TensorParallelLinearMode.ALL_REDUCE, False),
+        pytest.param(TensorParallelLinearMode.REDUCE_SCATTER, True),
+    ],
+)
+@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
+@rerun_if_address_is_in_use()
+def test_clip_grads_with_tp(tp_mode: TensorParallelLinearMode, async_communication: bool, norm_type: float):
+    init_distributed(tp=2, dp=1, pp=1)(_test_clip_grads_with_tp)(
+        tp_mode=tp_mode, async_communication=async_communication, norm_type=norm_type
+    )
+
+
+def _test_clip_grads_with_tp(
+    parallel_context: ParallelContext, tp_mode: TensorParallelLinearMode, async_communication: bool, norm_type: float
+):
+    if async_communication:
+        os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+    in_features = 4
+    out_features_per_tp_rank = 8
+    out_features = parallel_context.tp_pg.size() * out_features_per_tp_rank
+
+    # Sharded
+    column_linear = TensorParallelColumnLinear(
+        in_features=in_features,
+        out_features=out_features,
+        pg=parallel_context.tp_pg,
+        mode=tp_mode,
+        device="cuda",
+        async_communication=async_communication,
+    )
+
+    # Un-sharded
+    reference_linear = nn.Linear(in_features=in_features, out_features=out_features, device="cuda")
+
+    # Copy weights/bias from sharded to un-sharded
+    with torch.inference_mode():
+        dist.all_gather(
+            tensor_list=list(reference_linear.weight.split(out_features_per_tp_rank, dim=0)),
+            tensor=column_linear.weight,
+            group=parallel_context.tp_pg,
+        )
+        dist.all_gather(
+            tensor_list=list(reference_linear.bias.split(out_features_per_tp_rank, dim=0)),
+            tensor=column_linear.bias,
+            group=parallel_context.tp_pg,
+        )
+
+    # Generate random input
+    random_input: torch.Tensor
+    sharded_random_input: torch.Tensor
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
+        batch_size = 5
+        random_input = torch.randn(batch_size, in_features, device="cuda")
+        # synchronize random_input across tp
+        dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
+        sharded_random_input = random_input
+    elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+        sharded_batch_size = 5
+        sharded_random_input = torch.randn(sharded_batch_size, in_features, device="cuda")
+        random_input = torch.empty(
+            sharded_batch_size * parallel_context.tp_pg.size(),
+            *(sharded_random_input.shape[1:]),
+            device=sharded_random_input.device,
+            dtype=sharded_random_input.dtype,
+        )
+        dist.all_gather_into_tensor(random_input, sharded_random_input, group=parallel_context.tp_pg)
+    else:
+        ValueError(f"Unsupported mode: {tp_mode}")
+
+    # Test that we get the same output after forward pass
+    sharded_output = column_linear(sharded_random_input)
+    reference_output = reference_linear(random_input)
+    # TODO @thomasw21: Tune tolerance
+    torch.testing.assert_close(
+        sharded_output,
+        reference_output[
+            :,
+            dist.get_rank(parallel_context.tp_pg)
+            * out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * out_features_per_tp_rank,
+        ],
+        atol=1e-6,
+        rtol=1e-7,
+    )
+
+    # Test that we get the same gradient after backward pass
+    sharded_output.sum().backward()
+    reference_output.sum().backward()
+    torch.testing.assert_close(
+        column_linear.weight.grad,
+        reference_linear.weight.grad[
+            dist.get_rank(parallel_context.tp_pg)
+            * out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * out_features_per_tp_rank
+        ],
+        atol=1e-6,
+        rtol=1e-7,
+    )
+    torch.testing.assert_close(
+        column_linear.bias.grad,
+        reference_linear.bias.grad[
+            dist.get_rank(parallel_context.tp_pg)
+            * out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * out_features_per_tp_rank
+        ],
+        atol=1e-6,
+        rtol=1e-7,
+    )
+
+    old_grad = column_linear.weight.grad.clone()
+    # Clip grads
+    total_norm = clip_grad_norm(
+        mp_pg=parallel_context.mp_pg,
+        named_parameters=column_linear.named_parameters(),
+        grad_accumulator=None,
+        max_norm=1.0,
+        norm_type=norm_type,
+    )
+    ref_total_norm = torch.nn.utils.clip_grad_norm_(reference_linear.parameters(), max_norm=1.0, norm_type=norm_type)
+
+    # Check that the gradients have changed
+    assert not torch.allclose(old_grad, column_linear.weight.grad), "Gradients should have changed after clipping"
+
+    # Test that we get the same gradient after clipping
+    torch.testing.assert_close(
+        column_linear.weight.grad,
+        reference_linear.weight.grad[
+            dist.get_rank(parallel_context.tp_pg)
+            * out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * out_features_per_tp_rank
+        ],
+    )
+    torch.testing.assert_close(
+        column_linear.bias.grad,
+        reference_linear.bias.grad[
+            dist.get_rank(parallel_context.tp_pg)
+            * out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * out_features_per_tp_rank
+        ],
+    )
+    torch.testing.assert_close(total_norm, ref_total_norm)
+
+    parallel_context.destroy()
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_tied_weights requires at least 2 gpus")
+@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
+@rerun_if_address_is_in_use()
+def test_clip_grads_tied_weights(norm_type: float):
+    init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_tied_weights)(norm_type=norm_type)
+
+
+def _test_clip_grads_tied_weights(parallel_context: ParallelContext, norm_type: float):
+    if dist.get_rank(parallel_context.pp_pg) == 0:
+        model = nn.ModuleDict({"dense0": nn.Linear(10, 10, device="cuda")})
+    else:
+        model = nn.ModuleDict({"dense1": nn.Linear(10, 10, device="cuda")})
+
+    # Tie weights/bias
+    tie_parameters(
+        root_module=model,
+        ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
+        parallel_context=parallel_context,
+        reduce_op=dist.ReduceOp.SUM,
+    )
+    tie_parameters(
+        root_module=model,
+        ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
+        parallel_context=parallel_context,
+        reduce_op=dist.ReduceOp.SUM,
+    )
+
+    group = parallel_context.world_ranks_to_pg[(0, 1)]
+
+    # Check that model weights are not in fact synchronized
+    if dist.get_rank(parallel_context.pp_pg) == 0:
+        weight = model.dense0.weight
+        bias = model.dense0.bias
+    else:
+        weight = model.dense1.weight
+        bias = model.dense1.bias
+
+    # Make sure that weight/bias are NanotronParameter and that they are tied
+    assert isinstance(weight, NanotronParameter)
+    assert weight.is_tied
+    assert isinstance(bias, NanotronParameter)
+    assert bias.is_tied
+
+    # Sync tied weights: basic assumption
+    initial_sync(model=model, parallel_context=parallel_context)
+
+    # Check that weights are now synced
+    assert_tensor_synced_across_pg(weight, group)
+    assert_tensor_synced_across_pg(bias, group)
+
+    # Compute gradient
+    input_ = torch.randn(13, 10, device="cuda")
+    if dist.get_rank(parallel_context.pp_pg) == 0:
+        out = model.dense0(input_)
+    else:
+        out = model.dense1(input_)
+    out.sum().backward()
+
+    # sync gradients
+    sync_tied_weights_gradients(model, parallel_context=parallel_context, grad_accumulator=None)
+
+    # We check that we both gradients are synchronized
+    assert_tensor_synced_across_pg(weight.grad, group)
+    assert_tensor_synced_across_pg(bias.grad, group)
+
+    # Save grads as reference
+    ref_weight = weight.clone()
+    ref_weight.grad = weight.grad.clone()
+    ref_bias = bias.clone()
+    ref_bias.grad = bias.grad.clone()
+
+    old_grad = weight.grad.clone()
+    # Clip grads
+    total_norm = clip_grad_norm(
+        mp_pg=parallel_context.mp_pg,
+        named_parameters=model.named_parameters(),
+        grad_accumulator=None,
+        max_norm=1.0,
+        norm_type=norm_type,
+    )
+    ref_total_norm = torch.nn.utils.clip_grad_norm_([ref_weight, ref_bias], max_norm=1.0, norm_type=norm_type)
+
+    # Check that the gradients have changed
+    assert not torch.allclose(old_grad, weight.grad), "Gradients should have changed after clipping"
+
+    # Test that we get the same gradient after clipping
+    assert torch.allclose(weight.grad, ref_weight.grad, rtol=1e-7, atol=1e-6)
+    assert torch.allclose(bias.grad, ref_bias.grad, rtol=1e-7, atol=1e-6)
+    assert torch.allclose(total_norm, ref_total_norm, rtol=0, atol=0), f"Got {total_norm} and {ref_total_norm}"
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
+@rerun_if_address_is_in_use()
+def test_clip_grads_fp32_accumulator(norm_type: float, half_precision: torch.dtype):
+    init_distributed(tp=1, dp=1, pp=2)(_test_clip_grads_fp32_accumulator)(
+        norm_type=norm_type, half_precision=half_precision
+    )
+
+
+def _test_clip_grads_fp32_accumulator(
+    parallel_context: ParallelContext, norm_type: float, half_precision: torch.dtype
+):
+    device = torch.device("cuda")
+    p2p = P2P(parallel_context.pp_pg, device=device)
+    reference_rank = 0
+    has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
+    pipeline_engine = AllForwardAllBackwardPipelineEngine()
+    current_pp_rank = dist.get_rank(parallel_context.pp_pg)
+
+    # spawn model
+    model = DummyModel(p2p=p2p)
+    if has_reference_model:
+        reference_model = DummyModel(p2p=p2p).to(torch.float)
+
+    # Set the ranks
+    assert len(model.mlp) == parallel_context.pp_pg.size()
+    with init_on_device_and_dtype(device):
+        for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
+            non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
+            non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
+        model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
+
+        if has_reference_model:
+            for non_linear in reference_model.mlp:
+                non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
+                non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
+            reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
+
+    for module in model.modules():
+        if isinstance(module, nn.Linear):
+            setattr(module, "weight", NanotronParameter(module.weight))
+            setattr(module, "bias", NanotronParameter(module.bias))
+
+    # model goes to half precision
+    model = model.to(half_precision)
+
+    # synchronize weights
+    if has_reference_model:
+        with torch.inference_mode():
+            for pp_rank in range(parallel_context.pp_pg.size()):
+                reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
+                if pp_rank == current_pp_rank:
+                    # We already have the weights locally
+                    non_linear = model.mlp[pp_rank].linear.pp_block
+                    reference_non_linear.weight.data.copy_(non_linear.weight.data)
+                    reference_non_linear.bias.data.copy_(non_linear.bias.data)
+                    continue
+
+                weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+                reference_non_linear.weight.data.copy_(weight.data)
+                reference_non_linear.bias.data.copy_(bias.data)
+    else:
+        p2p.send_tensors(
+            [model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
+            to_rank=reference_rank,
+        )
+
+    # Add gradient accumulator
+    grad_accumulator = FP32GradientAccumulator(model.named_parameters())
+
+    # Check that our model is a valid model
+    sanity_check(model)
+
+    # Compute backward
+    # Get infinite dummy data iterator
+    data_iterator = dummy_infinite_data_loader(
+        pp_pg=parallel_context.pp_pg, dtype=half_precision
+    )  # First rank receives data
+
+    n_micro_batches_per_batch = 5
+    batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
+    pipeline_engine.train_batch_iter(
+        model,
+        pg=parallel_context.pp_pg,
+        batch=batch,
+        nb_microbatches=n_micro_batches_per_batch,
+        grad_accumulator=grad_accumulator,
+    )
+
+    # We're going to copy the model gradients to the reference model gradient
+    # The reason why we do this, instead of computing backward using autograd is because of numerical precisions
+    if has_reference_model:
+        for pp_rank in range(parallel_context.pp_pg.size()):
+            reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
+            prefix_name = f"mlp.{pp_rank}.linear.pp_block"
+            if pp_rank == current_pp_rank:
+                # We already have the gradients locally
+                reference_non_linear.weight.grad = grad_accumulator.get_grad_buffer(f"{prefix_name}.weight").clone()
+                reference_non_linear.bias.grad = grad_accumulator.get_grad_buffer(f"{prefix_name}.bias").clone()
+                continue
+
+            weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+            reference_non_linear.weight.grad = weight_grad
+            reference_non_linear.bias.grad = bias_grad
+    else:
+        p2p.send_tensors(
+            [
+                grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.weight"),
+                grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.bias"),
+            ],
+            to_rank=reference_rank,
+        )
+
+    old_fp32_grads = {
+        name: grad_accumulator.get_grad_buffer(name=name).clone() for name, _ in model.named_parameters()
+    }
+
+    # Clip grads
+    total_norm = clip_grad_norm(
+        mp_pg=parallel_context.mp_pg,
+        named_parameters=model.named_parameters(),
+        grad_accumulator=grad_accumulator,
+        max_norm=1.0,
+        norm_type=norm_type,
+    )
+    if has_reference_model:
+        ref_total_norm = torch.nn.utils.clip_grad_norm_(
+            reference_model.parameters(), max_norm=1.0, norm_type=norm_type
+        )
+
+    # Check that the gradients have changed
+    for name, _ in model.named_parameters():
+        new_fp32_grad = grad_accumulator.get_grad_buffer(name=name)
+        assert not torch.allclose(old_fp32_grads[name], new_fp32_grad), "Gradients should have changed after clipping"
+
+    # We check that we get the same gradient accumulation. In theory we do get more precision by promoting gradients to fp32.
+    if has_reference_model:
+        torch.testing.assert_close(
+            total_norm.view(1),
+            ref_total_norm.view(1),
+            atol=1e-6,
+            rtol=1e-7,
+            msg=lambda msg: f"Expected {total_norm} to match {ref_total_norm}.\n{msg}",
+        )
+        for pp_rank in range(parallel_context.pp_pg.size()):
+            reference_non_linear = reference_model.mlp[pp_rank].linear.pp_block
+            prefix_name = f"mlp.{pp_rank}.linear.pp_block"
+            if pp_rank == current_pp_rank:
+                # We already have the gradients locally
+                torch.testing.assert_close(
+                    reference_non_linear.weight.grad,
+                    grad_accumulator.get_grad_buffer(f"{prefix_name}.weight"),
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                torch.testing.assert_close(
+                    reference_non_linear.bias.grad,
+                    grad_accumulator.get_grad_buffer(f"{prefix_name}.bias"),
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                continue
+
+            weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+            torch.testing.assert_close(
+                reference_non_linear.weight.grad,
+                weight_grad,
+                atol=1e-6,
+                rtol=1e-7,
+            )
+            torch.testing.assert_close(
+                reference_non_linear.bias.grad,
+                bias_grad,
+                atol=1e-6,
+                rtol=1e-7,
+            )
+    else:
+        p2p.send_tensors(
+            [
+                grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.weight"),
+                grad_accumulator.get_grad_buffer(f"mlp.{current_pp_rank}.linear.pp_block.bias"),
+            ],
+            to_rank=reference_rank,
+        )
+
+    parallel_context.destroy()
--- a/tests/test_data_parallel.py
+++ b/tests/test_data_parallel.py
+from contextlib import nullcontext
+
+import pytest
+import torch
+from helpers.exception import assert_fail_except_rank_with
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.data_parallel.utils import ddp_trigger_sync_in_bwd
+from nanotron.parallel.parameters import NanotronParameter
+from nanotron.sanity_checks import assert_tensor_synced_across_pg
+from torch import nn
+from torch.distributed import GradBucket
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_ddp_with_afab requires at least 2 gpus")
+@pytest.mark.parametrize("accumulation_steps", [1, 3])
+@rerun_if_address_is_in_use()
+def test_ddp_with_afab(accumulation_steps):
+    init_distributed(tp=1, dp=2, pp=1)(_test_ddp_with_afab)(accumulation_steps=accumulation_steps)
+
+
+def _test_ddp_with_afab(parallel_context: ParallelContext, accumulation_steps: int):
+    half_precision = torch.float16
+
+    def allreduce_hook(process_group: dist.ProcessGroup, bucket: GradBucket):
+        # DDP groups grads in GradBuckets. This hook is called throughout the bwd pass, once each bucket is ready to overlap communication with computation.
+        # See https://pytorch.org/docs/stable/ddp_comm_hooks.html#what-does-a-communication-hook-operate-on for more details.
+        half_flat_bucket_buffer = bucket.buffer()
+        group_to_use = process_group if process_group is not None else parallel_context.dp_pg
+
+        return (
+            dist.all_reduce(half_flat_bucket_buffer, group=group_to_use, async_op=True, op=dist.ReduceOp.AVG)
+            .get_future()
+            .then(lambda fut: fut.value()[0])
+        )
+
+    model_hook = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
+
+    # Create Nanotron Parameter
+    model_hook.weight = NanotronParameter(model_hook.weight)
+
+    model_ddp_hook = torch.nn.parallel.DistributedDataParallel(
+        model_hook,
+        process_group=parallel_context.dp_pg,
+    )
+
+    # Register DDP hook
+    model_ddp_hook.register_comm_hook(state=None, hook=allreduce_hook)
+
+    activations = []
+    # All forward
+    for i in range(accumulation_steps):
+        input = torch.randn(5, 3, dtype=half_precision, device="cuda")
+
+        with model_ddp_hook.no_sync():
+            loss_hook = model_ddp_hook(input).sum()
+
+        activations.append(loss_hook)
+
+    # All backward
+    for i in range(accumulation_steps):
+        context = nullcontext()
+        if i == accumulation_steps - 1:
+            context = ddp_trigger_sync_in_bwd(model_ddp_hook)  # triggers a sync for the final backward
+        loss_hook = activations[i]
+        with context:
+            loss_hook.backward()
+
+        grad_hook = model_ddp_hook.module.weight.grad.clone()
+
+        # Check that the gradients are synchronized across DP
+        if i == accumulation_steps - 1:
+            assert_tensor_synced_across_pg(grad_hook, parallel_context.dp_pg)
+        else:
+            with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
+                assert_tensor_synced_across_pg(grad_hook, parallel_context.dp_pg)
+
+    parallel_context.destroy()
--- a/tests/test_distributed.py
+++ b/tests/test_distributed.py
+import numpy as np
+import pytest
+import torch.distributed as dist
+from helpers.utils import (
+    available_gpus,
+    get_all_3d_configurations,
+    init_distributed,
+    rerun_if_address_is_in_use,
+)
+from nanotron.parallel import ParallelContext
+from torch.distributed import ProcessGroup
+
+
+def _test_init_parallel_context(parallel_context: ParallelContext):
+    assert dist.is_initialized() is True
+    assert isinstance(parallel_context.world_pg, ProcessGroup)
+    assert isinstance(parallel_context.tp_pg, ProcessGroup) if parallel_context.tensor_parallel_size > 1 else True
+    assert isinstance(parallel_context.pp_pg, ProcessGroup) if parallel_context.pipeline_parallel_size > 1 else True
+    assert isinstance(parallel_context.dp_pg, ProcessGroup) if parallel_context.data_parallel_size > 1 else True
+
+    world_rank = dist.get_rank(parallel_context.world_pg)
+    ranks3d = parallel_context.get_local_ranks(world_rank)
+    assert isinstance(ranks3d, tuple) and len(ranks3d)
+
+    assert isinstance(parallel_context.world_rank_matrix, np.ndarray)
+    assert isinstance(parallel_context.world_ranks_to_pg, dict)
+
+    local_rank = tuple(i.item() for i in np.where(parallel_context.world_rank_matrix == world_rank))
+    global_rank = parallel_context.get_global_rank(*local_rank)
+    assert isinstance(global_rank, np.int64), f"The type of global_rank is {type(global_rank)}"
+
+    assert global_rank == dist.get_rank()
+
+    parallel_context.destroy()
+    assert dist.is_initialized() is False
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@rerun_if_address_is_in_use()
+def test_init_parallel_context(tp: int, dp: int, pp: int):
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_init_parallel_context)()
\ No newline at end of file
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
+from typing import Union
+
+import pytest
+import torch
+from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
+from helpers.utils import init_distributed, rerun_if_address_is_in_use
+from nanotron.config import ModelArgs, RandomInit, SpectralMupInit
+from nanotron.helpers import get_custom_lr_for_named_parameters
+from nanotron.parallel import ParallelContext
+from nanotron.scaling.parametrization import ParametrizationMethod
+
+
+@pytest.mark.parametrize("tp,dp,pp", [(1, 1, 1), (2, 1, 1), (1, 1, 2), (2, 1, 2)])
+@pytest.mark.parametrize(
+    "parametrization_method", [ParametrizationMethod.STANDARD, ParametrizationMethod.SPECTRAL_MUP]
+)
+@pytest.mark.skip
+@rerun_if_address_is_in_use()
+def test_get_custom_lr(tp: int, dp: int, pp: int, parametrization_method: ParametrizationMethod):
+    LR = 1e-3
+
+    if parametrization_method == ParametrizationMethod.STANDARD:
+        init_method = RandomInit(std=1.0)
+    elif parametrization_method == ParametrizationMethod.SPECTRAL_MUP:
+        init_method = SpectralMupInit(use_mup=True)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_get_custom_lr)(
+        lr=LR,
+        init_method=init_method,
+        parametrization_method=parametrization_method,
+    )
+
+
+def _test_get_custom_lr(
+    parallel_context: ParallelContext,
+    lr: float,
+    init_method: Union[RandomInit, SpectralMupInit],
+    parametrization_method: ParametrizationMethod,
+):
+    model_args = ModelArgs(init_method=init_method, model_config=TINY_LLAMA_CONFIG)
+    config = get_llama_training_config(model_args)
+    llama = create_llama_from_config(
+        model_config=TINY_LLAMA_CONFIG,
+        device=torch.device("cuda"),
+        parallel_context=parallel_context,
+    )
+    llama.init_model_randomly(config=config, init_method=parametrization_method)
+    named_parameters = list(llama.get_named_params_with_correct_tied())
+
+    if len(named_parameters) == 0:
+        # NOTE: some pp ranks don't have any parameters
+        return
+
+    named_param_groups = get_custom_lr_for_named_parameters(
+        parametrization_method=parametrization_method, lr=lr, named_parameters=named_parameters, model=llama
+    )
+
+    assert len(named_param_groups) == len(named_parameters)
+    assert all(isinstance(named_param_group["lr"], float) for named_param_group in named_param_groups)
+    assert all(isinstance(named_param_group["named_params"], list) for named_param_group in named_param_groups)
+
+    is_all_lr_the_same = parametrization_method == ParametrizationMethod.STANDARD
+    assert all(named_param_group["lr"] == lr for named_param_group in named_param_groups) is is_all_lr_the_same
--- a/tests/test_optimizer_params_groups.py
+++ b/tests/test_optimizer_params_groups.py
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron.optim.gradient_accumulator import FP32GradientAccumulator
+from nanotron.optim.named_optimizer import NamedOptimizer
+from nanotron.optim.optimizer_from_gradient_accumulator import OptimizerFromGradientAccumulator
+from nanotron.parallel.context import ParallelContext
+from nanotron.parallel.parameters import NanotronParameter
+from nanotron.random import set_random_seed
+
+
+class DummyModel(nn.Module):
+    def __init__(self, dtype=torch.float32):
+        super(DummyModel, self).__init__()
+        self.fc1 = nn.Linear(10, 20, bias=False).to(dtype=dtype)
+        self.fc2 = nn.Linear(20, 2, bias=False).to(dtype=dtype)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        return x
+
+
+def test_optimizer_lr_one_group():
+    set_random_seed(42)
+
+    model = DummyModel().to("cuda")
+
+    lr1 = 0.1
+
+    named_params_or_groups = []
+    for name, param in model.named_parameters():
+        named_params_or_groups.append((name, param))
+    named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1}]
+
+    optimizer = NamedOptimizer(
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=lambda param_groups: optim.SGD(
+            param_groups,
+            lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+        ),
+    )
+
+    input = torch.randn(10, 10).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for _ in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output, target)
+        loss.backward()
+
+        fc1_grad = model.fc1.weight.grad.clone()
+        fc2_grad = model.fc2.weight.grad.clone()
+
+        # compute gradient manually
+        with torch.no_grad():
+            expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
+            expected_fc2_weight = model.fc2.weight - lr1 * fc2_grad
+
+        optimizer.step()
+
+        updated_fc1_weight = model.fc1.weight
+        updated_fc2_weight = model.fc2.weight
+
+        torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+        torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+def test_optimizer_lr_multiple_group():
+    set_random_seed(42)
+
+    model = DummyModel().to("cuda")
+
+    lr1, lr2 = 0.1, 0.001
+
+    named_params_or_groups = [
+        {"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name], "lr": lr1},
+        {"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name], "lr": lr2},
+    ]
+
+    optimizer = NamedOptimizer(
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=lambda param_groups: optim.SGD(
+            param_groups,
+            lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+        ),
+    )
+
+    input = torch.randn(10, 10).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for _ in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output, target)
+        loss.backward()
+
+        fc1_grad = model.fc1.weight.grad.clone()
+        fc2_grad = model.fc2.weight.grad.clone()
+
+        with torch.no_grad():
+            expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
+            expected_fc2_weight = model.fc2.weight - lr2 * fc2_grad
+
+        optimizer.step()
+
+        updated_fc1_weight = model.fc1.weight
+        updated_fc2_weight = model.fc2.weight
+
+        torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+        torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+def test_optimizer_lr_weight_decay_one_group():
+    set_random_seed(42)
+
+    model = DummyModel().to("cuda")
+
+    lr1 = 0.1
+    weight_decay = 0.1
+
+    named_params_or_groups = []
+    for name, param in model.named_parameters():
+        named_params_or_groups.append((name, param))
+    named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1, "weight_decay": weight_decay}]
+
+    optimizer = NamedOptimizer(
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=lambda param_groups: optim.SGD(
+            param_groups,
+            lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+        ),
+    )
+
+    input = torch.randn(10, 10).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for _ in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output, target)
+        loss.backward()
+
+        # Compute gradient manually and apply weight decay
+        with torch.no_grad():
+            expected_fc1_weight = (1 - lr1 * weight_decay) * model.fc1.weight - lr1 * model.fc1.weight.grad
+            expected_fc2_weight = (1 - lr1 * weight_decay) * model.fc2.weight - lr1 * model.fc2.weight.grad
+
+        optimizer.step()
+
+        updated_fc1_weight = model.fc1.weight
+        updated_fc2_weight = model.fc2.weight
+
+        torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+        torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+def test_optimizer_lr_weight_decay_multiple_group():
+    set_random_seed(42)
+
+    model = DummyModel().to("cuda")
+
+    lr1, lr2 = 0.1, 0.001
+    weight_decay1, weight_decay2 = 0.1, 0.001
+
+    named_params_or_groups = [
+        {
+            "named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name],
+            "lr": lr1,
+            "weight_decay": weight_decay1,
+        },
+        {
+            "named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name],
+            "lr": lr2,
+            "weight_decay": weight_decay2,
+        },
+    ]
+
+    optimizer = NamedOptimizer(
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=lambda param_groups: optim.SGD(
+            param_groups,
+            lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+        ),
+    )
+
+    input = torch.randn(10, 10).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for _ in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output, target)
+        loss.backward()
+
+        # Compute gradient manually and apply weight decay
+        with torch.no_grad():
+            expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * model.fc1.weight.grad
+            expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * model.fc2.weight.grad
+
+        optimizer.step()
+
+        updated_fc1_weight = model.fc1.weight
+        updated_fc2_weight = model.fc2.weight
+
+        torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+        torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+def test_optimizer_grad_accumulation_lr_one_group(half_precision: torch.dtype, accumulation_steps: int):
+    set_random_seed(42)
+    dtype = half_precision
+    lr1 = 0.1
+
+    model = DummyModel(dtype=dtype).to("cuda")
+
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    named_params_or_groups = []
+    for name, param in model.named_parameters():
+        named_params_or_groups.append((name, param))
+
+    named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1}]
+
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
+                expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
+                expected_fc2_weight = model.fc2.weight - lr1 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+def test_optimizer_grad_accumulation_lr_multiple_group(half_precision: torch.dtype, accumulation_steps: int):
+    set_random_seed(42)
+    dtype = half_precision
+    lr1, lr2 = 0.1, 0.001
+
+    model = DummyModel(dtype=dtype).to("cuda")
+
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    named_params_or_groups = [
+        {"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name], "lr": lr1},
+        {"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name], "lr": lr2},
+    ]
+
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
+                expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
+                expected_fc2_weight = model.fc2.weight - lr2 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+def test_optimizer_grad_accumulation_lr_weight_decay_one_group(half_precision: torch.dtype, accumulation_steps: int):
+    set_random_seed(42)
+    dtype = half_precision
+    lr1 = 0.1
+    weight_decay = 0.1
+
+    model = DummyModel(dtype=dtype).to("cuda")
+
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    named_params_or_groups = []
+    for name, param in model.named_parameters():
+        named_params_or_groups.append((name, param))
+    named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1, "weight_decay": weight_decay}]
+
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
+                weight_decay=9999999,  # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
+                expected_fc1_weight = (1 - lr1 * weight_decay) * model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
+                expected_fc2_weight = (1 - lr1 * weight_decay) * model.fc2.weight - lr1 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+def test_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
+    half_precision: torch.dtype, accumulation_steps: int
+):
+    set_random_seed(42)
+    dtype = half_precision
+    lr1, lr2 = 0.1, 0.001
+    weight_decay1, weight_decay2 = 0.1, 0.001
+
+    model = DummyModel(dtype=dtype).to("cuda")
+
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    named_params_or_groups = [
+        {
+            "named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name],
+            "lr": lr1,
+            "weight_decay": weight_decay1,
+        },
+        {
+            "named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name],
+            "lr": lr2,
+            "weight_decay": weight_decay2,
+        },
+    ]
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
+                weight_decay=9999999,  # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
+                expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
+                expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing requires at least 2 gpus")
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+@rerun_if_address_is_in_use()
+def test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
+    half_precision: torch.dtype, accumulation_steps: int
+):
+    init_distributed(tp=1, dp=2, pp=1)(_test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group)(
+        half_precision=half_precision,
+        accumulation_steps=accumulation_steps,
+    )
+
+
+def _test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
+    parallel_context: ParallelContext, half_precision: torch.dtype, accumulation_steps: int
+):
+    set_random_seed(42)
+    dtype = half_precision
+    # Making it bigger so that the difference is more visible during update
+    lr1, lr2 = 0.04, 0.05
+    weight_decay1, weight_decay2 = 0.5, 0.2
+
+    model = DummyModel(dtype=dtype).to("cuda")
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    model_ddp = torch.nn.parallel.DistributedDataParallel(
+        model,
+        process_group=parallel_context.dp_pg,
+    )
+
+    named_params_or_groups = [
+        {
+            "named_params": [(name, param) for name, param in model_ddp.named_parameters() if "fc1" in name],
+            "lr": lr1,
+            "weight_decay": weight_decay1,
+        },
+        {
+            "named_params": [(name, param) for name, param in model_ddp.named_parameters() if "fc2" in name],
+            "lr": lr2,
+            "weight_decay": weight_decay2,
+        },
+    ]
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
+                weight_decay=9999999,  # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="module.fc1.weight").to(dtype)
+                expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="module.fc2.weight").to(dtype)
+                expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
--- a/tests/test_p2p.py
+++ b/tests/test_p2p.py
+import contextlib
+
+import pytest
+import torch
+from helpers.exception import assert_fail_with
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.pipeline_parallel.p2p import P2P
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_ddp_with_afab requires at least 2 gpus")
+@pytest.mark.parametrize("send_contiguous", [True, False])
+@pytest.mark.parametrize("full", [True, False])
+@rerun_if_address_is_in_use()
+def test_check_send_recv_tensor(send_contiguous: bool, full: bool):
+    init_distributed(tp=1, dp=1, pp=2)(_test_check_send_recv_tensor)(send_contiguous=send_contiguous, full=full)
+
+
+def _test_check_send_recv_tensor(parallel_context: ParallelContext, send_contiguous: bool, full: bool):
+    p2p = P2P(pg=parallel_context.pp_pg, device=torch.device("cuda"))
+    if dist.get_rank(p2p.pg) == 0:
+        tensor_to_send = torch.randn(3, 5, dtype=torch.float, device=torch.device("cuda"))
+        if send_contiguous is True:
+            assert tensor_to_send.is_contiguous()
+        else:
+            tensor_to_send = tensor_to_send.transpose(0, 1)
+            assert not tensor_to_send.is_contiguous()
+
+        # `full` defines if we take a non trivial slice of the tensor
+        if full is False:
+            tensor_to_send = tensor_to_send[1:3]
+
+    if send_contiguous is False and full is False:
+        # This is supposed to return a ValueError mentioning that you should have sent a smaller model by running `contiguous` before.
+        send_first_context = assert_fail_with(
+            AssertionError,
+            error_msg="Expect storage_size to be smaller than tensor size. It might not be true, when you use slicing for example though. We probably don't want to support it in our P2P system",
+        )
+        fail_at_first_send = True
+    else:
+        send_first_context = contextlib.nullcontext()
+        fail_at_first_send = False
+
+    # Send tensor back and forth through p2p protocol and check that we get the same thing.
+    if dist.get_rank(p2p.pg) == 0:
+        with send_first_context:
+            handles = p2p.isend_tensors([tensor_to_send], to_rank=1)
+        if fail_at_first_send is True:
+            # We early return if we caught an error
+            return
+        for handle in handles:
+            handle.wait()
+        tensor_travelled_back_and_forth = p2p.recv_tensors(1, from_rank=1)[0]
+        torch.testing.assert_close(tensor_to_send, tensor_travelled_back_and_forth, atol=0, rtol=0)
+    elif dist.get_rank(p2p.pg) == 1:
+        #  Instead of letting first rank hang since sending won't be possible, we early return
+        tensors, handles = p2p.irecv_tensors(1, from_rank=0)
+        if fail_at_first_send is True:
+            return
+        for handle in handles:
+            handle.wait()
+        tensor_to_recv = tensors[0]
+        p2p.send_tensors([tensor_to_recv], to_rank=0)
+    else:
+        raise ValueError()
+
+    if full is False and send_contiguous is True:
+        # We can actually check that we haven't sent the entire storage as storage not accessed by the tensor are not sent
+        if dist.get_rank(p2p.pg) == 0:
+            # Check that the first element in the storages don't correspond (because they are not support to be communicated when the tensor is not full).
+
+            print(tensor_to_send.untyped_storage()[:4], tensor_travelled_back_and_forth.untyped_storage()[:4])
+            print(tensor_to_send.as_strided(size=(1,), stride=(1,), storage_offset=0))
+            print(tensor_travelled_back_and_forth.as_strided(size=(1,), stride=(1,), storage_offset=0))
+            assert not torch.allclose(
+                tensor_to_send.as_strided(size=(1,), stride=(1,), storage_offset=0),
+                tensor_travelled_back_and_forth.as_strided(size=(1,), stride=(1,), storage_offset=0),
+            )
+
+    parallel_context.destroy()
--- a/tests/test_parameter.py
+++ b/tests/test_parameter.py
+import torch
+from helpers.exception import assert_fail_with
+from nanotron.models.base import DTypeInvariantTensor, init_on_device_and_dtype
+from nanotron.parallel.parameters import NanotronParameter
+from torch import nn
+
+
+def test_nanotron_parameter_does_not_override_some_parameter_variable():
+    param = nn.Parameter(torch.empty(3))
+    assert not hasattr(param, NanotronParameter.NANOTRON_PARAMETER_METADATA_ATTRIBUTE_NAME)
+
+
+def test_uncastable_tensor():
+    # Test that we can create an DTypeInvariantTensor
+    x = DTypeInvariantTensor(torch.randn(3, 3))
+    assert isinstance(x, torch.Tensor)
+    assert isinstance(x, DTypeInvariantTensor)
+
+    # Test that we cannot modify the type of an DTypeInvariantTensor
+    with assert_fail_with(RuntimeError, error_msg="Cannot convert the type of an DTypeInvariantTensor to float"):
+        x = x.float()
+
+    with assert_fail_with(RuntimeError, error_msg="Cannot convert the type of an DTypeInvariantTensor to half"):
+        x = x.half()
+
+    with assert_fail_with(RuntimeError, error_msg="Cannot change the type of an DTypeInvariantTensor"):
+        x = x.to(torch.float32)
+
+    with assert_fail_with(RuntimeError, error_msg="Cannot change the type of an DTypeInvariantTensor"):
+        x = x.to(dtype=torch.float32)
+
+    # Test that we can modify the value of an DTypeInvariantTensor
+    x[0, 0] = 1
+    assert x[0, 0] == 1
+
+    # Test that we can modify the device of an DTypeInvariantTensor
+    x = x.to("cuda")
+    assert x.device.type == "cuda"
+
+
+def test_register_buffer_does_not_update_uncastable_tensor():
+    old_device = torch.device("cuda")
+    old_dtype = torch.float32
+    new_device = torch.device("cpu")
+    new_dtype = torch.bfloat16
+    with init_on_device_and_dtype(device=new_device, dtype=new_dtype):
+        module = torch.nn.Module()
+        # Test that we can register an DTypeInvariantTensor as a buffer
+        tensor = DTypeInvariantTensor(torch.randn(3, 4, dtype=old_dtype, device=old_device))
+        module.register_buffer("buffer", tensor)
+
+        # Test that we can modify the buffer
+        module.buffer[0, 0] = 1
+        assert module.buffer[0, 0] == 1
+
+        # Test that device has been updated
+        assert module.buffer.device.type == new_device.type
+
+        # Test that dtype has not been modified
+        assert module.buffer.dtype is old_dtype
--- a/tests/test_parameters_accumulate_gradient_in_fp32.py
+++ b/tests/test_parameters_accumulate_gradient_in_fp32.py
+import copy
+
+import nanotron.distributed as dist
+import pytest
+import torch
+from helpers.dummy import DummyModel, dummy_infinite_data_loader
+from helpers.exception import assert_fail_except_rank_with, timeout_after
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron.models import init_on_device_and_dtype
+from nanotron.optim import ZeroDistributedOptimizer
+from nanotron.optim.gradient_accumulator import FP32GradBucketManager, FP32GradientAccumulator, get_fp32_accum_hook
+from nanotron.optim.named_optimizer import NamedOptimizer
+from nanotron.optim.optimizer_from_gradient_accumulator import (
+    OptimizerFromGradientAccumulator,
+)
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.parameters import NanotronParameter, sanity_check
+from nanotron.parallel.pipeline_parallel.engine import (
+    AllForwardAllBackwardPipelineEngine,
+    OneForwardOneBackwardPipelineEngine,
+    PipelineEngine,
+)
+from nanotron.parallel.pipeline_parallel.p2p import P2P
+from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of
+from nanotron.parallel.tied_parameters import (
+    get_tied_id_to_param,
+    sync_tied_weights_gradients,
+    tie_parameters,
+)
+from nanotron.parallel.utils import initial_sync
+from nanotron.sanity_checks import assert_tensor_synced_across_pg
+from nanotron.utils import ContextManagers
+from torch import nn
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+def test_gradient_promoting_in_fp32(half_precision: torch.dtype):
+    model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
+
+    # Create Nanotron Parameter
+    model.weight = NanotronParameter(model.weight)
+
+    # Add gradient accumulator
+    accumulator = FP32GradientAccumulator(model.named_parameters())
+
+    # Check that our model is a valid model
+    sanity_check(model)
+
+    # Compute backward
+    input = torch.randn(5, 3, dtype=half_precision, device="cuda")
+    accumulator.backward(model(input).sum())
+
+    # Check that we have an high precision gradient and that the low precision one is cleared
+    assert accumulator.parameters["weight"]["fp32"].grad.dtype == torch.float
+    if model.weight.grad is not None:
+        # We check that it's zero
+        torch.testing.assert_close(model.weight.grad, torch.zeros_like(model.weight.grad), atol=1e-6, rtol=1e-7)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+def test_gradient_accumulated_in_fp32(half_precision: torch.dtype):
+    model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
+    ref_model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
+    with torch.inference_mode():
+        ref_model.weight.copy_(model.weight)
+
+    # Create Nanotron Parameter
+    model.weight = NanotronParameter(model.weight)
+
+    # Add gradient accumulator
+    accumulator = FP32GradientAccumulator(model.named_parameters())
+
+    # Check that our model is a valid model
+    sanity_check(model)
+
+    # Compute backward
+    grad_accumulation_steps = 2
+    for _ in range(grad_accumulation_steps):
+        # We want large input to have large gradients.
+        input = (torch.randn(5, 3, dtype=half_precision, device="cuda") ** 2 + 1) * 100
+
+        # Compute backwards
+        accumulator.backward(model(input).sum())
+        ref_model(input).sum().backward()
+
+    # We check that we get the same gradient accumulation. In theory we do get more precision by promoting gradients to fp32.
+    torch.testing.assert_close(
+        accumulator.parameters["weight"]["fp32"].grad.to(half_precision),
+        ref_model.weight.grad,
+    )
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+def test_optimizer_can_step_gradient_in_fp32(half_precision: torch.dtype):
+    model = nn.Linear(3, 2, bias=False, dtype=half_precision, device="cuda")
+    original_weight = model.weight.detach().clone()
+
+    # Create Nanotron Parameter
+    model.weight = NanotronParameter(model.weight)
+
+    # Add optimizer
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+    )
+    accumulator = optimizer.gradient_accumulator
+
+    # Check that our model is a valid model
+    sanity_check(model)
+
+    # Compute backward
+    input = torch.randn(5, 3, dtype=half_precision, device="cuda")
+    accumulator.backward(model(input).sum())
+
+    # Check that we have an high precision gradient and that the low precision one is cleared
+    assert accumulator.parameters["weight"]["fp32"].grad.dtype == torch.float
+    if model.weight.grad is not None:
+        # We check that it's zero
+        torch.testing.assert_close(model.weight.grad, torch.zeros_like(model.weight.grad), atol=1e-6, rtol=1e-7)
+
+    optimizer.step()
+    optimizer.zero_grad()
+
+    # Check that we don't have gradients anymore and that it's set to `None`
+    assert accumulator.parameters["weight"]["fp32"].grad is None
+    assert model.weight.grad is None
+
+    # Check that gradients have been set to zero
+    fp32_grad = accumulator.get_grad_buffer(name="weight")
+    torch.testing.assert_close(fp32_grad, torch.zeros_like(fp32_grad), atol=1e-6, rtol=1e-7)
+
+    # weights has been updates
+    assert not torch.allclose(original_weight, model.weight)
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing ddp_hook_allreduce requires at least 2 gpus")
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+@pytest.mark.parametrize("train_iterations", [1, 3])
+@rerun_if_address_is_in_use()
+def test_ddp_with_grad_accum_in_fp32(half_precision: torch.dtype, accumulation_steps: int, train_iterations: int):
+    init_distributed(tp=1, dp=2, pp=1)(_test_ddp_with_grad_accum_in_fp32)(
+        half_precision=half_precision,
+        accumulation_steps=accumulation_steps,
+        train_iterations=train_iterations,
+    )
+
+
+def _test_ddp_with_grad_accum_in_fp32(
+    parallel_context: ParallelContext,
+    half_precision: torch.dtype,
+    accumulation_steps: int,
+    train_iterations: int,
+):
+    hidden_size = 32
+    n_layers = 3
+    model = nn.Sequential(
+        nn.Linear(3, hidden_size, bias=False, dtype=half_precision, device="cuda"),
+        *(
+            nn.Linear(hidden_size, hidden_size, bias=False, dtype=half_precision, device="cuda")
+            for _ in range(n_layers - 1)
+        ),
+    )
+    model_hook = copy.deepcopy(model)
+
+    # Create Nanotron Parameters
+    for module in model.modules():
+        if isinstance(module, nn.Linear):
+            setattr(module, "weight", NanotronParameter(module.weight))
+    for module in model_hook.modules():
+        if isinstance(module, nn.Linear):
+            setattr(module, "weight", NanotronParameter(module.weight))
+
+    # Needed in order to obtain smaller gradient buckets when using `DistributedDataParallel`
+    model_ddp = torch.nn.parallel.DistributedDataParallel(
+        model,
+        process_group=parallel_context.dp_pg,
+    )  # we won't actually use DDP anywhere, it's just to have same module names
+    model_ddp_accum_ref = {}
+    model_ddp_fp32_accum = torch.nn.parallel.DistributedDataParallel(
+        model_hook,
+        process_group=parallel_context.dp_pg,
+    )
+
+    # Add gradient accumulator
+    accumulator = FP32GradientAccumulator(model_ddp_fp32_accum.named_parameters())
+
+    # Register DDP hook
+    state = FP32GradBucketManager(
+        dp_pg=parallel_context.dp_pg,
+        accumulator=accumulator,
+        param_id_to_name={id(param): name for name, param in model_ddp_fp32_accum.named_parameters()},
+    )
+    model_ddp_fp32_accum.register_comm_hook(
+        state=state,
+        hook=get_fp32_accum_hook(
+            reduce_scatter=False,
+            reduce_op=dist.ReduceOp.AVG,
+        ),
+    )
+
+    for train_iter in range(train_iterations):
+        # Gradient accumulation steps
+        for accum_step in range(accumulation_steps - 1):
+            # Forward-Backward
+            input = torch.randn(10, 3, dtype=half_precision, device="cuda")
+            loss = model_ddp.module(input).sum()
+            assert not torch.isinf(loss).any(), "loss is inf"
+            loss.backward()
+            with ContextManagers([model_ddp_fp32_accum.no_sync(), accumulator.no_sync()]):
+                loss_fp32_accum = model_ddp_fp32_accum(input).sum()
+                accumulator.backward(loss_fp32_accum)
+
+            for name, param in model_ddp.named_parameters():
+                grad = param.grad
+                grad_fp32_accum = accumulator.parameters[name]["fp32"].grad
+                fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
+
+                # Check that FP32GradAccum+DDP+hook gives close gradients to DDP
+                model_ddp_accum_ref[name] = (
+                    grad.float() if accum_step == 0 else model_ddp_accum_ref[name] + grad.float()
+                )
+
+                dist.barrier()
+                torch.testing.assert_close(model_ddp_accum_ref[name], fp32_grad_bucket, atol=1e-6, rtol=1e-7)
+
+                dist.barrier()
+                # Check that we correctly copied grads from buckets to params (`copy_buckets_to_grads`)
+                torch.testing.assert_close(fp32_grad_bucket, grad_fp32_accum, atol=1e-6, rtol=1e-7)
+
+                # Check that the gradients are not synchronized across DP
+                with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
+                    assert_tensor_synced_across_pg(grad, parallel_context.dp_pg)
+                with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=parallel_context.dp_pg):
+                    assert_tensor_synced_across_pg(fp32_grad_bucket, parallel_context.dp_pg)
+
+            # We zero out half grads for `model_ddp` because we're accumulating grads manually in `model_ddp_accum_ref`
+            model_ddp.zero_grad()
+
+        # Last accumulation step (Sync grads across DDP)
+        input = torch.randn(10, 3, dtype=half_precision, device="cuda")
+        loss = model_ddp.module(input).sum()
+        loss.backward()
+        # manually reduce grads across DDP
+        for name, param in model_ddp.named_parameters():
+            grad = param.grad
+            model_ddp_accum_ref[name] = (
+                model_ddp_accum_ref[name] + grad.float() if name in model_ddp_accum_ref else grad.float()
+            )
+            dist.all_reduce(model_ddp_accum_ref[name], group=parallel_context.dp_pg, op=dist.ReduceOp.AVG)
+
+        loss_fp32_accum = model_ddp_fp32_accum(input).sum()
+        accumulator.backward(loss_fp32_accum)
+
+        for name, param in model_ddp_fp32_accum.named_parameters():
+            # Check that half grads has been set to None in sync step, to avoid it being uncorrectly used
+            half_grad = param.grad
+            assert half_grad is None, f"{half_grad} != None"
+
+            grad = model_ddp_accum_ref[name]
+            grad_fp32_accum = accumulator.parameters[name]["fp32"].grad
+            fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
+
+            # Check that FP32GradAccum+DDP+hook gives close gradients to DDP
+            dist.barrier()
+            torch.testing.assert_close(grad, fp32_grad_bucket, atol=1e-6, rtol=1e-7)
+
+            # Check that grad points to the same memory as the bucket
+            assert grad_fp32_accum.data_ptr() == fp32_grad_bucket.data_ptr()
+
+            # Check that the gradients are synchronized across DP
+            assert_tensor_synced_across_pg(grad, parallel_context.dp_pg)
+            assert_tensor_synced_across_pg(grad_fp32_accum, parallel_context.dp_pg)
+
+        # Zero out gradients (Usually it's the optimizer that does this)
+        model_ddp.zero_grad()
+        model_ddp_accum_ref = {}
+        accumulator.zero_grad()  # Sets half grads to None and zeroes out fp32 grad buckets
+        for name, elt in accumulator.parameters.items():
+            fp32_param = elt["fp32"]
+            fp32_param.grad = None
+
+        # Check that fp32 grad buckets are zeroed out and `param.grad` is set to None
+        for name, param in model_ddp_fp32_accum.named_parameters():
+            assert param.grad is None
+            fp32_grad_bucket = accumulator.get_grad_buffer(name=name)
+            dist.barrier()
+            torch.testing.assert_close(fp32_grad_bucket, torch.zeros_like(fp32_grad_bucket), atol=1e-6, rtol=1e-7)
+
+        # Check that all fp32 grad buckets are zeroed out
+        for _, elt in accumulator.fp32_grad_buffers.items():
+            fp32_grad = elt["fp32_grad"]
+            # This is important as we assume grad buckets to be zeroed out at the first accumulation step
+            dist.barrier()
+            torch.testing.assert_close(fp32_grad, torch.zeros_like(fp32_grad), atol=1e-6, rtol=1e-7)
+
+    parallel_context.destroy()
+
+
+@pytest.mark.skipif(
+    available_gpus() < 4, reason="Testing test_tied_weights_sync_with_grad_accum_in_fp32 requires at least 4 gpus"
+)
+@pytest.mark.parametrize(
+    "pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
+)
+@pytest.mark.parametrize("reduce_scatter", [True, False])
+@rerun_if_address_is_in_use()
+def test_tied_weights_sync_with_grad_accum_in_fp32(pipeline_engine: PipelineEngine, reduce_scatter: bool):
+    init_distributed(tp=1, dp=2, pp=2)(_test_tied_weights_sync_with_grad_accum_in_fp32)(
+        pipeline_engine=pipeline_engine, reduce_scatter=reduce_scatter
+    )
+
+
+def _test_tied_weights_sync_with_grad_accum_in_fp32(
+    parallel_context: ParallelContext, pipeline_engine: PipelineEngine, reduce_scatter: bool
+):
+    # We init two replicas of 2 denses. Each dense is on a device.
+    dtype = torch.float16
+    device = torch.device("cuda")
+    p2p = P2P(pg=parallel_context.pp_pg, device=device)
+
+    model = DummyModel(p2p=p2p)
+    reference_model = DummyModel(p2p=p2p)
+    reference_model_accum_ref = {}
+
+    for mdl in [model, reference_model]:
+        # Set the ranks
+        with init_on_device_and_dtype(device, dtype):
+            assert parallel_context.pp_pg.size() == len(mdl.mlp)
+            for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), mdl.mlp):
+                non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
+                non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
+            mdl.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
+
+        # Tie all dense weights across PP
+        tie_parameters(
+            root_module=mdl,
+            ties=[
+                (
+                    target,
+                    (
+                        parallel_context.get_global_rank(
+                            ep_rank=dist.get_rank(parallel_context.expert_pg),
+                            pp_rank=get_pp_rank_of(target, module=mdl),
+                            dp_rank=dist.get_rank(parallel_context.dp_pg),
+                            tp_rank=dist.get_rank(parallel_context.tp_pg),
+                        ),
+                    ),
+                )
+                for target in [
+                    f"mlp.{pp_rank}.linear.pp_block.weight" for pp_rank in range(parallel_context.pp_pg.size())
+                ]
+            ],
+            parallel_context=parallel_context,
+            reduce_op=dist.ReduceOp.SUM,
+        )
+
+        for name, module in mdl.named_modules():
+            if isinstance(module, nn.Linear):
+                module.bias = NanotronParameter(module.bias)
+
+        # Sync DP and tied weights: basic assumption
+        initial_sync(model=mdl, parallel_context=parallel_context)
+
+    # Sync params between `model` and `reference_model`
+    with torch.no_grad():
+        for name, param in model.named_parameters():
+            param.copy_(reference_model.get_parameter(name))
+
+    # DDP
+    model_ddp = torch.nn.parallel.DistributedDataParallel(model, process_group=parallel_context.dp_pg)
+    module_id_to_prefix = {id(module): f"{module_name}." for module_name, module in model.named_modules()}
+    reference_module_id_to_prefix = {
+        id(module): f"{module_name}." for module_name, module in reference_model.named_modules()
+    }
+    # Fix the root_model
+    module_id_to_prefix[id(model)] = ""
+    reference_module_id_to_prefix[id(reference_model)] = ""
+
+    # named parameters
+    named_parameters = [
+        (
+            param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix)
+            if param.is_tied
+            else name,
+            param,
+        )
+        for name, param in model.named_parameters()
+    ]
+
+    # Optimizer: We don't actually run the optimizer, we just use it to build the gradient accumulator
+    optimizer = ZeroDistributedOptimizer(
+        dp_pg=parallel_context.dp_pg,
+        named_params_or_groups=named_parameters,
+        optimizer_builder=lambda named_param_groups_1: OptimizerFromGradientAccumulator(
+            gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(
+                named_parameters=named_params,
+                grad_buckets_named_params=named_parameters,
+            ),
+            named_params_or_groups=named_param_groups_1,
+            optimizer_builder=lambda named_param_groups_2: NamedOptimizer(
+                named_params_or_groups=named_param_groups_2,
+                optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+            ),
+        ),
+    )
+    param_id_to_name = {
+        id(param): param.get_tied_info().get_full_name_from_module_id_to_prefix(
+            module_id_to_prefix=module_id_to_prefix
+        )
+        if param.is_tied
+        else name
+        for name, param in model.named_parameters()
+    }
+
+    # Add gradient accumulator
+    # We use `model_ddp.module` in order ta have the parameter names without the `module.` prefix
+    accumulator = optimizer.optimizer.gradient_accumulator
+    accumulator.assign_param_offsets(
+        dp_rank=dist.get_rank(parallel_context.dp_pg),
+        param_name_to_offsets=optimizer.param_name_to_dp_rank_offsets,
+    )
+    model_ddp.register_comm_hook(
+        state=FP32GradBucketManager(
+            dp_pg=parallel_context.dp_pg,
+            accumulator=accumulator,
+            param_id_to_name=param_id_to_name,
+        ),
+        hook=get_fp32_accum_hook(reduce_scatter=reduce_scatter, reduce_op=dist.ReduceOp.AVG),
+    )
+
+    # Get infinite dummy data iterator
+    data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg, dtype=dtype)  # First rank receives data
+
+    n_micro_batches_per_batch = 2
+    batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
+
+    ## Reference model iteration step
+    def forward_backward_reference(mdl, micro_batch):
+        pipeline_engine.train_batch_iter(
+            mdl, pg=parallel_context.pp_pg, batch=[micro_batch], nb_microbatches=1, grad_accumulator=None
+        )
+
+    for accum_step in range(n_micro_batches_per_batch - 1):
+        # Forward-Backward
+        forward_backward_reference(reference_model, batch[accum_step])
+        # Accumulate grads
+        for name, param in reference_model.named_parameters():
+            grad = param.grad
+            if param.is_tied:
+                tied_info = param.get_tied_info()
+                name = tied_info.get_full_name_from_module_id_to_prefix(
+                    module_id_to_prefix=reference_module_id_to_prefix
+                )
+            reference_model_accum_ref[name] = (
+                grad.float() if accum_step == 0 else reference_model_accum_ref[name] + grad.float()
+            )
+
+        # We zero out half grads for `reference_model` because we're accumulating grads manually in `reference_model_accum_ref`
+        reference_model.zero_grad()
+
+    # Last accumulation step (Sync grads across DDP)
+    forward_backward_reference(reference_model, batch[-1])
+    # manually reduce grads across DDP
+    for name, param in reference_model.named_parameters():
+        grad = param.grad
+        if param.is_tied:
+            tied_info = param.get_tied_info()
+            name = tied_info.get_full_name_from_module_id_to_prefix(module_id_to_prefix=reference_module_id_to_prefix)
+        reference_model_accum_ref[name] = (
+            reference_model_accum_ref[name] + grad.float() if name in reference_model_accum_ref else grad.float()
+        )
+        dist.all_reduce(reference_model_accum_ref[name], group=parallel_context.dp_pg, op=dist.ReduceOp.AVG)
+
+    ## Model iteration step
+    pipeline_engine.train_batch_iter(
+        model_ddp,
+        pg=parallel_context.pp_pg,
+        batch=batch,
+        nb_microbatches=n_micro_batches_per_batch,
+        grad_accumulator=accumulator,
+    )
+    for name, param in model_ddp.module.named_parameters():
+        if param.is_tied:
+            tied_info = param.get_tied_info()
+            name = tied_info.get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix)
+
+        # Each parameter is sharded across DP.
+        assert (
+            name in accumulator.parameters
+        ), f"`accumulator.parameters` must have all params {name} not in `accumulator.parameters`. Existing keys are: {accumulator.parameters}"
+
+        fp32_grad = accumulator.get_grad_buffer(name=name)
+
+        if not reduce_scatter:
+            # Check that the gradients are synchronized across DP
+            assert_tensor_synced_across_pg(fp32_grad, parallel_context.dp_pg)
+
+        fp32_grad_ref = reference_model_accum_ref[name]
+        dist.barrier()
+
+        if reduce_scatter:
+            slice_ = slice(*accumulator.param_name_to_offsets[name])
+            # Check that gradients are correct
+            torch.testing.assert_close(
+                fp32_grad_ref.view(-1)[slice_] / n_micro_batches_per_batch,
+                fp32_grad.view(-1)[slice_],
+                rtol=1e-7,
+                atol=1e-6,
+                msg=lambda msg: f"FP32 Gradients at `{name}` don't match\n - Expected: {fp32_grad_ref.view(-1)[slice_] / n_micro_batches_per_batch}\n - Got: {fp32_grad.view(-1)[slice_]}",
+            )
+        else:
+            # Check that gradients are correct
+            torch.testing.assert_close(fp32_grad_ref / n_micro_batches_per_batch, fp32_grad, rtol=1e-7, atol=1e-6)
+
+    # Check that tied weights grads are not synchronized yet
+    for (name, group_ranks), param in sorted(
+        get_tied_id_to_param(parameters=model_ddp.parameters(), root_module=model_ddp.module).items(),
+        key=lambda x: x[0],
+    ):
+        if not (isinstance(param, NanotronParameter) and param.is_tied):
+            continue
+
+        group = parallel_context.world_ranks_to_pg[group_ranks]
+        fp32_grad = accumulator.get_grad_buffer(name=name)
+
+        with assert_fail_except_rank_with(AssertionError, rank_exception=0, pg=group):
+            assert_tensor_synced_across_pg(
+                tensor=fp32_grad,
+                pg=group,
+                msg=lambda err: f"Tied weights's grads {name} are not synchronized. {err}",
+            )
+    # Sync tied weights grads (e.g. sync dense1 and dense2 grads in DP=0, but the problem is that DP=0 has only optim states for dense1)
+    # - Translate tied ranks along DP axis to find the DP rank that has the tied weights
+    # - accumulator keeps grads for all DPs, so we can just sync the grads
+    with timeout_after():
+        sync_tied_weights_gradients(
+            module=model_ddp.module, parallel_context=parallel_context, grad_accumulator=accumulator
+        )
+
+    tied_infos_dict = {
+        (
+            param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix),
+            param.get_tied_info().global_ranks,
+            param.get_tied_info().reduce_op,
+        ): param
+        for name, param in model_ddp.module.named_parameters()
+        if param.is_tied
+    }
+
+    # Check that tied weights grads are synchronized
+    for (name, group_ranks, reduce_op), param in sorted(tied_infos_dict.items(), key=lambda x: x[0]):
+        # Make sure we don't get None for reduce_op
+        assert reduce_op == dist.ReduceOp.SUM
+
+        fp32_grad_buffer = accumulator.get_grad_buffer(name=name)
+
+        # Grad buffers are only attached to param.grad on ranks that are sharded depending on `param_to_dprank`
+        fp32_grad = accumulator.parameters[name]["fp32"].grad
+        # Tied weights are synced using the fp32 grad buffers. Let's make sure they still point to the same memory
+        # When using ZeRODistributedOptimizer gradients are slices across dp
+        dp_slice_fp_32_grad_buffer = fp32_grad_buffer.view(-1)[slice(*accumulator.param_name_to_offsets[name])]
+        assert (
+            dp_slice_fp_32_grad_buffer.data_ptr() == fp32_grad.data_ptr()
+        ), "dp_slice_fp_32_grad_buffer and fp32_grad should point to the same memory"
+
+        group = parallel_context.world_ranks_to_pg[group_ranks]
+
+        # Check that fp32 grads for tied weights are synced (Used in optimizer step)
+        # Since we use `reduce_scatter = False` the entire gradient buffer is all reduced, causing it to be synced
+        if reduce_scatter:
+            assert_tensor_synced_across_pg(
+                tensor=dp_slice_fp_32_grad_buffer,
+                pg=group,
+                msg=lambda err: f"Tied weights's fp32 grads {name} are not synchronized. {err}",
+            )
+        else:
+            assert_tensor_synced_across_pg(
+                tensor=fp32_grad_buffer,
+                pg=group,
+                msg=lambda err: f"Tied weights's fp32 grads {name} are not synchronized. {err}",
+            )
+
+        # Manually sync reference model's tied weights grads
+        dist.all_reduce(reference_model_accum_ref[name], group=group, op=reduce_op)
+
+    # Check that accumulated grads are correct
+    for name, elt in accumulator.fp32_grad_buffers.items():
+        fp32_grad = elt["fp32_grad"]
+
+        dist.barrier()
+        if reduce_scatter:
+            slice_ = slice(*accumulator.param_name_to_offsets[name])
+            torch.testing.assert_close(
+                reference_model_accum_ref[name].view(-1)[slice_] / n_micro_batches_per_batch,
+                fp32_grad.view(-1)[slice_],
+                atol=1e-6,
+                rtol=1e-7,
+                msg=lambda msg: f"Grad for {name} is not correct.\n{msg}",
+            )
+        else:
+            torch.testing.assert_close(
+                reference_model_accum_ref[name] / n_micro_batches_per_batch,
+                fp32_grad,
+                atol=1e-6,
+                rtol=1e-7,
+                msg=lambda msg: f"Grad for {name} is not correct.\n{msg}",
+            )
+
+    parallel_context.destroy()
--- a/tests/test_parametrization.py
+++ b/tests/test_parametrization.py
+import math
+from typing import Union
+
+import pytest
+import torch
+from helpers.llama import TINY_LLAMA_CONFIG, create_llama_from_config, get_llama_training_config
+from helpers.utils import init_distributed, rerun_if_address_is_in_use
+from nanotron.config import ModelArgs, RandomInit, SpectralMupInit
+from nanotron.parallel import ParallelContext
+from nanotron.scaling.parametrization import ParametrizationMethod
+
+
+@pytest.mark.parametrize("tp,dp,pp", [(2, 1, 1)])
+@pytest.mark.parametrize("parametrization_method", [ParametrizationMethod.SPECTRAL_MUP])
+@pytest.mark.skip
+@rerun_if_address_is_in_use()
+def test_parametrization(tp: int, dp: int, pp: int, parametrization_method: ParametrizationMethod):
+    if parametrization_method == ParametrizationMethod.STANDARD:
+        init_method = RandomInit(std=1.0)
+    elif parametrization_method == ParametrizationMethod.SPECTRAL_MUP:
+        init_method = SpectralMupInit(use_mup=True)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_parametrization)(
+        init_method=init_method,
+        parametrization_method=parametrization_method,
+    )
+
+
+def _test_parametrization(
+    parallel_context: ParallelContext,
+    init_method: Union[RandomInit, SpectralMupInit],
+    parametrization_method: ParametrizationMethod,
+):
+    def spectral_std(fan_in: int, fan_out: int):
+        return torch.tensor((1.0 / math.sqrt(fan_in)) * min(1, math.sqrt(fan_out / fan_in)))
+
+    model_args = ModelArgs(init_method=init_method, model_config=TINY_LLAMA_CONFIG)
+    config = get_llama_training_config(model_args)
+
+    llama = create_llama_from_config(
+        model_config=TINY_LLAMA_CONFIG,
+        device=torch.device("cuda"),
+        parallel_context=parallel_context,
+    )
+    llama.init_model_randomly(config=config, init_method=parametrization_method)
+
+    hidden_size = TINY_LLAMA_CONFIG.hidden_size
+    interdimte_size = TINY_LLAMA_CONFIG.intermediate_size
+
+    o_proj_infeatures = llama.model.decoder[0].pp_block.attn.o_proj.in_features * parallel_context.tensor_parallel_size
+    NAME_TO_EXPECTED_STD = {
+        "input_layernorm": torch.tensor(0.0),
+        "post_attention_layernorm": torch.tensor(0.0),
+        "final_layer_norm": torch.tensor(0.0),
+        "token_embedding": torch.tensor(1.0),
+        # "lm_head": torch.tensor(1.0),
+        "qkv_proj": spectral_std(fan_in=hidden_size, fan_out=interdimte_size),
+        "o_proj": spectral_std(fan_in=o_proj_infeatures, fan_out=hidden_size),
+        "gate_up_proj": spectral_std(fan_in=hidden_size, fan_out=interdimte_size),
+        "down_proj": spectral_std(fan_in=interdimte_size, fan_out=hidden_size),
+    }
+
+    def find_expected_std(param_name):
+        for name in NAME_TO_EXPECTED_STD:
+            if name in param_name:
+                return NAME_TO_EXPECTED_STD[name]
+
+    for name, param in llama.model.named_parameters():
+        if "lm_head" in name:
+            continue
+
+        expected_std = find_expected_std(name)
+        assert expected_std is not None, f"Could not find expected std for {name}"
+        assert torch.allclose(
+            param.std().float(), expected_std, atol=0.05
+        ), f"name: {name}, expected: {expected_std}, actual: {param.std()}"
--- a/tests/test_pipeline_parallel.py
+++ b/tests/test_pipeline_parallel.py
+from typing import Union
+
+import pytest
+import torch
+from helpers.dummy import DummyModel, dummy_infinite_data_loader
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.models import init_on_device_and_dtype
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.pipeline_parallel.block import PipelineBlock
+from nanotron.parallel.pipeline_parallel.engine import (
+    AllForwardAllBackwardPipelineEngine,
+    OneForwardOneBackwardPipelineEngine,
+    PipelineEngine,
+)
+from nanotron.parallel.pipeline_parallel.p2p import P2P
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+from torch import nn
+from torch.nn import functional as F
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing build_and_set_rank requires at least 2 gpus")
+@rerun_if_address_is_in_use()
+def test_build_and_set_rank():
+    init_distributed(tp=1, dp=1, pp=2)(_test_build_and_set_rank)()
+
+
+def _test_build_and_set_rank(parallel_context: ParallelContext):
+    device = torch.device("cuda")
+    p2p = P2P(pg=parallel_context.pp_pg, device=device)
+    model = DummyModel(p2p=p2p)
+
+    # Set the ranks
+    assert len(model.mlp) == parallel_context.pp_pg.size()
+    with init_on_device_and_dtype(device):
+        for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
+            non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
+            non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
+        model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
+
+    # Check that the ranks are set correctly
+    current_pp_rank = dist.get_rank(parallel_context.pp_pg)
+    assert model.mlp[current_pp_rank].linear.rank == current_pp_rank
+    assert model.mlp[current_pp_rank].activation.rank == current_pp_rank
+
+    # Check that blocks were built on the correct ranks
+    for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
+        if pp_rank == current_pp_rank:
+            assert hasattr(non_linear.linear, "pp_block")
+            assert hasattr(non_linear.activation, "pp_block")
+        else:
+            assert not hasattr(non_linear.linear, "pp_block")
+            assert not hasattr(non_linear.activation, "pp_block")
+
+    parallel_context.destroy()
+
+
+@pytest.mark.skipif(available_gpus() < 1, reason="Testing test_init_on_device_and_dtype requires at least 1 gpus")
+def test_init_on_device_and_dtype():
+    device = torch.device(type="cuda", index=0)
+    with init_on_device_and_dtype(device=device, dtype=torch.bfloat16):
+        model = nn.Linear(10, 10)
+
+    assert model.weight.dtype == torch.bfloat16, "Model weight wasn't initialised with the correct dtype"
+    assert model.weight.device == device, "Model weight wasn't initialised with the correct device"
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing AFAB requires at least 2 gpus")
+@pytest.mark.parametrize(
+    "pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
+)
+@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
+@rerun_if_address_is_in_use()
+def test_pipeline_engine(pipeline_engine: PipelineEngine, pp: int):
+    init_distributed(tp=1, dp=1, pp=pp)(_test_pipeline_engine)(pipeline_engine=pipeline_engine)
+
+
+def _test_pipeline_engine(parallel_context: ParallelContext, pipeline_engine: PipelineEngine):
+    device = torch.device("cuda")
+    p2p = P2P(parallel_context.pp_pg, device=device)
+    reference_rank = 0
+    has_reference_model = dist.get_rank(parallel_context.pp_pg) == reference_rank
+    current_pp_rank = dist.get_rank(parallel_context.pp_pg)
+
+    # spawn model
+    model = DummyModel(p2p=p2p)
+    if has_reference_model:
+        reference_model = DummyModel(p2p=p2p)
+
+    # Set the ranks
+    assert len(model.mlp) == parallel_context.pp_pg.size()
+    with init_on_device_and_dtype(device):
+        for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
+            non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
+            non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
+        model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
+
+        # build reference model
+        if has_reference_model:
+            for non_linear in reference_model.mlp:
+                non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
+                non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
+            reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
+
+    # synchronize weights
+    if has_reference_model:
+        with torch.inference_mode():
+            for pp_rank in range(parallel_context.pp_pg.size()):
+                non_linear = model.mlp[pp_rank]
+                reference_non_linear = reference_model.mlp[pp_rank]
+                if pp_rank == current_pp_rank:
+                    # We already have the weights locally
+                    reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
+                    reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
+                    continue
+
+                weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+                reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
+                reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
+    else:
+        p2p.send_tensors(
+            [model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
+            to_rank=reference_rank,
+        )
+
+    # Get infinite dummy data iterator
+    data_iterator = dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg)  # First rank receives data
+
+    # Have at least as many microbatches as PP size.
+    n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
+
+    batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
+    losses = pipeline_engine.train_batch_iter(
+        model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
+    )
+
+    # Equivalent on the reference model
+    if has_reference_model:
+        reference_losses = []
+        for micro_batch in batch:
+            loss = reference_model(**micro_batch)
+            loss /= n_micro_batches_per_batch
+            loss.backward()
+            reference_losses.append(loss.detach())
+
+    # Gather loss in reference_rank
+    if has_reference_model:
+        _losses = []
+    for loss in losses:
+        if isinstance(loss["loss"], torch.Tensor):
+            if has_reference_model:
+                _losses.append(loss["loss"])
+            else:
+                p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
+        else:
+            assert isinstance(loss["loss"], TensorPointer)
+            if not has_reference_model:
+                continue
+            _losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
+    if has_reference_model:
+        losses = _losses
+
+    # Check loss are the same as reference
+    if has_reference_model:
+        for loss, ref_loss in zip(losses, reference_losses):
+            torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
+
+    # Check that gradient flows through the entire model
+    for param in model.parameters():
+        assert param.grad is not None
+
+    # Check that gradient are the same as reference
+    if has_reference_model:
+        for pp_rank in range(parallel_context.pp_pg.size()):
+            non_linear = model.mlp[pp_rank]
+            reference_non_linear = reference_model.mlp[pp_rank]
+            if pp_rank == current_pp_rank:
+                # We already have the weights locally
+                torch.testing.assert_close(
+                    non_linear.linear.pp_block.weight.grad,
+                    reference_non_linear.linear.pp_block.weight.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                torch.testing.assert_close(
+                    non_linear.linear.pp_block.bias.grad,
+                    reference_non_linear.linear.pp_block.bias.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                continue
+
+            weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+            torch.testing.assert_close(
+                weight_grad, reference_non_linear.linear.pp_block.weight.grad, atol=1e-6, rtol=1e-7
+            )
+            torch.testing.assert_close(bias_grad, reference_non_linear.linear.pp_block.bias.grad, atol=1e-6, rtol=1e-7)
+    else:
+        p2p.send_tensors(
+            [
+                model.mlp[current_pp_rank].linear.pp_block.weight.grad,
+                model.mlp[current_pp_rank].linear.pp_block.bias.grad,
+            ],
+            to_rank=reference_rank,
+        )
+
+    parallel_context.destroy()
+
+
+@pytest.mark.skipif(
+    available_gpus() < 2,
+    reason="Testing `test_pipeline_engine_with_tensor_that_does_not_require_grad` requires at least 2 gpus",
+)
+@pytest.mark.parametrize(
+    "pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
+)
+@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
+@rerun_if_address_is_in_use()
+def test_pipeline_engine_with_tensor_that_does_not_require_grad(pipeline_engine: PipelineEngine, pp: int):
+    init_distributed(pp=pp, dp=1, tp=1)(_test_pipeline_engine_with_tensor_that_does_not_require_grad)(
+        pipeline_engine=pipeline_engine
+    )
+
+
+def _test_pipeline_engine_with_tensor_that_does_not_require_grad(
+    parallel_context: ParallelContext, pipeline_engine: PipelineEngine
+):
+    def activation(x: torch.Tensor, y: torch.Tensor):
+        return {"output": F.sigmoid(x) * y, "y": y}
+
+    class LinearWithDummyInput(nn.Linear):
+        def __init__(self, in_features, out_features):
+            super().__init__(in_features=in_features, out_features=out_features)
+
+        def forward(self, x: torch.Tensor, y: torch.Tensor):
+            return {"output": super().forward(x), "y": y}
+
+    class DummyModelPassingNonDifferentiableTensor(nn.Module):
+        def __init__(
+            self,
+            p2p: P2P,
+        ):
+            super().__init__()
+            self.p2p = p2p
+            self.mlp = nn.Sequential(
+                *(
+                    nn.ModuleDict(
+                        {
+                            "linear": PipelineBlock(
+                                p2p=p2p,
+                                module_builder=LinearWithDummyInput,
+                                module_kwargs={"in_features": 10, "out_features": 10},
+                                module_input_keys={"x", "y"},
+                                module_output_keys={"output", "y"},
+                            ),
+                            "activation": PipelineBlock(
+                                p2p=p2p,
+                                module_builder=lambda: activation,
+                                module_kwargs={},
+                                module_input_keys={"x", "y"},
+                                module_output_keys={"output", "y"},
+                            ),
+                        }
+                    )
+                    for _ in range(p2p.pg.size() + 1)
+                )
+            )
+
+            self.loss = PipelineBlock(
+                p2p=p2p,
+                module_builder=lambda: lambda x: x.sum(),
+                module_kwargs={},
+                module_input_keys={"x"},
+                module_output_keys={"output"},
+            )
+
+        def forward(
+            self,
+            differentiable_tensor: Union[torch.Tensor, TensorPointer],
+            non_differentiable_tensor: Union[torch.Tensor, TensorPointer],
+        ):
+            for non_linear in self.mlp:
+                linear_output = non_linear.linear(x=differentiable_tensor, y=non_differentiable_tensor)
+                output = non_linear.activation(x=linear_output["output"], y=linear_output["y"])
+                differentiable_tensor, non_differentiable_tensor = output["output"], output["y"]
+
+                if isinstance(differentiable_tensor, torch.Tensor):
+                    assert differentiable_tensor.requires_grad is True
+                if isinstance(non_differentiable_tensor, torch.Tensor):
+                    assert non_differentiable_tensor.requires_grad is False
+
+            differentiable_tensor = self.loss(x=differentiable_tensor)["output"]
+            return differentiable_tensor
+
+    device = torch.device("cuda")
+    p2p = P2P(parallel_context.pp_pg, device=device)
+    reference_rank = 0
+    current_pp_rank = dist.get_rank(parallel_context.pp_pg)
+    has_reference_model = current_pp_rank == reference_rank
+
+    # spawn model
+    model = DummyModelPassingNonDifferentiableTensor(p2p=p2p)
+    if has_reference_model:
+        reference_model = DummyModelPassingNonDifferentiableTensor(p2p=p2p)
+
+    # Set the ranks
+    assert len(model.mlp) == parallel_context.pp_pg.size() + 1
+    # An additional mlp is in the end
+    mlp_index_pp_rank = [(i, i) for i in range(parallel_context.pp_pg.size())] + [
+        (parallel_context.pp_pg.size(), parallel_context.pp_pg.size() - 1)
+    ]
+
+    with init_on_device_and_dtype(device):
+        for (mlp_index, pp_rank), non_linear in zip(mlp_index_pp_rank, model.mlp):
+            non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
+            non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
+        model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
+
+        # build reference model
+        if has_reference_model:
+            for non_linear in reference_model.mlp:
+                non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
+                non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
+            reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
+
+    # synchronize weights
+    if has_reference_model:
+        with torch.inference_mode():
+            for (mlp_index, pp_rank) in mlp_index_pp_rank:
+                non_linear = model.mlp[mlp_index]
+                reference_non_linear = reference_model.mlp[mlp_index]
+                if pp_rank == current_pp_rank:
+                    # We already have the weights locally
+                    reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
+                    reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
+                    continue
+
+                weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+                reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
+                reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
+    else:
+        for (mlp_index, pp_rank) in mlp_index_pp_rank:
+            if pp_rank == current_pp_rank:
+                p2p.send_tensors(
+                    [model.mlp[mlp_index].linear.pp_block.weight, model.mlp[mlp_index].linear.pp_block.bias],
+                    to_rank=reference_rank,
+                )
+
+    # Get infinite dummy data iterator
+    def dummy_infinite_data_loader_with_non_differentiable_tensor(
+        pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
+    ):
+        micro_batch_size = 3
+        # We assume the first linear is always built on the first rank.
+        while True:
+            yield {
+                "differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
+                if current_pp_rank == input_pp_rank
+                else TensorPointer(group_rank=input_pp_rank),
+                "non_differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
+                if current_pp_rank == input_pp_rank
+                else TensorPointer(group_rank=input_pp_rank),
+            }
+
+    data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
+        pp_pg=parallel_context.pp_pg
+    )  # First rank receives data
+
+    # Have at least as many microbatches as PP size.
+    n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
+
+    batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
+    losses = pipeline_engine.train_batch_iter(
+        model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
+    )
+    # Equivalent on the reference model
+    if has_reference_model:
+        reference_losses = []
+        for micro_batch in batch:
+            loss = reference_model(**micro_batch)
+            loss /= n_micro_batches_per_batch
+            loss.backward()
+            reference_losses.append(loss.detach())
+
+    # Gather loss in reference_rank
+    if has_reference_model:
+        _losses = []
+    for loss in losses:
+        if isinstance(loss["loss"], torch.Tensor):
+            if has_reference_model:
+                _losses.append(loss["loss"])
+            else:
+                p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
+        else:
+            assert isinstance(loss["loss"], TensorPointer)
+            if not has_reference_model:
+                continue
+            _losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
+    if has_reference_model:
+        losses = _losses
+
+    # Check loss are the same as reference
+    if has_reference_model:
+        for loss, ref_loss in zip(losses, reference_losses):
+            torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
+
+    # Check that gradient flows through the entire model
+    for param in model.parameters():
+        assert param.grad is not None
+
+    # Check that gradient are the same as reference
+    if has_reference_model:
+        for (mlp_index, pp_rank) in mlp_index_pp_rank:
+            non_linear = model.mlp[mlp_index]
+            reference_non_linear = reference_model.mlp[mlp_index]
+            if pp_rank == current_pp_rank:
+                # We already have the weights locally
+                torch.testing.assert_close(
+                    non_linear.linear.pp_block.weight.grad,
+                    reference_non_linear.linear.pp_block.weight.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                torch.testing.assert_close(
+                    non_linear.linear.pp_block.bias.grad,
+                    reference_non_linear.linear.pp_block.bias.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                continue
+
+            weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+            torch.testing.assert_close(
+                weight_grad, reference_non_linear.linear.pp_block.weight.grad, atol=1e-6, rtol=1e-7
+            )
+            torch.testing.assert_close(bias_grad, reference_non_linear.linear.pp_block.bias.grad, atol=1e-6, rtol=1e-7)
+    else:
+        for (mlp_index, pp_rank) in mlp_index_pp_rank:
+            if pp_rank == current_pp_rank:
+                p2p.send_tensors(
+                    [model.mlp[mlp_index].linear.pp_block.weight.grad, model.mlp[mlp_index].linear.pp_block.bias.grad],
+                    to_rank=reference_rank,
+                )
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize("pp", list(range(2, min(4, available_gpus()) + 1)))
+@rerun_if_address_is_in_use()
+def test_pipeline_forward_without_engine(pp: int):
+    init_distributed(pp=pp, dp=1, tp=1)(_test_pipeline_forward_without_engine)()
+
+
+def _test_pipeline_forward_without_engine(parallel_context: ParallelContext):
+    def activation(x: torch.Tensor, y: torch.Tensor):
+        return {"output": F.sigmoid(x) * y, "y": y}
+
+    class DummyModel(nn.Module):
+        def __init__(
+            self,
+            p2p: P2P,
+        ):
+            super().__init__()
+            self.p2p = p2p
+            self.mlp = nn.Sequential(
+                *(
+                    nn.ModuleDict(
+                        {
+                            "linear": PipelineBlock(
+                                p2p=p2p,
+                                module_builder=nn.Linear,
+                                module_kwargs={"in_features": 10, "out_features": 10},
+                                module_input_keys={"input"},
+                                module_output_keys={"output"},
+                            ),
+                            "activation": PipelineBlock(
+                                p2p=p2p,
+                                module_builder=lambda: activation,
+                                module_kwargs={},
+                                module_input_keys={"x", "y"},
+                                module_output_keys={"output", "y"},
+                            ),
+                        }
+                    )
+                    for _ in range(p2p.pg.size())
+                )
+            )
+
+            self.loss = PipelineBlock(
+                p2p=p2p,
+                module_builder=lambda: lambda x: x.sum(),
+                module_kwargs={},
+                module_input_keys={"x"},
+                module_output_keys={"output"},
+            )
+
+        def forward(
+            self,
+            differentiable_tensor: Union[torch.Tensor, TensorPointer],
+            non_differentiable_tensor: Union[torch.Tensor, TensorPointer],
+        ):
+            for non_linear in self.mlp:
+                differentiable_tensor = non_linear.linear(input=differentiable_tensor)["output"]
+                output = non_linear.activation(x=differentiable_tensor, y=non_differentiable_tensor)
+                differentiable_tensor, non_differentiable_tensor = output["output"], output["y"]
+            differentiable_tensor = self.loss(x=differentiable_tensor)["output"]
+            return differentiable_tensor
+
+    device = torch.device("cuda")
+    p2p = P2P(parallel_context.pp_pg, device=device)
+    reference_rank = 0
+    current_pp_rank = dist.get_rank(parallel_context.pp_pg)
+    has_reference_model = current_pp_rank == reference_rank
+
+    # spawn model
+    model = DummyModel(p2p=p2p)
+    if has_reference_model:
+        reference_model = DummyModel(p2p=p2p)
+
+    # Set the ranks
+    assert len(model.mlp) == parallel_context.pp_pg.size()
+    with init_on_device_and_dtype(device):
+        for pp_rank, non_linear in zip(range(parallel_context.pp_pg.size()), model.mlp):
+            non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
+            non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
+        model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
+
+        # build reference model
+        if has_reference_model:
+            for non_linear in reference_model.mlp:
+                non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
+                non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
+            reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
+
+    # synchronize weights
+    if has_reference_model:
+        with torch.inference_mode():
+            for pp_rank in range(parallel_context.pp_pg.size()):
+                non_linear = model.mlp[pp_rank]
+                reference_non_linear = reference_model.mlp[pp_rank]
+                if pp_rank == current_pp_rank:
+                    # We already have the weights locally
+                    reference_non_linear.linear.pp_block.weight.data.copy_(non_linear.linear.pp_block.weight.data)
+                    reference_non_linear.linear.pp_block.bias.data.copy_(non_linear.linear.pp_block.bias.data)
+                    continue
+
+                weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+                reference_non_linear.linear.pp_block.weight.data.copy_(weight.data)
+                reference_non_linear.linear.pp_block.bias.data.copy_(bias.data)
+    else:
+        p2p.send_tensors(
+            [model.mlp[current_pp_rank].linear.pp_block.weight, model.mlp[current_pp_rank].linear.pp_block.bias],
+            to_rank=reference_rank,
+        )
+
+    # Get infinite dummy data iterator
+    def dummy_infinite_data_loader_with_non_differentiable_tensor(
+        pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
+    ):
+        micro_batch_size = 3
+        # We assume the first linear is always built on the first rank.
+        while True:
+            yield {
+                "differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
+                if current_pp_rank == input_pp_rank
+                else TensorPointer(group_rank=input_pp_rank),
+                "non_differentiable_tensor": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
+                if current_pp_rank == input_pp_rank
+                else TensorPointer(group_rank=input_pp_rank),
+            }
+
+    data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
+        pp_pg=parallel_context.pp_pg
+    )  # First rank receives data
+
+    # Have at least as many microbatches as PP size.
+    n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
+
+    batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
+
+    # Run the model
+    losses = []
+    for micro_batch in batch:
+        with torch.inference_mode():
+            loss = model(**micro_batch)
+        losses.append(loss)
+
+    # Equivalent on the reference model
+    if has_reference_model:
+        reference_losses = []
+        for micro_batch in batch:
+            loss = reference_model(**micro_batch)
+            reference_losses.append(loss.detach())
+
+    # Gather loss in reference_rank
+    if has_reference_model:
+        _losses = []
+    for loss in losses:
+        if isinstance(loss, torch.Tensor):
+            if has_reference_model:
+                _losses.append(loss)
+            else:
+                p2p.send_tensors([loss], to_rank=reference_rank)
+        else:
+            assert isinstance(loss, TensorPointer)
+            if not has_reference_model:
+                continue
+            _losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss.group_rank)[0])
+    if has_reference_model:
+        losses = _losses
+
+    # Check loss are the same as reference
+    if has_reference_model:
+        for loss, ref_loss in zip(losses, reference_losses):
+            torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
+
+    parallel_context.destroy()
+
+
+@pytest.mark.skipif(available_gpus() < 4, reason="Testing `test_pipeline_engine_diamond` requires at least 4 gpus")
+@pytest.mark.parametrize(
+    "pipeline_engine", [AllForwardAllBackwardPipelineEngine(), OneForwardOneBackwardPipelineEngine()]
+)
+@rerun_if_address_is_in_use()
+def test_pipeline_engine_diamond(pipeline_engine: PipelineEngine):
+    init_distributed(pp=4, dp=1, tp=1)(_test_pipeline_engine_diamond)(pipeline_engine=pipeline_engine)
+    pass
+
+
+def _test_pipeline_engine_diamond(parallel_context: ParallelContext, pipeline_engine: PipelineEngine):
+    class DiamondModel(nn.Module):
+        def __init__(self, p2p: P2P):
+            super().__init__()
+            self.p2p = p2p
+            self.dense_bottom = nn.ModuleDict(
+                {
+                    "linear": PipelineBlock(
+                        p2p=p2p,
+                        module_builder=nn.Linear,
+                        module_kwargs={"in_features": 10, "out_features": 10},
+                        module_input_keys={"input"},
+                        module_output_keys={"output"},
+                    ),
+                    "activation": PipelineBlock(
+                        p2p=p2p,
+                        module_builder=nn.ReLU,
+                        module_kwargs={},
+                        module_input_keys={"input"},
+                        module_output_keys={"output"},
+                    ),
+                }
+            )
+            self.dense_left = nn.ModuleDict(
+                {
+                    "linear": PipelineBlock(
+                        p2p=p2p,
+                        module_builder=nn.Linear,
+                        module_kwargs={"in_features": 10, "out_features": 10},
+                        module_input_keys={"input"},
+                        module_output_keys={"output"},
+                    ),
+                    "activation": PipelineBlock(
+                        p2p=p2p,
+                        module_builder=nn.ReLU,
+                        module_kwargs={},
+                        module_input_keys={"input"},
+                        module_output_keys={"output"},
+                    ),
+                }
+            )
+            self.dense_right = nn.ModuleDict(
+                {
+                    "linear": PipelineBlock(
+                        p2p=p2p,
+                        module_builder=nn.Linear,
+                        module_kwargs={"in_features": 10, "out_features": 10},
+                        module_input_keys={"input"},
+                        module_output_keys={"output"},
+                    ),
+                    "activation": PipelineBlock(
+                        p2p=p2p,
+                        module_builder=nn.ReLU,
+                        module_kwargs={},
+                        module_input_keys={"input"},
+                        module_output_keys={"output"},
+                    ),
+                }
+            )
+            self.dense_top = nn.ModuleDict(
+                {
+                    "linear": PipelineBlock(
+                        p2p=p2p,
+                        module_builder=nn.Bilinear,
+                        module_kwargs={"in1_features": 10, "in2_features": 10, "out_features": 10},
+                        module_input_keys={"input1", "input2"},
+                        module_output_keys={"output"},
+                    ),
+                    "activation": PipelineBlock(
+                        p2p=p2p,
+                        module_builder=nn.ReLU,
+                        module_kwargs={},
+                        module_input_keys={"input"},
+                        module_output_keys={"output"},
+                    ),
+                }
+            )
+
+            self.loss = PipelineBlock(
+                p2p=p2p,
+                module_builder=lambda: lambda x: x.sum(),
+                module_kwargs={},
+                module_input_keys={"x"},
+                module_output_keys={"output"},
+            )
+
+        def forward(self, x):
+            x = self.dense_bottom.activation(input=self.dense_bottom.linear(input=x)["output"])["output"]
+            y = self.dense_left.activation(input=self.dense_left.linear(input=x)["output"])["output"]
+            z = self.dense_right.activation(input=self.dense_right.linear(input=x)["output"])["output"]
+            out = self.dense_top.activation(input=self.dense_top.linear(input1=y, input2=z)["output"])["output"]
+            return self.loss(x=out)["output"]
+
+    device = torch.device("cuda")
+    p2p = P2P(parallel_context.pp_pg, device=device)
+    reference_rank = 0
+    current_pp_rank = dist.get_rank(parallel_context.pp_pg)
+    has_reference_model = current_pp_rank == reference_rank
+
+    # spawn model
+    model = DiamondModel(p2p=p2p)
+    if has_reference_model:
+        reference_model = DiamondModel(p2p=p2p)
+
+    # Set the ranks
+    assert parallel_context.pp_pg.size() == len(
+        [model.dense_bottom, model.dense_left, model.dense_right, model.dense_top]
+    )
+    assert parallel_context.pp_pg.size() == 4
+    pp_rank_to_dense_name = ["dense_bottom", "dense_left", "dense_right", "dense_top"]
+    with init_on_device_and_dtype(device):
+        for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
+            non_linear = model.get_submodule(module_name)
+            non_linear.linear.build_and_set_rank(pp_rank=pp_rank)
+            non_linear.activation.build_and_set_rank(pp_rank=pp_rank)
+        model.loss.build_and_set_rank(pp_rank=parallel_context.pp_pg.size() - 1)
+
+        # build reference model
+        if has_reference_model:
+            for module_name in pp_rank_to_dense_name:
+                non_linear = reference_model.get_submodule(module_name)
+                non_linear.linear.build_and_set_rank(pp_rank=reference_rank)
+                non_linear.activation.build_and_set_rank(pp_rank=reference_rank)
+            reference_model.loss.build_and_set_rank(pp_rank=reference_rank)
+
+    # synchronize weights
+    if has_reference_model:
+        with torch.inference_mode():
+            for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
+                reference_non_linear = reference_model.get_submodule(module_name).linear.pp_block
+                if pp_rank == current_pp_rank:
+                    # We already have the weights locally
+                    non_linear = model.get_submodule(module_name).linear.pp_block
+                    reference_non_linear.weight.data.copy_(non_linear.weight.data)
+                    reference_non_linear.bias.data.copy_(non_linear.bias.data)
+                    continue
+
+                weight, bias = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+                reference_non_linear.weight.data.copy_(weight.data)
+                reference_non_linear.bias.data.copy_(bias.data)
+    else:
+        non_linear = model.get_submodule(pp_rank_to_dense_name[current_pp_rank]).linear.pp_block
+        p2p.send_tensors(
+            [non_linear.weight, non_linear.bias],
+            to_rank=reference_rank,
+        )
+
+    # Get infinite dummy data iterator
+    def dummy_infinite_data_loader_with_non_differentiable_tensor(
+        pp_pg: dist.ProcessGroup, dtype=torch.float, input_pp_rank=0
+    ):
+        micro_batch_size = 3
+        # We assume the first linear is always built on the first rank.
+        while True:
+            yield {
+                "x": torch.randn(micro_batch_size, 10, dtype=dtype, device="cuda")
+                if current_pp_rank == input_pp_rank
+                else TensorPointer(group_rank=input_pp_rank),
+            }
+
+    data_iterator = dummy_infinite_data_loader_with_non_differentiable_tensor(
+        pp_pg=parallel_context.pp_pg
+    )  # First rank receives data
+
+    # Have at least as many microbatches as PP size.
+    n_micro_batches_per_batch = parallel_context.pp_pg.size() + 5
+
+    batch = [next(data_iterator) for _ in range(n_micro_batches_per_batch)]
+    losses = pipeline_engine.train_batch_iter(
+        model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=n_micro_batches_per_batch, grad_accumulator=None
+    )
+
+    # Equivalent on the reference model
+    if has_reference_model:
+        reference_losses = []
+        for micro_batch in batch:
+            loss = reference_model(**micro_batch)
+            loss /= n_micro_batches_per_batch
+            loss.backward()
+            reference_losses.append(loss.detach())
+
+    # Gather loss in reference_rank
+    if has_reference_model:
+        _losses = []
+    for loss in losses:
+        if isinstance(loss["loss"], torch.Tensor):
+            if has_reference_model:
+                _losses.append(loss["loss"])
+            else:
+                p2p.send_tensors([loss["loss"]], to_rank=reference_rank)
+        else:
+            assert isinstance(loss["loss"], TensorPointer)
+            if not has_reference_model:
+                continue
+            _losses.append(p2p.recv_tensors(num_tensors=1, from_rank=loss["loss"].group_rank)[0])
+    if has_reference_model:
+        losses = _losses
+
+    # Check loss are the same as reference
+    if has_reference_model:
+        for loss, ref_loss in zip(losses, reference_losses):
+            torch.testing.assert_close(loss, ref_loss, atol=1e-6, rtol=1e-7)
+
+    # Check that gradient flows through the entire model
+    for param in model.parameters():
+        assert param.grad is not None
+
+    # Check that gradient are the same as reference
+    if has_reference_model:
+        for pp_rank, module_name in enumerate(pp_rank_to_dense_name):
+            reference_non_linear = reference_model.get_submodule(module_name).linear.pp_block
+            if pp_rank == current_pp_rank:
+                # We already have the weights locally
+                non_linear = model.get_submodule(module_name).linear.pp_block
+                torch.testing.assert_close(
+                    non_linear.weight.grad,
+                    reference_non_linear.weight.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                torch.testing.assert_close(
+                    non_linear.bias.grad,
+                    reference_non_linear.bias.grad,
+                    atol=1e-6,
+                    rtol=1e-7,
+                )
+                continue
+
+            weight_grad, bias_grad = p2p.recv_tensors(num_tensors=2, from_rank=pp_rank)
+            torch.testing.assert_close(weight_grad, reference_non_linear.weight.grad, atol=1e-6, rtol=1e-7)
+            torch.testing.assert_close(bias_grad, reference_non_linear.bias.grad, atol=1e-6, rtol=1e-7)
+    else:
+        non_linear = model.get_submodule(pp_rank_to_dense_name[current_pp_rank]).linear.pp_block
+        p2p.send_tensors(
+            [non_linear.weight.grad, non_linear.bias.grad],
+            to_rank=reference_rank,
+        )
+
+    parallel_context.destroy()
--- a/tests/test_random_state.py
+++ b/tests/test_random_state.py
+import pytest
+import torch
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.parallel import ParallelContext
+from nanotron.random import (
+    RandomStates,
+    branch_random_state,
+    get_current_random_state,
+    get_synced_random_state,
+)
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_random_state_sync requires at least 2 gpus")
+@pytest.mark.parametrize("tp,dp,pp", [(2, 1, 1), (1, 2, 1), (1, 1, 2)])
+@rerun_if_address_is_in_use()
+def test_random_state_sync(tp: int, dp: int, pp: int):
+    # TODO @nouamane: Make a test with 4 gpus (2 in one pg, 2 in other pg)
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_random_state_sync)()
+
+
+def _test_random_state_sync(parallel_context: ParallelContext):
+    current_random_state = get_current_random_state()
+    reference_rank = 0
+    pg = next(
+        (pg for pg in [parallel_context.tp_pg, parallel_context.dp_pg, parallel_context.pp_pg] if pg.size() == 2)
+    )
+
+    # Check that they are not equal across process group
+    if dist.get_rank(pg) == reference_rank:
+        random_states = [current_random_state]
+    else:
+        random_states = [None]
+    dist.broadcast_object_list(random_states, src=reference_rank, group=pg)
+    if dist.get_rank(pg) != reference_rank:
+        assert current_random_state != random_states[0]
+
+    # Sync random state
+    synced_random_state = get_synced_random_state(current_random_state, pg=pg)
+
+    # Check that they are equal across process group
+    random_states = [synced_random_state]
+    dist.broadcast_object_list(random_states, src=reference_rank, group=pg)
+    if dist.get_rank(pg) != reference_rank:
+        assert current_random_state != random_states[0]
+
+    parallel_context.destroy()
+
+
+def test_random_state_fork_random_operation_in_global_context():
+    key = "my_random_state"
+    random_state = get_current_random_state()
+    random_states = RandomStates({key: random_state})
+    assert random_states[key] == random_state
+
+    # Random operation that updates the random state
+    torch.randn(1)
+
+    new_random_state = get_current_random_state()
+
+    # Check that random states changed
+    assert new_random_state != random_state
+    assert random_states[key] == random_state
+
+    # Check that within the context manager the random state matches the one we stored in `random_states`
+    with branch_random_state(random_states=random_states, key=key, enabled=True):
+        assert random_states[key] == random_state
+        assert get_current_random_state() == random_states[key]
+
+    # Check that random states if back to global one
+    assert get_current_random_state() == new_random_state
+
+
+def test_random_state_fork_random_operation_in_local_context():
+    key = "my_random_state"
+    random_state = get_current_random_state()
+    random_states = RandomStates({key: random_state})
+
+    # Check that within the context manager the random state matches the one we stored in `random_states`
+    with branch_random_state(random_states=random_states, key=key, enabled=True):
+        old_random_state = get_current_random_state()
+        assert old_random_state == random_states[key]
+
+        # Random operation that updates the random state
+        torch.randn(1)
+
+        # Check that random states changed
+        new_random_state = get_current_random_state()
+
+    # Check that global random_state hasn't changed
+    assert get_current_random_state() == random_state
+
+    # Check that local random_state has changed and is equal to `new_random_state`
+    assert old_random_state != random_states[key]
+    assert new_random_state == random_states[key]
--- a/tests/test_serialize.py
+++ b/tests/test_serialize.py
+import pytest
+import torch
+from helpers.context import TestContext
+from helpers.dummy import dummy_infinite_data_loader, init_dummy_model
+from helpers.utils import (
+    available_gpus,
+    get_all_3d_configurations,
+    init_distributed,
+    is_dict_equal,
+    rerun_if_address_is_in_use,
+)
+from nanotron import distributed as dist
+from nanotron.constants import CHECKPOINT_VERSION
+from nanotron.optim.gradient_accumulator import FP32GradientAccumulator
+from nanotron.optim.named_optimizer import NamedOptimizer
+from nanotron.optim.optimizer_from_gradient_accumulator import (
+    OptimizerFromGradientAccumulator,
+)
+from nanotron.optim.zero import ZeroDistributedOptimizer
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.pipeline_parallel.engine import (
+    AllForwardAllBackwardPipelineEngine,
+)
+from nanotron.parallel.sharded_parameters import SplitConfig, create_sharded_parameter_from_config
+from nanotron.parallel.tied_parameters import sync_tied_weights_gradients
+from nanotron.random import RandomStates, get_current_random_state, get_synced_random_state
+from nanotron.serialize import (
+    load_optimizer,
+    load_random_states,
+    load_weights,
+    save_optimizer,
+    save_random_states,
+    save_weights,
+)
+from nanotron.serialize.metadata import TensorMetadata
+from torch.nn.parallel import DistributedDataParallel
+
+
+def test_save_and_load_with_changed_topolgy():
+    # TODO @thomasw21: We want to be able to support a change of topology mechanism
+    return
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@rerun_if_address_is_in_use()
+def test_save_and_load_model(tp: int, dp: int, pp: int):
+    test_context = TestContext()
+    # We use DP=2 as we're interested in testing that one
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_and_load_model)(test_context=test_context)
+
+
+def _test_save_and_load_model(parallel_context: ParallelContext, test_context: TestContext):
+    model = init_dummy_model(parallel_context=parallel_context)
+    store_folder = test_context.get_auto_remove_tmp_dir()
+
+    # Save
+    save_weights(model=model, parallel_context=parallel_context, root_folder=store_folder)
+
+    # Load
+    new_model = init_dummy_model(parallel_context=parallel_context)
+
+    # Check that the newly initialised model isn't the same.
+    match, msg = is_dict_equal(new_model.state_dict(), model.state_dict())
+    if len(model.state_dict()) == 0:
+        # Edge case where there's no parameters/buffers stored in the model.
+        pass
+    else:
+        assert not match, "Newly initialised model should not match."
+
+    load_weights(model=new_model, parallel_context=parallel_context, root_folder=store_folder)
+
+    # Assert the weights are exactly the same after loading
+    match, msg = is_dict_equal(new_model.state_dict(), model.state_dict())
+    assert match, msg
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@rerun_if_address_is_in_use()
+def test_save_and_load_optimizer(tp: int, dp: int, pp: int):
+    test_context = TestContext()
+    if pp > 1:
+        pytest.skip("Pipeline parallelism not supported for this test yet")
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_and_load_optimizer)(test_context=test_context)
+
+
+def _test_save_and_load_optimizer(parallel_context: ParallelContext, test_context: TestContext):
+    store_folder = test_context.get_auto_remove_tmp_dir()
+    model = init_dummy_model(parallel_context=parallel_context)
+    optimizer = NamedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda params: torch.optim.AdamW(params),
+    )
+
+    # Train in order to update the optimizer step a few times
+    data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
+    nb_optim_steps = 3
+    pipeline_engine = AllForwardAllBackwardPipelineEngine()
+    for _ in range(nb_optim_steps):
+        minibatch = next(data_loader)
+        _ = pipeline_engine.train_batch_iter(
+            model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
+        )
+        # Manually sync tied parameters
+        sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
+        # Optimizer steps
+        optimizer.step()
+        optimizer.zero_grad()
+
+    # Save optimizer
+    save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
+    dist.barrier(parallel_context.world_pg)
+
+    # Generate a new optimizer
+    new_optimizer = NamedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda params: torch.optim.AdamW(params),
+    )
+
+    # Check that the newly initialised optimizer isn't the same.
+    match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+    if len(optimizer.state_dict()["state"]) == 0:
+        # Edge case where there's no state stored in the optimizer.
+        pass
+    else:
+        assert not match, "Newly initialised optimizer should not match."
+    load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
+
+    # Assert the optimizer states are exactly the same after loading.
+    match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+    assert match, msg
+
+    # Test loading optimizer states to CPU
+    cpu_optimizer = NamedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda params: torch.optim.AdamW(params),
+    )
+
+    # Load optimizer states to CPU
+    load_optimizer(
+        optimizer=cpu_optimizer, parallel_context=parallel_context, root_folder=store_folder, map_location="cpu"
+    )
+
+    # Get state dicts
+    gpu_state = optimizer.state_dict()
+    cpu_state = cpu_optimizer.state_dict()
+
+    # Check that states match except for device
+    for param_id in gpu_state["state"]:
+        for key, gpu_value in gpu_state["state"][param_id].items():
+            cpu_value = cpu_state["state"][param_id][key]
+            if isinstance(gpu_value, torch.Tensor):
+                assert torch.equal(gpu_value.cpu(), cpu_value), f"Values don't match for param {param_id}, key {key}"
+                if key != "step":  # Skip device checks for 'step' key
+                    assert (
+                        cpu_value.device.type == "cpu"
+                    ), f"CPU optimizer state should be on CPU for param {param_id}, key {key}"
+                    assert (
+                        gpu_value.device.type == "cuda"
+                    ), f"GPU optimizer state should be on CUDA for param {param_id}, key {key}"
+            else:
+                assert gpu_value == cpu_value, f"Non-tensor values don't match for param {param_id}, key {key}"
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@rerun_if_address_is_in_use()
+def test_save_zero_optimizer_and_load_optimizer(tp: int, dp: int, pp: int):
+    test_context = TestContext()
+    # We use DP=2 as we're interested in testing that one
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_zero_optimizer_and_load_optimizer)(test_context=test_context)
+
+
+def _test_save_zero_optimizer_and_load_optimizer(parallel_context: ParallelContext, test_context: TestContext):
+    store_folder = test_context.get_auto_remove_tmp_dir()
+    model = init_dummy_model(parallel_context=parallel_context)
+    optimizer = ZeroDistributedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+        dp_pg=parallel_context.dp_pg,
+    )
+
+    # Train in order to update the optimizer step a few times
+    data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
+    nb_optim_steps = 3
+    pipeline_engine = AllForwardAllBackwardPipelineEngine()
+    for _ in range(nb_optim_steps):
+        minibatch = next(data_loader)
+        _ = pipeline_engine.train_batch_iter(
+            model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
+        )
+        # Manually sync tied parameters
+        sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
+        # Optimizer steps
+        optimizer.step()
+        optimizer.zero_grad()
+
+    # Save optimizer
+    save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
+    dist.barrier(parallel_context.world_pg)
+
+    # Generate a new optimizer
+    new_optimizer = ZeroDistributedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+        dp_pg=parallel_context.dp_pg,
+    )
+
+    # Check that the newly initialised optimizer isn't the same.
+    match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+    if len(optimizer.state_dict()["state"]) == 0:
+        # Edge case where there's no state stored in the optimizer.
+        pass
+    else:
+        assert not match, "Newly initialised optimizer should not match."
+
+    load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
+
+    # Assert the optimizer states are exactly the same after loading.
+    match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+    assert match, msg
+
+    parallel_context.destroy()
+
+
+@pytest.mark.skip(reason="Assumption that zero and non zero optimizer have the same serialization format doesn't hold")
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@rerun_if_address_is_in_use()
+def test_save_zero_optimizer_and_load_data_parallel_optimizer(tp: int, dp: int, pp: int):
+    test_context = TestContext()
+    # We use DP=2 as we're interested in testing that one
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_zero_optimizer_and_load_data_parallel_optimizer)(
+        test_context=test_context
+    )
+
+
+def _test_save_zero_optimizer_and_load_data_parallel_optimizer(
+    parallel_context: ParallelContext, test_context: TestContext
+):
+    store_folder = test_context.get_auto_remove_tmp_dir()
+    model = init_dummy_model(parallel_context=parallel_context)
+    optimizer = ZeroDistributedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+        dp_pg=parallel_context.dp_pg,
+    )
+
+    # Train in order to update the optimizer step a few times
+    data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
+    nb_optim_steps = 3
+    pipeline_engine = AllForwardAllBackwardPipelineEngine()
+    for _ in range(nb_optim_steps):
+        minibatch = next(data_loader)
+        _ = pipeline_engine.train_batch_iter(
+            model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
+        )
+        # Manually sync tied parameters
+        sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
+        # Optimizer steps
+        optimizer.step()
+        optimizer.zero_grad()
+
+    # Save optimizer
+    save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
+    dist.barrier(parallel_context.world_pg)
+
+    # Generate a new optimizer
+    new_optimizer = NamedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda params: torch.optim.AdamW(params),
+    )
+
+    # Check that the newly initialised optimizer isn't the same.
+    match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+    if len(optimizer.state_dict()["state"]) == 0:
+        # Edge case where there's no state stored in the optimizer.
+        pass
+    else:
+        assert not match, "Newly initialised optimizer should not match."
+
+    load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
+
+    # TODO @thomasw21: Compare zero optimizer with non zero
+    parallel_context.destroy()
+
+
+@pytest.mark.skip(reason="Assumption that zero and non zero optimizer have the same serialization format doesn't hold")
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@rerun_if_address_is_in_use()
+def test_save_data_parallel_optimizer_and_load_zero_optimizer(tp: int, dp: int, pp: int):
+    test_context = TestContext()
+    # We use DP=2 as we're interested in testing that one
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_data_parallel_optimizer_and_load_zero_optimizer)(
+        test_context=test_context
+    )
+
+
+def _test_save_data_parallel_optimizer_and_load_zero_optimizer(
+    parallel_context: ParallelContext, test_context: TestContext
+):
+    store_folder = test_context.get_auto_remove_tmp_dir()
+    model = init_dummy_model(parallel_context=parallel_context)
+    optimizer = NamedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda params: torch.optim.AdamW(params),
+    )
+
+    # Train in order to update the optimizer step a few times
+    data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
+    nb_optim_steps = 3
+    pipeline_engine = AllForwardAllBackwardPipelineEngine()
+    for _ in range(nb_optim_steps):
+        minibatch = next(data_loader)
+        _ = pipeline_engine.train_batch_iter(
+            model=model, pg=parallel_context.pp_pg, batch=[minibatch], nb_microbatches=1, grad_accumulator=None
+        )
+        optimizer.step()
+        optimizer.zero_grad()
+
+    # Save optimizer
+    save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
+    dist.barrier(parallel_context.world_pg)
+
+    # Generate a new optimizer
+    new_optimizer = ZeroDistributedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+        dp_pg=parallel_context.dp_pg,
+    )
+
+    # Check that the newly initialised optimizer isn't the same.
+    match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+    if len(optimizer.state_dict()["state"]) == 0:
+        # Edge case where there's no state stored in the optimizer.
+        pass
+    else:
+        assert not match, "Newly initialised optimizer should not match."
+
+    load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
+
+    # TODO @thomasw21: Compare zero optimizer with non zero
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@rerun_if_address_is_in_use()
+def test_save_optimizer_with_additional_state_dict_keys(tp: int, dp: int, pp: int):
+    test_context = TestContext()
+    # We use DP=2 as we're interested in testing that one
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_save_optimizer_with_additional_state_dict_keys)(
+        test_context=test_context
+    )
+
+
+def _test_save_optimizer_with_additional_state_dict_keys(parallel_context: ParallelContext, test_context: TestContext):
+    dtype = torch.float16
+    store_folder = test_context.get_auto_remove_tmp_dir()
+    model = init_dummy_model(parallel_context=parallel_context, dtype=dtype)
+
+    if isinstance(model, DistributedDataParallel):
+        # Remove the annoying "module." prefix
+        unwrapped_model = model.module
+    else:
+        unwrapped_model = model
+
+    named_parameters = list(unwrapped_model.named_parameters())
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_parameters,
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+    )
+    grad_accumulator = optimizer.gradient_accumulator
+
+    assert len(optimizer.state_dict_additional_keys()) > 0
+
+    # Train in order to update the optimizer step a few times
+    data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg, dtype=dtype))
+    nb_optim_steps = 3
+    pipeline_engine = AllForwardAllBackwardPipelineEngine()
+    for _ in range(nb_optim_steps):
+        minibatch = next(data_loader)
+        _ = pipeline_engine.train_batch_iter(
+            model=model,
+            pg=parallel_context.pp_pg,
+            batch=[minibatch],
+            nb_microbatches=1,
+            grad_accumulator=grad_accumulator,
+        )
+        # Manually sync tied parameters
+        sync_tied_weights_gradients(
+            module=unwrapped_model, parallel_context=parallel_context, grad_accumulator=grad_accumulator
+        )
+        # Optimizer steps
+        optimizer.step()
+        optimizer.zero_grad()
+
+    # Save optimizer
+    save_optimizer(optimizer=optimizer, parallel_context=parallel_context, root_folder=store_folder)
+    dist.barrier(parallel_context.world_pg)
+
+    # Generate a new optimizer
+    new_optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_parameters,
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+    )
+    new_grad_accumulator = new_optimizer.gradient_accumulator
+
+    # Check that the newly initialised optimizer isn't the same.
+    if len(optimizer.state_dict()["state"]) == 0:
+        pass
+    else:
+        match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+        assert not match, "Newly initialised optimizer should not match."
+
+    load_optimizer(optimizer=new_optimizer, parallel_context=parallel_context, root_folder=store_folder)
+
+    # Assert the optimizer states are exactly the same after loading.
+    match, msg = is_dict_equal(optimizer.state_dict()["state"], new_optimizer.state_dict()["state"])
+    assert match, msg
+
+    # Assert the optimizer state_dict are exactly the same after loading.
+    match, msg = is_dict_equal(optimizer.state_dict(), new_optimizer.state_dict())
+    assert match, msg
+
+    # Assert the internal optimizer states are exactly the same after loading.
+    keys_to_ignore = []
+    match, msg = is_dict_equal(
+        {
+            name: {key: tensor for key, tensor in elt.items() if key not in keys_to_ignore}
+            for name, elt in grad_accumulator.parameters.items()
+        },
+        {
+            name: {key: tensor for key, tensor in elt.items() if key not in keys_to_ignore}
+            for name, elt in new_grad_accumulator.parameters.items()
+        },
+    )
+    assert match, msg
+
+    parallel_context.destroy()
+
+
+# TODO @thomasw21: Test with a optimizer that uses `named_param_groups` instead of `param_groups`
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing test_save_and_load_random_states requires at least 2 gpus")
+@rerun_if_address_is_in_use()
+def test_save_and_load_random_states():
+    test_context = TestContext()
+    # We use DP=2 as we're interested in testing
+    init_distributed(tp=2, dp=1, pp=1)(_test_save_and_load_random_states)(test_context=test_context)
+
+
+def _test_save_and_load_random_states(parallel_context: ParallelContext, test_context: TestContext):
+    pg = next(
+        (pg for pg in [parallel_context.tp_pg, parallel_context.dp_pg, parallel_context.pp_pg] if pg.size() == 2)
+    )
+    random_states = RandomStates(
+        {
+            "my_synced_random_state": get_synced_random_state(random_state=get_current_random_state(), pg=pg),
+            "my_own_random_state": get_current_random_state(),
+        }
+    )
+    store_folder = test_context.get_auto_remove_tmp_dir()
+
+    # Check that random states are unequal between ranks (due to `my_own_random_state`)
+    reference_rank = 0
+    if dist.get_rank(pg) == reference_rank:
+        random_statess = [random_states]
+    else:
+        random_statess = [None]
+    dist.broadcast_object_list(random_statess, src=dist.get_global_rank(group_rank=reference_rank, group=pg), group=pg)
+    if dist.get_rank(pg) != reference_rank:
+        assert random_states != random_statess[0]
+
+    # save
+    save_random_states(random_states=random_states, parallel_context=parallel_context, root_folder=store_folder)
+
+    # load
+    new_random_states = load_random_states(parallel_context=parallel_context, root_folder=store_folder)
+    # Each rank has restored it's own random state
+    assert random_states == new_random_states
+
+    parallel_context.destroy()
+
+
+@rerun_if_address_is_in_use()
+def test_serialize_deserialize_tensormetadata():
+    test_context = TestContext()
+    init_distributed(tp=2, dp=1, pp=1)(_test_serialize_deserialize_tensormetadata)(test_context=test_context)
+
+
+def _test_serialize_deserialize_tensormetadata(parallel_context: ParallelContext, test_context: TestContext):
+    param = torch.nn.Parameter(torch.randn(16, 64))
+    split_config = SplitConfig(
+        split_dim=0,
+        contiguous_chunks=(8, 8),
+    )
+    param = create_sharded_parameter_from_config(parameter=param, pg=parallel_context.tp_pg, split_config=split_config)
+    sharded_info = param.get_sharded_info()
+    metadata = TensorMetadata(
+        version=CHECKPOINT_VERSION,
+        local_global_slices_pairs=sharded_info.local_global_slices_pairs,
+        unsharded_shape=sharded_info.unsharded_shape,
+    )
+    metadata_str_dict = metadata.to_str_dict()
+    # Assert metadata_str_dict is Dict[str, str]
+    assert isinstance(metadata_str_dict, dict)
+    assert all(isinstance(key, str) for key in metadata_str_dict.keys())
+    assert all(isinstance(value, str) for value in metadata_str_dict.values())
+
+    metadata_from_str_dict = TensorMetadata.from_str_dict(metadata_str_dict)
+    assert metadata == metadata_from_str_dict
+
+    parallel_context.destroy()
--- a/tests/test_tensor_parallel.py
+++ b/tests/test_tensor_parallel.py
+import os
+
+import pytest
+import torch
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.distributed import get_global_rank
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
+from nanotron.parallel.tensor_parallel.nn import (
+    TensorParallelColumnLinear,
+    TensorParallelEmbedding,
+    TensorParallelRowLinear,
+)
+from torch import nn as torch_nn
+
+
+@pytest.mark.parametrize("tp,dp,pp", [pytest.param(i, 1, 1) for i in range(1, min(4, available_gpus()) + 1)])
+@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
+@pytest.mark.parametrize("async_communication", [False, True])
+@pytest.mark.parametrize("tp_recompute_allgather", [False, True])
+@rerun_if_address_is_in_use()
+def test_column_linear(
+    tp: int,
+    dp: int,
+    pp: int,
+    tp_mode: TensorParallelLinearMode,
+    async_communication: bool,
+    tp_recompute_allgather: bool,
+):
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
+        pytest.skip("ALL_REDUCE mode does not support async communication")
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE and tp_recompute_allgather:
+        pytest.skip("ALL_REDUCE mode is unaffected by tp_recompute_allgather")
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_column_linear)(
+        tp_mode=tp_mode, async_communication=async_communication, tp_recompute_allgather=tp_recompute_allgather
+    )
+
+
+def _test_column_linear(
+    parallel_context: ParallelContext,
+    tp_mode: TensorParallelLinearMode,
+    async_communication: bool,
+    tp_recompute_allgather: bool,
+):
+    if async_communication:
+        os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+    in_features = 2
+    out_features_per_tp_rank = 3
+    out_features = parallel_context.tp_pg.size() * out_features_per_tp_rank
+
+    # Sharded
+    column_linear = TensorParallelColumnLinear(
+        in_features=in_features,
+        out_features=out_features,
+        pg=parallel_context.tp_pg,
+        mode=tp_mode,
+        device="cuda",
+        async_communication=async_communication,
+        tp_recompute_allgather=tp_recompute_allgather,
+    )
+
+    # Un-sharded
+    reference_linear = torch_nn.Linear(in_features=in_features, out_features=out_features, device="cuda")
+
+    # Copy weights/bias from sharded to un-sharded
+    with torch.inference_mode():
+        dist.all_gather(
+            tensor_list=list(reference_linear.weight.split(out_features_per_tp_rank, dim=0)),
+            tensor=column_linear.weight,
+            group=parallel_context.tp_pg,
+        )
+        dist.all_gather(
+            tensor_list=list(reference_linear.bias.split(out_features_per_tp_rank, dim=0)),
+            tensor=column_linear.bias,
+            group=parallel_context.tp_pg,
+        )
+
+    # Generate random input
+    random_input: torch.Tensor
+    sharded_random_input: torch.Tensor
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
+        batch_size = 5
+        random_input = torch.randn(batch_size, in_features, device="cuda")
+        # synchronize random_input across tp
+        dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
+        sharded_random_input = random_input
+    elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+        sharded_batch_size = 5
+        sharded_random_input = torch.randn(sharded_batch_size, in_features, device="cuda")
+        if parallel_context.tp_pg.size() > 1:
+            random_input = torch.empty(
+                sharded_batch_size * parallel_context.tp_pg.size(),
+                *(sharded_random_input.shape[1:]),
+                device=sharded_random_input.device,
+                dtype=sharded_random_input.dtype,
+            )
+            dist.all_gather_into_tensor(random_input, sharded_random_input, group=parallel_context.tp_pg)
+        else:
+            random_input = sharded_random_input
+    else:
+        ValueError(f"Unsupported mode: {tp_mode}")
+    # It's important that `random_input` and `sharded_random_input` are two separate tensors with separate storage
+    sharded_random_input = sharded_random_input.clone()
+    random_input.requires_grad = True
+    sharded_random_input.requires_grad = True
+
+    # Test that we get the same output after forward pass
+    sharded_output = column_linear(sharded_random_input)
+    reference_output = reference_linear(random_input)
+    # TODO @thomasw21: Tune tolerance
+    try:
+        torch.testing.assert_close(
+            sharded_output,
+            reference_output[
+                :,
+                dist.get_rank(parallel_context.tp_pg)
+                * out_features_per_tp_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+                * out_features_per_tp_rank,
+            ],
+        )
+    except BaseException as e:
+        print(f"Rank {dist.get_rank(parallel_context.tp_pg)}: FAIL.")
+        dist.barrier()
+        raise e
+    print(f"Rank {dist.get_rank(parallel_context.tp_pg)}: SUCCESS.")
+    dist.barrier()
+
+    # Test that we get the same gradient after backward pass
+    sharded_output.sum().backward()
+    reference_output.sum().backward()
+    hidden_dim_slice = slice(
+        dist.get_rank(parallel_context.tp_pg) * out_features_per_tp_rank,
+        (dist.get_rank(parallel_context.tp_pg) + 1) * out_features_per_tp_rank,
+    )
+    torch.testing.assert_close(
+        column_linear.weight.grad,
+        reference_linear.weight.grad[hidden_dim_slice],
+    )
+    torch.testing.assert_close(
+        column_linear.bias.grad,
+        reference_linear.bias.grad[hidden_dim_slice],
+    )
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
+        torch.testing.assert_close(
+            sharded_random_input.grad,
+            random_input.grad,
+        )
+    elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+        batch_dim_slice = slice(
+            dist.get_rank(parallel_context.tp_pg) * sharded_batch_size,
+            (dist.get_rank(parallel_context.tp_pg) + 1) * sharded_batch_size,
+        )
+        torch.testing.assert_close(
+            sharded_random_input.grad,
+            random_input.grad[batch_dim_slice],
+        )
+    else:
+        ValueError(f"Unsupported mode: {tp_mode}")
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize("tp,dp,pp", [pytest.param(i, 1, 1) for i in range(1, min(4, available_gpus()) + 1)])
+@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
+@pytest.mark.parametrize("async_communication", [False, True])
+@pytest.mark.parametrize("tp_recompute_allgather", [False, True])
+@rerun_if_address_is_in_use()
+def test_row_linear(
+    tp: int,
+    dp: int,
+    pp: int,
+    tp_mode: TensorParallelLinearMode,
+    async_communication: bool,
+    tp_recompute_allgather: bool,
+):
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
+        pytest.skip("ALL_REDUCE mode does not support async communication")
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE and tp_recompute_allgather:
+        pytest.skip("ALL_REDUCE mode is not affected by tp_recompute_allgather")
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_row_linear)(
+        tp_mode=tp_mode, async_communication=async_communication, tp_recompute_allgather=tp_recompute_allgather
+    )
+
+
+def _test_row_linear(
+    parallel_context: ParallelContext,
+    tp_mode: TensorParallelLinearMode,
+    async_communication: bool,
+    tp_recompute_allgather: bool,
+):
+    if async_communication:
+        os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+    out_features = 3
+    in_features_per_rank = 2
+    in_features = parallel_context.tp_pg.size() * in_features_per_rank
+
+    # Sharded
+    row_linear = TensorParallelRowLinear(
+        in_features=in_features,
+        out_features=out_features,
+        pg=parallel_context.tp_pg,
+        mode=tp_mode,
+        device="cuda",
+        async_communication=async_communication,
+    )
+
+    # Un-sharded
+    reference_linear = torch_nn.Linear(in_features=in_features, out_features=out_features, device="cuda")
+
+    # Copy weights/bias from sharded to un-sharded
+    with torch.inference_mode():
+        dist.all_reduce(tensor=reference_linear.weight, op=dist.ReduceOp.SUM, group=parallel_context.tp_pg)
+        row_linear.weight.copy_(
+            reference_linear.weight[
+                :,
+                dist.get_rank(parallel_context.tp_pg)
+                * in_features_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+                * in_features_per_rank,
+            ]
+        )
+        # broadcast bias from rank 0, and the other don't have bias
+        if dist.get_rank(parallel_context.tp_pg) == 0:
+            row_linear.bias.copy_(reference_linear.bias)
+        dist.broadcast(
+            tensor=reference_linear.bias,
+            src=get_global_rank(group=parallel_context.tp_pg, group_rank=0),
+            group=parallel_context.tp_pg,
+        )
+
+    # Generate random input
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
+        batch_size = 5
+    elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+        batch_size = 5 * parallel_context.tp_pg.size()
+    else:
+        raise ValueError()
+    random_input = torch.randn(batch_size, in_features, device="cuda")
+    # synchronize random_input across tp
+    dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
+    random_input.requires_grad = True
+    # Row linear receives as input sharded input
+    random_sharded_input = (
+        random_input[
+            :,
+            dist.get_rank(parallel_context.tp_pg)
+            * in_features_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * in_features_per_rank,
+        ]
+        .detach()
+        .clone()
+    )
+    random_sharded_input.requires_grad = True
+
+    # Test that we get the same output after forward pass
+    # TODO @kunhao: We may want to have our custom error type
+    sharded_output = row_linear(random_sharded_input)
+    reference_output = reference_linear(random_input)
+
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
+        sharded_reference_output = reference_output
+    elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+        assert batch_size % parallel_context.tp_pg.size() == 0
+        sharded_batch_size = batch_size // parallel_context.tp_pg.size()
+        sharded_reference_output = reference_output[
+            dist.get_rank(parallel_context.tp_pg)
+            * sharded_batch_size : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * sharded_batch_size
+        ]
+    else:
+        raise ValueError(f"Unsupported mode: {tp_mode}")
+
+    # TODO @thomasw21: Tune tolerance
+    torch.testing.assert_close(
+        sharded_output,
+        sharded_reference_output,
+    )
+
+    # Test that we get the same gradient after backward pass
+    sharded_output.sum().backward()
+    reference_output.sum().backward()
+    torch.testing.assert_close(
+        row_linear.weight.grad,
+        reference_linear.weight.grad[
+            :,
+            dist.get_rank(parallel_context.tp_pg)
+            * in_features_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * in_features_per_rank,
+        ],
+    )
+    if dist.get_rank(parallel_context.tp_pg) == 0:
+        torch.testing.assert_close(
+            row_linear.bias.grad,
+            reference_linear.bias.grad,
+        )
+    else:
+        assert row_linear.bias is None
+
+    torch.testing.assert_close(
+        random_sharded_input.grad,
+        random_input.grad[
+            :,
+            dist.get_rank(parallel_context.tp_pg)
+            * in_features_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * in_features_per_rank,
+        ],
+    )
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize("tp,dp,pp", [pytest.param(i, 1, 1) for i in range(1, min(4, available_gpus()) + 1)])
+@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
+@rerun_if_address_is_in_use()
+def test_tensor_parallel_embedding(tp: int, dp: int, pp: int, tp_mode: TensorParallelLinearMode):
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_tensor_parallel_embedding)(tp_mode=tp_mode)
+
+
+def _test_tensor_parallel_embedding(parallel_context: ParallelContext, tp_mode: TensorParallelLinearMode):
+    num_embeddings_per_rank = 100
+    embedding_dim = 3
+    num_embeddings = parallel_context.tp_pg.size() * num_embeddings_per_rank
+
+    # Sharded
+    sharded_embedding = TensorParallelEmbedding(
+        num_embeddings=num_embeddings,
+        embedding_dim=embedding_dim,
+        pg=parallel_context.tp_pg,
+        mode=tp_mode,
+        device="cuda",
+    )
+
+    # Un-sharded
+    reference_embedding = torch_nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, device="cuda")
+
+    # Copy weights/bias from sharded to un-sharded
+    with torch.inference_mode():
+        dist.all_reduce(tensor=reference_embedding.weight, op=dist.ReduceOp.SUM, group=parallel_context.tp_pg)
+        sharded_embedding.weight.copy_(
+            reference_embedding.weight[
+                dist.get_rank(parallel_context.tp_pg)
+                * num_embeddings_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+                * num_embeddings_per_rank,
+                :,
+            ]
+        )
+
+    # Generate random input
+    random_input: torch.Tensor
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
+        batch_size = 5
+    elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+        batch_size = 5 * parallel_context.tp_pg.size()
+    else:
+        raise ValueError(f"Unsupported mode: {tp_mode}")
+    random_input = torch.randint(low=0, high=num_embeddings, size=(batch_size,), device="cuda")
+    dist.all_reduce(random_input, op=dist.ReduceOp.AVG, group=parallel_context.tp_pg)
+
+    # Test that we get the same output after forward pass
+    sharded_output = sharded_embedding(random_input)
+    reference_output = reference_embedding(random_input)
+    weights = torch.arange(batch_size, device="cuda")[:, None]
+
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE:
+        sharded_reference_output = reference_output
+        sharded_weights = weights
+    elif tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+        assert batch_size % parallel_context.tp_pg.size() == 0
+        sharded_batch_size = batch_size // parallel_context.tp_pg.size()
+        sharded_reference_output = reference_output[
+            dist.get_rank(parallel_context.tp_pg)
+            * sharded_batch_size : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * sharded_batch_size
+        ]
+        sharded_weights = weights[
+            dist.get_rank(parallel_context.tp_pg)
+            * sharded_batch_size : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * sharded_batch_size
+        ]
+    else:
+        raise ValueError(f"Unsupported mode: {tp_mode}")
+
+    # TODO @thomasw21: Tune tolerance
+    torch.testing.assert_close(sharded_output, sharded_reference_output, atol=0, rtol=0)
+
+    # Test that we get the same gradient after backward pass
+    (sharded_output * sharded_weights).sum().backward()
+    (reference_output * weights).sum().backward()
+    torch.testing.assert_close(
+        sharded_embedding.weight.grad,
+        reference_embedding.weight.grad[
+            dist.get_rank(parallel_context.tp_pg)
+            * num_embeddings_per_rank : (dist.get_rank(parallel_context.tp_pg) + 1)
+            * num_embeddings_per_rank,
+            :,
+        ],
+        atol=0,
+        rtol=0,
+    )
+
+    parallel_context.destroy()
--- a/tests/test_tie_weights.py
+++ b/tests/test_tie_weights.py
+import torch
+from helpers.distributed_tensor import assert_tensor_equal_over_group
+from helpers.exception import assert_fail_with
+from helpers.utils import init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.parameters import NanotronParameter
+from nanotron.parallel.tied_parameters import (
+    get_tied_id_to_param,
+    sync_tied_weights_gradients,
+    tie_parameters,
+)
+from torch import nn
+
+
+@rerun_if_address_is_in_use()
+def test_tie_weight_in_same_device():
+    init_distributed(tp=1, dp=1, pp=1)(_test_tie_weight_in_same_device)()
+
+
+def _test_tie_weight_in_same_device(parallel_context: ParallelContext):
+    model = nn.ModuleDict({"dense0": nn.Linear(10, 10, device="cuda"), "dense1": nn.Linear(10, 10, device="cuda")})
+
+    # Tie weights/bias
+    tie_parameters(
+        root_module=model,
+        ties=[("dense0.weight", (0,)), ("dense1.weight", (0,))],
+        parallel_context=parallel_context,
+        reduce_op=dist.ReduceOp.SUM,
+    )
+    tie_parameters(
+        root_module=model,
+        ties=[("dense0.bias", (0,)), ("dense1.bias", (0,))],
+        parallel_context=parallel_context,
+        reduce_op=dist.ReduceOp.SUM,
+    )
+
+    weight0 = model.get_parameter("dense0.weight")
+    weight1 = model.get_parameter("dense1.weight")
+    bias0 = model.get_parameter("dense0.bias")
+    bias1 = model.get_parameter("dense1.bias")
+
+    # We check that we use the same parameter for both linear layers
+    assert id(weight0) == id(weight1)
+    assert id(bias0) == id(bias1)
+
+    parallel_context.destroy()
+
+
+@rerun_if_address_is_in_use()
+def test_tie_weight_in_different_device():
+    init_distributed(tp=1, dp=1, pp=2)(_test_tie_weight_in_different_device)()
+
+
+def _test_tie_weight_in_different_device(parallel_context: ParallelContext):
+    if dist.get_rank(parallel_context.pp_pg) == 0:
+        model = nn.ModuleDict(
+            {
+                "dense0": nn.Linear(10, 10, device="cuda"),
+            }
+        )
+    else:
+        model = nn.ModuleDict(
+            {
+                "dense1": nn.Linear(10, 10, device="cuda"),
+            }
+        )
+
+    # Tie weights/bias
+    tie_parameters(
+        root_module=model,
+        ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
+        parallel_context=parallel_context,
+        reduce_op=dist.ReduceOp.SUM,
+    )
+    tie_parameters(
+        root_module=model,
+        ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
+        parallel_context=parallel_context,
+        reduce_op=dist.ReduceOp.SUM,
+    )
+
+    group = parallel_context.world_ranks_to_pg[(0, 1)]
+
+    # Check that model weights are not in fact synchronized
+    if dist.get_rank(parallel_context.pp_pg) == 0:
+        weight = model.dense0.weight
+        bias = model.dense0.bias
+    else:
+        weight = model.dense1.weight
+        bias = model.dense1.bias
+
+    # Make sure that weight/bias are NanotronParameter and that they are tied
+    assert isinstance(weight, NanotronParameter)
+    assert weight.is_tied
+    assert isinstance(bias, NanotronParameter)
+    assert bias.is_tied
+
+    # Weights/bias are not synced yet
+    assert not assert_tensor_equal_over_group(weight, group=group, assert_=False)
+    assert not assert_tensor_equal_over_group(bias, group=group, assert_=False)
+
+    # Manually sync weights
+    for (_, group_ranks), param in sorted(
+        get_tied_id_to_param(
+            parameters=model.parameters(),
+            root_module=model,
+        ).items(),
+        key=lambda x: x[0],
+    ):
+        group = parallel_context.world_ranks_to_pg[group_ranks]
+        dist.all_reduce(param, op=dist.ReduceOp.AVG, group=group)
+
+    # We check that we use the same parameter for both linear layers
+    assert_tensor_equal_over_group(weight, group=group)
+    assert_tensor_equal_over_group(bias, group=group)
+
+    parallel_context.destroy()
+
+
+@rerun_if_address_is_in_use()
+def test_tie_weight_across_dp_is_impossible():
+    init_distributed(tp=1, dp=2, pp=1)(_test_tie_weight_across_dp_is_impossible)()
+
+
+def _test_tie_weight_across_dp_is_impossible(parallel_context: ParallelContext):
+    if dist.get_rank(parallel_context.dp_pg) == 0:
+        model = nn.ModuleDict(
+            {
+                "dense0": nn.Linear(10, 10, device="cuda"),
+            }
+        )
+    else:
+        model = nn.ModuleDict(
+            {
+                "dense1": nn.Linear(10, 10, device="cuda"),
+            }
+        )
+
+    # Tie weights/bias
+    with assert_fail_with(AssertionError):
+        tie_parameters(
+            root_module=model,
+            ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
+            parallel_context=parallel_context,
+            reduce_op=dist.ReduceOp.SUM,
+        )
+    with assert_fail_with(AssertionError):
+        tie_parameters(
+            root_module=model,
+            ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
+            parallel_context=parallel_context,
+            reduce_op=dist.ReduceOp.SUM,
+        )
+
+    parallel_context.destroy()
+
+
+@rerun_if_address_is_in_use()
+def test_tie_weight_in_different_device_have_gradients_synchronized():
+    init_distributed(tp=1, dp=1, pp=2)(_test_tie_weight_in_different_device_have_gradients_synchronized)()
+
+
+def _test_tie_weight_in_different_device_have_gradients_synchronized(parallel_context: ParallelContext):
+    if dist.get_rank(parallel_context.pp_pg) == 0:
+        model = nn.ModuleDict(
+            {
+                "dense0": nn.Linear(10, 10, device="cuda"),
+            }
+        )
+    else:
+        model = nn.ModuleDict(
+            {
+                "dense1": nn.Linear(10, 10, device="cuda"),
+            }
+        )
+
+    # Tie weights/bias
+    tie_parameters(
+        root_module=model,
+        ties=[("dense0.weight", (0,)), ("dense1.weight", (1,))],
+        parallel_context=parallel_context,
+        reduce_op=dist.ReduceOp.SUM,
+    )
+    tie_parameters(
+        root_module=model,
+        ties=[("dense0.bias", (0,)), ("dense1.bias", (1,))],
+        parallel_context=parallel_context,
+        reduce_op=dist.ReduceOp.SUM,
+    )
+
+    group = parallel_context.world_ranks_to_pg[(0, 1)]
+
+    # Check that model weights are not in fact synchronized
+    if dist.get_rank(parallel_context.pp_pg) == 0:
+        weight = model.dense0.weight
+        bias = model.dense0.bias
+    else:
+        weight = model.dense1.weight
+        bias = model.dense1.bias
+
+    # Make sure that weight/bias are NanotronParameter and that they are tied
+    assert isinstance(weight, NanotronParameter)
+    assert weight.is_tied
+    assert isinstance(bias, NanotronParameter)
+    assert bias.is_tied
+
+    # Weights/bias are not synced yet
+    assert not assert_tensor_equal_over_group(weight, group=group, assert_=False)
+    assert not assert_tensor_equal_over_group(bias, group=group, assert_=False)
+
+    # Compute gradient
+    input_ = torch.randn(13, 10, device="cuda")
+    if dist.get_rank(parallel_context.pp_pg) == 0:
+        out = model.dense0(input_)
+    else:
+        out = model.dense1(input_)
+    out.sum().backward()
+
+    # sync gradients
+    # TODO @thomasw21: This should be done in hooks
+    sync_tied_weights_gradients(model, parallel_context=parallel_context, grad_accumulator=None)
+
+    # Check that we have gradient
+    assert weight.grad is not None
+    assert bias.grad is not None
+
+    # We check that we both gradients are synchronized
+    assert_tensor_equal_over_group(weight.grad, group=group)
+    assert_tensor_equal_over_group(bias.grad, group=group)
+
+    parallel_context.destroy()
--- a/tests/test_zero.py
+++ b/tests/test_zero.py
+import os
+
+import pytest
+import torch
+from helpers.distributed_tensor import assert_tensor_equal_over_group
+from helpers.dummy import dummy_infinite_data_loader, init_dummy_model
+from helpers.exception import assert_fail_with
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron import distributed as dist
+from nanotron.optim import NamedOptimizer, ZeroDistributedOptimizer
+from nanotron.optim.zero import SlicedFlatTensor
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.data_parallel.utils import sync_gradients_across_dp
+from nanotron.parallel.parameters import NanotronParameter
+from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+from nanotron.parallel.tensor_parallel import nn
+from nanotron.parallel.tensor_parallel.enum import TensorParallelLinearMode
+from nanotron.parallel.tied_parameters import sync_tied_weights_gradients
+from nanotron.random import RandomStates, branch_random_state, get_current_random_state, get_synced_random_state
+from torch import nn as torch_nn
+from torch.nn.parallel import DistributedDataParallel
+
+
+@pytest.mark.parametrize("tp,dp,pp", [pytest.param(1, i, 1) for i in range(1, min(4, available_gpus()) + 1)])
+@rerun_if_address_is_in_use()
+def test_zero_optimizer(tp: int, dp: int, pp: int):
+    init_distributed(pp=pp, dp=dp, tp=tp)(_test_zero_optimizer)()
+
+
+def _test_zero_optimizer(parallel_context: ParallelContext):
+    model = init_dummy_model(parallel_context=parallel_context)
+    optimizer = ZeroDistributedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+        dp_pg=parallel_context.dp_pg,
+    )
+    index_to_name = [name for name, _ in model.named_parameters()]
+
+    # reference model
+    reference_model = init_dummy_model(parallel_context=parallel_context)
+    reference_optimizer = torch.optim.AdamW(reference_model.parameters())
+
+    # sync weights between reference_model and model
+    with torch.no_grad():
+        for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
+            assert name == ref_name
+            param.copy_(ref_param)
+
+    # Get infinite dummy data iterator
+    data_loader = iter(dummy_infinite_data_loader(pp_pg=parallel_context.pp_pg))
+    nb_optim_steps = 3
+    batches = [[next(data_loader)] for _ in range(nb_optim_steps)]
+    pipeline_engine = AllForwardAllBackwardPipelineEngine()
+
+    # Training loop
+    for i, batch in enumerate(batches):
+        # store original reference parameter
+        old_named_params = {name: param.detach().clone() for name, param in model.named_parameters()}
+
+        # Run forward/backward
+        losses = pipeline_engine.train_batch_iter(
+            model=model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=1, grad_accumulator=None
+        )
+        ref_losses = pipeline_engine.train_batch_iter(
+            model=reference_model, pg=parallel_context.pp_pg, batch=batch, nb_microbatches=1, grad_accumulator=None
+        )
+
+        # Check loss match
+        losses = list(losses)
+        ref_losses = list(ref_losses)
+        assert len(losses) == len(ref_losses)
+        for loss, ref_loss in zip(losses, ref_losses):
+            assert isinstance(loss["loss"], torch.Tensor)
+            assert isinstance(ref_loss["loss"], torch.Tensor)
+            torch.testing.assert_close(
+                loss["loss"], ref_loss["loss"], atol=0, rtol=0, msg=lambda msg: f"At iteration {i}, {msg}"
+            )
+
+        # Manually sync tied parameters' gradients
+        sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
+        sync_tied_weights_gradients(module=reference_model, parallel_context=parallel_context, grad_accumulator=None)
+
+        # We rely on DDP to synchronize gradients across DP. We only need to manually synchronize them if we don't use DDP.
+        if not isinstance(model, DistributedDataParallel):
+            sync_gradients_across_dp(
+                model, dp_pg=parallel_context.dp_pg, reduce_op=dist.ReduceOp.AVG, grad_accumulator=None
+            )
+        if not isinstance(reference_model, DistributedDataParallel):
+            sync_gradients_across_dp(
+                reference_model, dp_pg=parallel_context.dp_pg, reduce_op=dist.ReduceOp.AVG, grad_accumulator=None
+            )
+
+        # Check gradients are synced across DP
+        for name, param in model.named_parameters():
+            assert_tensor_equal_over_group(param.grad, group=parallel_context.dp_pg)
+        for ref_name, ref_param in reference_model.named_parameters():
+            assert_tensor_equal_over_group(ref_param.grad, group=parallel_context.dp_pg)
+
+        # Check gradients are the same with reference_model
+        for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
+            assert name == ref_name
+            torch.testing.assert_close(
+                param.grad, ref_param.grad, atol=0, rtol=0, msg=lambda msg: f"At iteration {i}, {msg}"
+            )
+
+        assert len(optimizer.param_groups) == 1
+        assert len(list(model.named_parameters())) == len(optimizer.param_groups[0]["params"])
+        with torch.no_grad():
+            for (name, param), sliced_param in zip(model.named_parameters(), optimizer.param_groups[0]["params"]):
+                offsets = optimizer.param_name_to_dp_rank_offsets[name][dist.get_rank(parallel_context.dp_pg)]
+
+                # Check that weights are the same
+                expected_slice = param.view(-1)[slice(*offsets)].view_as(sliced_param)
+                torch.testing.assert_close(
+                    expected_slice,
+                    sliced_param,
+                    atol=0,
+                    rtol=0,
+                    msg=lambda msg: f"Weights don't match: {msg}\n - Expected slice: {expected_slice}\n - Got: {sliced_param}\n - Full gradient: {param}",
+                )
+                assert (
+                    expected_slice.data_ptr() == sliced_param.data_ptr()
+                ), "Parameters should actually share the same data pointer"
+
+                # Check gradients is the view
+                expected_slice = param.grad.view(-1)[slice(*offsets)].view_as(sliced_param.grad)
+                assert (
+                    expected_slice.data_ptr() == sliced_param.grad.data_ptr()
+                ), "Parameters should actually share the same data pointer"
+                torch.testing.assert_close(
+                    expected_slice,
+                    sliced_param.grad,
+                    atol=0,
+                    rtol=0,
+                    msg=lambda msg: f"Gradients don't match: {msg}\n - Expected slice: {expected_slice}\n - Got: {sliced_param.grad}\n - Full gradient: {param.grad}",
+                )
+
+        # Optimizer steps
+        optimizer.step()
+        optimizer.zero_grad()
+        reference_optimizer.step()
+        reference_optimizer.zero_grad()
+
+        # Check that params are synced across DP
+        for name, param in model.named_parameters():
+            assert_tensor_equal_over_group(param, group=parallel_context.dp_pg)
+            assert param.grad is None
+
+        # Check that gradients are reset
+        for ref_name, ref_param in reference_model.named_parameters():
+            assert_tensor_equal_over_group(ref_param, group=parallel_context.dp_pg)
+            assert ref_param.grad is None
+        for param_group in optimizer.param_groups:
+            for param in param_group["params"]:
+                assert param.grad is None
+
+        # Check params are the same with reference_model
+        for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
+            assert name == ref_name
+            # TODO @thomasw21: Figure out how to make this pass at `atol`/`rtol` set to 0.
+            torch.testing.assert_close(param, ref_param, msg=lambda msg: f"At iteration {i}, {msg}")
+
+        # Check params have been updated correctly
+        for (name, param) in model.named_parameters():
+            old_param = old_named_params[name]
+            assert not torch.allclose(param, old_param)
+
+        # We need to check that the optimizer states are the same
+        state_dict = optimizer.state_dict()
+        reference_state_dict = reference_optimizer.state_dict()
+        state = state_dict["state"]
+        ref_state = reference_state_dict["state"]
+        assert set(state) == set(ref_state)
+
+        for index, optim_state in state.items():
+            ref_optim_state = ref_state[index]
+
+            name = index_to_name[index]
+            offsets = optimizer.param_name_to_dp_rank_offsets[name][dist.get_rank(parallel_context.dp_pg)]
+
+            assert set(optim_state) == set(ref_optim_state)
+
+            for key in ["exp_avg", "exp_avg_sq"]:
+                value = optim_state[key]
+                ref_value = ref_optim_state[key]
+                torch.testing.assert_close(
+                    value,
+                    ref_value.view(-1)[slice(*offsets)].view_as(value),
+                    atol=0,
+                    rtol=0,
+                    msg=lambda msg: f"At iteration {i}, {msg}",
+                )
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize("tp,dp,pp", [pytest.param(2, i, 1) for i in range(1, available_gpus() // 2 + 1)])
+@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
+@pytest.mark.parametrize("async_communication", [False, True])
+@rerun_if_address_is_in_use()
+def test_zero_optimizer_with_tp(
+    tp: int, dp: int, pp: int, tp_mode: TensorParallelLinearMode, async_communication: bool
+):
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
+        pytest.skip("ALL_REDUCE mode does not support async communication")
+    init_distributed(pp=pp, dp=dp, tp=tp)(_test_zero_optimizer_with_tp)(
+        tp_mode=tp_mode, async_communication=async_communication
+    )
+
+
+def _test_zero_optimizer_with_tp(
+    parallel_context: ParallelContext, tp_mode: TensorParallelLinearMode, async_communication: bool
+):
+    if async_communication:
+        os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
+    model = torch_nn.Sequential(
+        nn.TensorParallelColumnLinear(
+            in_features=5,
+            out_features=parallel_context.tp_pg.size(),
+            mode=tp_mode,
+            pg=parallel_context.tp_pg,
+            device="cuda",
+            async_communication=async_communication,
+        ),
+        # We choose `sigmoid` instead of `relu` since `relu` can result in a sparse gradient, causing no update to certain parameters
+        torch_nn.Sigmoid(),
+        nn.TensorParallelRowLinear(
+            in_features=parallel_context.tp_pg.size(),
+            out_features=3,
+            mode=tp_mode,
+            pg=parallel_context.tp_pg,
+            device="cuda",
+        ),
+    )
+    optimizer = ZeroDistributedOptimizer(
+        named_params_or_groups=model.named_parameters(),
+        optimizer_builder=lambda named_param_groups: NamedOptimizer(
+            named_params_or_groups=named_param_groups,
+            optimizer_builder=lambda param_groups: torch.optim.AdamW(param_groups),
+        ),
+        dp_pg=parallel_context.dp_pg,
+    )
+    optimizer_name_to_id = {v: k for k, v in optimizer.optimizer.id_to_name.items()}
+    assert len(optimizer_name_to_id) == len(optimizer.id_to_name)
+
+    # reference model
+    reference_model = torch_nn.Sequential(
+        torch_nn.Linear(in_features=5, out_features=parallel_context.tp_pg.size(), device="cuda"),
+        torch_nn.Sigmoid(),
+        torch_nn.Linear(in_features=parallel_context.tp_pg.size(), out_features=3, device="cuda"),
+    )
+    for module in reference_model.modules():
+        for name, param in module.named_parameters(recurse=False):
+            setattr(module, name, NanotronParameter(param))
+
+    reference_optimizer = torch.optim.AdamW(reference_model.parameters())
+    # TODO @thomasw21: This is a hack to obtain `AdamW` index in it's state.
+    name_to_index = {name: index for index, (name, _) in enumerate(reference_model.named_parameters())}
+
+    # sync parameters
+    with torch.no_grad():
+        for ref_name, ref_param in reference_model.named_parameters():
+            dist.all_reduce(ref_param, op=dist.ReduceOp.AVG, group=parallel_context.world_pg)
+
+        for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
+            assert name == ref_name
+            assert isinstance(param, NanotronParameter)
+
+            if param.is_sharded:
+                sharded_info = param.get_sharded_info()
+                for local_global_slices_pair in sharded_info.local_global_slices_pairs:
+                    local_slices = local_global_slices_pair.local_slices
+                    global_slices = local_global_slices_pair.global_slices
+                    param[local_slices].copy_(ref_param[global_slices])
+            else:
+                param.copy_(ref_param)
+
+    # Get infinite dummy data iterator, it has to be synced across TP
+    random_states = RandomStates(
+        {
+            "tp_synced": get_synced_random_state(random_state=get_current_random_state(), pg=parallel_context.tp_pg),
+        }
+    )
+    batch_size = 2 * parallel_context.tp_pg.size() if tp_mode is TensorParallelLinearMode.REDUCE_SCATTER else 7
+    with branch_random_state(random_states=random_states, key="tp_synced", enabled=True):
+        nb_optim_steps = 3
+        batches = [
+            torch.randn(batch_size, 5, device="cuda")
+            if dist.get_rank(parallel_context.pp_pg) == 0
+            else TensorPointer(0)
+            for _ in range(nb_optim_steps)
+        ]
+
+    # Model training loop
+    for i, batch in enumerate(batches):
+        # store original reference parameter
+        old_named_params = {name: param.detach().clone() for name, param in model.named_parameters()}
+
+        # Run forward pass
+        if tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+            batch_size = batch.shape[0]
+            assert batch_size % parallel_context.tp_pg.size() == 0
+            step = batch_size // parallel_context.tp_pg.size()
+            loss = model(
+                batch[
+                    dist.get_rank(parallel_context.tp_pg) * step : (dist.get_rank(parallel_context.tp_pg) + 1) * step
+                ]
+            )
+        else:
+            loss = model(batch)
+        ref_loss = reference_model(batch)
+
+        # Run backward pass
+        loss.sum().backward()
+        ref_loss.sum().backward()
+
+        # Check loss is the same
+        loss = loss.detach()
+        ref_loss = ref_loss.detach()
+        assert isinstance(loss, torch.Tensor)
+        assert isinstance(ref_loss, torch.Tensor)
+        if tp_mode is TensorParallelLinearMode.REDUCE_SCATTER:
+            batch_size = batch.shape[0]
+            assert batch_size % parallel_context.tp_pg.size() == 0
+            step = batch_size // parallel_context.tp_pg.size()
+            torch.testing.assert_close(
+                loss,
+                ref_loss[
+                    dist.get_rank(parallel_context.tp_pg) * step : (dist.get_rank(parallel_context.tp_pg) + 1) * step
+                ],
+                msg=lambda msg: f"At iteration {i}, {msg}",
+            )
+        else:
+            torch.testing.assert_close(loss, ref_loss, msg=lambda msg: f"At iteration {i}, {msg}")
+
+        # Manually sync tied parameters
+        sync_tied_weights_gradients(module=model, parallel_context=parallel_context, grad_accumulator=None)
+        sync_tied_weights_gradients(module=reference_model, parallel_context=parallel_context, grad_accumulator=None)
+
+        # We rely on DDP to synchronize gradients across DP. We only need to manually synchronize them if we don't use DDP.
+        if not isinstance(model, DistributedDataParallel):
+            sync_gradients_across_dp(
+                model, dp_pg=parallel_context.dp_pg, reduce_op=dist.ReduceOp.AVG, grad_accumulator=None
+            )
+        if not isinstance(reference_model, DistributedDataParallel):
+            sync_gradients_across_dp(
+                reference_model, dp_pg=parallel_context.dp_pg, reduce_op=dist.ReduceOp.AVG, grad_accumulator=None
+            )
+
+        # Check gradients are synced across DP
+        for name, param in model.named_parameters():
+            assert_tensor_equal_over_group(param.grad, group=parallel_context.dp_pg)
+        for ref_name, ref_param in reference_model.named_parameters():
+            assert_tensor_equal_over_group(ref_param.grad, group=parallel_context.dp_pg)
+
+        # Check gradients are the same with reference_model
+        for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
+            assert name == ref_name
+
+            if param.is_sharded:
+                sharded_info = param.get_sharded_info()
+                for local_global_slices_pair in sharded_info.local_global_slices_pairs:
+                    local_slices = local_global_slices_pair.local_slices
+                    global_slices = local_global_slices_pair.global_slices
+                    torch.testing.assert_close(
+                        param.grad[local_slices],
+                        ref_param.grad[global_slices],
+                        msg=lambda msg: f"At iteration {i}, {msg}",
+                    )
+            else:
+                torch.testing.assert_close(param.grad, ref_param.grad, msg=lambda msg: f"At iteration {i}, {msg}")
+
+        with torch.no_grad():
+            optim_param_id_to_param = {id(param): param for param in optimizer.param_groups[0]["params"]}
+            assert len(optim_param_id_to_param) == len(optimizer.param_groups[0]["params"])
+            for name, param in model.named_parameters():
+                if dist.get_rank(parallel_context.dp_pg) not in optimizer.param_name_to_dp_rank_offsets[name]:
+                    assert name not in optimizer_name_to_id
+                    continue
+
+                param_id = optimizer_name_to_id[name]
+                sliced_param = optim_param_id_to_param[param_id]
+                offsets = optimizer.param_name_to_dp_rank_offsets[name][dist.get_rank(parallel_context.dp_pg)]
+
+                # Check that weights share the same storage
+                expected_slice = param.view(-1)[slice(*offsets)].view_as(sliced_param)
+                torch.testing.assert_close(
+                    expected_slice,
+                    sliced_param,
+                    atol=0,
+                    rtol=0,
+                    msg=lambda msg: f"At iteration {i}, weights don't match: {msg}\n - Expected slice: {expected_slice}\n - Got: {sliced_param}\n - Full gradient: {param}",
+                )
+                assert (
+                    expected_slice.data_ptr() == sliced_param.data_ptr()
+                ), "Parameters should actually share the same data pointer"
+
+                # Check that gradients share the same storage
+                expected_slice = param.grad.view(-1)[slice(*offsets)].view_as(sliced_param.grad)
+                assert (
+                    expected_slice.data_ptr() == sliced_param.grad.data_ptr()
+                ), "Parameters should actually share the same data pointer"
+                torch.testing.assert_close(
+                    expected_slice,
+                    sliced_param.grad,
+                    atol=0,
+                    rtol=0,
+                    msg=lambda msg: f"At iteration {i}, gradients don't match: {msg}\n - Expected slice: {expected_slice}\n - Got: {sliced_param.grad}\n - Full gradient: {param.grad}",
+                )
+
+        # Optimizer steps
+        optimizer.step()
+        optimizer.zero_grad()
+        reference_optimizer.step()
+        reference_optimizer.zero_grad()
+
+        # Check that params are synced across DP
+        for name, param in model.named_parameters():
+            assert_tensor_equal_over_group(param, group=parallel_context.dp_pg)
+            assert param.grad is None
+
+        # Check that gradients are reset
+        for ref_name, ref_param in reference_model.named_parameters():
+            assert_tensor_equal_over_group(ref_param, group=parallel_context.dp_pg)
+            assert ref_param.grad is None
+        for param_group in optimizer.param_groups:
+            for param in param_group["params"]:
+                assert param.grad is None
+
+        # Check params are the same with reference_model
+        for (name, param), (ref_name, ref_param) in zip(model.named_parameters(), reference_model.named_parameters()):
+            assert name == ref_name
+            if param.is_sharded:
+                sharded_info = param.get_sharded_info()
+                for local_global_slices_pair in sharded_info.local_global_slices_pairs:
+                    local_slices = local_global_slices_pair.local_slices
+                    global_slices = local_global_slices_pair.global_slices
+                    torch.testing.assert_close(
+                        param[local_slices], ref_param[global_slices], msg=lambda msg: f"At iteration {i}, {msg}"
+                    )
+            else:
+                torch.testing.assert_close(param, ref_param, msg=lambda msg: f"At iteration {i}, {msg}")
+
+        # Check params have been updated correctly:
+        for (name, param) in model.named_parameters():
+            old_param = old_named_params[name]
+            assert not torch.allclose(param, old_param)
+
+        # We need to check that the optimizer states are the same
+        state_dict = optimizer.state_dict()
+        reference_state_dict = reference_optimizer.state_dict()
+        state = state_dict["state"]
+        ref_state = reference_state_dict["state"]
+
+        assert "names" in state_dict
+        state_index_to_name = state_dict["names"]
+        state_name_to_index = {name: index for index, name in state_index_to_name.items()}
+        # Check that this is a bijection
+        assert len(state_index_to_name) == len(state_name_to_index)
+
+        for name, param in model.named_parameters():
+            if name not in state_name_to_index:
+                # Parameters is not passed to optimizer, mainly due to zero sharding strategy
+                continue
+
+            index = state_name_to_index[name]
+            optim_state = state[index]
+
+            ref_optim_state = ref_state[name_to_index[name]]
+
+            offsets = optimizer.param_name_to_dp_rank_offsets[name][dist.get_rank(parallel_context.dp_pg)]
+
+            assert set(optim_state) == set(ref_optim_state)
+            assert isinstance(param, NanotronParameter)
+            for key in ["exp_avg", "exp_avg_sq"]:
+                value = optim_state[key]
+                ref_value = ref_optim_state[key]
+                if param.is_sharded:
+                    sharded_info = param.get_sharded_info()
+
+                    for local_global_slices_pair in sharded_info.local_global_slices_pairs:
+                        global_slices = local_global_slices_pair.global_slices
+                        torch.testing.assert_close(
+                            # TODO @thomasw21: We can't add any information about `local_slices` to `value` because it's already flattened
+                            #  For now, we're going to assume that sharded parameters are contiguous, and `local_slices` are trivial all none slice
+                            value,
+                            ref_value[global_slices].view(-1)[slice(*offsets)],
+                            msg=lambda msg: f"At iteration {i}, {msg}",
+                        )
+                else:
+                    torch.testing.assert_close(
+                        value,
+                        ref_value.view(-1)[slice(*offsets)].view_as(value),
+                        msg=lambda msg: f"At iteration {i}, {msg}",
+                    )
+
+    parallel_context.destroy()
+
+
+@rerun_if_address_is_in_use()
+def test_sliced_flat_tensor():
+    init_distributed(1, 1, 1)(_test_sliced_flat_tensor)()
+
+
+def _test_sliced_flat_tensor(parallel_context: ParallelContext):
+    a = torch.randn(2, 3, requires_grad=True)
+    grad = torch.randn(2, 3)
+    a.grad = grad
+
+    start_offset, end_offset = 1, 5
+    b = SlicedFlatTensor(a, start_offset=start_offset, end_offset=end_offset)
+
+    torch.testing.assert_close(a.grad, grad, atol=0, rtol=0)
+    torch.testing.assert_close(b.grad, grad.view(-1)[start_offset:end_offset])
+
+    # Deallocate the gradient by setting it to None
+    a.grad = None
+
+    assert a.grad is None
+    assert b.grad is None
+
+    # Setting gradient to None on the sliced tensor works
+    a.grad = grad
+    assert a.grad is not None
+    assert b.grad is not None
+    b.grad = None
+    assert b.grad is None
+    assert a.grad is None
+
+    with assert_fail_with(NotImplementedError):
+        b.grad = torch.randn(1, 5)
+
+    with assert_fail_with(NotImplementedError):
+        del b.grad
+
+    c = b[:3]
+    # It's important not to contaminate everyone.
+    assert not isinstance(c, SlicedFlatTensor)
+
+    parallel_context.destroy()