v1.0.8

dfcb88ff · chenzk · dfcb88ff · dfcb88ff · dfcb88ff · dfcb88ff
Commit dfcb88ff authored Dec 04, 2024 by chenzk
20 changed files
--- a/examples/doremi/tests/test_doremi_dataloader.py
+++ b/examples/doremi/tests/test_doremi_dataloader.py
+import pytest
+from utils import create_dummy_dataset, set_system_path
+set_system_path()
+from examples.doremi.doremi.dataloader import CombinedDataset
+@pytest.fixture
+def dataset1():
+    return create_dummy_dataset(4000)
+@pytest.fixture
+def dataset2():
+    return create_dummy_dataset(6000)
+def test_combined_dataset_length(dataset1, dataset2):
+    combined_dataset = CombinedDataset([dataset1, dataset2])
+    assert len(combined_dataset) == len(dataset1) + len(dataset2)
+@pytest.mark.parametrize("idx_type", ["idxs", "batch_of_idxs"])
+def test_get_item_from_combined_dataset(dataset1, dataset2, idx_type):
+    def count_elements(lst):
+        return sum(count_elements(i) if isinstance(i, list) else 1 for i in lst)
+    if idx_type == "batch_of_idxs":
+        total_samples = len(dataset1) + len(dataset2)
+        idxs = [[0, 1], [total_samples - 2, total_samples - 1]]
+    else:
+        idxs = [0, 1]
+    combined_dataset = CombinedDataset([dataset1, dataset2])
+    outputs = combined_dataset[idxs]
+    # NOTE: obtain the first key in a dict
+    first_key = next(iter(outputs))
+    assert isinstance(outputs, dict)
+    assert outputs.keys() == dataset1[0].keys()
+    assert len(outputs[first_key]) == count_elements(idxs)
+    assert outputs[first_key][0] == dataset1[0][first_key]
+    assert outputs[first_key][1] == dataset1[1][first_key]
+    if idx_type == "batch_of_idxs":
+        assert outputs[first_key][2] == dataset2[len(dataset2) - 2][first_key]
+        assert outputs[first_key][3] == dataset2[len(dataset2) - 1][first_key]
--- a/examples/doremi/tests/test_doremi_loss.py
+++ b/examples/doremi/tests/test_doremi_loss.py
+import pytest
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy
+from nanotron.sanity_checks import assert_tensor_synced_across_pg
+from utils import set_system_path
+set_system_path()
+from examples.doremi.doremi.doremi_context import DoReMiContext
+from examples.doremi.doremi.loss import (
+    CrossEntropyWithPerDomainLoss,
+    DomainLossForProxyTraining,
+    DoReMiLossForProxyTraining,
+    compute_domain_loss_per_replicas,
+    compute_per_domain_loss,
+)
+from tests.helpers.utils import init_distributed
+@pytest.fixture
+def doremi_context():
+    N_DOMAINS = 5
+    domain_keys = [f"domain {i}" for i in range(N_DOMAINS)]
+    doremi_context = DoReMiContext(domain_keys, is_proxy=False)
+    return doremi_context
+def get_partition_logit(logits, parallel_context):
+    tp_size = dist.get_world_size(parallel_context.tp_pg)
+    tp_rank = dist.get_rank(parallel_context.tp_pg)
+    VOCAB_SIZE = logits.shape[-1]
+    per_partition = VOCAB_SIZE // tp_size
+    chunks = torch.split(logits, per_partition, dim=-1)
+    return chunks[tp_rank]
+@pytest.mark.parametrize("tp", [1, 2])
+def test_computing_per_token_loss(tp: int):
+    BATCH_SIZE = 512
+    SEQ_LEN = 128
+    VOCAB_SIZE = 4
+    torch.manual_seed(69)
+    logits = torch.randn(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
+    targets = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN))
+    ref_losses = F.cross_entropy(logits.view(-1, logits.size(2)), targets.view(-1), reduction="none")
+    init_distributed(tp=tp, dp=1, pp=1)(_test_computing_per_token_loss)(
+        logits=logits, targets=targets, ref_losses=ref_losses
+    )
+def _test_computing_per_token_loss(parallel_context: ParallelContext, logits, targets, ref_losses):
+    logits = logits.to("cuda")
+    targets = targets.to("cuda")
+    parallel_logits = get_partition_logit(logits, parallel_context)
+    loss = sharded_cross_entropy(parallel_logits, targets, parallel_context.tp_pg)
+    assert torch.allclose(loss.cpu().view(-1), ref_losses)
+@pytest.mark.parametrize("dp", [1, 2])
+def test_domain_loss_for_proxy_training(dp: int):
+    GLOBAL_BATCH_SIZE = 512
+    BATCH_SIZE = GLOBAL_BATCH_SIZE // dp
+    SEQ_LEN = 128
+    N_DOMAINS = 5
+    domain_keys = [f"domain {i}" for i in range(N_DOMAINS)]
+    init_distributed(tp=1, dp=dp, pp=1)(_test_domain_loss_for_proxy_training)(
+        global_batch_size=GLOBAL_BATCH_SIZE,
+        batch_size=BATCH_SIZE,
+        seq_len=SEQ_LEN,
+        domain_keys=domain_keys,
+    )
+def _test_domain_loss_for_proxy_training(
+    parallel_context: ParallelContext, global_batch_size, batch_size, seq_len, domain_keys
+):
+    N_DOMAINS = len(domain_keys)
+    losses = torch.randn(batch_size, seq_len, device="cuda")
+    ref_losses = torch.randn(batch_size, seq_len, device="cuda")
+    domain_idxs = torch.randint(0, N_DOMAINS, (batch_size,), device="cuda")
+    doremi_context = DoReMiContext(domain_keys, is_proxy=False)
+    doremi_context.domain_weights = doremi_context.domain_weights.to("cuda")
+    loss_func = DomainLossForProxyTraining(doremi_context, parallel_context)
+    outputs = loss_func(losses, ref_losses, domain_idxs)
+    assert outputs.keys() == {"dro_loss", "domain_losses", "domain_weights", "samples_per_domain"}
+    assert (outputs["domain_losses"] > 0.0).all()
+    assert outputs["domain_losses"].shape == (N_DOMAINS,)
+    assert (outputs["domain_weights"] > 0.0).all()
+    assert outputs["domain_weights"].shape == (N_DOMAINS,)
+@pytest.mark.parametrize("dp", [1, 2])
+def test_computing_per_domain_loss(dp: int):
+    GLOBAL_BATCH_SIZE = 512
+    BATCH_SIZE = GLOBAL_BATCH_SIZE // dp
+    SEQ_LEN = 128
+    N_DOMAINS = 5
+    domain_keys = [f"domain {i}" for i in range(N_DOMAINS)]
+    init_distributed(tp=1, dp=dp, pp=1)(_test_computing_per_domain_loss)(
+        batch_size=BATCH_SIZE,
+        global_batch_size=GLOBAL_BATCH_SIZE,
+        seq_len=SEQ_LEN,
+        domain_keys=domain_keys,
+    )
+def _test_computing_per_domain_loss(
+    parallel_context: ParallelContext, batch_size, global_batch_size, seq_len, domain_keys
+):
+    N_DOMAINS = len(domain_keys)
+    losses = torch.randn(batch_size, seq_len, device="cuda")
+    domain_idxs = torch.randint(0, N_DOMAINS, (batch_size,), device="cuda")
+    doremi_context = DoReMiContext(domain_keys, is_proxy=False)
+    doremi_context.domain_weights.to("cuda")
+    losses_dp, per_domain_loss, samples_per_domain = compute_per_domain_loss(
+        losses, domain_idxs, doremi_context, parallel_context
+    )
+    assert per_domain_loss.shape == (N_DOMAINS,)
+    assert_tensor_synced_across_pg(
+        per_domain_loss, parallel_context.dp_pg, msg=lambda err: f"Per domain loss are not synced across ranks {err}"
+    )
+    assert samples_per_domain.shape == (N_DOMAINS,)
+    assert sum(samples_per_domain) == global_batch_size
+    assert_tensor_synced_across_pg(
+        samples_per_domain,
+        parallel_context.dp_pg,
+        msg=lambda err: f"Samples per domain are not synced across ranks {err}",
+    )
+@pytest.mark.parametrize("dp", [1, 2])
+def test_computing_domain_loss_per_replicas(dp: int):
+    GLOBAL_BATCH_SIZE = 512
+    BATCH_SIZE = GLOBAL_BATCH_SIZE // dp
+    SEQ_LEN = 128
+    N_DOMAINS = 5
+    domain_keys = [f"domain {i}" for i in range(N_DOMAINS)]
+    init_distributed(tp=1, dp=dp, pp=1)(_test_computing_domain_loss_per_replicas)(
+        batch_size=BATCH_SIZE,
+        global_batch_size=GLOBAL_BATCH_SIZE,
+        seq_len=SEQ_LEN,
+        domain_keys=domain_keys,
+    )
+def _test_computing_domain_loss_per_replicas(
+    parallel_context: ParallelContext, batch_size, global_batch_size, seq_len, domain_keys
+):
+    N_DOMAINS = len(domain_keys)
+    losses = torch.randn(batch_size, seq_len, device="cuda")
+    domain_idxs = torch.randint(0, N_DOMAINS, (batch_size,), device="cuda")
+    doremi_context = DoReMiContext(domain_keys, is_proxy=False)
+    doremi_context.domain_weights.to("cuda")
+    per_domain_loss, samples_per_domain = compute_domain_loss_per_replicas(losses, domain_idxs, doremi_context)
+    assert per_domain_loss.shape == (N_DOMAINS,)
+    assert samples_per_domain.shape == (N_DOMAINS,)
+@pytest.mark.skip
+@pytest.mark.parametrize("tp", [1, 2])
+def test_cross_entropy_with_per_domain_loss(tp: int, doremi_context):
+    BATCH_SIZE = 512
+    SEQ_LEN = 128
+    VOCAB_SIZE = 4
+    N_DOMAINS = doremi_context.num_domains
+    torch.manual_seed(69)
+    logits = torch.randn(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
+    label_ids = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN))
+    label_mask = torch.ones((BATCH_SIZE, SEQ_LEN), dtype=torch.bool)
+    domain_idxs = torch.randint(0, N_DOMAINS, (BATCH_SIZE,))
+    ref_losses = F.cross_entropy(logits.view(-1, logits.size(2)), label_ids.view(-1))
+    init_distributed(tp=tp, dp=1, pp=1)(_test_cross_entropy_with_per_domain_loss)(
+        logits=logits,
+        label_ids=label_ids,
+        label_mask=label_mask,
+        domain_idxs=domain_idxs,
+        ref_losses=ref_losses,
+        batch_size=BATCH_SIZE,
+        doremi_context=doremi_context,
+    )
+def _test_cross_entropy_with_per_domain_loss(
+    parallel_context: ParallelContext,
+    logits,
+    label_ids,
+    label_mask,
+    domain_idxs,
+    ref_losses,
+    batch_size,
+    doremi_context,
+):
+    logits = logits.to("cuda")
+    label_ids = label_ids.to("cuda")
+    label_mask = label_mask.to("cuda")
+    domain_idxs = domain_idxs.to("cuda")
+    parallel_logits = get_partition_logit(logits, parallel_context)
+    loss_func = CrossEntropyWithPerDomainLoss(doremi_context, parallel_context)
+    outputs = loss_func(parallel_logits, label_ids, label_mask, domain_idxs)
+    assert torch.allclose(outputs["loss"].cpu().view(-1), ref_losses)
+    assert outputs["domain_losses"].shape == (doremi_context.num_domains,)
+    assert outputs["samples_per_domain"].shape == (doremi_context.num_domains,)
+    assert sum(outputs["samples_per_domain"]) == batch_size
+@pytest.mark.parametrize("tp", [1, 2])
+def test_doremi_loss_for_proxy_training(tp: int, doremi_context):
+    BATCH_SIZE = 512
+    SEQ_LEN = 128
+    VOCAB_SIZE = 4
+    N_DOMAINS = doremi_context.num_domains
+    torch.manual_seed(69)
+    logits = torch.randn(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
+    label_ids = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN))
+    label_mask = torch.ones((BATCH_SIZE, SEQ_LEN), dtype=torch.bool)
+    domain_idxs = torch.randint(0, N_DOMAINS, (BATCH_SIZE,))
+    ref_losses = torch.randn(BATCH_SIZE, SEQ_LEN)
+    ref_ce_loss = F.cross_entropy(logits.view(-1, logits.size(2)), label_ids.view(-1))
+    init_distributed(tp=tp, dp=1, pp=1)(_test_doremi_loss_for_proxy_training)(
+        logits=logits,
+        label_ids=label_ids,
+        label_mask=label_mask,
+        domain_idxs=domain_idxs,
+        ref_losses=ref_losses,
+        ref_ce_loss=ref_ce_loss,
+        batch_size=BATCH_SIZE,
+        n_domains=N_DOMAINS,
+        doremi_context=doremi_context,
+    )
+def _test_doremi_loss_for_proxy_training(
+    parallel_context: ParallelContext,
+    logits,
+    label_ids,
+    label_mask,
+    domain_idxs,
+    ref_losses,
+    ref_ce_loss,
+    batch_size,
+    n_domains,
+    doremi_context,
+):
+    logits = logits.to("cuda")
+    label_ids = label_ids.to("cuda")
+    label_mask = label_mask.to("cuda")
+    domain_idxs = domain_idxs.to("cuda")
+    ref_losses = ref_losses.to("cuda")
+    doremi_context.domain_weights = doremi_context.domain_weights.to("cuda")
+    parallel_logits = get_partition_logit(logits, parallel_context)
+    loss_func = DoReMiLossForProxyTraining(doremi_context, parallel_context)
+    outputs = loss_func(parallel_logits, label_ids, label_mask, domain_idxs, ref_losses)
+    assert outputs["loss"].ndim == 0
+    assert outputs["loss"] > 0.0
+    assert torch.allclose(outputs["ce_loss"].cpu().view(-1), ref_ce_loss)
+    assert outputs["domain_losses"].shape == (doremi_context.num_domains,)
+    assert (outputs["domain_losses"] > 0).all()
+    assert outputs["domain_weights"].shape == (doremi_context.num_domains,)
+    assert torch.allclose(sum(outputs["domain_weights"].cpu()), torch.tensor(1.0))
+    samples_per_domain = outputs["samples_per_domain"]
+    assert samples_per_domain.shape == (n_domains,)
+    assert sum(samples_per_domain) == batch_size
--- a/examples/doremi/tests/test_doremi_sampler.py
+++ b/examples/doremi/tests/test_doremi_sampler.py
+import pytest
+import torch
+from nanotron import distributed as dist
+from nanotron.parallel import ParallelContext
+from nanotron.sanity_checks import assert_tensor_synced_across_pg
+from torch.utils.data import DataLoader
+from utils import create_dummy_dataset, set_system_path
+set_system_path()
+from examples.doremi.doremi.dataloader import (
+    CombinedDataset,
+    DistributedSamplerForDoReMi,
+)
+from examples.doremi.doremi.doremi_context import DoReMiContext
+from tests.helpers.utils import init_distributed
+@pytest.fixture
+def dataset1():
+    return create_dummy_dataset(7000)
+@pytest.fixture
+def dataset2():
+    return create_dummy_dataset(3000)
+@pytest.fixture
+def datasets(dataset1, dataset2):
+    return [dataset1, dataset2]
+@pytest.mark.parametrize("num_microbatches", [1, 32])
+@pytest.mark.parametrize("is_proxy", [True, False])
+def test_dist_doremi_sampler_sync_across_tp(num_microbatches, dataset1, is_proxy):
+    NUM_DOMAINS = 2
+    BATCH_SIZE = 16
+    datasets = [dataset1 for _ in range(NUM_DOMAINS)]
+    domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
+    doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
+    init_distributed(tp=2, dp=1, pp=1)(_test_dist_doremi_sampler_sync_across_tp)(
+        batch_size=BATCH_SIZE,
+        num_microbatches=num_microbatches,
+        datasets=datasets,
+        doremi_context=doremi_context,
+    )
+def _test_dist_doremi_sampler_sync_across_tp(
+    parallel_context: ParallelContext, batch_size: int, num_microbatches: int, datasets, doremi_context: DoReMiContext
+):
+    dp_size = dist.get_world_size(parallel_context.dp_pg)
+    dp_rank = dist.get_rank(parallel_context.dp_pg)
+    sampler = DistributedSamplerForDoReMi(
+        datasets,
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        num_replicas=dp_size,
+        rank=dp_rank,
+        doremi_context=doremi_context,
+        parallel_context=parallel_context,
+    )
+    for idxs in sampler:
+        idxs = torch.tensor(idxs, device="cuda")
+        assert_tensor_synced_across_pg(idxs, parallel_context.tp_pg)
+@pytest.mark.parametrize("dp_size", [2, 4])
+@pytest.mark.parametrize("num_microbatches", [1, 32])
+@pytest.mark.parametrize("is_proxy", [True, False])
+def test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training(dp_size, num_microbatches, dataset1, is_proxy):
+    NUM_DOMAINS = 2
+    GLOBAL_BATCH_SIZE = 512
+    batch_size = GLOBAL_BATCH_SIZE // (num_microbatches * dp_size)
+    datasets = [dataset1 for _ in range(NUM_DOMAINS)]
+    domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
+    doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
+    init_distributed(tp=1, dp=2, pp=1)(_test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training)(
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        datasets=datasets,
+        doremi_context=doremi_context,
+    )
+def _test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training(
+    parallel_context: ParallelContext,
+    batch_size: int,
+    num_microbatches: int,
+    datasets,
+    doremi_context: DoReMiContext,
+):
+    dp_size = dist.get_world_size(parallel_context.dp_pg)
+    dp_rank = dist.get_rank(parallel_context.dp_pg)
+    sampler = DistributedSamplerForDoReMi(
+        datasets,
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        num_replicas=dp_size,
+        rank=dp_rank,
+        doremi_context=doremi_context,
+        parallel_context=parallel_context,
+    )
+    for idxs in sampler:
+        idxs = torch.tensor(idxs, device="cuda").view(-1)
+        # NOTE: i tried to use assert_fail_except_rank_with, but it mark the test as failed
+        # even the test raises an exception as expected
+        gathered_idxs = [torch.empty_like(idxs, device="cuda") for _ in range(dp_size)]
+        dist.all_gather(gathered_idxs, idxs)
+        # NOTE: whether proxy or reference training
+        # the idxs should not be overlapse
+        assert not torch.any(torch.isin(*gathered_idxs))
+@pytest.mark.parametrize("num_microbatches", [1, 32])
+@pytest.mark.parametrize("is_proxy", [True, False])
+def test_determistic_doremi_sampler(num_microbatches, dataset1, is_proxy):
+    BATCH_SIZE = 100
+    NUM_DOMAINS = 2
+    datasets = [dataset1 for _ in range(NUM_DOMAINS)]
+    domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
+    doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
+    n_epochs = 3
+    init_distributed(tp=1, dp=1, pp=1)(_test_determistic_doremi_sampler)(
+        batch_size=BATCH_SIZE,
+        num_microbatches=num_microbatches,
+        datasets=datasets,
+        doremi_context=doremi_context,
+        n_epochs=n_epochs,
+    )
+def _test_determistic_doremi_sampler(
+    parallel_context: ParallelContext,
+    batch_size: int,
+    num_microbatches: int,
+    n_epochs: int,
+    datasets,
+    doremi_context: DoReMiContext,
+):
+    dp_size = dist.get_world_size(parallel_context.dp_pg)
+    dp_rank = dist.get_rank(parallel_context.dp_pg)
+    sampler = DistributedSamplerForDoReMi(
+        datasets,
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        num_replicas=dp_size,
+        rank=dp_rank,
+        doremi_context=doremi_context,
+        parallel_context=parallel_context,
+    )
+    idxs_per_epoch = []
+    for _ in range(n_epochs):
+        all_idxs = []
+        for idxs in sampler:
+            all_idxs.append(idxs)
+        idxs_per_epoch.append(all_idxs)
+        sampler.reset()
+    # NOTE: check if the sequence of idxs across epochs are all the same
+    assert all(
+        all(arr1[i] == arr2[i] for i in range(len(arr1))) for arr1, arr2 in zip(idxs_per_epoch, idxs_per_epoch[1:])
+    )
+@pytest.mark.parametrize("dp_size", [1, 2, 4])
+@pytest.mark.parametrize("num_microbatches", [1, 32])
+@pytest.mark.parametrize("is_proxy", [True, False])
+def test_sampling_from_dist_doremi_sampler_with_global_batch_size(
+    dp_size,
+    num_microbatches,
+    # domain_weights: torch.Tensor,
+    dataset1,
+    is_proxy,
+):
+    NUM_DOMAINS = 8
+    GLOBAL_BATCH_SIZE = 512
+    batch_size = GLOBAL_BATCH_SIZE // (num_microbatches * dp_size)
+    datasets = [dataset1 for _ in range(NUM_DOMAINS)]
+    domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
+    doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
+    init_distributed(tp=1, dp=dp_size, pp=1)(_test_sampling_from_dist_doremi_sampler_with_global_batch_size)(
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        global_batch_size=GLOBAL_BATCH_SIZE,
+        datasets=datasets,
+        doremi_context=doremi_context,
+    )
+def _test_sampling_from_dist_doremi_sampler_with_global_batch_size(
+    parallel_context: ParallelContext,
+    batch_size: int,
+    num_microbatches: int,
+    global_batch_size: int,
+    datasets,
+    doremi_context: DoReMiContext,
+):
+    dp_size = dist.get_world_size(parallel_context.dp_pg)
+    dp_rank = dist.get_rank(parallel_context.dp_pg)
+    sampler = DistributedSamplerForDoReMi(
+        datasets,
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        num_replicas=dp_size,
+        rank=dp_rank,
+        doremi_context=doremi_context,
+        parallel_context=parallel_context,
+    )
+    domain_weights = doremi_context.domain_weights
+    global_batch_size_per_domain = [round(global_batch_size * weight.item()) for weight in domain_weights]
+    microbatch_idx = 0
+    num_samples_per_domain = [0 for _ in range(len(domain_weights))]
+    for idxs in sampler:
+        assert batch_size == len(idxs)
+        # NOTE: make sure the indices from a batch
+        # is proportion to the domain weights
+        start_indices = [sum([len(ds) for ds in datasets[:i]]) for i in range(len(datasets))]
+        end_indices = [sum([len(ds) for ds in datasets[: i + 1]]) for i in range(len(datasets))]
+        for domain_idx in range(len(domain_weights)):
+            num_samples = sum(1 for idx in idxs if idx >= start_indices[domain_idx] and idx < end_indices[domain_idx])
+            num_samples_per_domain[domain_idx] += num_samples
+        if microbatch_idx == num_microbatches - 1:
+            # NOTE: if this is the last microbatch => we iterate through all the microbatches
+            # now we check if the overall number of samples in each domain is correct across
+            # all the microbatches
+            num_samples_per_domain = torch.tensor(num_samples_per_domain, dtype=torch.int, device="cuda")
+            # NOTE: the domain weights are chosen so that we expect
+            # no domains have zero sample in the global batch size
+            dist.all_reduce(num_samples_per_domain, op=dist.ReduceOp.SUM)
+            assert (num_samples_per_domain == 0).sum().item() == 0
+            for expected_bs, bs in zip(global_batch_size_per_domain, num_samples_per_domain):
+                assert bs > 0
+                # NOTE: take into account rounding errors
+                # across all the dp ranks
+                assert abs(expected_bs - bs) <= dp_size, f"abs(expected_bs - bs): {abs(expected_bs - bs)}"
+            microbatch_idx = 0
+            num_samples_per_domain = [0 for _ in range(len(domain_weights))]
+        else:
+            microbatch_idx += 1
+@pytest.mark.parametrize("dp_size", [1, 2, 4])
+@pytest.mark.parametrize("num_microbatches", [1, 32])
+@pytest.mark.parametrize("is_proxy", [True, False])
+def test_dist_doremi_sampler_not_repeating_samples(dp_size, num_microbatches, dataset1, is_proxy):
+    NUM_DOMAINS = 2
+    GLOBAL_BATCH_SIZE = 512
+    batch_size = GLOBAL_BATCH_SIZE // (num_microbatches * dp_size)
+    datasets = [dataset1 for _ in range(NUM_DOMAINS)]
+    domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
+    doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
+    init_distributed(tp=1, dp=dp_size, pp=1)(_test_dist_doremi_sampler_not_repeating_samples)(
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        datasets=datasets,
+        doremi_context=doremi_context,
+    )
+def _test_dist_doremi_sampler_not_repeating_samples(
+    parallel_context: ParallelContext,
+    batch_size: int,
+    num_microbatches: int,
+    datasets,
+    doremi_context: DoReMiContext,
+):
+    dp_size = dist.get_world_size(parallel_context.dp_pg)
+    dp_rank = dist.get_rank(parallel_context.dp_pg)
+    sampler = DistributedSamplerForDoReMi(
+        datasets,
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        num_replicas=dp_size,
+        rank=dp_rank,
+        doremi_context=doremi_context,
+        parallel_context=parallel_context,
+    )
+    local_yieled_idxs = []
+    yielded_idxs = []
+    epoch = 0
+    for idxs in sampler:
+        # NOTE: check that the indices are not repeated
+        assert not set(idxs).intersection(
+            local_yieled_idxs
+        ), f"set(idxs): {set(idxs)}, local_yieled_idxs: {local_yieled_idxs}"
+        assert not set(idxs).intersection(
+            yielded_idxs
+        ), f"set(idxs): {set(idxs)}, yielded_idxs: {yielded_idxs} \
+        epoch: {epoch}"
+        local_yieled_idxs.extend(idxs)
+        # NOTE: gather all the indices from all the dp ranks
+        idxs = torch.tensor(idxs, dtype=torch.int, device="cuda")
+        all_idxs = [torch.zeros_like(idxs) for _ in range(dp_size)]
+        dist.all_gather(all_idxs, idxs)
+        all_idxs = torch.cat(all_idxs, dim=0).view(-1).cpu().tolist()
+        yielded_idxs.extend(all_idxs)
+        epoch += 1
+    assert len(set(yielded_idxs)) == len(yielded_idxs)
+@pytest.mark.parametrize("dp_size", [2, 4, 8])
+@pytest.mark.parametrize("num_microbatches", [1, 5])
+@pytest.mark.parametrize("is_proxy", [True, False])
+def test_yielding(dp_size, num_microbatches, dataset1, is_proxy):
+    NUM_DOMAINS = 2
+    BATCH_SIZE = 100
+    global_batch_size = BATCH_SIZE * num_microbatches * dp_size
+    datasets = [dataset1 for _ in range(NUM_DOMAINS)]
+    domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
+    doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
+    init_distributed(tp=1, dp=dp_size, pp=1)(_test_yielding)(
+        batch_size=BATCH_SIZE,
+        global_batch_size=global_batch_size,
+        num_microbatches=num_microbatches,
+        datasets=datasets,
+        doremi_context=doremi_context,
+    )
+def _test_yielding(
+    parallel_context: ParallelContext,
+    batch_size: int,
+    global_batch_size: int,
+    num_microbatches: int,
+    datasets,
+    doremi_context: DoReMiContext,
+):
+    dp_size = dist.get_world_size(parallel_context.dp_pg)
+    dp_rank = dist.get_rank(parallel_context.dp_pg)
+    sampler = DistributedSamplerForDoReMi(
+        datasets,
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        num_replicas=dp_size,
+        rank=dp_rank,
+        doremi_context=doremi_context,
+        parallel_context=parallel_context,
+    )
+    step = 0
+    num_yielded_microbatches = 0
+    expected_domain_weights = torch.tensor([0.5, 0.5])
+    for idxs in sampler:
+        idxs = torch.tensor(idxs, dtype=torch.int, device="cuda")
+        idxs_dp = [torch.empty_like(idxs) for _ in range(dp_size)]
+        dist.all_gather(idxs_dp, idxs)
+        idxs_dp = torch.cat(idxs_dp, dim=0)
+        assert idxs_dp.numel() == batch_size * dp_size
+        # NOTE: if it loops through all the microbatches
+        # then we check if the number of samples in each domain
+        if (step + 1) % num_microbatches == 0:
+            num_yielded_microbatches += 1
+            for i, weight in enumerate(expected_domain_weights):
+                assert sampler.domain_counters[i] == int(num_yielded_microbatches * global_batch_size * weight)
+        step += 1
+@pytest.mark.parametrize("dp_size", [2, 4, 8])
+@pytest.mark.parametrize("num_microbatches", [1, 5])
+@pytest.mark.parametrize("is_proxy", [True, False])
+def test_yielding_with_dataloader(dp_size, num_microbatches, dataset1, is_proxy):
+    NUM_DOMAINS = 2
+    BATCH_SIZE = 100
+    global_batch_size = BATCH_SIZE * num_microbatches * dp_size
+    datasets = [dataset1 for _ in range(NUM_DOMAINS)]
+    domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
+    doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
+    init_distributed(tp=1, dp=dp_size, pp=1)(_test_yielding_with_dataloader)(
+        batch_size=BATCH_SIZE,
+        global_batch_size=global_batch_size,
+        num_microbatches=num_microbatches,
+        datasets=datasets,
+        doremi_context=doremi_context,
+    )
+def _test_yielding_with_dataloader(
+    parallel_context: ParallelContext,
+    batch_size: int,
+    global_batch_size: int,
+    num_microbatches: int,
+    datasets,
+    doremi_context: DoReMiContext,
+):
+    dp_size = dist.get_world_size(parallel_context.dp_pg)
+    dp_rank = dist.get_rank(parallel_context.dp_pg)
+    sampler = DistributedSamplerForDoReMi(
+        datasets,
+        batch_size=batch_size,
+        num_microbatches=num_microbatches,
+        num_replicas=dp_size,
+        rank=dp_rank,
+        doremi_context=doremi_context,
+        parallel_context=parallel_context,
+    )
+    comebined_dataset = CombinedDataset(datasets)
+    dataloader = DataLoader(comebined_dataset, batch_sampler=sampler)
+    step = 1
+    num_yielded_microbatches = 0
+    expected_domain_weights = torch.tensor([0.5, 0.5])
+    for idxs in dataloader:
+        num_idxs = torch.tensor(len(idxs["text"]), dtype=torch.int, device="cuda")
+        assert num_idxs.item() == batch_size
+        dist.all_reduce(num_idxs, op=dist.ReduceOp.SUM, group=parallel_context.dp_pg)
+        assert num_idxs == batch_size * dp_size
+        if step % num_microbatches == 0:
+            num_yielded_microbatches += 1
+            for i, weight in enumerate(expected_domain_weights):
+                assert sampler.domain_counters[i] == int(num_yielded_microbatches * global_batch_size * weight)
+        step += 1
+    assert step > 1
--- a/examples/doremi/tests/test_doremi_utils.py
+++ b/examples/doremi/tests/test_doremi_utils.py
+import torch
+from utils import create_dummy_dataset, set_system_path
+set_system_path()
+from examples.doremi.doremi.utils import compute_domain_weights_based_on_token_count
+def test_compute_domain_weights_based_on_token_count():
+    datasets = [
+        create_dummy_dataset(10),
+        create_dummy_dataset(20),
+        create_dummy_dataset(70),
+    ]
+    domain_weights = compute_domain_weights_based_on_token_count(datasets)
+    assert torch.equal(domain_weights, torch.tensor([0.1, 0.2, 0.7]))
+    assert torch.allclose(domain_weights.sum(), torch.tensor(1.0))
--- a/examples/doremi/tests/utils.py
+++ b/examples/doremi/tests/utils.py
+import importlib
+import sys
+from pathlib import Path
+from datasets import Dataset
+def set_system_path():
+    package = importlib.import_module("nanotron")
+    # NOTE:  Path(package.__file__).parent = .../nanotron/src/nanotron
+    # we want .../nanotron
+    package_path = Path(package.__file__).parent.parent.parent
+    sys.path.append(str(package_path))
+def create_dummy_dataset(num_items: int):
+    data = {"text": list(range(num_items))}
+    return Dataset.from_dict(data)
--- a/examples/doremi/train_doremi.py
+++ b/examples/doremi/train_doremi.py
+"""
+DoReMi training script.
+Usage:
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=4 examples/doremi/train_doremi.py --config-file examples/doremi/configs/config_280m_llama_proxy.yaml
+"""
+import argparse
+from nanotron.config import get_config_from_file
+from doremi.config import DoReMiConfig
+from doremi.dataloader import get_dataloader, get_datasets
+from doremi.trainer import DoReMiTrainer
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args()
+    config_file = args.config_file
+    config: DoReMiConfig = get_config_from_file(config_file, config_class=DoReMiConfig)
+    dataset_paths = [
+        f"{config.data_stages[0].data.dataset.hf_dataset_or_datasets}/{name}" for name in config.doremi.domain_names
+    ]
+    datasets = get_datasets(dataset_paths)
+    trainer = DoReMiTrainer(config_file, config_class=DoReMiConfig)
+    dataloader = get_dataloader(trainer, datasets)
+    trainer.train(dataloader)
--- a/examples/doremi/train_reference.py
+++ b/examples/doremi/train_reference.py
+"""
+DoReMi training script.
+Usage:
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=4 examples/doremi/train_doremi.py --config-file examples/doremi/configs/config_280m_llama.yaml
+"""
+import argparse
+import torch
+from doremi.config import DoReMiConfig
+from doremi.dataloader import get_dataloader, get_datasets
+from doremi.trainer import ReferenceTrainer
+from doremi.utils import compute_domain_weights_based_on_token_count
+from nanotron.config import get_config_from_file
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args()
+    config_file = args.config_file
+    config = get_config_from_file(config_file, config_class=DoReMiConfig)
+    dataset_paths = [f"{config.data.dataset.hf_dataset_or_datasets}/{name}" for name in config.doremi.domain_names]
+    datasets = get_datasets(dataset_paths)
+    # TODO(xrsrke): add retrieving domain weights from config
+    # or calculate it in the trainer
+    if config.doremi.domain_weights is None:
+        initial_domain_weights = compute_domain_weights_based_on_token_count(datasets)
+    else:
+        initial_domain_weights = torch.tensor(config.doremi.domain_weights)
+    assert torch.allclose(initial_domain_weights.sum(), torch.tensor(1.0), rtol=1e-3)
+    domain_names = config.doremi.domain_names
+    trainer = ReferenceTrainer(initial_domain_weights, domain_names, config_file, config_class=DoReMiConfig)
+    dataloader = get_dataloader(trainer, datasets)
+    trainer.train(dataloader)
--- a/examples/doremi/utils.py
+++ b/examples/doremi/utils.py
+from typing import List
+def print_array_for_human(arr: List[float], precision: int = 5) -> str:
+    formatted_elements = [f"{x:.{precision}f}" for x in arr]
+    return "[" + ", ".join(formatted_elements) + "]"
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
+## Debugging the tests with vscode
+To debug the tests with vscode, add the following json to your `launch.json` file.
+```
+{
+    "name": "Test conversion",
+    "type": "python",
+        "request": "launch",
+        "module": "pytest",
+        "console": "integratedTerminal",
+        "args": [
+            "examples/llama/tests"
+        ],
+        "justMyCode": false
+}
+```
--- a/examples/llama/__init__.py
+++ b/examples/llama/__init__.py
--- a/examples/llama/convert_hf_to_nanotron.py
+++ b/examples/llama/convert_hf_to_nanotron.py
+"""
+Converts a HF model to nanotron format
+Command:
+    torchrun --nproc_per_node=1 convert_hf_to_nanotron.py --checkpoint_path=hf_weights --save_path=nanotron_weights
+"""
+import dataclasses
+import json
+from argparse import ArgumentParser
+from pathlib import Path
+import nanotron
+import torch
+from convert_weights import get_config_mapping, get_weight_mapping, load_nanotron_model
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models.llama import LlamaForTraining
+from transformers import LlamaConfig as HFLlamaConfig
+from transformers import LlamaForCausalLM
+def _handle_attention_block(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, n_q_heads: int, n_kv_heads: int, d_qk: int
+) -> torch.Tensor:
+    # Huggingface Llama separates the q, k, v weights (as opposed to nanotron).
+    # Furthermore, in the rotary embeddings in nanotron expects interleaved pairs of even
+    # and odd dimensions GPT-J style, while the huggingface implementation expects
+    # the whole 1st half and then the whole 2nd half GPT-NeoX style (for more information
+    # see flash_attn.layers.rotary.RotaryEmbedding).
+    # This function handles the concatenation of the q, k, v weights and proper permutation
+    # to ensure correct transformation.
+    def interleave(w: torch.Tensor):
+        w_new = []
+        for head_w in w.split(d_qk):
+            head_w = head_w.view(2, d_qk // 2, -1).transpose(0, 1).reshape(d_qk, -1)
+            w_new.append(head_w)
+        return torch.cat(w_new)
+    q = interleave(q)
+    k = interleave(k)
+    return torch.cat([q, k, v])
+def convert_hf_to_nt(model_hf: LlamaForCausalLM, model_nt: LlamaForTraining, config: NanotronLlamaConfig):
+    """Converts the weights from the model_hf to model_nt, making modifications
+    in-place."""
+    hf_sd = model_hf.state_dict()
+    nt_to_hf = get_weight_mapping(config, nt_to_hf=True)
+    for module_name_nt, module_nt in model_nt.named_modules():
+        for param_name_nt, param_nt in module_nt.named_parameters(recurse=False):
+            # In the case of qkv_proj, the nt_to_hf has exactly three keys, ccorresponding
+            # to q, k, v.
+            if "qkv_proj" in module_name_nt:
+                key_k, key_q, key_v = sorted(nt_to_hf[f"{module_name_nt}.{param_name_nt}"])
+                q = hf_sd[key_q]
+                k = hf_sd[key_k]
+                v = hf_sd[key_v]
+                param = _handle_attention_block(
+                    q,
+                    k,
+                    v,
+                    config.num_attention_heads,
+                    config.num_key_value_heads,
+                    config.hidden_size // config.num_attention_heads,
+                )
+            # The case of gate_up_proj, nt_to_hf_map has two keys.
+            elif "gate_up_proj" in module_name_nt:
+                key_gate, key_up = sorted(nt_to_hf[f"{module_name_nt}.{param_name_nt}"])
+                gate = hf_sd[key_gate]
+                up = hf_sd[key_up]
+                param = torch.cat([gate, up])
+            # All other cases are simple 1-to-1 correspondence.
+            else:
+                hf_key = nt_to_hf[f"{module_name_nt}.{param_name_nt}"]
+                param = hf_sd[hf_key]
+            with torch.no_grad():
+                param_nt.copy_(param)
+def get_nanotron_config(config: HFLlamaConfig) -> NanotronLlamaConfig:
+    """Converts a huggingface configuration to nanotron configuration."""
+    attrs = {key: getattr(config, value) for key, value in get_config_mapping(nt_to_hf=True).items()}
+    return NanotronLlamaConfig(**attrs)
+def convert_checkpoint_and_save(checkpoint_path: Path, save_path: Path):
+    """Loads the huggingface checkpoint in `checkpoint_path`, creates
+    a new nanotron instance, copies the weights from the huggingface checkpoint
+    and saves the transformed nanotron to `save_path`."""
+    # Load huggingface.
+    hf_model = LlamaForCausalLM.from_pretrained(checkpoint_path)
+    # Init nanotron model.
+    model_config = get_nanotron_config(hf_model.config)
+    nanotron_model = load_nanotron_model(model_config=model_config)
+    # Copy weights and save model.
+    parallel_context = nanotron.parallel.ParallelContext(
+        data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=1
+    )
+    convert_hf_to_nt(hf_model, nanotron_model, model_config)
+    nanotron.serialize.save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=save_path)
+    with open(save_path / "model_config.json", "w+") as f:
+        json.dump(dataclasses.asdict(model_config), f)
+    print(f"Model saved to {save_path}")
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Convert HF weights to nanotron format")
+    parser.add_argument("--checkpoint_path", type=Path, default="llama-7b", help="Path to the checkpoint")
+    parser.add_argument("--save_path", type=Path, default="llama-7b-hf", help="Path to save the nanotron model")
+    args = parser.parse_args()
+    # Convert HF model to nanotron format.
+    convert_checkpoint_and_save(checkpoint_path=args.checkpoint_path, save_path=args.save_path)
--- a/examples/llama/convert_nanotron_to_hf.py
+++ b/examples/llama/convert_nanotron_to_hf.py
+"""
+Converts a nanotron model to HF format
+Command:
+    torchrun --nproc_per_node=1 convert_nanotron_to_hf.py --checkpoint_path=nanotron-path --save_path=hf-path
+"""
+import json
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Literal, Optional
+import torch
+from convert_weights import get_config_mapping, get_weight_mapping, load_nanotron_model
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models import init_on_device_and_dtype
+from nanotron.models.llama import LlamaForTraining
+from transformers import AutoTokenizer, LlamaForCausalLM
+from transformers import LlamaConfig as HFLlamaConfig
+TEST_PROMPT = "What is the meaning of the word chutzpah?\nThe word chutzpah means"
+def _handle_attention_block(
+    qkv: torch.Tensor, part: Literal["q", "k", "v"], n_q_heads: int, n_kv_heads: int, d_qk: int
+) -> torch.Tensor:
+    # Huggingface Llama separates the q, k, v weights (as opposed to nanotron).
+    # Furthermore, in the rotary embeddings in nanotron expects interleaved pairs of even
+    # and odd dimensions GPT-J style, while the huggingface implementation expects
+    # the whole 1st half and then the whole 2nd half GPT-NeoX style (for more information
+    # see flash_attn.layers.rotary.RotaryEmbedding).
+    # This function selects the proper chunk of the bundled qkv tensor and permutation
+    # to ensure correct transformation to huggingface.
+    def interleave(w: torch.Tensor):
+        w_new = []
+        for head_w in w.split(d_qk):
+            head_w = head_w.view(d_qk // 2, 2, -1).transpose(0, 1).reshape(d_qk, -1)
+            w_new.append(head_w)
+        return torch.cat(w_new)
+    assert part in ["q", "k", "v"], "part must be one of [q, k, v]"
+    index_end_q = n_q_heads * d_qk
+    index_end_k = index_end_q + n_kv_heads * d_qk
+    if part == "q":
+        return interleave(qkv[:index_end_q])
+    if part == "k":
+        return interleave(qkv[index_end_q:index_end_k])
+    return qkv[index_end_k:]
+def _handle_gate_up_proj(gate_up_proj: torch.Tensor, gate: bool) -> torch.Tensor:
+    # The gate and up projection are bundled in nanotron.
+    # This function selects the proper chunk in the bundled weights to return
+    # either the gate or the up projection only.
+    weight_size = gate_up_proj.shape[0] // 2
+    if gate:
+        return gate_up_proj[:weight_size]
+    else:
+        return gate_up_proj[weight_size:]
+def convert_nt_to_hf(nanotron_model: LlamaForTraining, hf_model: LlamaForCausalLM, model_config: NanotronLlamaConfig):
+    """Converts the weights from the nanotron_model to hf_model, making modifications
+    in-place."""
+    nanotron_model_state_dict = nanotron_model.state_dict()
+    hf_to_nt = get_weight_mapping(model_config, nt_to_hf=False)
+    for module_name_hf, module_hf in hf_model.named_modules():
+        for param_name_hf, param_hf in module_hf.named_parameters(recurse=False):
+            # Get the Nanotron parameter
+            nanotron_key = hf_to_nt[f"{module_name_hf}.{param_name_hf}"]
+            param = nanotron_model_state_dict[nanotron_key]
+            if "qkv_proj" in nanotron_key:
+                proj_name = module_name_hf.split(".")[4][0]
+                param = _handle_attention_block(
+                    param,
+                    proj_name,
+                    model_config.num_attention_heads,
+                    model_config.num_key_value_heads,
+                    model_config.hidden_size // model_config.num_attention_heads,
+                )
+            elif "gate_up_proj" in nanotron_key:
+                gate = "gate" in module_name_hf
+                param = _handle_gate_up_proj(param, gate)
+            with torch.no_grad():
+                param_hf.copy_(param)
+def get_hf_config(config: NanotronLlamaConfig) -> HFLlamaConfig:
+    """Converts a nanotron configuration to huggingface configuration."""
+    attrs = {key: getattr(config, value) for key, value in get_config_mapping(nt_to_hf=False).items()}
+    return HFLlamaConfig(**attrs)
+def convert_checkpoint_and_save(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str] = None):
+    """Loads the nanotron checkpoint in `checkpoint_path`, creates
+    a new huggingface instance, copies the weights from the nanotron checkpoint
+    and saves the transformed huggingface to `save_path`."""
+    # Init nanotron model.
+    with open(checkpoint_path / "model_config.json", "r") as f:
+        attrs = json.load(f)
+        model_config = NanotronLlamaConfig(**attrs)
+    nanotron_model = load_nanotron_model(
+        model_config=model_config,
+        checkpoint_path=checkpoint_path,
+    )
+    # Init huggingface model.
+    with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
+        model_config_hf = get_hf_config(model_config)
+        hf_model = LlamaForCausalLM._from_config(model_config_hf)
+    # Copy weights, initialize tokenizer and save model.
+    if tokenizer_name is not None:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        tokenizer.save_pretrained(save_path)
+    convert_nt_to_hf(nanotron_model, hf_model, model_config)
+    hf_model.save_pretrained(save_path)
+    print(f"Model saved to {save_path}")
+def check_converted_model_generation(save_path: Path):
+    """Loads a huggingface model and tokenizer from `save_path` and
+    performs a dummy text generation."""
+    tokenizer = AutoTokenizer.from_pretrained(save_path)
+    input_ids = tokenizer(TEST_PROMPT, return_tensors="pt")["input_ids"].cuda()
+    print("Inputs:", tokenizer.batch_decode(input_ids))
+    model = LlamaForCausalLM.from_pretrained(save_path).cuda().bfloat16()
+    out = model.generate(input_ids, max_new_tokens=100)
+    print("Generation (converted): ", tokenizer.batch_decode(out))
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Convert Nanotron weights to HF format")
+    parser.add_argument("--checkpoint_path", type=Path, default="llama-7b", help="Path to the checkpoint")
+    parser.add_argument("--save_path", type=Path, default="llama-7b-hf", help="Path to save the HF model")
+    parser.add_argument("--tokenizer_name", type=str, default="meta-llama/Llama-2-7b-chat-hf")
+    args = parser.parse_args()
+    # Convert Nanotron model to HF format.
+    convert_checkpoint_and_save(
+        checkpoint_path=args.checkpoint_path, save_path=args.save_path, tokenizer_name=args.tokenizer_name
+    )
+    # Check if the conversion was successful by generating some text.
+    if args.tokenizer_name is not None:
+        check_converted_model_generation(save_path=args.save_path)
--- a/examples/llama/convert_weights.py
+++ b/examples/llama/convert_weights.py
+import json
+from pathlib import Path
+from typing import Optional
+import nanotron
+import torch
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models.llama import LlamaForTraining
+from nanotron.trainer import mark_tied_parameters
+def get_weight_mapping(config: NanotronLlamaConfig, nt_to_hf: bool = True) -> dict[str, str]:
+    """Returns the nanotron to huggingface parameter mapping if `nt_to_hf`, otherwise the
+    huggingface to nanotron mapping."""
+    hf_to_nt_map = {}
+    hf_to_nt_map["lm_head.weight"] = "model.lm_head.pp_block.weight"
+    hf_to_nt_map["model.embed_tokens.weight"] = "model.token_position_embeddings.pp_block.token_embedding.weight"
+    hf_to_nt_map["model.norm.weight"] = "model.final_layer_norm.pp_block.weight"
+    hf_to_nt_map["model.embed_tokens.weight"] = "model.token_position_embeddings.pp_block.token_embedding.weight"
+    for i in range(config.num_hidden_layers):
+        hf_prefix = f"model.layers.{i}"
+        nt_prefix = f"model.decoder.{i}.pp_block"
+        hf_to_nt_map[f"{hf_prefix}.self_attn.q_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.self_attn.k_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.self_attn.v_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.self_attn.o_proj.weight"] = f"{nt_prefix}.attn.o_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.mlp.gate_proj.weight"] = f"{nt_prefix}.mlp.gate_up_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.mlp.gate_proj.bias"] = f"{nt_prefix}.mlp.gate_up_proj.bias"
+        hf_to_nt_map[f"{hf_prefix}.mlp.up_proj.weight"] = f"{nt_prefix}.mlp.gate_up_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.mlp.up_proj.bias"] = f"{nt_prefix}.mlp.gate_up_proj.bias"
+        hf_to_nt_map[f"{hf_prefix}.mlp.down_proj.weight"] = f"{nt_prefix}.mlp.down_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.mlp.down_proj.bias"] = f"{nt_prefix}.mlp.down_proj.bias"
+        hf_to_nt_map[f"{hf_prefix}.input_layernorm.weight"] = f"{nt_prefix}.input_layernorm.weight"
+        hf_to_nt_map[f"{hf_prefix}.post_attention_layernorm.weight"] = f"{nt_prefix}.post_attention_layernorm.weight"
+    if nt_to_hf:
+        nt_to_hf_map = {}
+        for hf, nt in hf_to_nt_map.items():
+            # Because the qkv and gate_up projections are separated in the
+            # huggingface format, when we return nanotron to huggingface
+            # we will need to return a list of parameters instead (e.g.
+            # the `qkv_proj` will point to a list `[q_proj, k_proj, v_proj]`).
+            if nt in nt_to_hf_map and isinstance(nt_to_hf_map[nt], list):
+                nt_to_hf_map[nt].append(hf)
+            elif nt in nt_to_hf_map:
+                nt_to_hf_map[nt] = [nt_to_hf_map[nt], hf]
+            else:
+                nt_to_hf_map[nt] = hf
+        return nt_to_hf_map
+    return hf_to_nt_map
+def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]:
+    """Returns either the nanotron to huggingface (if `nt_to_hf`)
+    configuration mapping, or the huggingface to nanotron."""
+    hf_to_nt_map = {
+        "bos_token_id": "bos_token_id",
+        "eos_token_id": "eos_token_id",
+        "hidden_act": "hidden_act",
+        "hidden_size": "hidden_size",
+        "initializer_range": "initializer_range",
+        "intermediate_size": "intermediate_size",
+        "max_position_embeddings": "max_position_embeddings",
+        "num_attention_heads": "num_attention_heads",
+        "num_hidden_layers": "num_hidden_layers",
+        "num_key_value_heads": "num_key_value_heads",
+        "pad_token_id": "pad_token_id",
+        "pretraining_tp": "pretraining_tp",
+        "rms_norm_eps": "rms_norm_eps",
+        "rope_scaling": "rope_scaling",
+        "rope_theta": "rope_theta",
+        "tie_word_embeddings": "tie_word_embeddings",
+        "use_cache": "use_cache",
+        "vocab_size": "vocab_size",
+    }
+    if nt_to_hf:
+        return {nt: hf for hf, nt in hf_to_nt_map.items()}
+    return hf_to_nt_map
+def make_parallel_config(
+    dp: int = 1,
+    pp: int = 1,
+    tp: int = 1,
+):
+    parallel_config = nanotron.config.ParallelismArgs(
+        dp=dp,
+        pp=pp,
+        tp=tp,
+        pp_engine=nanotron.config.AllForwardAllBackwardPipelineEngine(),
+        tp_mode=nanotron.config.TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+    return parallel_config
+def load_nanotron_model(
+    model_config: Optional[NanotronLlamaConfig] = None,
+    device: torch.device = torch.device("cuda"),
+    dtype: torch.dtype = torch.bfloat16,
+    checkpoint_path: Optional[Path] = None,
+) -> LlamaForTraining:
+    """
+    Creates and returns a nanotron model.
+    If `model_config` is None, then `checkpoint_path` must be set, in which case
+    the configuration will be loaded from such path.
+    If `checkpoint_path` is None, then `model_config` must be set, in which case
+    the model created will have random weights.
+    """
+    if model_config is None:
+        assert checkpoint_path is not None
+        with open(checkpoint_path / "model_config.json") as f:
+            model_config = NanotronLlamaConfig(**json.load(f))
+    parallel_config = make_parallel_config()
+    parallel_context = nanotron.parallel.ParallelContext(
+        data_parallel_size=parallel_config.dp,
+        pipeline_parallel_size=parallel_config.pp,
+        tensor_parallel_size=parallel_config.tp,
+    )
+    nanotron_model = nanotron.models.build_model(
+        model_builder=lambda: LlamaForTraining(
+            config=model_config,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=dtype,
+        device=device,
+    )
+    mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
+    # Load checkpoint directly in memory and then only keep the state dictionary
+    if checkpoint_path is not None:
+        nanotron.serialize.load_weights(
+            model=nanotron_model, parallel_context=parallel_context, root_folder=checkpoint_path
+        )
+    return nanotron_model
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
+transformers==4.39.3
--- a/examples/llama/tests/test_conversion.py
+++ b/examples/llama/tests/test_conversion.py
+# ruff: noqa: E402
+import dataclasses
+import json
+from pathlib import Path
+import pytest
+import torch
+from transformers import LlamaForCausalLM
+from utils import set_system_path
+set_system_path()
+import nanotron
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models.base import init_on_device_and_dtype
+from nanotron.models.llama import LlamaForTraining
+from nanotron.parallel import ParallelContext
+from nanotron.trainer import mark_tied_parameters
+from examples.llama.convert_hf_to_nanotron import convert_checkpoint_and_save as convert_hf_to_nt_and_save
+from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
+from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
+from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
+from examples.llama.convert_weights import load_nanotron_model, make_parallel_config
+from tests.helpers.context import TestContext
+from tests.helpers.utils import init_distributed
+CONFIG = NanotronLlamaConfig(
+    **{
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": 512,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "is_llama_config": True,
+        "max_position_embeddings": 128,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 4,
+        "num_key_value_heads": 4,
+        "pad_token_id": None,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-06,
+        "rope_scaling": None,
+        "tie_word_embeddings": False,
+        "use_cache": True,
+        "vocab_size": 4096,
+    }
+)
+BATCH_SIZE = 3
+SEQUENCE_LENGTH = 5
+ATOL = 0.03
+def create_nanotron_model(parallel_context: ParallelContext) -> LlamaForTraining:
+    parallel_config = make_parallel_config(
+        tp=parallel_context.tensor_parallel_size,
+        dp=parallel_context.data_parallel_size,
+        pp=parallel_context.pipeline_parallel_size,
+    )
+    nanotron_model = nanotron.models.build_model(
+        model_builder=lambda: LlamaForTraining(
+            config=CONFIG,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=torch.bfloat16,
+        device=torch.device("cuda"),
+    )
+    mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
+    return nanotron_model
+def create_huggingface_model() -> LlamaForCausalLM:
+    config_hf = get_hf_config(CONFIG)
+    with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
+        model_hf = LlamaForCausalLM._from_config(config_hf)
+    return model_hf
+@pytest.fixture(autouse=True, scope="module")
+def fix_seed():
+    torch.manual_seed(0)
+    yield
+@pytest.fixture
+def input_ids() -> torch.Tensor:
+    return torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
+def _test_nt_to_hf(parallel_context: ParallelContext, input_ids: torch.Tensor):
+    model_nt = create_nanotron_model(parallel_context)
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+def test_nt_to_hf(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf)(input_ids=input_ids)
+def _test_nt_to_hf_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
+    # Create and save nanotron model.
+    model_nt = create_nanotron_model(parallel_context)
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "hf"
+    nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
+    with open(nt_path / "model_config.json", "w+") as f:
+        json.dump(dataclasses.asdict(CONFIG), f)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    del model_nt
+    # Perform conversion.
+    convert_nt_to_hf_and_save(nt_path, hf_path)
+    # Load huggingface and get logits.
+    model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    torch.testing.assert_allclose(logits_nt, logits_hf, atol=ATOL)
+def test_nt_to_hf_with_files(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf_with_files)(input_ids=input_ids, test_context=TestContext())
+def _test_hf_to_nt(parallel_context: ParallelContext, input_ids: torch.Tensor):
+    model_nt = create_nanotron_model(parallel_context)
+    model_hf = create_huggingface_model()
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    torch.testing.assert_allclose(logits_hf, logits_nt, atol=ATOL)  
+def test_hf_to_nt(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt)(input_ids=input_ids)
+def _test_hf_to_nt_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
+    # Create and save hf model.
+    model_hf = create_huggingface_model()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "hf"
+    model_hf.save_pretrained(hf_path)
+    logits_hf = model_hf(input_ids).logits
+    del model_hf
+    # Perform conversion.
+    convert_hf_to_nt_and_save(hf_path, nt_path)
+    # Load nanotron and get logits.
+    input_mask = torch.ones_like(input_ids)
+    model_nt = load_nanotron_model(checkpoint_path=nt_path)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL)
+def test_hf_to_nt_with_files(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt_with_files)(input_ids=input_ids, test_context=TestContext())
+def _test_composed_conversion(parallel_context: ParallelContext):
+    # Get HF statedict.
+    model_hf = create_huggingface_model()
+    hf_sd = {key: val.clone() for key, val in model_hf.state_dict().items()}
+    # Convert once to nanotron, save its statedict.
+    model_nt = create_nanotron_model(parallel_context)
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    nt_sd = {key: val.clone() for key, val in model_nt.state_dict().items()}
+    # Convert back to HF, compare statedicts.
+    del model_hf
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    hf_sd_new = model_hf.state_dict()
+    assert set(hf_sd_new) == set(hf_sd)
+    assert all(torch.all(hf_sd[key] == hf_sd_new[key]) for key in hf_sd_new)
+    # Convert to nanotron one more time, compare statedicts.
+    del model_nt
+    model_nt = create_nanotron_model(parallel_context)
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    nt_sd_new = model_nt.state_dict()
+    assert set(nt_sd_new) == set(nt_sd)
+    assert all(torch.all(nt_sd[key] == nt_sd_new[key]) for key in nt_sd_new)
+def test_composed_conversion():
+    init_distributed(tp=1, dp=1, pp=1)(_test_composed_conversion)()
+def _save_parallel_nanotron(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path):
+    # Create and save a parallel model.
+    model_nt = create_nanotron_model(parallel_context)
+    nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
+    with open(nt_path / "model_config.json", "w+") as f:
+        json.dump(dataclasses.asdict(CONFIG), f)
+    # Get parallel predictions.
+    input_ids = input_ids.cuda()  # Move them to the current device index.
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    if torch.distributed.get_rank() == 0:
+        torch.save(logits_nt.detach().cpu(), nt_path / "logits.pt")
+    # Convert nanotron to hf, load it and compare logits.
+    # hf_path = root/"hf"
+    # convert_nt_to_hf_and_save(nt_path, hf_path)
+    # model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    # logits_hf = model_hf(input_ids).logits
+    # assert logits_nt.size() == logits_hf.size()
+    # assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+def _convert_from_parallel(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path, hf_path: Path):
+    # Convert parallel nanotron to hf, get and save huggingface predictions.
+    convert_nt_to_hf_and_save(nt_path, hf_path)
+    model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    logits_hf = model_hf(input_ids).logits
+    torch.save(logits_hf.detach().cpu(), hf_path / "logits.pt")
+def test_tensor_parallel_conversion(input_ids: torch.Tensor):
+    # Set up test.
+    test_context = TestContext()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "nanotron"
+    # Launch both parts.
+    init_distributed(tp=2, dp=1, pp=1)(_save_parallel_nanotron)(input_ids=input_ids, nt_path=nt_path)
+    assert (nt_path / "logits.pt").exists()
+    init_distributed(tp=1, dp=1, pp=1)(_convert_from_parallel)(input_ids=input_ids, nt_path=nt_path, hf_path=hf_path)
+    assert (hf_path / "logits.pt").exists()
+    # Load logits and verify they match.
+    logits_nt = torch.load(nt_path / "logits.pt")
+    logits_hf = torch.load(hf_path / "logits.pt")
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
--- a/examples/llama/tests/test_conversion.py.orig
+++ b/examples/llama/tests/test_conversion.py.orig
+# ruff: noqa: E402
+import json
+<<<<<<< HEAD
+from pathlib import Path
+=======
+>>>>>>> main
+import pytest
+import torch
+from transformers import LlamaForCausalLM
+from utils import set_system_path
+set_system_path()
+import nanotron
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models.base import init_on_device_and_dtype
+from nanotron.models.llama import LlamaForTraining
+from nanotron.parallel import ParallelContext
+from examples.llama.convert_hf_to_nanotron import convert_checkpoint_and_save as convert_hf_to_nt_and_save
+<<<<<<< HEAD
+from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
+from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
+from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
+from examples.llama.convert_weights import load_nanotron_model
+from tests.helpers.context import TestContext
+from tests.helpers.utils import init_distributed
+=======
+from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
+from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
+from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
+from examples.llama.convert_weights import load_nanotron_model, make_parallel_config
+from tests.helpers.context import TestContext
+from tests.helpers.utils import init_distributed, rerun_if_address_is_in_use
+>>>>>>> main
+CONFIG = NanotronLlamaConfig(
+    **{
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": 512,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "is_llama_config": True,
+        "max_position_embeddings": 128,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 4,
+        "num_key_value_heads": 4,
+        "pad_token_id": None,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-06,
+        "rope_scaling": None,
+        "tie_word_embeddings": False,
+        "use_cache": True,
+        "vocab_size": 4096,
+    }
+)
+BATCH_SIZE = 3
+SEQUENCE_LENGTH = 5
+ATOL = 0.02
+def create_nanotron_model(pp: int = 1, tp: int = 1, dp: int = 1) -> LlamaForTraining:
+    parallel_config = make_parallel_config(dp, pp, tp)
+    return load_nanotron_model(parallel_config, CONFIG, torch.device("cuda"), torch.bfloat16)
+def create_huggingface_model() -> LlamaForCausalLM:
+    config_hf = get_hf_config(CONFIG)
+    with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
+        model_hf = LlamaForCausalLM._from_config(config_hf)
+    return model_hf
+@pytest.fixture(autouse=True, scope="module")
+def fix_seed():
+    torch.manual_seed(0)
+    yield
+@pytest.fixture
+def input_ids() -> torch.Tensor:
+    return torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
+def _test_nt_to_hf(parallel_context: ParallelContext, input_ids: torch.Tensor):
+    model_nt = create_nanotron_model()
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+def test_nt_to_hf(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf)(input_ids=input_ids)
+def _test_nt_to_hf_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
+    # Create and save nanotron model.
+    model_nt = create_nanotron_model()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "hf"
+    nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
+    with open(nt_path / "model_config.json", "w+") as f:
+        json.dump(vars(CONFIG), f)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    del model_nt
+    # Perform conversion.
+    convert_nt_to_hf_and_save(nt_path, hf_path)
+    # Load huggingface and get logits.
+    model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+def test_nt_to_hf_with_files(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf_with_files)(input_ids=input_ids, test_context=TestContext())
+def _test_hf_to_nt(parallel_context: ParallelContext, input_ids: torch.Tensor):
+    model_nt = create_nanotron_model()
+    model_hf = create_huggingface_model()
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+def test_hf_to_nt(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt)(input_ids=input_ids)
+def _test_hf_to_nt_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
+    # Create and save hf model.
+    model_hf = create_huggingface_model()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "hf"
+    model_hf.save_pretrained(hf_path)
+    logits_hf = model_hf(input_ids).logits
+    del model_hf
+    # Perform conversion.
+    convert_hf_to_nt_and_save(hf_path, nt_path)
+    # Load nanotron and get logits.
+    input_mask = torch.ones_like(input_ids)
+    model_nt = load_nanotron_model(checkpoint_path=nt_path)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL)
+def test_hf_to_nt_with_files(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt_with_files)(input_ids=input_ids, test_context=TestContext())
+def _test_composed_conversion(parallel_context: ParallelContext):
+    # Get HF statedict.
+    model_hf = create_huggingface_model()
+    hf_sd = {key: val.clone() for key, val in model_hf.state_dict().items()}
+    # Convert once to nanotron, save its statedict.
+    model_nt = create_nanotron_model()
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    nt_sd = {key: val.clone() for key, val in model_nt.state_dict().items()}
+    # Convert back to HF, compare statedicts.
+    del model_hf
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    hf_sd_new = model_hf.state_dict()
+    assert set(hf_sd_new) == set(hf_sd)
+    assert all(torch.all(hf_sd[key] == hf_sd_new[key]) for key in hf_sd_new)
+    # Convert to nanotron one more time, compare statedicts.
+    del model_nt
+    model_nt = create_nanotron_model()
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    nt_sd_new = model_nt.state_dict()
+    assert set(nt_sd_new) == set(nt_sd)
+    assert all(torch.all(nt_sd[key] == nt_sd_new[key]) for key in nt_sd_new)
+def test_composed_conversion():
+    init_distributed(tp=1, dp=1, pp=1)(_test_composed_conversion)()
+<<<<<<< HEAD
+def _save_parallel_nanotron(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path):
+    # Create and save a parallel model.
+    model_nt = create_nanotron_model(tp=parallel_context.tensor_parallel_size, pp=parallel_context.pipeline_parallel_size)
+    # print(torch.distributed.get_rank(), "model_nt", set(p.device for p in model_nt.parameters()))
+    nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
+    with open(nt_path/"model_config.json", "w+") as f:
+        json.dump(vars(CONFIG), f)
+    # Get parallel predictions.
+    input_ids = input_ids.cuda()  # Move them to the current device index.
+    input_mask = torch.ones_like(input_ids)
+    # print(torch.distributed.get_rank(), "input_ids", input_ids.device)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    if torch.distributed.get_rank() == 0:
+        torch.save(logits_nt.detach().cpu(), nt_path/"logits.pt")
+    # print(torch.distributed.get_rank(), logits_nt.shape)
+    # Convert nanotron to hf, load it and compare logits.
+    # hf_path = root/"hf"
+    # convert_nt_to_hf_and_save(nt_path, hf_path)
+    # model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    # logits_hf = model_hf(input_ids).logits
+    # assert logits_nt.size() == logits_hf.size()
+    # assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+def _convert_from_parallel(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path, hf_path: Path):
+    # Convert parallel nanotron to hf, get and save huggingface predictions.
+    convert_nt_to_hf_and_save(nt_path, hf_path)
+    model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    logits_hf = model_hf(input_ids).logits
+    torch.save(logits_hf.detach().cpu(), hf_path/"logits.pt")
+def test_tensor_parallel_conversion(input_ids: torch.Tensor):
+    # Set up test.
+    test_context = TestContext()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path =root/"nanotron"
+    hf_path =root/"nanotron"
+    # Launch both parts.
+    init_distributed(tp=2, dp=1, pp=1)(_save_parallel_nanotron)(input_ids=input_ids, nt_path=nt_path)
+    assert (nt_path/"logits.pt").exists()
+    init_distributed(tp=1, dp=1, pp=1)(_convert_from_parallel)(input_ids=input_ids, nt_path=nt_path, hf_path=hf_path)
+    assert (hf_path/"logits.pt").exists()
+    # Load logits and verify they match.
+    logits_nt = torch.load(nt_path/"logits.pt")
+    logits_hf = torch.load(hf_path/"logits.pt")
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+=======
+def _test_tensor_parallel_conversion(parallel_context: ParallelContext):
+    model_nt = create_nanotron_model(tp=2)
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+@rerun_if_address_is_in_use()
+def test_tensor_parallel_conversion():
+    init_distributed(tp=2, dp=1, pp=1)(_test_tensor_parallel_conversion)()
+>>>>>>> main
--- a/examples/llama/tests/utils.py
+++ b/examples/llama/tests/utils.py
+import importlib
+import sys
+from pathlib import Path
+def set_system_path():
+    package = importlib.import_module("nanotron")
+    # NOTE:  Path(package.__file__).parent = .../nanotron/src/nanotron
+    # we want .../nanotron
+    package_path = Path(package.__file__).parent.parent.parent
+    sys.path.insert(0, str(package_path))
+    # we also want ../llama
+    llama_path = Path(__file__).parent.parent
+    sys.path.insert(0, str(llama_path))
--- a/examples/mamba/README.md
+++ b/examples/mamba/README.md
+---
+library_name: nanotron
+---
+# Mamba
+Modeling code for Mamba to use with [Nanotron](https://github.com/huggingface/nanotron/)
+## 🚀 Quickstart
+```bash
+pip install -r requirements.txt
+# Run training
+./examples/mamba/train_mamba.sh
+```
+![mamba](./assets/loss_mamba.png)
+> https://wandb.ai/bouteille/test/reports/Mamba-loss--Vmlldzo2OTgwNDM5
+## Bug related to nanotron
+Encountered the following issue when ran train_mamba.sh:   
+```
+causal_conv1d_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c1017SymbolicShapeMeta18init_is_contiguousEv
+```
+Solved this by doing:    
+pip uninstall mamba-ssm   
+pip install causal_conv1d==1.1.1   
+pip install mamba-ssm --no-cache-dir  
+https://github.com/state-spaces/mamba/issues/169 
+## Credits
+Credits to the following repositories from which the code was adapted:
+- https://github.com/state-spaces/mamba
--- a/examples/mamba/assets/loss_mamba.png
+++ b/examples/mamba/assets/loss_mamba.png
--- a/examples/mamba/config.py
+++ b/examples/mamba/config.py
+from dataclasses import dataclass
+from typing import Optional, Union
+import torch
+from nanotron.config import Config, ExistingCheckpointInit, NanotronConfigs
+from nanotron.config.utils_config import cast_str_to_torch_dtype
+@dataclass
+class MambaInit:
+    initializer_range: float = 0.02
+    rescale_prenorm_residual: bool = True
+    n_residuals_per_layer: int = 1  # Change to 2 if we have MLP
+@dataclass
+class ModelArgs:
+    """Arguments related to model architecture"""
+    model_config: NanotronConfigs
+    init_method: Union[MambaInit, ExistingCheckpointInit]
+    dtype: Optional[torch.dtype] = None
+    make_vocab_size_divisible_by: int = 1
+    ddp_bucket_cap_mb: int = 25
+    def __post_init__(self):
+        if self.dtype is None:
+            self.dtype = torch.bfloat16
+        if isinstance(self.dtype, str):
+            self.dtype = cast_str_to_torch_dtype(self.dtype)
+        # if self.model_config.max_position_embeddings is None:
+        #     self.model_config.max_position_embeddings = 0
+@dataclass(kw_only=True)  # pylint: disable=unexpected-keyword-arg
+class MambaConfig(Config):
+    """Main configuration class"""
+    model: ModelArgs
+@dataclass
+class MambaModelConfig:
+    """Configuration for a Mamba model
+    Be careful on having a coherent typing as we use it to reconstruct the model from yaml
+    """
+    is_mamba_config: bool = True  # We use this help differentiate models in yaml/python conversion
+    d_model: int = 2560
+    num_hidden_layers: int = 64
+    vocab_size: int = 50277
+    ssm_cfg: Optional[dict] = None
+    rms_norm: bool = True
+    fused_add_norm: bool = True
+    residual_in_fp32: bool = True
+    pad_vocab_size_multiple: int = 8
+    # ==== Custom ======
+    dtype: str = "float32"
+    rms_norm_eps: float = 1e-5
+    pad_token_id: Optional[int] = None