Commit dfcb88ff authored by chenzk's avatar chenzk
Browse files

v1.0.8

parents
import pytest
from utils import create_dummy_dataset, set_system_path
set_system_path()
from examples.doremi.doremi.dataloader import CombinedDataset
@pytest.fixture
def dataset1():
return create_dummy_dataset(4000)
@pytest.fixture
def dataset2():
return create_dummy_dataset(6000)
def test_combined_dataset_length(dataset1, dataset2):
combined_dataset = CombinedDataset([dataset1, dataset2])
assert len(combined_dataset) == len(dataset1) + len(dataset2)
@pytest.mark.parametrize("idx_type", ["idxs", "batch_of_idxs"])
def test_get_item_from_combined_dataset(dataset1, dataset2, idx_type):
def count_elements(lst):
return sum(count_elements(i) if isinstance(i, list) else 1 for i in lst)
if idx_type == "batch_of_idxs":
total_samples = len(dataset1) + len(dataset2)
idxs = [[0, 1], [total_samples - 2, total_samples - 1]]
else:
idxs = [0, 1]
combined_dataset = CombinedDataset([dataset1, dataset2])
outputs = combined_dataset[idxs]
# NOTE: obtain the first key in a dict
first_key = next(iter(outputs))
assert isinstance(outputs, dict)
assert outputs.keys() == dataset1[0].keys()
assert len(outputs[first_key]) == count_elements(idxs)
assert outputs[first_key][0] == dataset1[0][first_key]
assert outputs[first_key][1] == dataset1[1][first_key]
if idx_type == "batch_of_idxs":
assert outputs[first_key][2] == dataset2[len(dataset2) - 2][first_key]
assert outputs[first_key][3] == dataset2[len(dataset2) - 1][first_key]
import pytest
import torch
import torch.distributed as dist
import torch.nn.functional as F
from nanotron.parallel import ParallelContext
from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy
from nanotron.sanity_checks import assert_tensor_synced_across_pg
from utils import set_system_path
set_system_path()
from examples.doremi.doremi.doremi_context import DoReMiContext
from examples.doremi.doremi.loss import (
CrossEntropyWithPerDomainLoss,
DomainLossForProxyTraining,
DoReMiLossForProxyTraining,
compute_domain_loss_per_replicas,
compute_per_domain_loss,
)
from tests.helpers.utils import init_distributed
@pytest.fixture
def doremi_context():
N_DOMAINS = 5
domain_keys = [f"domain {i}" for i in range(N_DOMAINS)]
doremi_context = DoReMiContext(domain_keys, is_proxy=False)
return doremi_context
def get_partition_logit(logits, parallel_context):
tp_size = dist.get_world_size(parallel_context.tp_pg)
tp_rank = dist.get_rank(parallel_context.tp_pg)
VOCAB_SIZE = logits.shape[-1]
per_partition = VOCAB_SIZE // tp_size
chunks = torch.split(logits, per_partition, dim=-1)
return chunks[tp_rank]
@pytest.mark.parametrize("tp", [1, 2])
def test_computing_per_token_loss(tp: int):
BATCH_SIZE = 512
SEQ_LEN = 128
VOCAB_SIZE = 4
torch.manual_seed(69)
logits = torch.randn(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
targets = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN))
ref_losses = F.cross_entropy(logits.view(-1, logits.size(2)), targets.view(-1), reduction="none")
init_distributed(tp=tp, dp=1, pp=1)(_test_computing_per_token_loss)(
logits=logits, targets=targets, ref_losses=ref_losses
)
def _test_computing_per_token_loss(parallel_context: ParallelContext, logits, targets, ref_losses):
logits = logits.to("cuda")
targets = targets.to("cuda")
parallel_logits = get_partition_logit(logits, parallel_context)
loss = sharded_cross_entropy(parallel_logits, targets, parallel_context.tp_pg)
assert torch.allclose(loss.cpu().view(-1), ref_losses)
@pytest.mark.parametrize("dp", [1, 2])
def test_domain_loss_for_proxy_training(dp: int):
GLOBAL_BATCH_SIZE = 512
BATCH_SIZE = GLOBAL_BATCH_SIZE // dp
SEQ_LEN = 128
N_DOMAINS = 5
domain_keys = [f"domain {i}" for i in range(N_DOMAINS)]
init_distributed(tp=1, dp=dp, pp=1)(_test_domain_loss_for_proxy_training)(
global_batch_size=GLOBAL_BATCH_SIZE,
batch_size=BATCH_SIZE,
seq_len=SEQ_LEN,
domain_keys=domain_keys,
)
def _test_domain_loss_for_proxy_training(
parallel_context: ParallelContext, global_batch_size, batch_size, seq_len, domain_keys
):
N_DOMAINS = len(domain_keys)
losses = torch.randn(batch_size, seq_len, device="cuda")
ref_losses = torch.randn(batch_size, seq_len, device="cuda")
domain_idxs = torch.randint(0, N_DOMAINS, (batch_size,), device="cuda")
doremi_context = DoReMiContext(domain_keys, is_proxy=False)
doremi_context.domain_weights = doremi_context.domain_weights.to("cuda")
loss_func = DomainLossForProxyTraining(doremi_context, parallel_context)
outputs = loss_func(losses, ref_losses, domain_idxs)
assert outputs.keys() == {"dro_loss", "domain_losses", "domain_weights", "samples_per_domain"}
assert (outputs["domain_losses"] > 0.0).all()
assert outputs["domain_losses"].shape == (N_DOMAINS,)
assert (outputs["domain_weights"] > 0.0).all()
assert outputs["domain_weights"].shape == (N_DOMAINS,)
@pytest.mark.parametrize("dp", [1, 2])
def test_computing_per_domain_loss(dp: int):
GLOBAL_BATCH_SIZE = 512
BATCH_SIZE = GLOBAL_BATCH_SIZE // dp
SEQ_LEN = 128
N_DOMAINS = 5
domain_keys = [f"domain {i}" for i in range(N_DOMAINS)]
init_distributed(tp=1, dp=dp, pp=1)(_test_computing_per_domain_loss)(
batch_size=BATCH_SIZE,
global_batch_size=GLOBAL_BATCH_SIZE,
seq_len=SEQ_LEN,
domain_keys=domain_keys,
)
def _test_computing_per_domain_loss(
parallel_context: ParallelContext, batch_size, global_batch_size, seq_len, domain_keys
):
N_DOMAINS = len(domain_keys)
losses = torch.randn(batch_size, seq_len, device="cuda")
domain_idxs = torch.randint(0, N_DOMAINS, (batch_size,), device="cuda")
doremi_context = DoReMiContext(domain_keys, is_proxy=False)
doremi_context.domain_weights.to("cuda")
losses_dp, per_domain_loss, samples_per_domain = compute_per_domain_loss(
losses, domain_idxs, doremi_context, parallel_context
)
assert per_domain_loss.shape == (N_DOMAINS,)
assert_tensor_synced_across_pg(
per_domain_loss, parallel_context.dp_pg, msg=lambda err: f"Per domain loss are not synced across ranks {err}"
)
assert samples_per_domain.shape == (N_DOMAINS,)
assert sum(samples_per_domain) == global_batch_size
assert_tensor_synced_across_pg(
samples_per_domain,
parallel_context.dp_pg,
msg=lambda err: f"Samples per domain are not synced across ranks {err}",
)
@pytest.mark.parametrize("dp", [1, 2])
def test_computing_domain_loss_per_replicas(dp: int):
GLOBAL_BATCH_SIZE = 512
BATCH_SIZE = GLOBAL_BATCH_SIZE // dp
SEQ_LEN = 128
N_DOMAINS = 5
domain_keys = [f"domain {i}" for i in range(N_DOMAINS)]
init_distributed(tp=1, dp=dp, pp=1)(_test_computing_domain_loss_per_replicas)(
batch_size=BATCH_SIZE,
global_batch_size=GLOBAL_BATCH_SIZE,
seq_len=SEQ_LEN,
domain_keys=domain_keys,
)
def _test_computing_domain_loss_per_replicas(
parallel_context: ParallelContext, batch_size, global_batch_size, seq_len, domain_keys
):
N_DOMAINS = len(domain_keys)
losses = torch.randn(batch_size, seq_len, device="cuda")
domain_idxs = torch.randint(0, N_DOMAINS, (batch_size,), device="cuda")
doremi_context = DoReMiContext(domain_keys, is_proxy=False)
doremi_context.domain_weights.to("cuda")
per_domain_loss, samples_per_domain = compute_domain_loss_per_replicas(losses, domain_idxs, doremi_context)
assert per_domain_loss.shape == (N_DOMAINS,)
assert samples_per_domain.shape == (N_DOMAINS,)
@pytest.mark.skip
@pytest.mark.parametrize("tp", [1, 2])
def test_cross_entropy_with_per_domain_loss(tp: int, doremi_context):
BATCH_SIZE = 512
SEQ_LEN = 128
VOCAB_SIZE = 4
N_DOMAINS = doremi_context.num_domains
torch.manual_seed(69)
logits = torch.randn(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
label_ids = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN))
label_mask = torch.ones((BATCH_SIZE, SEQ_LEN), dtype=torch.bool)
domain_idxs = torch.randint(0, N_DOMAINS, (BATCH_SIZE,))
ref_losses = F.cross_entropy(logits.view(-1, logits.size(2)), label_ids.view(-1))
init_distributed(tp=tp, dp=1, pp=1)(_test_cross_entropy_with_per_domain_loss)(
logits=logits,
label_ids=label_ids,
label_mask=label_mask,
domain_idxs=domain_idxs,
ref_losses=ref_losses,
batch_size=BATCH_SIZE,
doremi_context=doremi_context,
)
def _test_cross_entropy_with_per_domain_loss(
parallel_context: ParallelContext,
logits,
label_ids,
label_mask,
domain_idxs,
ref_losses,
batch_size,
doremi_context,
):
logits = logits.to("cuda")
label_ids = label_ids.to("cuda")
label_mask = label_mask.to("cuda")
domain_idxs = domain_idxs.to("cuda")
parallel_logits = get_partition_logit(logits, parallel_context)
loss_func = CrossEntropyWithPerDomainLoss(doremi_context, parallel_context)
outputs = loss_func(parallel_logits, label_ids, label_mask, domain_idxs)
assert torch.allclose(outputs["loss"].cpu().view(-1), ref_losses)
assert outputs["domain_losses"].shape == (doremi_context.num_domains,)
assert outputs["samples_per_domain"].shape == (doremi_context.num_domains,)
assert sum(outputs["samples_per_domain"]) == batch_size
@pytest.mark.parametrize("tp", [1, 2])
def test_doremi_loss_for_proxy_training(tp: int, doremi_context):
BATCH_SIZE = 512
SEQ_LEN = 128
VOCAB_SIZE = 4
N_DOMAINS = doremi_context.num_domains
torch.manual_seed(69)
logits = torch.randn(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
label_ids = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, SEQ_LEN))
label_mask = torch.ones((BATCH_SIZE, SEQ_LEN), dtype=torch.bool)
domain_idxs = torch.randint(0, N_DOMAINS, (BATCH_SIZE,))
ref_losses = torch.randn(BATCH_SIZE, SEQ_LEN)
ref_ce_loss = F.cross_entropy(logits.view(-1, logits.size(2)), label_ids.view(-1))
init_distributed(tp=tp, dp=1, pp=1)(_test_doremi_loss_for_proxy_training)(
logits=logits,
label_ids=label_ids,
label_mask=label_mask,
domain_idxs=domain_idxs,
ref_losses=ref_losses,
ref_ce_loss=ref_ce_loss,
batch_size=BATCH_SIZE,
n_domains=N_DOMAINS,
doremi_context=doremi_context,
)
def _test_doremi_loss_for_proxy_training(
parallel_context: ParallelContext,
logits,
label_ids,
label_mask,
domain_idxs,
ref_losses,
ref_ce_loss,
batch_size,
n_domains,
doremi_context,
):
logits = logits.to("cuda")
label_ids = label_ids.to("cuda")
label_mask = label_mask.to("cuda")
domain_idxs = domain_idxs.to("cuda")
ref_losses = ref_losses.to("cuda")
doremi_context.domain_weights = doremi_context.domain_weights.to("cuda")
parallel_logits = get_partition_logit(logits, parallel_context)
loss_func = DoReMiLossForProxyTraining(doremi_context, parallel_context)
outputs = loss_func(parallel_logits, label_ids, label_mask, domain_idxs, ref_losses)
assert outputs["loss"].ndim == 0
assert outputs["loss"] > 0.0
assert torch.allclose(outputs["ce_loss"].cpu().view(-1), ref_ce_loss)
assert outputs["domain_losses"].shape == (doremi_context.num_domains,)
assert (outputs["domain_losses"] > 0).all()
assert outputs["domain_weights"].shape == (doremi_context.num_domains,)
assert torch.allclose(sum(outputs["domain_weights"].cpu()), torch.tensor(1.0))
samples_per_domain = outputs["samples_per_domain"]
assert samples_per_domain.shape == (n_domains,)
assert sum(samples_per_domain) == batch_size
import pytest
import torch
from nanotron import distributed as dist
from nanotron.parallel import ParallelContext
from nanotron.sanity_checks import assert_tensor_synced_across_pg
from torch.utils.data import DataLoader
from utils import create_dummy_dataset, set_system_path
set_system_path()
from examples.doremi.doremi.dataloader import (
CombinedDataset,
DistributedSamplerForDoReMi,
)
from examples.doremi.doremi.doremi_context import DoReMiContext
from tests.helpers.utils import init_distributed
@pytest.fixture
def dataset1():
return create_dummy_dataset(7000)
@pytest.fixture
def dataset2():
return create_dummy_dataset(3000)
@pytest.fixture
def datasets(dataset1, dataset2):
return [dataset1, dataset2]
@pytest.mark.parametrize("num_microbatches", [1, 32])
@pytest.mark.parametrize("is_proxy", [True, False])
def test_dist_doremi_sampler_sync_across_tp(num_microbatches, dataset1, is_proxy):
NUM_DOMAINS = 2
BATCH_SIZE = 16
datasets = [dataset1 for _ in range(NUM_DOMAINS)]
domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
init_distributed(tp=2, dp=1, pp=1)(_test_dist_doremi_sampler_sync_across_tp)(
batch_size=BATCH_SIZE,
num_microbatches=num_microbatches,
datasets=datasets,
doremi_context=doremi_context,
)
def _test_dist_doremi_sampler_sync_across_tp(
parallel_context: ParallelContext, batch_size: int, num_microbatches: int, datasets, doremi_context: DoReMiContext
):
dp_size = dist.get_world_size(parallel_context.dp_pg)
dp_rank = dist.get_rank(parallel_context.dp_pg)
sampler = DistributedSamplerForDoReMi(
datasets,
batch_size=batch_size,
num_microbatches=num_microbatches,
num_replicas=dp_size,
rank=dp_rank,
doremi_context=doremi_context,
parallel_context=parallel_context,
)
for idxs in sampler:
idxs = torch.tensor(idxs, device="cuda")
assert_tensor_synced_across_pg(idxs, parallel_context.tp_pg)
@pytest.mark.parametrize("dp_size", [2, 4])
@pytest.mark.parametrize("num_microbatches", [1, 32])
@pytest.mark.parametrize("is_proxy", [True, False])
def test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training(dp_size, num_microbatches, dataset1, is_proxy):
NUM_DOMAINS = 2
GLOBAL_BATCH_SIZE = 512
batch_size = GLOBAL_BATCH_SIZE // (num_microbatches * dp_size)
datasets = [dataset1 for _ in range(NUM_DOMAINS)]
domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
init_distributed(tp=1, dp=2, pp=1)(_test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training)(
batch_size=batch_size,
num_microbatches=num_microbatches,
datasets=datasets,
doremi_context=doremi_context,
)
def _test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training(
parallel_context: ParallelContext,
batch_size: int,
num_microbatches: int,
datasets,
doremi_context: DoReMiContext,
):
dp_size = dist.get_world_size(parallel_context.dp_pg)
dp_rank = dist.get_rank(parallel_context.dp_pg)
sampler = DistributedSamplerForDoReMi(
datasets,
batch_size=batch_size,
num_microbatches=num_microbatches,
num_replicas=dp_size,
rank=dp_rank,
doremi_context=doremi_context,
parallel_context=parallel_context,
)
for idxs in sampler:
idxs = torch.tensor(idxs, device="cuda").view(-1)
# NOTE: i tried to use assert_fail_except_rank_with, but it mark the test as failed
# even the test raises an exception as expected
gathered_idxs = [torch.empty_like(idxs, device="cuda") for _ in range(dp_size)]
dist.all_gather(gathered_idxs, idxs)
# NOTE: whether proxy or reference training
# the idxs should not be overlapse
assert not torch.any(torch.isin(*gathered_idxs))
@pytest.mark.parametrize("num_microbatches", [1, 32])
@pytest.mark.parametrize("is_proxy", [True, False])
def test_determistic_doremi_sampler(num_microbatches, dataset1, is_proxy):
BATCH_SIZE = 100
NUM_DOMAINS = 2
datasets = [dataset1 for _ in range(NUM_DOMAINS)]
domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
n_epochs = 3
init_distributed(tp=1, dp=1, pp=1)(_test_determistic_doremi_sampler)(
batch_size=BATCH_SIZE,
num_microbatches=num_microbatches,
datasets=datasets,
doremi_context=doremi_context,
n_epochs=n_epochs,
)
def _test_determistic_doremi_sampler(
parallel_context: ParallelContext,
batch_size: int,
num_microbatches: int,
n_epochs: int,
datasets,
doremi_context: DoReMiContext,
):
dp_size = dist.get_world_size(parallel_context.dp_pg)
dp_rank = dist.get_rank(parallel_context.dp_pg)
sampler = DistributedSamplerForDoReMi(
datasets,
batch_size=batch_size,
num_microbatches=num_microbatches,
num_replicas=dp_size,
rank=dp_rank,
doremi_context=doremi_context,
parallel_context=parallel_context,
)
idxs_per_epoch = []
for _ in range(n_epochs):
all_idxs = []
for idxs in sampler:
all_idxs.append(idxs)
idxs_per_epoch.append(all_idxs)
sampler.reset()
# NOTE: check if the sequence of idxs across epochs are all the same
assert all(
all(arr1[i] == arr2[i] for i in range(len(arr1))) for arr1, arr2 in zip(idxs_per_epoch, idxs_per_epoch[1:])
)
@pytest.mark.parametrize("dp_size", [1, 2, 4])
@pytest.mark.parametrize("num_microbatches", [1, 32])
@pytest.mark.parametrize("is_proxy", [True, False])
def test_sampling_from_dist_doremi_sampler_with_global_batch_size(
dp_size,
num_microbatches,
# domain_weights: torch.Tensor,
dataset1,
is_proxy,
):
NUM_DOMAINS = 8
GLOBAL_BATCH_SIZE = 512
batch_size = GLOBAL_BATCH_SIZE // (num_microbatches * dp_size)
datasets = [dataset1 for _ in range(NUM_DOMAINS)]
domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
init_distributed(tp=1, dp=dp_size, pp=1)(_test_sampling_from_dist_doremi_sampler_with_global_batch_size)(
batch_size=batch_size,
num_microbatches=num_microbatches,
global_batch_size=GLOBAL_BATCH_SIZE,
datasets=datasets,
doremi_context=doremi_context,
)
def _test_sampling_from_dist_doremi_sampler_with_global_batch_size(
parallel_context: ParallelContext,
batch_size: int,
num_microbatches: int,
global_batch_size: int,
datasets,
doremi_context: DoReMiContext,
):
dp_size = dist.get_world_size(parallel_context.dp_pg)
dp_rank = dist.get_rank(parallel_context.dp_pg)
sampler = DistributedSamplerForDoReMi(
datasets,
batch_size=batch_size,
num_microbatches=num_microbatches,
num_replicas=dp_size,
rank=dp_rank,
doremi_context=doremi_context,
parallel_context=parallel_context,
)
domain_weights = doremi_context.domain_weights
global_batch_size_per_domain = [round(global_batch_size * weight.item()) for weight in domain_weights]
microbatch_idx = 0
num_samples_per_domain = [0 for _ in range(len(domain_weights))]
for idxs in sampler:
assert batch_size == len(idxs)
# NOTE: make sure the indices from a batch
# is proportion to the domain weights
start_indices = [sum([len(ds) for ds in datasets[:i]]) for i in range(len(datasets))]
end_indices = [sum([len(ds) for ds in datasets[: i + 1]]) for i in range(len(datasets))]
for domain_idx in range(len(domain_weights)):
num_samples = sum(1 for idx in idxs if idx >= start_indices[domain_idx] and idx < end_indices[domain_idx])
num_samples_per_domain[domain_idx] += num_samples
if microbatch_idx == num_microbatches - 1:
# NOTE: if this is the last microbatch => we iterate through all the microbatches
# now we check if the overall number of samples in each domain is correct across
# all the microbatches
num_samples_per_domain = torch.tensor(num_samples_per_domain, dtype=torch.int, device="cuda")
# NOTE: the domain weights are chosen so that we expect
# no domains have zero sample in the global batch size
dist.all_reduce(num_samples_per_domain, op=dist.ReduceOp.SUM)
assert (num_samples_per_domain == 0).sum().item() == 0
for expected_bs, bs in zip(global_batch_size_per_domain, num_samples_per_domain):
assert bs > 0
# NOTE: take into account rounding errors
# across all the dp ranks
assert abs(expected_bs - bs) <= dp_size, f"abs(expected_bs - bs): {abs(expected_bs - bs)}"
microbatch_idx = 0
num_samples_per_domain = [0 for _ in range(len(domain_weights))]
else:
microbatch_idx += 1
@pytest.mark.parametrize("dp_size", [1, 2, 4])
@pytest.mark.parametrize("num_microbatches", [1, 32])
@pytest.mark.parametrize("is_proxy", [True, False])
def test_dist_doremi_sampler_not_repeating_samples(dp_size, num_microbatches, dataset1, is_proxy):
NUM_DOMAINS = 2
GLOBAL_BATCH_SIZE = 512
batch_size = GLOBAL_BATCH_SIZE // (num_microbatches * dp_size)
datasets = [dataset1 for _ in range(NUM_DOMAINS)]
domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
init_distributed(tp=1, dp=dp_size, pp=1)(_test_dist_doremi_sampler_not_repeating_samples)(
batch_size=batch_size,
num_microbatches=num_microbatches,
datasets=datasets,
doremi_context=doremi_context,
)
def _test_dist_doremi_sampler_not_repeating_samples(
parallel_context: ParallelContext,
batch_size: int,
num_microbatches: int,
datasets,
doremi_context: DoReMiContext,
):
dp_size = dist.get_world_size(parallel_context.dp_pg)
dp_rank = dist.get_rank(parallel_context.dp_pg)
sampler = DistributedSamplerForDoReMi(
datasets,
batch_size=batch_size,
num_microbatches=num_microbatches,
num_replicas=dp_size,
rank=dp_rank,
doremi_context=doremi_context,
parallel_context=parallel_context,
)
local_yieled_idxs = []
yielded_idxs = []
epoch = 0
for idxs in sampler:
# NOTE: check that the indices are not repeated
assert not set(idxs).intersection(
local_yieled_idxs
), f"set(idxs): {set(idxs)}, local_yieled_idxs: {local_yieled_idxs}"
assert not set(idxs).intersection(
yielded_idxs
), f"set(idxs): {set(idxs)}, yielded_idxs: {yielded_idxs} \
epoch: {epoch}"
local_yieled_idxs.extend(idxs)
# NOTE: gather all the indices from all the dp ranks
idxs = torch.tensor(idxs, dtype=torch.int, device="cuda")
all_idxs = [torch.zeros_like(idxs) for _ in range(dp_size)]
dist.all_gather(all_idxs, idxs)
all_idxs = torch.cat(all_idxs, dim=0).view(-1).cpu().tolist()
yielded_idxs.extend(all_idxs)
epoch += 1
assert len(set(yielded_idxs)) == len(yielded_idxs)
@pytest.mark.parametrize("dp_size", [2, 4, 8])
@pytest.mark.parametrize("num_microbatches", [1, 5])
@pytest.mark.parametrize("is_proxy", [True, False])
def test_yielding(dp_size, num_microbatches, dataset1, is_proxy):
NUM_DOMAINS = 2
BATCH_SIZE = 100
global_batch_size = BATCH_SIZE * num_microbatches * dp_size
datasets = [dataset1 for _ in range(NUM_DOMAINS)]
domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
init_distributed(tp=1, dp=dp_size, pp=1)(_test_yielding)(
batch_size=BATCH_SIZE,
global_batch_size=global_batch_size,
num_microbatches=num_microbatches,
datasets=datasets,
doremi_context=doremi_context,
)
def _test_yielding(
parallel_context: ParallelContext,
batch_size: int,
global_batch_size: int,
num_microbatches: int,
datasets,
doremi_context: DoReMiContext,
):
dp_size = dist.get_world_size(parallel_context.dp_pg)
dp_rank = dist.get_rank(parallel_context.dp_pg)
sampler = DistributedSamplerForDoReMi(
datasets,
batch_size=batch_size,
num_microbatches=num_microbatches,
num_replicas=dp_size,
rank=dp_rank,
doremi_context=doremi_context,
parallel_context=parallel_context,
)
step = 0
num_yielded_microbatches = 0
expected_domain_weights = torch.tensor([0.5, 0.5])
for idxs in sampler:
idxs = torch.tensor(idxs, dtype=torch.int, device="cuda")
idxs_dp = [torch.empty_like(idxs) for _ in range(dp_size)]
dist.all_gather(idxs_dp, idxs)
idxs_dp = torch.cat(idxs_dp, dim=0)
assert idxs_dp.numel() == batch_size * dp_size
# NOTE: if it loops through all the microbatches
# then we check if the number of samples in each domain
if (step + 1) % num_microbatches == 0:
num_yielded_microbatches += 1
for i, weight in enumerate(expected_domain_weights):
assert sampler.domain_counters[i] == int(num_yielded_microbatches * global_batch_size * weight)
step += 1
@pytest.mark.parametrize("dp_size", [2, 4, 8])
@pytest.mark.parametrize("num_microbatches", [1, 5])
@pytest.mark.parametrize("is_proxy", [True, False])
def test_yielding_with_dataloader(dp_size, num_microbatches, dataset1, is_proxy):
NUM_DOMAINS = 2
BATCH_SIZE = 100
global_batch_size = BATCH_SIZE * num_microbatches * dp_size
datasets = [dataset1 for _ in range(NUM_DOMAINS)]
domain_keys = [f"domain {i}" for i in range(NUM_DOMAINS)]
doremi_context = DoReMiContext(domain_keys, is_proxy=is_proxy)
init_distributed(tp=1, dp=dp_size, pp=1)(_test_yielding_with_dataloader)(
batch_size=BATCH_SIZE,
global_batch_size=global_batch_size,
num_microbatches=num_microbatches,
datasets=datasets,
doremi_context=doremi_context,
)
def _test_yielding_with_dataloader(
parallel_context: ParallelContext,
batch_size: int,
global_batch_size: int,
num_microbatches: int,
datasets,
doremi_context: DoReMiContext,
):
dp_size = dist.get_world_size(parallel_context.dp_pg)
dp_rank = dist.get_rank(parallel_context.dp_pg)
sampler = DistributedSamplerForDoReMi(
datasets,
batch_size=batch_size,
num_microbatches=num_microbatches,
num_replicas=dp_size,
rank=dp_rank,
doremi_context=doremi_context,
parallel_context=parallel_context,
)
comebined_dataset = CombinedDataset(datasets)
dataloader = DataLoader(comebined_dataset, batch_sampler=sampler)
step = 1
num_yielded_microbatches = 0
expected_domain_weights = torch.tensor([0.5, 0.5])
for idxs in dataloader:
num_idxs = torch.tensor(len(idxs["text"]), dtype=torch.int, device="cuda")
assert num_idxs.item() == batch_size
dist.all_reduce(num_idxs, op=dist.ReduceOp.SUM, group=parallel_context.dp_pg)
assert num_idxs == batch_size * dp_size
if step % num_microbatches == 0:
num_yielded_microbatches += 1
for i, weight in enumerate(expected_domain_weights):
assert sampler.domain_counters[i] == int(num_yielded_microbatches * global_batch_size * weight)
step += 1
assert step > 1
import torch
from utils import create_dummy_dataset, set_system_path
set_system_path()
from examples.doremi.doremi.utils import compute_domain_weights_based_on_token_count
def test_compute_domain_weights_based_on_token_count():
datasets = [
create_dummy_dataset(10),
create_dummy_dataset(20),
create_dummy_dataset(70),
]
domain_weights = compute_domain_weights_based_on_token_count(datasets)
assert torch.equal(domain_weights, torch.tensor([0.1, 0.2, 0.7]))
assert torch.allclose(domain_weights.sum(), torch.tensor(1.0))
import importlib
import sys
from pathlib import Path
from datasets import Dataset
def set_system_path():
package = importlib.import_module("nanotron")
# NOTE: Path(package.__file__).parent = .../nanotron/src/nanotron
# we want .../nanotron
package_path = Path(package.__file__).parent.parent.parent
sys.path.append(str(package_path))
def create_dummy_dataset(num_items: int):
data = {"text": list(range(num_items))}
return Dataset.from_dict(data)
"""
DoReMi training script.
Usage:
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=4 examples/doremi/train_doremi.py --config-file examples/doremi/configs/config_280m_llama_proxy.yaml
"""
import argparse
from nanotron.config import get_config_from_file
from doremi.config import DoReMiConfig
from doremi.dataloader import get_dataloader, get_datasets
from doremi.trainer import DoReMiTrainer
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
config_file = args.config_file
config: DoReMiConfig = get_config_from_file(config_file, config_class=DoReMiConfig)
dataset_paths = [
f"{config.data_stages[0].data.dataset.hf_dataset_or_datasets}/{name}" for name in config.doremi.domain_names
]
datasets = get_datasets(dataset_paths)
trainer = DoReMiTrainer(config_file, config_class=DoReMiConfig)
dataloader = get_dataloader(trainer, datasets)
trainer.train(dataloader)
"""
DoReMi training script.
Usage:
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=4 examples/doremi/train_doremi.py --config-file examples/doremi/configs/config_280m_llama.yaml
"""
import argparse
import torch
from doremi.config import DoReMiConfig
from doremi.dataloader import get_dataloader, get_datasets
from doremi.trainer import ReferenceTrainer
from doremi.utils import compute_domain_weights_based_on_token_count
from nanotron.config import get_config_from_file
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
return parser.parse_args()
if __name__ == "__main__":
args = get_args()
config_file = args.config_file
config = get_config_from_file(config_file, config_class=DoReMiConfig)
dataset_paths = [f"{config.data.dataset.hf_dataset_or_datasets}/{name}" for name in config.doremi.domain_names]
datasets = get_datasets(dataset_paths)
# TODO(xrsrke): add retrieving domain weights from config
# or calculate it in the trainer
if config.doremi.domain_weights is None:
initial_domain_weights = compute_domain_weights_based_on_token_count(datasets)
else:
initial_domain_weights = torch.tensor(config.doremi.domain_weights)
assert torch.allclose(initial_domain_weights.sum(), torch.tensor(1.0), rtol=1e-3)
domain_names = config.doremi.domain_names
trainer = ReferenceTrainer(initial_domain_weights, domain_names, config_file, config_class=DoReMiConfig)
dataloader = get_dataloader(trainer, datasets)
trainer.train(dataloader)
from typing import List
def print_array_for_human(arr: List[float], precision: int = 5) -> str:
formatted_elements = [f"{x:.{precision}f}" for x in arr]
return "[" + ", ".join(formatted_elements) + "]"
## Debugging the tests with vscode
To debug the tests with vscode, add the following json to your `launch.json` file.
```
{
"name": "Test conversion",
"type": "python",
"request": "launch",
"module": "pytest",
"console": "integratedTerminal",
"args": [
"examples/llama/tests"
],
"justMyCode": false
}
```
"""
Converts a HF model to nanotron format
Command:
torchrun --nproc_per_node=1 convert_hf_to_nanotron.py --checkpoint_path=hf_weights --save_path=nanotron_weights
"""
import dataclasses
import json
from argparse import ArgumentParser
from pathlib import Path
import nanotron
import torch
from convert_weights import get_config_mapping, get_weight_mapping, load_nanotron_model
from nanotron.config import LlamaConfig as NanotronLlamaConfig
from nanotron.models.llama import LlamaForTraining
from transformers import LlamaConfig as HFLlamaConfig
from transformers import LlamaForCausalLM
def _handle_attention_block(
q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, n_q_heads: int, n_kv_heads: int, d_qk: int
) -> torch.Tensor:
# Huggingface Llama separates the q, k, v weights (as opposed to nanotron).
# Furthermore, in the rotary embeddings in nanotron expects interleaved pairs of even
# and odd dimensions GPT-J style, while the huggingface implementation expects
# the whole 1st half and then the whole 2nd half GPT-NeoX style (for more information
# see flash_attn.layers.rotary.RotaryEmbedding).
# This function handles the concatenation of the q, k, v weights and proper permutation
# to ensure correct transformation.
def interleave(w: torch.Tensor):
w_new = []
for head_w in w.split(d_qk):
head_w = head_w.view(2, d_qk // 2, -1).transpose(0, 1).reshape(d_qk, -1)
w_new.append(head_w)
return torch.cat(w_new)
q = interleave(q)
k = interleave(k)
return torch.cat([q, k, v])
def convert_hf_to_nt(model_hf: LlamaForCausalLM, model_nt: LlamaForTraining, config: NanotronLlamaConfig):
"""Converts the weights from the model_hf to model_nt, making modifications
in-place."""
hf_sd = model_hf.state_dict()
nt_to_hf = get_weight_mapping(config, nt_to_hf=True)
for module_name_nt, module_nt in model_nt.named_modules():
for param_name_nt, param_nt in module_nt.named_parameters(recurse=False):
# In the case of qkv_proj, the nt_to_hf has exactly three keys, ccorresponding
# to q, k, v.
if "qkv_proj" in module_name_nt:
key_k, key_q, key_v = sorted(nt_to_hf[f"{module_name_nt}.{param_name_nt}"])
q = hf_sd[key_q]
k = hf_sd[key_k]
v = hf_sd[key_v]
param = _handle_attention_block(
q,
k,
v,
config.num_attention_heads,
config.num_key_value_heads,
config.hidden_size // config.num_attention_heads,
)
# The case of gate_up_proj, nt_to_hf_map has two keys.
elif "gate_up_proj" in module_name_nt:
key_gate, key_up = sorted(nt_to_hf[f"{module_name_nt}.{param_name_nt}"])
gate = hf_sd[key_gate]
up = hf_sd[key_up]
param = torch.cat([gate, up])
# All other cases are simple 1-to-1 correspondence.
else:
hf_key = nt_to_hf[f"{module_name_nt}.{param_name_nt}"]
param = hf_sd[hf_key]
with torch.no_grad():
param_nt.copy_(param)
def get_nanotron_config(config: HFLlamaConfig) -> NanotronLlamaConfig:
"""Converts a huggingface configuration to nanotron configuration."""
attrs = {key: getattr(config, value) for key, value in get_config_mapping(nt_to_hf=True).items()}
return NanotronLlamaConfig(**attrs)
def convert_checkpoint_and_save(checkpoint_path: Path, save_path: Path):
"""Loads the huggingface checkpoint in `checkpoint_path`, creates
a new nanotron instance, copies the weights from the huggingface checkpoint
and saves the transformed nanotron to `save_path`."""
# Load huggingface.
hf_model = LlamaForCausalLM.from_pretrained(checkpoint_path)
# Init nanotron model.
model_config = get_nanotron_config(hf_model.config)
nanotron_model = load_nanotron_model(model_config=model_config)
# Copy weights and save model.
parallel_context = nanotron.parallel.ParallelContext(
data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=1
)
convert_hf_to_nt(hf_model, nanotron_model, model_config)
nanotron.serialize.save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=save_path)
with open(save_path / "model_config.json", "w+") as f:
json.dump(dataclasses.asdict(model_config), f)
print(f"Model saved to {save_path}")
if __name__ == "__main__":
parser = ArgumentParser(description="Convert HF weights to nanotron format")
parser.add_argument("--checkpoint_path", type=Path, default="llama-7b", help="Path to the checkpoint")
parser.add_argument("--save_path", type=Path, default="llama-7b-hf", help="Path to save the nanotron model")
args = parser.parse_args()
# Convert HF model to nanotron format.
convert_checkpoint_and_save(checkpoint_path=args.checkpoint_path, save_path=args.save_path)
"""
Converts a nanotron model to HF format
Command:
torchrun --nproc_per_node=1 convert_nanotron_to_hf.py --checkpoint_path=nanotron-path --save_path=hf-path
"""
import json
from argparse import ArgumentParser
from pathlib import Path
from typing import Literal, Optional
import torch
from convert_weights import get_config_mapping, get_weight_mapping, load_nanotron_model
from nanotron.config import LlamaConfig as NanotronLlamaConfig
from nanotron.models import init_on_device_and_dtype
from nanotron.models.llama import LlamaForTraining
from transformers import AutoTokenizer, LlamaForCausalLM
from transformers import LlamaConfig as HFLlamaConfig
TEST_PROMPT = "What is the meaning of the word chutzpah?\nThe word chutzpah means"
def _handle_attention_block(
qkv: torch.Tensor, part: Literal["q", "k", "v"], n_q_heads: int, n_kv_heads: int, d_qk: int
) -> torch.Tensor:
# Huggingface Llama separates the q, k, v weights (as opposed to nanotron).
# Furthermore, in the rotary embeddings in nanotron expects interleaved pairs of even
# and odd dimensions GPT-J style, while the huggingface implementation expects
# the whole 1st half and then the whole 2nd half GPT-NeoX style (for more information
# see flash_attn.layers.rotary.RotaryEmbedding).
# This function selects the proper chunk of the bundled qkv tensor and permutation
# to ensure correct transformation to huggingface.
def interleave(w: torch.Tensor):
w_new = []
for head_w in w.split(d_qk):
head_w = head_w.view(d_qk // 2, 2, -1).transpose(0, 1).reshape(d_qk, -1)
w_new.append(head_w)
return torch.cat(w_new)
assert part in ["q", "k", "v"], "part must be one of [q, k, v]"
index_end_q = n_q_heads * d_qk
index_end_k = index_end_q + n_kv_heads * d_qk
if part == "q":
return interleave(qkv[:index_end_q])
if part == "k":
return interleave(qkv[index_end_q:index_end_k])
return qkv[index_end_k:]
def _handle_gate_up_proj(gate_up_proj: torch.Tensor, gate: bool) -> torch.Tensor:
# The gate and up projection are bundled in nanotron.
# This function selects the proper chunk in the bundled weights to return
# either the gate or the up projection only.
weight_size = gate_up_proj.shape[0] // 2
if gate:
return gate_up_proj[:weight_size]
else:
return gate_up_proj[weight_size:]
def convert_nt_to_hf(nanotron_model: LlamaForTraining, hf_model: LlamaForCausalLM, model_config: NanotronLlamaConfig):
"""Converts the weights from the nanotron_model to hf_model, making modifications
in-place."""
nanotron_model_state_dict = nanotron_model.state_dict()
hf_to_nt = get_weight_mapping(model_config, nt_to_hf=False)
for module_name_hf, module_hf in hf_model.named_modules():
for param_name_hf, param_hf in module_hf.named_parameters(recurse=False):
# Get the Nanotron parameter
nanotron_key = hf_to_nt[f"{module_name_hf}.{param_name_hf}"]
param = nanotron_model_state_dict[nanotron_key]
if "qkv_proj" in nanotron_key:
proj_name = module_name_hf.split(".")[4][0]
param = _handle_attention_block(
param,
proj_name,
model_config.num_attention_heads,
model_config.num_key_value_heads,
model_config.hidden_size // model_config.num_attention_heads,
)
elif "gate_up_proj" in nanotron_key:
gate = "gate" in module_name_hf
param = _handle_gate_up_proj(param, gate)
with torch.no_grad():
param_hf.copy_(param)
def get_hf_config(config: NanotronLlamaConfig) -> HFLlamaConfig:
"""Converts a nanotron configuration to huggingface configuration."""
attrs = {key: getattr(config, value) for key, value in get_config_mapping(nt_to_hf=False).items()}
return HFLlamaConfig(**attrs)
def convert_checkpoint_and_save(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str] = None):
"""Loads the nanotron checkpoint in `checkpoint_path`, creates
a new huggingface instance, copies the weights from the nanotron checkpoint
and saves the transformed huggingface to `save_path`."""
# Init nanotron model.
with open(checkpoint_path / "model_config.json", "r") as f:
attrs = json.load(f)
model_config = NanotronLlamaConfig(**attrs)
nanotron_model = load_nanotron_model(
model_config=model_config,
checkpoint_path=checkpoint_path,
)
# Init huggingface model.
with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
model_config_hf = get_hf_config(model_config)
hf_model = LlamaForCausalLM._from_config(model_config_hf)
# Copy weights, initialize tokenizer and save model.
if tokenizer_name is not None:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.save_pretrained(save_path)
convert_nt_to_hf(nanotron_model, hf_model, model_config)
hf_model.save_pretrained(save_path)
print(f"Model saved to {save_path}")
def check_converted_model_generation(save_path: Path):
"""Loads a huggingface model and tokenizer from `save_path` and
performs a dummy text generation."""
tokenizer = AutoTokenizer.from_pretrained(save_path)
input_ids = tokenizer(TEST_PROMPT, return_tensors="pt")["input_ids"].cuda()
print("Inputs:", tokenizer.batch_decode(input_ids))
model = LlamaForCausalLM.from_pretrained(save_path).cuda().bfloat16()
out = model.generate(input_ids, max_new_tokens=100)
print("Generation (converted): ", tokenizer.batch_decode(out))
if __name__ == "__main__":
parser = ArgumentParser(description="Convert Nanotron weights to HF format")
parser.add_argument("--checkpoint_path", type=Path, default="llama-7b", help="Path to the checkpoint")
parser.add_argument("--save_path", type=Path, default="llama-7b-hf", help="Path to save the HF model")
parser.add_argument("--tokenizer_name", type=str, default="meta-llama/Llama-2-7b-chat-hf")
args = parser.parse_args()
# Convert Nanotron model to HF format.
convert_checkpoint_and_save(
checkpoint_path=args.checkpoint_path, save_path=args.save_path, tokenizer_name=args.tokenizer_name
)
# Check if the conversion was successful by generating some text.
if args.tokenizer_name is not None:
check_converted_model_generation(save_path=args.save_path)
import json
from pathlib import Path
from typing import Optional
import nanotron
import torch
from nanotron.config import LlamaConfig as NanotronLlamaConfig
from nanotron.models.llama import LlamaForTraining
from nanotron.trainer import mark_tied_parameters
def get_weight_mapping(config: NanotronLlamaConfig, nt_to_hf: bool = True) -> dict[str, str]:
"""Returns the nanotron to huggingface parameter mapping if `nt_to_hf`, otherwise the
huggingface to nanotron mapping."""
hf_to_nt_map = {}
hf_to_nt_map["lm_head.weight"] = "model.lm_head.pp_block.weight"
hf_to_nt_map["model.embed_tokens.weight"] = "model.token_position_embeddings.pp_block.token_embedding.weight"
hf_to_nt_map["model.norm.weight"] = "model.final_layer_norm.pp_block.weight"
hf_to_nt_map["model.embed_tokens.weight"] = "model.token_position_embeddings.pp_block.token_embedding.weight"
for i in range(config.num_hidden_layers):
hf_prefix = f"model.layers.{i}"
nt_prefix = f"model.decoder.{i}.pp_block"
hf_to_nt_map[f"{hf_prefix}.self_attn.q_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
hf_to_nt_map[f"{hf_prefix}.self_attn.k_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
hf_to_nt_map[f"{hf_prefix}.self_attn.v_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
hf_to_nt_map[f"{hf_prefix}.self_attn.o_proj.weight"] = f"{nt_prefix}.attn.o_proj.weight"
hf_to_nt_map[f"{hf_prefix}.mlp.gate_proj.weight"] = f"{nt_prefix}.mlp.gate_up_proj.weight"
hf_to_nt_map[f"{hf_prefix}.mlp.gate_proj.bias"] = f"{nt_prefix}.mlp.gate_up_proj.bias"
hf_to_nt_map[f"{hf_prefix}.mlp.up_proj.weight"] = f"{nt_prefix}.mlp.gate_up_proj.weight"
hf_to_nt_map[f"{hf_prefix}.mlp.up_proj.bias"] = f"{nt_prefix}.mlp.gate_up_proj.bias"
hf_to_nt_map[f"{hf_prefix}.mlp.down_proj.weight"] = f"{nt_prefix}.mlp.down_proj.weight"
hf_to_nt_map[f"{hf_prefix}.mlp.down_proj.bias"] = f"{nt_prefix}.mlp.down_proj.bias"
hf_to_nt_map[f"{hf_prefix}.input_layernorm.weight"] = f"{nt_prefix}.input_layernorm.weight"
hf_to_nt_map[f"{hf_prefix}.post_attention_layernorm.weight"] = f"{nt_prefix}.post_attention_layernorm.weight"
if nt_to_hf:
nt_to_hf_map = {}
for hf, nt in hf_to_nt_map.items():
# Because the qkv and gate_up projections are separated in the
# huggingface format, when we return nanotron to huggingface
# we will need to return a list of parameters instead (e.g.
# the `qkv_proj` will point to a list `[q_proj, k_proj, v_proj]`).
if nt in nt_to_hf_map and isinstance(nt_to_hf_map[nt], list):
nt_to_hf_map[nt].append(hf)
elif nt in nt_to_hf_map:
nt_to_hf_map[nt] = [nt_to_hf_map[nt], hf]
else:
nt_to_hf_map[nt] = hf
return nt_to_hf_map
return hf_to_nt_map
def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]:
"""Returns either the nanotron to huggingface (if `nt_to_hf`)
configuration mapping, or the huggingface to nanotron."""
hf_to_nt_map = {
"bos_token_id": "bos_token_id",
"eos_token_id": "eos_token_id",
"hidden_act": "hidden_act",
"hidden_size": "hidden_size",
"initializer_range": "initializer_range",
"intermediate_size": "intermediate_size",
"max_position_embeddings": "max_position_embeddings",
"num_attention_heads": "num_attention_heads",
"num_hidden_layers": "num_hidden_layers",
"num_key_value_heads": "num_key_value_heads",
"pad_token_id": "pad_token_id",
"pretraining_tp": "pretraining_tp",
"rms_norm_eps": "rms_norm_eps",
"rope_scaling": "rope_scaling",
"rope_theta": "rope_theta",
"tie_word_embeddings": "tie_word_embeddings",
"use_cache": "use_cache",
"vocab_size": "vocab_size",
}
if nt_to_hf:
return {nt: hf for hf, nt in hf_to_nt_map.items()}
return hf_to_nt_map
def make_parallel_config(
dp: int = 1,
pp: int = 1,
tp: int = 1,
):
parallel_config = nanotron.config.ParallelismArgs(
dp=dp,
pp=pp,
tp=tp,
pp_engine=nanotron.config.AllForwardAllBackwardPipelineEngine(),
tp_mode=nanotron.config.TensorParallelLinearMode.ALL_REDUCE,
tp_linear_async_communication=False,
)
return parallel_config
def load_nanotron_model(
model_config: Optional[NanotronLlamaConfig] = None,
device: torch.device = torch.device("cuda"),
dtype: torch.dtype = torch.bfloat16,
checkpoint_path: Optional[Path] = None,
) -> LlamaForTraining:
"""
Creates and returns a nanotron model.
If `model_config` is None, then `checkpoint_path` must be set, in which case
the configuration will be loaded from such path.
If `checkpoint_path` is None, then `model_config` must be set, in which case
the model created will have random weights.
"""
if model_config is None:
assert checkpoint_path is not None
with open(checkpoint_path / "model_config.json") as f:
model_config = NanotronLlamaConfig(**json.load(f))
parallel_config = make_parallel_config()
parallel_context = nanotron.parallel.ParallelContext(
data_parallel_size=parallel_config.dp,
pipeline_parallel_size=parallel_config.pp,
tensor_parallel_size=parallel_config.tp,
)
nanotron_model = nanotron.models.build_model(
model_builder=lambda: LlamaForTraining(
config=model_config,
parallel_context=parallel_context,
parallel_config=parallel_config,
random_states=None,
),
parallel_context=parallel_context,
dtype=dtype,
device=device,
)
mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
# Load checkpoint directly in memory and then only keep the state dictionary
if checkpoint_path is not None:
nanotron.serialize.load_weights(
model=nanotron_model, parallel_context=parallel_context, root_folder=checkpoint_path
)
return nanotron_model
# ruff: noqa: E402
import dataclasses
import json
from pathlib import Path
import pytest
import torch
from transformers import LlamaForCausalLM
from utils import set_system_path
set_system_path()
import nanotron
from nanotron.config import LlamaConfig as NanotronLlamaConfig
from nanotron.models.base import init_on_device_and_dtype
from nanotron.models.llama import LlamaForTraining
from nanotron.parallel import ParallelContext
from nanotron.trainer import mark_tied_parameters
from examples.llama.convert_hf_to_nanotron import convert_checkpoint_and_save as convert_hf_to_nt_and_save
from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
from examples.llama.convert_weights import load_nanotron_model, make_parallel_config
from tests.helpers.context import TestContext
from tests.helpers.utils import init_distributed
CONFIG = NanotronLlamaConfig(
**{
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 1024,
"is_llama_config": True,
"max_position_embeddings": 128,
"num_attention_heads": 8,
"num_hidden_layers": 4,
"num_key_value_heads": 4,
"pad_token_id": None,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": None,
"tie_word_embeddings": False,
"use_cache": True,
"vocab_size": 4096,
}
)
BATCH_SIZE = 3
SEQUENCE_LENGTH = 5
ATOL = 0.03
def create_nanotron_model(parallel_context: ParallelContext) -> LlamaForTraining:
parallel_config = make_parallel_config(
tp=parallel_context.tensor_parallel_size,
dp=parallel_context.data_parallel_size,
pp=parallel_context.pipeline_parallel_size,
)
nanotron_model = nanotron.models.build_model(
model_builder=lambda: LlamaForTraining(
config=CONFIG,
parallel_context=parallel_context,
parallel_config=parallel_config,
random_states=None,
),
parallel_context=parallel_context,
dtype=torch.bfloat16,
device=torch.device("cuda"),
)
mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
return nanotron_model
def create_huggingface_model() -> LlamaForCausalLM:
config_hf = get_hf_config(CONFIG)
with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
model_hf = LlamaForCausalLM._from_config(config_hf)
return model_hf
@pytest.fixture(autouse=True, scope="module")
def fix_seed():
torch.manual_seed(0)
yield
@pytest.fixture
def input_ids() -> torch.Tensor:
return torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
def _test_nt_to_hf(parallel_context: ParallelContext, input_ids: torch.Tensor):
model_nt = create_nanotron_model(parallel_context)
model_hf = create_huggingface_model()
convert_nt_to_hf(model_nt, model_hf, CONFIG)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def test_nt_to_hf(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf)(input_ids=input_ids)
def _test_nt_to_hf_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
# Create and save nanotron model.
model_nt = create_nanotron_model(parallel_context)
root = test_context.get_auto_remove_tmp_dir()
nt_path = root / "nanotron"
hf_path = root / "hf"
nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
with open(nt_path / "model_config.json", "w+") as f:
json.dump(dataclasses.asdict(CONFIG), f)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
del model_nt
# Perform conversion.
convert_nt_to_hf_and_save(nt_path, hf_path)
# Load huggingface and get logits.
model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
torch.testing.assert_allclose(logits_nt, logits_hf, atol=ATOL)
def test_nt_to_hf_with_files(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf_with_files)(input_ids=input_ids, test_context=TestContext())
def _test_hf_to_nt(parallel_context: ParallelContext, input_ids: torch.Tensor):
model_nt = create_nanotron_model(parallel_context)
model_hf = create_huggingface_model()
convert_hf_to_nt(model_hf, model_nt, CONFIG)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
torch.testing.assert_allclose(logits_hf, logits_nt, atol=ATOL)
def test_hf_to_nt(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt)(input_ids=input_ids)
def _test_hf_to_nt_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
# Create and save hf model.
model_hf = create_huggingface_model()
root = test_context.get_auto_remove_tmp_dir()
nt_path = root / "nanotron"
hf_path = root / "hf"
model_hf.save_pretrained(hf_path)
logits_hf = model_hf(input_ids).logits
del model_hf
# Perform conversion.
convert_hf_to_nt_and_save(hf_path, nt_path)
# Load nanotron and get logits.
input_mask = torch.ones_like(input_ids)
model_nt = load_nanotron_model(checkpoint_path=nt_path)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL)
def test_hf_to_nt_with_files(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt_with_files)(input_ids=input_ids, test_context=TestContext())
def _test_composed_conversion(parallel_context: ParallelContext):
# Get HF statedict.
model_hf = create_huggingface_model()
hf_sd = {key: val.clone() for key, val in model_hf.state_dict().items()}
# Convert once to nanotron, save its statedict.
model_nt = create_nanotron_model(parallel_context)
convert_hf_to_nt(model_hf, model_nt, CONFIG)
nt_sd = {key: val.clone() for key, val in model_nt.state_dict().items()}
# Convert back to HF, compare statedicts.
del model_hf
model_hf = create_huggingface_model()
convert_nt_to_hf(model_nt, model_hf, CONFIG)
hf_sd_new = model_hf.state_dict()
assert set(hf_sd_new) == set(hf_sd)
assert all(torch.all(hf_sd[key] == hf_sd_new[key]) for key in hf_sd_new)
# Convert to nanotron one more time, compare statedicts.
del model_nt
model_nt = create_nanotron_model(parallel_context)
convert_hf_to_nt(model_hf, model_nt, CONFIG)
nt_sd_new = model_nt.state_dict()
assert set(nt_sd_new) == set(nt_sd)
assert all(torch.all(nt_sd[key] == nt_sd_new[key]) for key in nt_sd_new)
def test_composed_conversion():
init_distributed(tp=1, dp=1, pp=1)(_test_composed_conversion)()
def _save_parallel_nanotron(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path):
# Create and save a parallel model.
model_nt = create_nanotron_model(parallel_context)
nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
with open(nt_path / "model_config.json", "w+") as f:
json.dump(dataclasses.asdict(CONFIG), f)
# Get parallel predictions.
input_ids = input_ids.cuda() # Move them to the current device index.
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
if torch.distributed.get_rank() == 0:
torch.save(logits_nt.detach().cpu(), nt_path / "logits.pt")
# Convert nanotron to hf, load it and compare logits.
# hf_path = root/"hf"
# convert_nt_to_hf_and_save(nt_path, hf_path)
# model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
# logits_hf = model_hf(input_ids).logits
# assert logits_nt.size() == logits_hf.size()
# assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def _convert_from_parallel(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path, hf_path: Path):
# Convert parallel nanotron to hf, get and save huggingface predictions.
convert_nt_to_hf_and_save(nt_path, hf_path)
model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
logits_hf = model_hf(input_ids).logits
torch.save(logits_hf.detach().cpu(), hf_path / "logits.pt")
def test_tensor_parallel_conversion(input_ids: torch.Tensor):
# Set up test.
test_context = TestContext()
root = test_context.get_auto_remove_tmp_dir()
nt_path = root / "nanotron"
hf_path = root / "nanotron"
# Launch both parts.
init_distributed(tp=2, dp=1, pp=1)(_save_parallel_nanotron)(input_ids=input_ids, nt_path=nt_path)
assert (nt_path / "logits.pt").exists()
init_distributed(tp=1, dp=1, pp=1)(_convert_from_parallel)(input_ids=input_ids, nt_path=nt_path, hf_path=hf_path)
assert (hf_path / "logits.pt").exists()
# Load logits and verify they match.
logits_nt = torch.load(nt_path / "logits.pt")
logits_hf = torch.load(hf_path / "logits.pt")
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
# ruff: noqa: E402
import json
<<<<<<< HEAD
from pathlib import Path
=======
>>>>>>> main
import pytest
import torch
from transformers import LlamaForCausalLM
from utils import set_system_path
set_system_path()
import nanotron
from nanotron.config import LlamaConfig as NanotronLlamaConfig
from nanotron.models.base import init_on_device_and_dtype
from nanotron.models.llama import LlamaForTraining
from nanotron.parallel import ParallelContext
from examples.llama.convert_hf_to_nanotron import convert_checkpoint_and_save as convert_hf_to_nt_and_save
<<<<<<< HEAD
from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
from examples.llama.convert_weights import load_nanotron_model
from tests.helpers.context import TestContext
from tests.helpers.utils import init_distributed
=======
from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
from examples.llama.convert_weights import load_nanotron_model, make_parallel_config
from tests.helpers.context import TestContext
from tests.helpers.utils import init_distributed, rerun_if_address_is_in_use
>>>>>>> main
CONFIG = NanotronLlamaConfig(
**{
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 1024,
"is_llama_config": True,
"max_position_embeddings": 128,
"num_attention_heads": 8,
"num_hidden_layers": 4,
"num_key_value_heads": 4,
"pad_token_id": None,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": None,
"tie_word_embeddings": False,
"use_cache": True,
"vocab_size": 4096,
}
)
BATCH_SIZE = 3
SEQUENCE_LENGTH = 5
ATOL = 0.02
def create_nanotron_model(pp: int = 1, tp: int = 1, dp: int = 1) -> LlamaForTraining:
parallel_config = make_parallel_config(dp, pp, tp)
return load_nanotron_model(parallel_config, CONFIG, torch.device("cuda"), torch.bfloat16)
def create_huggingface_model() -> LlamaForCausalLM:
config_hf = get_hf_config(CONFIG)
with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
model_hf = LlamaForCausalLM._from_config(config_hf)
return model_hf
@pytest.fixture(autouse=True, scope="module")
def fix_seed():
torch.manual_seed(0)
yield
@pytest.fixture
def input_ids() -> torch.Tensor:
return torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
def _test_nt_to_hf(parallel_context: ParallelContext, input_ids: torch.Tensor):
model_nt = create_nanotron_model()
model_hf = create_huggingface_model()
convert_nt_to_hf(model_nt, model_hf, CONFIG)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def test_nt_to_hf(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf)(input_ids=input_ids)
def _test_nt_to_hf_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
# Create and save nanotron model.
model_nt = create_nanotron_model()
root = test_context.get_auto_remove_tmp_dir()
nt_path = root / "nanotron"
hf_path = root / "hf"
nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
with open(nt_path / "model_config.json", "w+") as f:
json.dump(vars(CONFIG), f)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
del model_nt
# Perform conversion.
convert_nt_to_hf_and_save(nt_path, hf_path)
# Load huggingface and get logits.
model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def test_nt_to_hf_with_files(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf_with_files)(input_ids=input_ids, test_context=TestContext())
def _test_hf_to_nt(parallel_context: ParallelContext, input_ids: torch.Tensor):
model_nt = create_nanotron_model()
model_hf = create_huggingface_model()
convert_hf_to_nt(model_hf, model_nt, CONFIG)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def test_hf_to_nt(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt)(input_ids=input_ids)
def _test_hf_to_nt_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
# Create and save hf model.
model_hf = create_huggingface_model()
root = test_context.get_auto_remove_tmp_dir()
nt_path = root / "nanotron"
hf_path = root / "hf"
model_hf.save_pretrained(hf_path)
logits_hf = model_hf(input_ids).logits
del model_hf
# Perform conversion.
convert_hf_to_nt_and_save(hf_path, nt_path)
# Load nanotron and get logits.
input_mask = torch.ones_like(input_ids)
model_nt = load_nanotron_model(checkpoint_path=nt_path)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL)
def test_hf_to_nt_with_files(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt_with_files)(input_ids=input_ids, test_context=TestContext())
def _test_composed_conversion(parallel_context: ParallelContext):
# Get HF statedict.
model_hf = create_huggingface_model()
hf_sd = {key: val.clone() for key, val in model_hf.state_dict().items()}
# Convert once to nanotron, save its statedict.
model_nt = create_nanotron_model()
convert_hf_to_nt(model_hf, model_nt, CONFIG)
nt_sd = {key: val.clone() for key, val in model_nt.state_dict().items()}
# Convert back to HF, compare statedicts.
del model_hf
model_hf = create_huggingface_model()
convert_nt_to_hf(model_nt, model_hf, CONFIG)
hf_sd_new = model_hf.state_dict()
assert set(hf_sd_new) == set(hf_sd)
assert all(torch.all(hf_sd[key] == hf_sd_new[key]) for key in hf_sd_new)
# Convert to nanotron one more time, compare statedicts.
del model_nt
model_nt = create_nanotron_model()
convert_hf_to_nt(model_hf, model_nt, CONFIG)
nt_sd_new = model_nt.state_dict()
assert set(nt_sd_new) == set(nt_sd)
assert all(torch.all(nt_sd[key] == nt_sd_new[key]) for key in nt_sd_new)
def test_composed_conversion():
init_distributed(tp=1, dp=1, pp=1)(_test_composed_conversion)()
<<<<<<< HEAD
def _save_parallel_nanotron(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path):
# Create and save a parallel model.
model_nt = create_nanotron_model(tp=parallel_context.tensor_parallel_size, pp=parallel_context.pipeline_parallel_size)
# print(torch.distributed.get_rank(), "model_nt", set(p.device for p in model_nt.parameters()))
nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
with open(nt_path/"model_config.json", "w+") as f:
json.dump(vars(CONFIG), f)
# Get parallel predictions.
input_ids = input_ids.cuda() # Move them to the current device index.
input_mask = torch.ones_like(input_ids)
# print(torch.distributed.get_rank(), "input_ids", input_ids.device)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
if torch.distributed.get_rank() == 0:
torch.save(logits_nt.detach().cpu(), nt_path/"logits.pt")
# print(torch.distributed.get_rank(), logits_nt.shape)
# Convert nanotron to hf, load it and compare logits.
# hf_path = root/"hf"
# convert_nt_to_hf_and_save(nt_path, hf_path)
# model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
# logits_hf = model_hf(input_ids).logits
# assert logits_nt.size() == logits_hf.size()
# assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def _convert_from_parallel(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path, hf_path: Path):
# Convert parallel nanotron to hf, get and save huggingface predictions.
convert_nt_to_hf_and_save(nt_path, hf_path)
model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
logits_hf = model_hf(input_ids).logits
torch.save(logits_hf.detach().cpu(), hf_path/"logits.pt")
def test_tensor_parallel_conversion(input_ids: torch.Tensor):
# Set up test.
test_context = TestContext()
root = test_context.get_auto_remove_tmp_dir()
nt_path =root/"nanotron"
hf_path =root/"nanotron"
# Launch both parts.
init_distributed(tp=2, dp=1, pp=1)(_save_parallel_nanotron)(input_ids=input_ids, nt_path=nt_path)
assert (nt_path/"logits.pt").exists()
init_distributed(tp=1, dp=1, pp=1)(_convert_from_parallel)(input_ids=input_ids, nt_path=nt_path, hf_path=hf_path)
assert (hf_path/"logits.pt").exists()
# Load logits and verify they match.
logits_nt = torch.load(nt_path/"logits.pt")
logits_hf = torch.load(hf_path/"logits.pt")
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
=======
def _test_tensor_parallel_conversion(parallel_context: ParallelContext):
model_nt = create_nanotron_model(tp=2)
model_hf = create_huggingface_model()
convert_nt_to_hf(model_nt, model_hf, CONFIG)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
@rerun_if_address_is_in_use()
def test_tensor_parallel_conversion():
init_distributed(tp=2, dp=1, pp=1)(_test_tensor_parallel_conversion)()
>>>>>>> main
import importlib
import sys
from pathlib import Path
def set_system_path():
package = importlib.import_module("nanotron")
# NOTE: Path(package.__file__).parent = .../nanotron/src/nanotron
# we want .../nanotron
package_path = Path(package.__file__).parent.parent.parent
sys.path.insert(0, str(package_path))
# we also want ../llama
llama_path = Path(__file__).parent.parent
sys.path.insert(0, str(llama_path))
---
library_name: nanotron
---
# Mamba
Modeling code for Mamba to use with [Nanotron](https://github.com/huggingface/nanotron/)
## 🚀 Quickstart
```bash
pip install -r requirements.txt
# Run training
./examples/mamba/train_mamba.sh
```
![mamba](./assets/loss_mamba.png)
> https://wandb.ai/bouteille/test/reports/Mamba-loss--Vmlldzo2OTgwNDM5
## Bug related to nanotron
Encountered the following issue when ran train_mamba.sh:
```
causal_conv1d_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c1017SymbolicShapeMeta18init_is_contiguousEv
```
Solved this by doing:
pip uninstall mamba-ssm
pip install causal_conv1d==1.1.1
pip install mamba-ssm --no-cache-dir
https://github.com/state-spaces/mamba/issues/169
## Credits
Credits to the following repositories from which the code was adapted:
- https://github.com/state-spaces/mamba
from dataclasses import dataclass
from typing import Optional, Union
import torch
from nanotron.config import Config, ExistingCheckpointInit, NanotronConfigs
from nanotron.config.utils_config import cast_str_to_torch_dtype
@dataclass
class MambaInit:
initializer_range: float = 0.02
rescale_prenorm_residual: bool = True
n_residuals_per_layer: int = 1 # Change to 2 if we have MLP
@dataclass
class ModelArgs:
"""Arguments related to model architecture"""
model_config: NanotronConfigs
init_method: Union[MambaInit, ExistingCheckpointInit]
dtype: Optional[torch.dtype] = None
make_vocab_size_divisible_by: int = 1
ddp_bucket_cap_mb: int = 25
def __post_init__(self):
if self.dtype is None:
self.dtype = torch.bfloat16
if isinstance(self.dtype, str):
self.dtype = cast_str_to_torch_dtype(self.dtype)
# if self.model_config.max_position_embeddings is None:
# self.model_config.max_position_embeddings = 0
@dataclass(kw_only=True) # pylint: disable=unexpected-keyword-arg
class MambaConfig(Config):
"""Main configuration class"""
model: ModelArgs
@dataclass
class MambaModelConfig:
"""Configuration for a Mamba model
Be careful on having a coherent typing as we use it to reconstruct the model from yaml
"""
is_mamba_config: bool = True # We use this help differentiate models in yaml/python conversion
d_model: int = 2560
num_hidden_layers: int = 64
vocab_size: int = 50277
ssm_cfg: Optional[dict] = None
rms_norm: bool = True
fused_add_norm: bool = True
residual_in_fp32: bool = True
pad_vocab_size_multiple: int = 8
# ==== Custom ======
dtype: str = "float32"
rms_norm_eps: float = 1e-5
pad_token_id: Optional[int] = None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment