Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
nanotron_pytorch
Commits
d99506f3
Commit
d99506f3
authored
Dec 03, 2024
by
chenzk
Browse files
v1.0.1
parent
61e92904
Pipeline
#2033
canceled with stages
Changes
331
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2056 additions
and
0 deletions
+2056
-0
examples/doremi/tests/test_doremi_loss.py
examples/doremi/tests/test_doremi_loss.py
+305
-0
examples/doremi/tests/test_doremi_sampler.py
examples/doremi/tests/test_doremi_sampler.py
+461
-0
examples/doremi/tests/test_doremi_utils.py
examples/doremi/tests/test_doremi_utils.py
+19
-0
examples/doremi/tests/utils.py
examples/doremi/tests/utils.py
+18
-0
examples/doremi/train_doremi.py
examples/doremi/train_doremi.py
+36
-0
examples/doremi/train_reference.py
examples/doremi/train_reference.py
+47
-0
examples/doremi/utils.py
examples/doremi/utils.py
+6
-0
examples/llama/README.md
examples/llama/README.md
+17
-0
examples/llama/__init__.py
examples/llama/__init__.py
+0
-0
examples/llama/convert_hf_to_nanotron.py
examples/llama/convert_hf_to_nanotron.py
+119
-0
examples/llama/convert_nanotron_to_hf.py
examples/llama/convert_nanotron_to_hf.py
+154
-0
examples/llama/convert_weights.py
examples/llama/convert_weights.py
+141
-0
examples/llama/requirements.txt
examples/llama/requirements.txt
+1
-0
examples/llama/tests/test_conversion.py
examples/llama/tests/test_conversion.py
+251
-0
examples/llama/tests/test_conversion.py.orig
examples/llama/tests/test_conversion.py.orig
+264
-0
examples/llama/tests/utils.py
examples/llama/tests/utils.py
+15
-0
examples/mamba/README.md
examples/mamba/README.md
+35
-0
examples/mamba/assets/loss_mamba.png
examples/mamba/assets/loss_mamba.png
+0
-0
examples/mamba/config.py
examples/mamba/config.py
+62
-0
examples/mamba/config_mamba.yaml
examples/mamba/config_mamba.yaml
+105
-0
No files found.
examples/doremi/tests/test_doremi_loss.py
0 → 100644
View file @
d99506f3
import
pytest
import
torch
import
torch.distributed
as
dist
import
torch.nn.functional
as
F
from
nanotron.parallel
import
ParallelContext
from
nanotron.parallel.tensor_parallel.functional
import
sharded_cross_entropy
from
nanotron.sanity_checks
import
assert_tensor_synced_across_pg
from
utils
import
set_system_path
set_system_path
()
from
examples.doremi.doremi.doremi_context
import
DoReMiContext
from
examples.doremi.doremi.loss
import
(
CrossEntropyWithPerDomainLoss
,
DomainLossForProxyTraining
,
DoReMiLossForProxyTraining
,
compute_domain_loss_per_replicas
,
compute_per_domain_loss
,
)
from
tests.helpers.utils
import
init_distributed
@
pytest
.
fixture
def
doremi_context
():
N_DOMAINS
=
5
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
N_DOMAINS
)]
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
False
)
return
doremi_context
def
get_partition_logit
(
logits
,
parallel_context
):
tp_size
=
dist
.
get_world_size
(
parallel_context
.
tp_pg
)
tp_rank
=
dist
.
get_rank
(
parallel_context
.
tp_pg
)
VOCAB_SIZE
=
logits
.
shape
[
-
1
]
per_partition
=
VOCAB_SIZE
//
tp_size
chunks
=
torch
.
split
(
logits
,
per_partition
,
dim
=-
1
)
return
chunks
[
tp_rank
]
@
pytest
.
mark
.
parametrize
(
"tp"
,
[
1
,
2
])
def
test_computing_per_token_loss
(
tp
:
int
):
BATCH_SIZE
=
512
SEQ_LEN
=
128
VOCAB_SIZE
=
4
torch
.
manual_seed
(
69
)
logits
=
torch
.
randn
(
BATCH_SIZE
,
SEQ_LEN
,
VOCAB_SIZE
)
targets
=
torch
.
randint
(
0
,
VOCAB_SIZE
,
(
BATCH_SIZE
,
SEQ_LEN
))
ref_losses
=
F
.
cross_entropy
(
logits
.
view
(
-
1
,
logits
.
size
(
2
)),
targets
.
view
(
-
1
),
reduction
=
"none"
)
init_distributed
(
tp
=
tp
,
dp
=
1
,
pp
=
1
)(
_test_computing_per_token_loss
)(
logits
=
logits
,
targets
=
targets
,
ref_losses
=
ref_losses
)
def
_test_computing_per_token_loss
(
parallel_context
:
ParallelContext
,
logits
,
targets
,
ref_losses
):
logits
=
logits
.
to
(
"cuda"
)
targets
=
targets
.
to
(
"cuda"
)
parallel_logits
=
get_partition_logit
(
logits
,
parallel_context
)
loss
=
sharded_cross_entropy
(
parallel_logits
,
targets
,
parallel_context
.
tp_pg
)
assert
torch
.
allclose
(
loss
.
cpu
().
view
(
-
1
),
ref_losses
)
@
pytest
.
mark
.
parametrize
(
"dp"
,
[
1
,
2
])
def
test_domain_loss_for_proxy_training
(
dp
:
int
):
GLOBAL_BATCH_SIZE
=
512
BATCH_SIZE
=
GLOBAL_BATCH_SIZE
//
dp
SEQ_LEN
=
128
N_DOMAINS
=
5
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
N_DOMAINS
)]
init_distributed
(
tp
=
1
,
dp
=
dp
,
pp
=
1
)(
_test_domain_loss_for_proxy_training
)(
global_batch_size
=
GLOBAL_BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
seq_len
=
SEQ_LEN
,
domain_keys
=
domain_keys
,
)
def
_test_domain_loss_for_proxy_training
(
parallel_context
:
ParallelContext
,
global_batch_size
,
batch_size
,
seq_len
,
domain_keys
):
N_DOMAINS
=
len
(
domain_keys
)
losses
=
torch
.
randn
(
batch_size
,
seq_len
,
device
=
"cuda"
)
ref_losses
=
torch
.
randn
(
batch_size
,
seq_len
,
device
=
"cuda"
)
domain_idxs
=
torch
.
randint
(
0
,
N_DOMAINS
,
(
batch_size
,),
device
=
"cuda"
)
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
False
)
doremi_context
.
domain_weights
=
doremi_context
.
domain_weights
.
to
(
"cuda"
)
loss_func
=
DomainLossForProxyTraining
(
doremi_context
,
parallel_context
)
outputs
=
loss_func
(
losses
,
ref_losses
,
domain_idxs
)
assert
outputs
.
keys
()
==
{
"dro_loss"
,
"domain_losses"
,
"domain_weights"
,
"samples_per_domain"
}
assert
(
outputs
[
"domain_losses"
]
>
0.0
).
all
()
assert
outputs
[
"domain_losses"
].
shape
==
(
N_DOMAINS
,)
assert
(
outputs
[
"domain_weights"
]
>
0.0
).
all
()
assert
outputs
[
"domain_weights"
].
shape
==
(
N_DOMAINS
,)
@
pytest
.
mark
.
parametrize
(
"dp"
,
[
1
,
2
])
def
test_computing_per_domain_loss
(
dp
:
int
):
GLOBAL_BATCH_SIZE
=
512
BATCH_SIZE
=
GLOBAL_BATCH_SIZE
//
dp
SEQ_LEN
=
128
N_DOMAINS
=
5
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
N_DOMAINS
)]
init_distributed
(
tp
=
1
,
dp
=
dp
,
pp
=
1
)(
_test_computing_per_domain_loss
)(
batch_size
=
BATCH_SIZE
,
global_batch_size
=
GLOBAL_BATCH_SIZE
,
seq_len
=
SEQ_LEN
,
domain_keys
=
domain_keys
,
)
def
_test_computing_per_domain_loss
(
parallel_context
:
ParallelContext
,
batch_size
,
global_batch_size
,
seq_len
,
domain_keys
):
N_DOMAINS
=
len
(
domain_keys
)
losses
=
torch
.
randn
(
batch_size
,
seq_len
,
device
=
"cuda"
)
domain_idxs
=
torch
.
randint
(
0
,
N_DOMAINS
,
(
batch_size
,),
device
=
"cuda"
)
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
False
)
doremi_context
.
domain_weights
.
to
(
"cuda"
)
losses_dp
,
per_domain_loss
,
samples_per_domain
=
compute_per_domain_loss
(
losses
,
domain_idxs
,
doremi_context
,
parallel_context
)
assert
per_domain_loss
.
shape
==
(
N_DOMAINS
,)
assert_tensor_synced_across_pg
(
per_domain_loss
,
parallel_context
.
dp_pg
,
msg
=
lambda
err
:
f
"Per domain loss are not synced across ranks
{
err
}
"
)
assert
samples_per_domain
.
shape
==
(
N_DOMAINS
,)
assert
sum
(
samples_per_domain
)
==
global_batch_size
assert_tensor_synced_across_pg
(
samples_per_domain
,
parallel_context
.
dp_pg
,
msg
=
lambda
err
:
f
"Samples per domain are not synced across ranks
{
err
}
"
,
)
@
pytest
.
mark
.
parametrize
(
"dp"
,
[
1
,
2
])
def
test_computing_domain_loss_per_replicas
(
dp
:
int
):
GLOBAL_BATCH_SIZE
=
512
BATCH_SIZE
=
GLOBAL_BATCH_SIZE
//
dp
SEQ_LEN
=
128
N_DOMAINS
=
5
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
N_DOMAINS
)]
init_distributed
(
tp
=
1
,
dp
=
dp
,
pp
=
1
)(
_test_computing_domain_loss_per_replicas
)(
batch_size
=
BATCH_SIZE
,
global_batch_size
=
GLOBAL_BATCH_SIZE
,
seq_len
=
SEQ_LEN
,
domain_keys
=
domain_keys
,
)
def
_test_computing_domain_loss_per_replicas
(
parallel_context
:
ParallelContext
,
batch_size
,
global_batch_size
,
seq_len
,
domain_keys
):
N_DOMAINS
=
len
(
domain_keys
)
losses
=
torch
.
randn
(
batch_size
,
seq_len
,
device
=
"cuda"
)
domain_idxs
=
torch
.
randint
(
0
,
N_DOMAINS
,
(
batch_size
,),
device
=
"cuda"
)
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
False
)
doremi_context
.
domain_weights
.
to
(
"cuda"
)
per_domain_loss
,
samples_per_domain
=
compute_domain_loss_per_replicas
(
losses
,
domain_idxs
,
doremi_context
)
assert
per_domain_loss
.
shape
==
(
N_DOMAINS
,)
assert
samples_per_domain
.
shape
==
(
N_DOMAINS
,)
@
pytest
.
mark
.
skip
@
pytest
.
mark
.
parametrize
(
"tp"
,
[
1
,
2
])
def
test_cross_entropy_with_per_domain_loss
(
tp
:
int
,
doremi_context
):
BATCH_SIZE
=
512
SEQ_LEN
=
128
VOCAB_SIZE
=
4
N_DOMAINS
=
doremi_context
.
num_domains
torch
.
manual_seed
(
69
)
logits
=
torch
.
randn
(
BATCH_SIZE
,
SEQ_LEN
,
VOCAB_SIZE
)
label_ids
=
torch
.
randint
(
0
,
VOCAB_SIZE
,
(
BATCH_SIZE
,
SEQ_LEN
))
label_mask
=
torch
.
ones
((
BATCH_SIZE
,
SEQ_LEN
),
dtype
=
torch
.
bool
)
domain_idxs
=
torch
.
randint
(
0
,
N_DOMAINS
,
(
BATCH_SIZE
,))
ref_losses
=
F
.
cross_entropy
(
logits
.
view
(
-
1
,
logits
.
size
(
2
)),
label_ids
.
view
(
-
1
))
init_distributed
(
tp
=
tp
,
dp
=
1
,
pp
=
1
)(
_test_cross_entropy_with_per_domain_loss
)(
logits
=
logits
,
label_ids
=
label_ids
,
label_mask
=
label_mask
,
domain_idxs
=
domain_idxs
,
ref_losses
=
ref_losses
,
batch_size
=
BATCH_SIZE
,
doremi_context
=
doremi_context
,
)
def
_test_cross_entropy_with_per_domain_loss
(
parallel_context
:
ParallelContext
,
logits
,
label_ids
,
label_mask
,
domain_idxs
,
ref_losses
,
batch_size
,
doremi_context
,
):
logits
=
logits
.
to
(
"cuda"
)
label_ids
=
label_ids
.
to
(
"cuda"
)
label_mask
=
label_mask
.
to
(
"cuda"
)
domain_idxs
=
domain_idxs
.
to
(
"cuda"
)
parallel_logits
=
get_partition_logit
(
logits
,
parallel_context
)
loss_func
=
CrossEntropyWithPerDomainLoss
(
doremi_context
,
parallel_context
)
outputs
=
loss_func
(
parallel_logits
,
label_ids
,
label_mask
,
domain_idxs
)
assert
torch
.
allclose
(
outputs
[
"loss"
].
cpu
().
view
(
-
1
),
ref_losses
)
assert
outputs
[
"domain_losses"
].
shape
==
(
doremi_context
.
num_domains
,)
assert
outputs
[
"samples_per_domain"
].
shape
==
(
doremi_context
.
num_domains
,)
assert
sum
(
outputs
[
"samples_per_domain"
])
==
batch_size
@
pytest
.
mark
.
parametrize
(
"tp"
,
[
1
,
2
])
def
test_doremi_loss_for_proxy_training
(
tp
:
int
,
doremi_context
):
BATCH_SIZE
=
512
SEQ_LEN
=
128
VOCAB_SIZE
=
4
N_DOMAINS
=
doremi_context
.
num_domains
torch
.
manual_seed
(
69
)
logits
=
torch
.
randn
(
BATCH_SIZE
,
SEQ_LEN
,
VOCAB_SIZE
)
label_ids
=
torch
.
randint
(
0
,
VOCAB_SIZE
,
(
BATCH_SIZE
,
SEQ_LEN
))
label_mask
=
torch
.
ones
((
BATCH_SIZE
,
SEQ_LEN
),
dtype
=
torch
.
bool
)
domain_idxs
=
torch
.
randint
(
0
,
N_DOMAINS
,
(
BATCH_SIZE
,))
ref_losses
=
torch
.
randn
(
BATCH_SIZE
,
SEQ_LEN
)
ref_ce_loss
=
F
.
cross_entropy
(
logits
.
view
(
-
1
,
logits
.
size
(
2
)),
label_ids
.
view
(
-
1
))
init_distributed
(
tp
=
tp
,
dp
=
1
,
pp
=
1
)(
_test_doremi_loss_for_proxy_training
)(
logits
=
logits
,
label_ids
=
label_ids
,
label_mask
=
label_mask
,
domain_idxs
=
domain_idxs
,
ref_losses
=
ref_losses
,
ref_ce_loss
=
ref_ce_loss
,
batch_size
=
BATCH_SIZE
,
n_domains
=
N_DOMAINS
,
doremi_context
=
doremi_context
,
)
def
_test_doremi_loss_for_proxy_training
(
parallel_context
:
ParallelContext
,
logits
,
label_ids
,
label_mask
,
domain_idxs
,
ref_losses
,
ref_ce_loss
,
batch_size
,
n_domains
,
doremi_context
,
):
logits
=
logits
.
to
(
"cuda"
)
label_ids
=
label_ids
.
to
(
"cuda"
)
label_mask
=
label_mask
.
to
(
"cuda"
)
domain_idxs
=
domain_idxs
.
to
(
"cuda"
)
ref_losses
=
ref_losses
.
to
(
"cuda"
)
doremi_context
.
domain_weights
=
doremi_context
.
domain_weights
.
to
(
"cuda"
)
parallel_logits
=
get_partition_logit
(
logits
,
parallel_context
)
loss_func
=
DoReMiLossForProxyTraining
(
doremi_context
,
parallel_context
)
outputs
=
loss_func
(
parallel_logits
,
label_ids
,
label_mask
,
domain_idxs
,
ref_losses
)
assert
outputs
[
"loss"
].
ndim
==
0
assert
outputs
[
"loss"
]
>
0.0
assert
torch
.
allclose
(
outputs
[
"ce_loss"
].
cpu
().
view
(
-
1
),
ref_ce_loss
)
assert
outputs
[
"domain_losses"
].
shape
==
(
doremi_context
.
num_domains
,)
assert
(
outputs
[
"domain_losses"
]
>
0
).
all
()
assert
outputs
[
"domain_weights"
].
shape
==
(
doremi_context
.
num_domains
,)
assert
torch
.
allclose
(
sum
(
outputs
[
"domain_weights"
].
cpu
()),
torch
.
tensor
(
1.0
))
samples_per_domain
=
outputs
[
"samples_per_domain"
]
assert
samples_per_domain
.
shape
==
(
n_domains
,)
assert
sum
(
samples_per_domain
)
==
batch_size
examples/doremi/tests/test_doremi_sampler.py
0 → 100644
View file @
d99506f3
import
pytest
import
torch
from
nanotron
import
distributed
as
dist
from
nanotron.parallel
import
ParallelContext
from
nanotron.sanity_checks
import
assert_tensor_synced_across_pg
from
torch.utils.data
import
DataLoader
from
utils
import
create_dummy_dataset
,
set_system_path
set_system_path
()
from
examples.doremi.doremi.dataloader
import
(
CombinedDataset
,
DistributedSamplerForDoReMi
,
)
from
examples.doremi.doremi.doremi_context
import
DoReMiContext
from
tests.helpers.utils
import
init_distributed
@
pytest
.
fixture
def
dataset1
():
return
create_dummy_dataset
(
7000
)
@
pytest
.
fixture
def
dataset2
():
return
create_dummy_dataset
(
3000
)
@
pytest
.
fixture
def
datasets
(
dataset1
,
dataset2
):
return
[
dataset1
,
dataset2
]
@
pytest
.
mark
.
parametrize
(
"num_microbatches"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"is_proxy"
,
[
True
,
False
])
def
test_dist_doremi_sampler_sync_across_tp
(
num_microbatches
,
dataset1
,
is_proxy
):
NUM_DOMAINS
=
2
BATCH_SIZE
=
16
datasets
=
[
dataset1
for
_
in
range
(
NUM_DOMAINS
)]
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
NUM_DOMAINS
)]
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
is_proxy
)
init_distributed
(
tp
=
2
,
dp
=
1
,
pp
=
1
)(
_test_dist_doremi_sampler_sync_across_tp
)(
batch_size
=
BATCH_SIZE
,
num_microbatches
=
num_microbatches
,
datasets
=
datasets
,
doremi_context
=
doremi_context
,
)
def
_test_dist_doremi_sampler_sync_across_tp
(
parallel_context
:
ParallelContext
,
batch_size
:
int
,
num_microbatches
:
int
,
datasets
,
doremi_context
:
DoReMiContext
):
dp_size
=
dist
.
get_world_size
(
parallel_context
.
dp_pg
)
dp_rank
=
dist
.
get_rank
(
parallel_context
.
dp_pg
)
sampler
=
DistributedSamplerForDoReMi
(
datasets
,
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
num_replicas
=
dp_size
,
rank
=
dp_rank
,
doremi_context
=
doremi_context
,
parallel_context
=
parallel_context
,
)
for
idxs
in
sampler
:
idxs
=
torch
.
tensor
(
idxs
,
device
=
"cuda"
)
assert_tensor_synced_across_pg
(
idxs
,
parallel_context
.
tp_pg
)
@
pytest
.
mark
.
parametrize
(
"dp_size"
,
[
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"num_microbatches"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"is_proxy"
,
[
True
,
False
])
def
test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training
(
dp_size
,
num_microbatches
,
dataset1
,
is_proxy
):
NUM_DOMAINS
=
2
GLOBAL_BATCH_SIZE
=
512
batch_size
=
GLOBAL_BATCH_SIZE
//
(
num_microbatches
*
dp_size
)
datasets
=
[
dataset1
for
_
in
range
(
NUM_DOMAINS
)]
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
NUM_DOMAINS
)]
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
is_proxy
)
init_distributed
(
tp
=
1
,
dp
=
2
,
pp
=
1
)(
_test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training
)(
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
datasets
=
datasets
,
doremi_context
=
doremi_context
,
)
def
_test_dist_doremi_sampler_not_overlapse_across_dp_for_proxy_training
(
parallel_context
:
ParallelContext
,
batch_size
:
int
,
num_microbatches
:
int
,
datasets
,
doremi_context
:
DoReMiContext
,
):
dp_size
=
dist
.
get_world_size
(
parallel_context
.
dp_pg
)
dp_rank
=
dist
.
get_rank
(
parallel_context
.
dp_pg
)
sampler
=
DistributedSamplerForDoReMi
(
datasets
,
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
num_replicas
=
dp_size
,
rank
=
dp_rank
,
doremi_context
=
doremi_context
,
parallel_context
=
parallel_context
,
)
for
idxs
in
sampler
:
idxs
=
torch
.
tensor
(
idxs
,
device
=
"cuda"
).
view
(
-
1
)
# NOTE: i tried to use assert_fail_except_rank_with, but it mark the test as failed
# even the test raises an exception as expected
gathered_idxs
=
[
torch
.
empty_like
(
idxs
,
device
=
"cuda"
)
for
_
in
range
(
dp_size
)]
dist
.
all_gather
(
gathered_idxs
,
idxs
)
# NOTE: whether proxy or reference training
# the idxs should not be overlapse
assert
not
torch
.
any
(
torch
.
isin
(
*
gathered_idxs
))
@
pytest
.
mark
.
parametrize
(
"num_microbatches"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"is_proxy"
,
[
True
,
False
])
def
test_determistic_doremi_sampler
(
num_microbatches
,
dataset1
,
is_proxy
):
BATCH_SIZE
=
100
NUM_DOMAINS
=
2
datasets
=
[
dataset1
for
_
in
range
(
NUM_DOMAINS
)]
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
NUM_DOMAINS
)]
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
is_proxy
)
n_epochs
=
3
init_distributed
(
tp
=
1
,
dp
=
1
,
pp
=
1
)(
_test_determistic_doremi_sampler
)(
batch_size
=
BATCH_SIZE
,
num_microbatches
=
num_microbatches
,
datasets
=
datasets
,
doremi_context
=
doremi_context
,
n_epochs
=
n_epochs
,
)
def
_test_determistic_doremi_sampler
(
parallel_context
:
ParallelContext
,
batch_size
:
int
,
num_microbatches
:
int
,
n_epochs
:
int
,
datasets
,
doremi_context
:
DoReMiContext
,
):
dp_size
=
dist
.
get_world_size
(
parallel_context
.
dp_pg
)
dp_rank
=
dist
.
get_rank
(
parallel_context
.
dp_pg
)
sampler
=
DistributedSamplerForDoReMi
(
datasets
,
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
num_replicas
=
dp_size
,
rank
=
dp_rank
,
doremi_context
=
doremi_context
,
parallel_context
=
parallel_context
,
)
idxs_per_epoch
=
[]
for
_
in
range
(
n_epochs
):
all_idxs
=
[]
for
idxs
in
sampler
:
all_idxs
.
append
(
idxs
)
idxs_per_epoch
.
append
(
all_idxs
)
sampler
.
reset
()
# NOTE: check if the sequence of idxs across epochs are all the same
assert
all
(
all
(
arr1
[
i
]
==
arr2
[
i
]
for
i
in
range
(
len
(
arr1
)))
for
arr1
,
arr2
in
zip
(
idxs_per_epoch
,
idxs_per_epoch
[
1
:])
)
@
pytest
.
mark
.
parametrize
(
"dp_size"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"num_microbatches"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"is_proxy"
,
[
True
,
False
])
def
test_sampling_from_dist_doremi_sampler_with_global_batch_size
(
dp_size
,
num_microbatches
,
# domain_weights: torch.Tensor,
dataset1
,
is_proxy
,
):
NUM_DOMAINS
=
8
GLOBAL_BATCH_SIZE
=
512
batch_size
=
GLOBAL_BATCH_SIZE
//
(
num_microbatches
*
dp_size
)
datasets
=
[
dataset1
for
_
in
range
(
NUM_DOMAINS
)]
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
NUM_DOMAINS
)]
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
is_proxy
)
init_distributed
(
tp
=
1
,
dp
=
dp_size
,
pp
=
1
)(
_test_sampling_from_dist_doremi_sampler_with_global_batch_size
)(
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
global_batch_size
=
GLOBAL_BATCH_SIZE
,
datasets
=
datasets
,
doremi_context
=
doremi_context
,
)
def
_test_sampling_from_dist_doremi_sampler_with_global_batch_size
(
parallel_context
:
ParallelContext
,
batch_size
:
int
,
num_microbatches
:
int
,
global_batch_size
:
int
,
datasets
,
doremi_context
:
DoReMiContext
,
):
dp_size
=
dist
.
get_world_size
(
parallel_context
.
dp_pg
)
dp_rank
=
dist
.
get_rank
(
parallel_context
.
dp_pg
)
sampler
=
DistributedSamplerForDoReMi
(
datasets
,
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
num_replicas
=
dp_size
,
rank
=
dp_rank
,
doremi_context
=
doremi_context
,
parallel_context
=
parallel_context
,
)
domain_weights
=
doremi_context
.
domain_weights
global_batch_size_per_domain
=
[
round
(
global_batch_size
*
weight
.
item
())
for
weight
in
domain_weights
]
microbatch_idx
=
0
num_samples_per_domain
=
[
0
for
_
in
range
(
len
(
domain_weights
))]
for
idxs
in
sampler
:
assert
batch_size
==
len
(
idxs
)
# NOTE: make sure the indices from a batch
# is proportion to the domain weights
start_indices
=
[
sum
([
len
(
ds
)
for
ds
in
datasets
[:
i
]])
for
i
in
range
(
len
(
datasets
))]
end_indices
=
[
sum
([
len
(
ds
)
for
ds
in
datasets
[:
i
+
1
]])
for
i
in
range
(
len
(
datasets
))]
for
domain_idx
in
range
(
len
(
domain_weights
)):
num_samples
=
sum
(
1
for
idx
in
idxs
if
idx
>=
start_indices
[
domain_idx
]
and
idx
<
end_indices
[
domain_idx
])
num_samples_per_domain
[
domain_idx
]
+=
num_samples
if
microbatch_idx
==
num_microbatches
-
1
:
# NOTE: if this is the last microbatch => we iterate through all the microbatches
# now we check if the overall number of samples in each domain is correct across
# all the microbatches
num_samples_per_domain
=
torch
.
tensor
(
num_samples_per_domain
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
# NOTE: the domain weights are chosen so that we expect
# no domains have zero sample in the global batch size
dist
.
all_reduce
(
num_samples_per_domain
,
op
=
dist
.
ReduceOp
.
SUM
)
assert
(
num_samples_per_domain
==
0
).
sum
().
item
()
==
0
for
expected_bs
,
bs
in
zip
(
global_batch_size_per_domain
,
num_samples_per_domain
):
assert
bs
>
0
# NOTE: take into account rounding errors
# across all the dp ranks
assert
abs
(
expected_bs
-
bs
)
<=
dp_size
,
f
"abs(expected_bs - bs):
{
abs
(
expected_bs
-
bs
)
}
"
microbatch_idx
=
0
num_samples_per_domain
=
[
0
for
_
in
range
(
len
(
domain_weights
))]
else
:
microbatch_idx
+=
1
@
pytest
.
mark
.
parametrize
(
"dp_size"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"num_microbatches"
,
[
1
,
32
])
@
pytest
.
mark
.
parametrize
(
"is_proxy"
,
[
True
,
False
])
def
test_dist_doremi_sampler_not_repeating_samples
(
dp_size
,
num_microbatches
,
dataset1
,
is_proxy
):
NUM_DOMAINS
=
2
GLOBAL_BATCH_SIZE
=
512
batch_size
=
GLOBAL_BATCH_SIZE
//
(
num_microbatches
*
dp_size
)
datasets
=
[
dataset1
for
_
in
range
(
NUM_DOMAINS
)]
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
NUM_DOMAINS
)]
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
is_proxy
)
init_distributed
(
tp
=
1
,
dp
=
dp_size
,
pp
=
1
)(
_test_dist_doremi_sampler_not_repeating_samples
)(
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
datasets
=
datasets
,
doremi_context
=
doremi_context
,
)
def
_test_dist_doremi_sampler_not_repeating_samples
(
parallel_context
:
ParallelContext
,
batch_size
:
int
,
num_microbatches
:
int
,
datasets
,
doremi_context
:
DoReMiContext
,
):
dp_size
=
dist
.
get_world_size
(
parallel_context
.
dp_pg
)
dp_rank
=
dist
.
get_rank
(
parallel_context
.
dp_pg
)
sampler
=
DistributedSamplerForDoReMi
(
datasets
,
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
num_replicas
=
dp_size
,
rank
=
dp_rank
,
doremi_context
=
doremi_context
,
parallel_context
=
parallel_context
,
)
local_yieled_idxs
=
[]
yielded_idxs
=
[]
epoch
=
0
for
idxs
in
sampler
:
# NOTE: check that the indices are not repeated
assert
not
set
(
idxs
).
intersection
(
local_yieled_idxs
),
f
"set(idxs):
{
set
(
idxs
)
}
, local_yieled_idxs:
{
local_yieled_idxs
}
"
assert
not
set
(
idxs
).
intersection
(
yielded_idxs
),
f
"set(idxs):
{
set
(
idxs
)
}
, yielded_idxs:
{
yielded_idxs
}
\
epoch:
{
epoch
}
"
local_yieled_idxs
.
extend
(
idxs
)
# NOTE: gather all the indices from all the dp ranks
idxs
=
torch
.
tensor
(
idxs
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
all_idxs
=
[
torch
.
zeros_like
(
idxs
)
for
_
in
range
(
dp_size
)]
dist
.
all_gather
(
all_idxs
,
idxs
)
all_idxs
=
torch
.
cat
(
all_idxs
,
dim
=
0
).
view
(
-
1
).
cpu
().
tolist
()
yielded_idxs
.
extend
(
all_idxs
)
epoch
+=
1
assert
len
(
set
(
yielded_idxs
))
==
len
(
yielded_idxs
)
@
pytest
.
mark
.
parametrize
(
"dp_size"
,
[
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_microbatches"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"is_proxy"
,
[
True
,
False
])
def
test_yielding
(
dp_size
,
num_microbatches
,
dataset1
,
is_proxy
):
NUM_DOMAINS
=
2
BATCH_SIZE
=
100
global_batch_size
=
BATCH_SIZE
*
num_microbatches
*
dp_size
datasets
=
[
dataset1
for
_
in
range
(
NUM_DOMAINS
)]
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
NUM_DOMAINS
)]
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
is_proxy
)
init_distributed
(
tp
=
1
,
dp
=
dp_size
,
pp
=
1
)(
_test_yielding
)(
batch_size
=
BATCH_SIZE
,
global_batch_size
=
global_batch_size
,
num_microbatches
=
num_microbatches
,
datasets
=
datasets
,
doremi_context
=
doremi_context
,
)
def
_test_yielding
(
parallel_context
:
ParallelContext
,
batch_size
:
int
,
global_batch_size
:
int
,
num_microbatches
:
int
,
datasets
,
doremi_context
:
DoReMiContext
,
):
dp_size
=
dist
.
get_world_size
(
parallel_context
.
dp_pg
)
dp_rank
=
dist
.
get_rank
(
parallel_context
.
dp_pg
)
sampler
=
DistributedSamplerForDoReMi
(
datasets
,
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
num_replicas
=
dp_size
,
rank
=
dp_rank
,
doremi_context
=
doremi_context
,
parallel_context
=
parallel_context
,
)
step
=
0
num_yielded_microbatches
=
0
expected_domain_weights
=
torch
.
tensor
([
0.5
,
0.5
])
for
idxs
in
sampler
:
idxs
=
torch
.
tensor
(
idxs
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
idxs_dp
=
[
torch
.
empty_like
(
idxs
)
for
_
in
range
(
dp_size
)]
dist
.
all_gather
(
idxs_dp
,
idxs
)
idxs_dp
=
torch
.
cat
(
idxs_dp
,
dim
=
0
)
assert
idxs_dp
.
numel
()
==
batch_size
*
dp_size
# NOTE: if it loops through all the microbatches
# then we check if the number of samples in each domain
if
(
step
+
1
)
%
num_microbatches
==
0
:
num_yielded_microbatches
+=
1
for
i
,
weight
in
enumerate
(
expected_domain_weights
):
assert
sampler
.
domain_counters
[
i
]
==
int
(
num_yielded_microbatches
*
global_batch_size
*
weight
)
step
+=
1
@
pytest
.
mark
.
parametrize
(
"dp_size"
,
[
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"num_microbatches"
,
[
1
,
5
])
@
pytest
.
mark
.
parametrize
(
"is_proxy"
,
[
True
,
False
])
def
test_yielding_with_dataloader
(
dp_size
,
num_microbatches
,
dataset1
,
is_proxy
):
NUM_DOMAINS
=
2
BATCH_SIZE
=
100
global_batch_size
=
BATCH_SIZE
*
num_microbatches
*
dp_size
datasets
=
[
dataset1
for
_
in
range
(
NUM_DOMAINS
)]
domain_keys
=
[
f
"domain
{
i
}
"
for
i
in
range
(
NUM_DOMAINS
)]
doremi_context
=
DoReMiContext
(
domain_keys
,
is_proxy
=
is_proxy
)
init_distributed
(
tp
=
1
,
dp
=
dp_size
,
pp
=
1
)(
_test_yielding_with_dataloader
)(
batch_size
=
BATCH_SIZE
,
global_batch_size
=
global_batch_size
,
num_microbatches
=
num_microbatches
,
datasets
=
datasets
,
doremi_context
=
doremi_context
,
)
def
_test_yielding_with_dataloader
(
parallel_context
:
ParallelContext
,
batch_size
:
int
,
global_batch_size
:
int
,
num_microbatches
:
int
,
datasets
,
doremi_context
:
DoReMiContext
,
):
dp_size
=
dist
.
get_world_size
(
parallel_context
.
dp_pg
)
dp_rank
=
dist
.
get_rank
(
parallel_context
.
dp_pg
)
sampler
=
DistributedSamplerForDoReMi
(
datasets
,
batch_size
=
batch_size
,
num_microbatches
=
num_microbatches
,
num_replicas
=
dp_size
,
rank
=
dp_rank
,
doremi_context
=
doremi_context
,
parallel_context
=
parallel_context
,
)
comebined_dataset
=
CombinedDataset
(
datasets
)
dataloader
=
DataLoader
(
comebined_dataset
,
batch_sampler
=
sampler
)
step
=
1
num_yielded_microbatches
=
0
expected_domain_weights
=
torch
.
tensor
([
0.5
,
0.5
])
for
idxs
in
dataloader
:
num_idxs
=
torch
.
tensor
(
len
(
idxs
[
"text"
]),
dtype
=
torch
.
int
,
device
=
"cuda"
)
assert
num_idxs
.
item
()
==
batch_size
dist
.
all_reduce
(
num_idxs
,
op
=
dist
.
ReduceOp
.
SUM
,
group
=
parallel_context
.
dp_pg
)
assert
num_idxs
==
batch_size
*
dp_size
if
step
%
num_microbatches
==
0
:
num_yielded_microbatches
+=
1
for
i
,
weight
in
enumerate
(
expected_domain_weights
):
assert
sampler
.
domain_counters
[
i
]
==
int
(
num_yielded_microbatches
*
global_batch_size
*
weight
)
step
+=
1
assert
step
>
1
examples/doremi/tests/test_doremi_utils.py
0 → 100644
View file @
d99506f3
import
torch
from
utils
import
create_dummy_dataset
,
set_system_path
set_system_path
()
from
examples.doremi.doremi.utils
import
compute_domain_weights_based_on_token_count
def
test_compute_domain_weights_based_on_token_count
():
datasets
=
[
create_dummy_dataset
(
10
),
create_dummy_dataset
(
20
),
create_dummy_dataset
(
70
),
]
domain_weights
=
compute_domain_weights_based_on_token_count
(
datasets
)
assert
torch
.
equal
(
domain_weights
,
torch
.
tensor
([
0.1
,
0.2
,
0.7
]))
assert
torch
.
allclose
(
domain_weights
.
sum
(),
torch
.
tensor
(
1.0
))
examples/doremi/tests/utils.py
0 → 100644
View file @
d99506f3
import
importlib
import
sys
from
pathlib
import
Path
from
datasets
import
Dataset
def
set_system_path
():
package
=
importlib
.
import_module
(
"nanotron"
)
# NOTE: Path(package.__file__).parent = .../nanotron/src/nanotron
# we want .../nanotron
package_path
=
Path
(
package
.
__file__
).
parent
.
parent
.
parent
sys
.
path
.
append
(
str
(
package_path
))
def
create_dummy_dataset
(
num_items
:
int
):
data
=
{
"text"
:
list
(
range
(
num_items
))}
return
Dataset
.
from_dict
(
data
)
examples/doremi/train_doremi.py
0 → 100644
View file @
d99506f3
"""
DoReMi training script.
Usage:
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=4 examples/doremi/train_doremi.py --config-file examples/doremi/configs/config_280m_llama_proxy.yaml
"""
import
argparse
from
nanotron.config
import
get_config_from_file
from
doremi.config
import
DoReMiConfig
from
doremi.dataloader
import
get_dataloader
,
get_datasets
from
doremi.trainer
import
DoReMiTrainer
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--config-file"
,
type
=
str
,
required
=
True
,
help
=
"Path to the YAML or python config file"
)
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
args
=
get_args
()
config_file
=
args
.
config_file
config
:
DoReMiConfig
=
get_config_from_file
(
config_file
,
config_class
=
DoReMiConfig
)
dataset_paths
=
[
f
"
{
config
.
data_stages
[
0
].
data
.
dataset
.
hf_dataset_or_datasets
}
/
{
name
}
"
for
name
in
config
.
doremi
.
domain_names
]
datasets
=
get_datasets
(
dataset_paths
)
trainer
=
DoReMiTrainer
(
config_file
,
config_class
=
DoReMiConfig
)
dataloader
=
get_dataloader
(
trainer
,
datasets
)
trainer
.
train
(
dataloader
)
examples/doremi/train_reference.py
0 → 100644
View file @
d99506f3
"""
DoReMi training script.
Usage:
export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
torchrun --nproc_per_node=4 examples/doremi/train_doremi.py --config-file examples/doremi/configs/config_280m_llama.yaml
"""
import
argparse
import
torch
from
doremi.config
import
DoReMiConfig
from
doremi.dataloader
import
get_dataloader
,
get_datasets
from
doremi.trainer
import
ReferenceTrainer
from
doremi.utils
import
compute_domain_weights_based_on_token_count
from
nanotron.config
import
get_config_from_file
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--config-file"
,
type
=
str
,
required
=
True
,
help
=
"Path to the YAML or python config file"
)
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
args
=
get_args
()
config_file
=
args
.
config_file
config
=
get_config_from_file
(
config_file
,
config_class
=
DoReMiConfig
)
dataset_paths
=
[
f
"
{
config
.
data
.
dataset
.
hf_dataset_or_datasets
}
/
{
name
}
"
for
name
in
config
.
doremi
.
domain_names
]
datasets
=
get_datasets
(
dataset_paths
)
# TODO(xrsrke): add retrieving domain weights from config
# or calculate it in the trainer
if
config
.
doremi
.
domain_weights
is
None
:
initial_domain_weights
=
compute_domain_weights_based_on_token_count
(
datasets
)
else
:
initial_domain_weights
=
torch
.
tensor
(
config
.
doremi
.
domain_weights
)
assert
torch
.
allclose
(
initial_domain_weights
.
sum
(),
torch
.
tensor
(
1.0
),
rtol
=
1e-3
)
domain_names
=
config
.
doremi
.
domain_names
trainer
=
ReferenceTrainer
(
initial_domain_weights
,
domain_names
,
config_file
,
config_class
=
DoReMiConfig
)
dataloader
=
get_dataloader
(
trainer
,
datasets
)
trainer
.
train
(
dataloader
)
examples/doremi/utils.py
0 → 100644
View file @
d99506f3
from
typing
import
List
def
print_array_for_human
(
arr
:
List
[
float
],
precision
:
int
=
5
)
->
str
:
formatted_elements
=
[
f
"
{
x
:.
{
precision
}
f
}
"
for
x
in
arr
]
return
"["
+
", "
.
join
(
formatted_elements
)
+
"]"
examples/llama/README.md
0 → 100644
View file @
d99506f3
## Debugging the tests with vscode
To debug the tests with vscode, add the following json to your
`launch.json`
file.
```
{
"name": "Test conversion",
"type": "python",
"request": "launch",
"module": "pytest",
"console": "integratedTerminal",
"args": [
"examples/llama/tests"
],
"justMyCode": false
}
```
parallel/tensor_parallel
/__init__.py
→
examples/llama
/__init__.py
View file @
d99506f3
File moved
examples/llama/convert_hf_to_nanotron.py
0 → 100644
View file @
d99506f3
"""
Converts a HF model to nanotron format
Command:
torchrun --nproc_per_node=1 convert_hf_to_nanotron.py --checkpoint_path=hf_weights --save_path=nanotron_weights
"""
import
dataclasses
import
json
from
argparse
import
ArgumentParser
from
pathlib
import
Path
import
nanotron
import
torch
from
convert_weights
import
get_config_mapping
,
get_weight_mapping
,
load_nanotron_model
from
nanotron.config
import
LlamaConfig
as
NanotronLlamaConfig
from
nanotron.models.llama
import
LlamaForTraining
from
transformers
import
LlamaConfig
as
HFLlamaConfig
from
transformers
import
LlamaForCausalLM
def
_handle_attention_block
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
n_q_heads
:
int
,
n_kv_heads
:
int
,
d_qk
:
int
)
->
torch
.
Tensor
:
# Huggingface Llama separates the q, k, v weights (as opposed to nanotron).
# Furthermore, in the rotary embeddings in nanotron expects interleaved pairs of even
# and odd dimensions GPT-J style, while the huggingface implementation expects
# the whole 1st half and then the whole 2nd half GPT-NeoX style (for more information
# see flash_attn.layers.rotary.RotaryEmbedding).
# This function handles the concatenation of the q, k, v weights and proper permutation
# to ensure correct transformation.
def
interleave
(
w
:
torch
.
Tensor
):
w_new
=
[]
for
head_w
in
w
.
split
(
d_qk
):
head_w
=
head_w
.
view
(
2
,
d_qk
//
2
,
-
1
).
transpose
(
0
,
1
).
reshape
(
d_qk
,
-
1
)
w_new
.
append
(
head_w
)
return
torch
.
cat
(
w_new
)
q
=
interleave
(
q
)
k
=
interleave
(
k
)
return
torch
.
cat
([
q
,
k
,
v
])
def
convert_hf_to_nt
(
model_hf
:
LlamaForCausalLM
,
model_nt
:
LlamaForTraining
,
config
:
NanotronLlamaConfig
):
"""Converts the weights from the model_hf to model_nt, making modifications
in-place."""
hf_sd
=
model_hf
.
state_dict
()
nt_to_hf
=
get_weight_mapping
(
config
,
nt_to_hf
=
True
)
for
module_name_nt
,
module_nt
in
model_nt
.
named_modules
():
for
param_name_nt
,
param_nt
in
module_nt
.
named_parameters
(
recurse
=
False
):
# In the case of qkv_proj, the nt_to_hf has exactly three keys, ccorresponding
# to q, k, v.
if
"qkv_proj"
in
module_name_nt
:
key_k
,
key_q
,
key_v
=
sorted
(
nt_to_hf
[
f
"
{
module_name_nt
}
.
{
param_name_nt
}
"
])
q
=
hf_sd
[
key_q
]
k
=
hf_sd
[
key_k
]
v
=
hf_sd
[
key_v
]
param
=
_handle_attention_block
(
q
,
k
,
v
,
config
.
num_attention_heads
,
config
.
num_key_value_heads
,
config
.
hidden_size
//
config
.
num_attention_heads
,
)
# The case of gate_up_proj, nt_to_hf_map has two keys.
elif
"gate_up_proj"
in
module_name_nt
:
key_gate
,
key_up
=
sorted
(
nt_to_hf
[
f
"
{
module_name_nt
}
.
{
param_name_nt
}
"
])
gate
=
hf_sd
[
key_gate
]
up
=
hf_sd
[
key_up
]
param
=
torch
.
cat
([
gate
,
up
])
# All other cases are simple 1-to-1 correspondence.
else
:
hf_key
=
nt_to_hf
[
f
"
{
module_name_nt
}
.
{
param_name_nt
}
"
]
param
=
hf_sd
[
hf_key
]
with
torch
.
no_grad
():
param_nt
.
copy_
(
param
)
def
get_nanotron_config
(
config
:
HFLlamaConfig
)
->
NanotronLlamaConfig
:
"""Converts a huggingface configuration to nanotron configuration."""
attrs
=
{
key
:
getattr
(
config
,
value
)
for
key
,
value
in
get_config_mapping
(
nt_to_hf
=
True
).
items
()}
return
NanotronLlamaConfig
(
**
attrs
)
def
convert_checkpoint_and_save
(
checkpoint_path
:
Path
,
save_path
:
Path
):
"""Loads the huggingface checkpoint in `checkpoint_path`, creates
a new nanotron instance, copies the weights from the huggingface checkpoint
and saves the transformed nanotron to `save_path`."""
# Load huggingface.
hf_model
=
LlamaForCausalLM
.
from_pretrained
(
checkpoint_path
)
# Init nanotron model.
model_config
=
get_nanotron_config
(
hf_model
.
config
)
nanotron_model
=
load_nanotron_model
(
model_config
=
model_config
)
# Copy weights and save model.
parallel_context
=
nanotron
.
parallel
.
ParallelContext
(
data_parallel_size
=
1
,
pipeline_parallel_size
=
1
,
tensor_parallel_size
=
1
)
convert_hf_to_nt
(
hf_model
,
nanotron_model
,
model_config
)
nanotron
.
serialize
.
save_weights
(
model
=
nanotron_model
,
parallel_context
=
parallel_context
,
root_folder
=
save_path
)
with
open
(
save_path
/
"model_config.json"
,
"w+"
)
as
f
:
json
.
dump
(
dataclasses
.
asdict
(
model_config
),
f
)
print
(
f
"Model saved to
{
save_path
}
"
)
if
__name__
==
"__main__"
:
parser
=
ArgumentParser
(
description
=
"Convert HF weights to nanotron format"
)
parser
.
add_argument
(
"--checkpoint_path"
,
type
=
Path
,
default
=
"llama-7b"
,
help
=
"Path to the checkpoint"
)
parser
.
add_argument
(
"--save_path"
,
type
=
Path
,
default
=
"llama-7b-hf"
,
help
=
"Path to save the nanotron model"
)
args
=
parser
.
parse_args
()
# Convert HF model to nanotron format.
convert_checkpoint_and_save
(
checkpoint_path
=
args
.
checkpoint_path
,
save_path
=
args
.
save_path
)
examples/llama/convert_nanotron_to_hf.py
0 → 100644
View file @
d99506f3
"""
Converts a nanotron model to HF format
Command:
torchrun --nproc_per_node=1 convert_nanotron_to_hf.py --checkpoint_path=nanotron-path --save_path=hf-path
"""
import
json
from
argparse
import
ArgumentParser
from
pathlib
import
Path
from
typing
import
Literal
,
Optional
import
torch
from
convert_weights
import
get_config_mapping
,
get_weight_mapping
,
load_nanotron_model
from
nanotron.config
import
LlamaConfig
as
NanotronLlamaConfig
from
nanotron.models
import
init_on_device_and_dtype
from
nanotron.models.llama
import
LlamaForTraining
from
transformers
import
AutoTokenizer
,
LlamaForCausalLM
from
transformers
import
LlamaConfig
as
HFLlamaConfig
TEST_PROMPT
=
"What is the meaning of the word chutzpah?
\n
The word chutzpah means"
def
_handle_attention_block
(
qkv
:
torch
.
Tensor
,
part
:
Literal
[
"q"
,
"k"
,
"v"
],
n_q_heads
:
int
,
n_kv_heads
:
int
,
d_qk
:
int
)
->
torch
.
Tensor
:
# Huggingface Llama separates the q, k, v weights (as opposed to nanotron).
# Furthermore, in the rotary embeddings in nanotron expects interleaved pairs of even
# and odd dimensions GPT-J style, while the huggingface implementation expects
# the whole 1st half and then the whole 2nd half GPT-NeoX style (for more information
# see flash_attn.layers.rotary.RotaryEmbedding).
# This function selects the proper chunk of the bundled qkv tensor and permutation
# to ensure correct transformation to huggingface.
def
interleave
(
w
:
torch
.
Tensor
):
w_new
=
[]
for
head_w
in
w
.
split
(
d_qk
):
head_w
=
head_w
.
view
(
d_qk
//
2
,
2
,
-
1
).
transpose
(
0
,
1
).
reshape
(
d_qk
,
-
1
)
w_new
.
append
(
head_w
)
return
torch
.
cat
(
w_new
)
assert
part
in
[
"q"
,
"k"
,
"v"
],
"part must be one of [q, k, v]"
index_end_q
=
n_q_heads
*
d_qk
index_end_k
=
index_end_q
+
n_kv_heads
*
d_qk
if
part
==
"q"
:
return
interleave
(
qkv
[:
index_end_q
])
if
part
==
"k"
:
return
interleave
(
qkv
[
index_end_q
:
index_end_k
])
return
qkv
[
index_end_k
:]
def
_handle_gate_up_proj
(
gate_up_proj
:
torch
.
Tensor
,
gate
:
bool
)
->
torch
.
Tensor
:
# The gate and up projection are bundled in nanotron.
# This function selects the proper chunk in the bundled weights to return
# either the gate or the up projection only.
weight_size
=
gate_up_proj
.
shape
[
0
]
//
2
if
gate
:
return
gate_up_proj
[:
weight_size
]
else
:
return
gate_up_proj
[
weight_size
:]
def
convert_nt_to_hf
(
nanotron_model
:
LlamaForTraining
,
hf_model
:
LlamaForCausalLM
,
model_config
:
NanotronLlamaConfig
):
"""Converts the weights from the nanotron_model to hf_model, making modifications
in-place."""
nanotron_model_state_dict
=
nanotron_model
.
state_dict
()
hf_to_nt
=
get_weight_mapping
(
model_config
,
nt_to_hf
=
False
)
for
module_name_hf
,
module_hf
in
hf_model
.
named_modules
():
for
param_name_hf
,
param_hf
in
module_hf
.
named_parameters
(
recurse
=
False
):
# Get the Nanotron parameter
nanotron_key
=
hf_to_nt
[
f
"
{
module_name_hf
}
.
{
param_name_hf
}
"
]
param
=
nanotron_model_state_dict
[
nanotron_key
]
if
"qkv_proj"
in
nanotron_key
:
proj_name
=
module_name_hf
.
split
(
"."
)[
4
][
0
]
param
=
_handle_attention_block
(
param
,
proj_name
,
model_config
.
num_attention_heads
,
model_config
.
num_key_value_heads
,
model_config
.
hidden_size
//
model_config
.
num_attention_heads
,
)
elif
"gate_up_proj"
in
nanotron_key
:
gate
=
"gate"
in
module_name_hf
param
=
_handle_gate_up_proj
(
param
,
gate
)
with
torch
.
no_grad
():
param_hf
.
copy_
(
param
)
def
get_hf_config
(
config
:
NanotronLlamaConfig
)
->
HFLlamaConfig
:
"""Converts a nanotron configuration to huggingface configuration."""
attrs
=
{
key
:
getattr
(
config
,
value
)
for
key
,
value
in
get_config_mapping
(
nt_to_hf
=
False
).
items
()}
return
HFLlamaConfig
(
**
attrs
)
def
convert_checkpoint_and_save
(
checkpoint_path
:
Path
,
save_path
:
Path
,
tokenizer_name
:
Optional
[
str
]
=
None
):
"""Loads the nanotron checkpoint in `checkpoint_path`, creates
a new huggingface instance, copies the weights from the nanotron checkpoint
and saves the transformed huggingface to `save_path`."""
# Init nanotron model.
with
open
(
checkpoint_path
/
"model_config.json"
,
"r"
)
as
f
:
attrs
=
json
.
load
(
f
)
model_config
=
NanotronLlamaConfig
(
**
attrs
)
nanotron_model
=
load_nanotron_model
(
model_config
=
model_config
,
checkpoint_path
=
checkpoint_path
,
)
# Init huggingface model.
with
init_on_device_and_dtype
(
torch
.
device
(
"cuda"
),
torch
.
bfloat16
):
model_config_hf
=
get_hf_config
(
model_config
)
hf_model
=
LlamaForCausalLM
.
_from_config
(
model_config_hf
)
# Copy weights, initialize tokenizer and save model.
if
tokenizer_name
is
not
None
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_name
)
tokenizer
.
save_pretrained
(
save_path
)
convert_nt_to_hf
(
nanotron_model
,
hf_model
,
model_config
)
hf_model
.
save_pretrained
(
save_path
)
print
(
f
"Model saved to
{
save_path
}
"
)
def
check_converted_model_generation
(
save_path
:
Path
):
"""Loads a huggingface model and tokenizer from `save_path` and
performs a dummy text generation."""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
save_path
)
input_ids
=
tokenizer
(
TEST_PROMPT
,
return_tensors
=
"pt"
)[
"input_ids"
].
cuda
()
print
(
"Inputs:"
,
tokenizer
.
batch_decode
(
input_ids
))
model
=
LlamaForCausalLM
.
from_pretrained
(
save_path
).
cuda
().
bfloat16
()
out
=
model
.
generate
(
input_ids
,
max_new_tokens
=
100
)
print
(
"Generation (converted): "
,
tokenizer
.
batch_decode
(
out
))
if
__name__
==
"__main__"
:
parser
=
ArgumentParser
(
description
=
"Convert Nanotron weights to HF format"
)
parser
.
add_argument
(
"--checkpoint_path"
,
type
=
Path
,
default
=
"llama-7b"
,
help
=
"Path to the checkpoint"
)
parser
.
add_argument
(
"--save_path"
,
type
=
Path
,
default
=
"llama-7b-hf"
,
help
=
"Path to save the HF model"
)
parser
.
add_argument
(
"--tokenizer_name"
,
type
=
str
,
default
=
"meta-llama/Llama-2-7b-chat-hf"
)
args
=
parser
.
parse_args
()
# Convert Nanotron model to HF format.
convert_checkpoint_and_save
(
checkpoint_path
=
args
.
checkpoint_path
,
save_path
=
args
.
save_path
,
tokenizer_name
=
args
.
tokenizer_name
)
# Check if the conversion was successful by generating some text.
if
args
.
tokenizer_name
is
not
None
:
check_converted_model_generation
(
save_path
=
args
.
save_path
)
examples/llama/convert_weights.py
0 → 100644
View file @
d99506f3
import
json
from
pathlib
import
Path
from
typing
import
Optional
import
nanotron
import
torch
from
nanotron.config
import
LlamaConfig
as
NanotronLlamaConfig
from
nanotron.models.llama
import
LlamaForTraining
from
nanotron.trainer
import
mark_tied_parameters
def
get_weight_mapping
(
config
:
NanotronLlamaConfig
,
nt_to_hf
:
bool
=
True
)
->
dict
[
str
,
str
]:
"""Returns the nanotron to huggingface parameter mapping if `nt_to_hf`, otherwise the
huggingface to nanotron mapping."""
hf_to_nt_map
=
{}
hf_to_nt_map
[
"lm_head.weight"
]
=
"model.lm_head.pp_block.weight"
hf_to_nt_map
[
"model.embed_tokens.weight"
]
=
"model.token_position_embeddings.pp_block.token_embedding.weight"
hf_to_nt_map
[
"model.norm.weight"
]
=
"model.final_layer_norm.pp_block.weight"
hf_to_nt_map
[
"model.embed_tokens.weight"
]
=
"model.token_position_embeddings.pp_block.token_embedding.weight"
for
i
in
range
(
config
.
num_hidden_layers
):
hf_prefix
=
f
"model.layers.
{
i
}
"
nt_prefix
=
f
"model.decoder.
{
i
}
.pp_block"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.self_attn.q_proj.weight"
]
=
f
"
{
nt_prefix
}
.attn.qkv_proj.weight"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.self_attn.k_proj.weight"
]
=
f
"
{
nt_prefix
}
.attn.qkv_proj.weight"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.self_attn.v_proj.weight"
]
=
f
"
{
nt_prefix
}
.attn.qkv_proj.weight"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.self_attn.o_proj.weight"
]
=
f
"
{
nt_prefix
}
.attn.o_proj.weight"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.mlp.gate_proj.weight"
]
=
f
"
{
nt_prefix
}
.mlp.gate_up_proj.weight"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.mlp.gate_proj.bias"
]
=
f
"
{
nt_prefix
}
.mlp.gate_up_proj.bias"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.mlp.up_proj.weight"
]
=
f
"
{
nt_prefix
}
.mlp.gate_up_proj.weight"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.mlp.up_proj.bias"
]
=
f
"
{
nt_prefix
}
.mlp.gate_up_proj.bias"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.mlp.down_proj.weight"
]
=
f
"
{
nt_prefix
}
.mlp.down_proj.weight"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.mlp.down_proj.bias"
]
=
f
"
{
nt_prefix
}
.mlp.down_proj.bias"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.input_layernorm.weight"
]
=
f
"
{
nt_prefix
}
.input_layernorm.weight"
hf_to_nt_map
[
f
"
{
hf_prefix
}
.post_attention_layernorm.weight"
]
=
f
"
{
nt_prefix
}
.post_attention_layernorm.weight"
if
nt_to_hf
:
nt_to_hf_map
=
{}
for
hf
,
nt
in
hf_to_nt_map
.
items
():
# Because the qkv and gate_up projections are separated in the
# huggingface format, when we return nanotron to huggingface
# we will need to return a list of parameters instead (e.g.
# the `qkv_proj` will point to a list `[q_proj, k_proj, v_proj]`).
if
nt
in
nt_to_hf_map
and
isinstance
(
nt_to_hf_map
[
nt
],
list
):
nt_to_hf_map
[
nt
].
append
(
hf
)
elif
nt
in
nt_to_hf_map
:
nt_to_hf_map
[
nt
]
=
[
nt_to_hf_map
[
nt
],
hf
]
else
:
nt_to_hf_map
[
nt
]
=
hf
return
nt_to_hf_map
return
hf_to_nt_map
def
get_config_mapping
(
nt_to_hf
:
bool
=
True
)
->
dict
[
str
,
str
]:
"""Returns either the nanotron to huggingface (if `nt_to_hf`)
configuration mapping, or the huggingface to nanotron."""
hf_to_nt_map
=
{
"bos_token_id"
:
"bos_token_id"
,
"eos_token_id"
:
"eos_token_id"
,
"hidden_act"
:
"hidden_act"
,
"hidden_size"
:
"hidden_size"
,
"initializer_range"
:
"initializer_range"
,
"intermediate_size"
:
"intermediate_size"
,
"max_position_embeddings"
:
"max_position_embeddings"
,
"num_attention_heads"
:
"num_attention_heads"
,
"num_hidden_layers"
:
"num_hidden_layers"
,
"num_key_value_heads"
:
"num_key_value_heads"
,
"pad_token_id"
:
"pad_token_id"
,
"pretraining_tp"
:
"pretraining_tp"
,
"rms_norm_eps"
:
"rms_norm_eps"
,
"rope_scaling"
:
"rope_scaling"
,
"rope_theta"
:
"rope_theta"
,
"tie_word_embeddings"
:
"tie_word_embeddings"
,
"use_cache"
:
"use_cache"
,
"vocab_size"
:
"vocab_size"
,
}
if
nt_to_hf
:
return
{
nt
:
hf
for
hf
,
nt
in
hf_to_nt_map
.
items
()}
return
hf_to_nt_map
def
make_parallel_config
(
dp
:
int
=
1
,
pp
:
int
=
1
,
tp
:
int
=
1
,
):
parallel_config
=
nanotron
.
config
.
ParallelismArgs
(
dp
=
dp
,
pp
=
pp
,
tp
=
tp
,
pp_engine
=
nanotron
.
config
.
AllForwardAllBackwardPipelineEngine
(),
tp_mode
=
nanotron
.
config
.
TensorParallelLinearMode
.
ALL_REDUCE
,
tp_linear_async_communication
=
False
,
)
return
parallel_config
def
load_nanotron_model
(
model_config
:
Optional
[
NanotronLlamaConfig
]
=
None
,
device
:
torch
.
device
=
torch
.
device
(
"cuda"
),
dtype
:
torch
.
dtype
=
torch
.
bfloat16
,
checkpoint_path
:
Optional
[
Path
]
=
None
,
)
->
LlamaForTraining
:
"""
Creates and returns a nanotron model.
If `model_config` is None, then `checkpoint_path` must be set, in which case
the configuration will be loaded from such path.
If `checkpoint_path` is None, then `model_config` must be set, in which case
the model created will have random weights.
"""
if
model_config
is
None
:
assert
checkpoint_path
is
not
None
with
open
(
checkpoint_path
/
"model_config.json"
)
as
f
:
model_config
=
NanotronLlamaConfig
(
**
json
.
load
(
f
))
parallel_config
=
make_parallel_config
()
parallel_context
=
nanotron
.
parallel
.
ParallelContext
(
data_parallel_size
=
parallel_config
.
dp
,
pipeline_parallel_size
=
parallel_config
.
pp
,
tensor_parallel_size
=
parallel_config
.
tp
,
)
nanotron_model
=
nanotron
.
models
.
build_model
(
model_builder
=
lambda
:
LlamaForTraining
(
config
=
model_config
,
parallel_context
=
parallel_context
,
parallel_config
=
parallel_config
,
random_states
=
None
,
),
parallel_context
=
parallel_context
,
dtype
=
dtype
,
device
=
device
,
)
mark_tied_parameters
(
model
=
nanotron_model
,
parallel_context
=
parallel_context
)
# Load checkpoint directly in memory and then only keep the state dictionary
if
checkpoint_path
is
not
None
:
nanotron
.
serialize
.
load_weights
(
model
=
nanotron_model
,
parallel_context
=
parallel_context
,
root_folder
=
checkpoint_path
)
return
nanotron_model
examples/llama/requirements.txt
0 → 100644
View file @
d99506f3
transformers==4.39.3
examples/llama/tests/test_conversion.py
0 → 100644
View file @
d99506f3
# ruff: noqa: E402
import
dataclasses
import
json
from
pathlib
import
Path
import
pytest
import
torch
from
transformers
import
LlamaForCausalLM
from
utils
import
set_system_path
set_system_path
()
import
nanotron
from
nanotron.config
import
LlamaConfig
as
NanotronLlamaConfig
from
nanotron.models.base
import
init_on_device_and_dtype
from
nanotron.models.llama
import
LlamaForTraining
from
nanotron.parallel
import
ParallelContext
from
nanotron.trainer
import
mark_tied_parameters
from
examples.llama.convert_hf_to_nanotron
import
convert_checkpoint_and_save
as
convert_hf_to_nt_and_save
from
examples.llama.convert_hf_to_nanotron
import
convert_hf_to_nt
from
examples.llama.convert_nanotron_to_hf
import
convert_checkpoint_and_save
as
convert_nt_to_hf_and_save
from
examples.llama.convert_nanotron_to_hf
import
convert_nt_to_hf
,
get_hf_config
from
examples.llama.convert_weights
import
load_nanotron_model
,
make_parallel_config
from
tests.helpers.context
import
TestContext
from
tests.helpers.utils
import
init_distributed
CONFIG
=
NanotronLlamaConfig
(
**
{
"bos_token_id"
:
1
,
"eos_token_id"
:
2
,
"hidden_act"
:
"silu"
,
"hidden_size"
:
512
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
1024
,
"is_llama_config"
:
True
,
"max_position_embeddings"
:
128
,
"num_attention_heads"
:
8
,
"num_hidden_layers"
:
4
,
"num_key_value_heads"
:
4
,
"pad_token_id"
:
None
,
"pretraining_tp"
:
1
,
"rms_norm_eps"
:
1e-06
,
"rope_scaling"
:
None
,
"tie_word_embeddings"
:
False
,
"use_cache"
:
True
,
"vocab_size"
:
4096
,
}
)
BATCH_SIZE
=
3
SEQUENCE_LENGTH
=
5
ATOL
=
0.03
def
create_nanotron_model
(
parallel_context
:
ParallelContext
)
->
LlamaForTraining
:
parallel_config
=
make_parallel_config
(
tp
=
parallel_context
.
tensor_parallel_size
,
dp
=
parallel_context
.
data_parallel_size
,
pp
=
parallel_context
.
pipeline_parallel_size
,
)
nanotron_model
=
nanotron
.
models
.
build_model
(
model_builder
=
lambda
:
LlamaForTraining
(
config
=
CONFIG
,
parallel_context
=
parallel_context
,
parallel_config
=
parallel_config
,
random_states
=
None
,
),
parallel_context
=
parallel_context
,
dtype
=
torch
.
bfloat16
,
device
=
torch
.
device
(
"cuda"
),
)
mark_tied_parameters
(
model
=
nanotron_model
,
parallel_context
=
parallel_context
)
return
nanotron_model
def
create_huggingface_model
()
->
LlamaForCausalLM
:
config_hf
=
get_hf_config
(
CONFIG
)
with
init_on_device_and_dtype
(
torch
.
device
(
"cuda"
),
torch
.
bfloat16
):
model_hf
=
LlamaForCausalLM
.
_from_config
(
config_hf
)
return
model_hf
@
pytest
.
fixture
(
autouse
=
True
,
scope
=
"module"
)
def
fix_seed
():
torch
.
manual_seed
(
0
)
yield
@
pytest
.
fixture
def
input_ids
()
->
torch
.
Tensor
:
return
torch
.
randint
(
0
,
CONFIG
.
vocab_size
,
size
=
(
BATCH_SIZE
,
SEQUENCE_LENGTH
),
device
=
"cuda"
)
def
_test_nt_to_hf
(
parallel_context
:
ParallelContext
,
input_ids
:
torch
.
Tensor
):
model_nt
=
create_nanotron_model
(
parallel_context
)
model_hf
=
create_huggingface_model
()
convert_nt_to_hf
(
model_nt
,
model_hf
,
CONFIG
)
input_mask
=
torch
.
ones_like
(
input_ids
)
logits_nt
=
model_nt
.
model
(
input_ids
,
input_mask
).
permute
(
1
,
0
,
2
)
logits_hf
=
model_hf
(
input_ids
).
logits
assert
logits_nt
.
size
()
==
logits_hf
.
size
()
assert
torch
.
allclose
(
logits_nt
,
logits_hf
,
atol
=
ATOL
),
torch
.
mean
(
torch
.
abs
(
logits_nt
-
logits_hf
))
def
test_nt_to_hf
(
input_ids
:
torch
.
Tensor
):
init_distributed
(
tp
=
1
,
dp
=
1
,
pp
=
1
)(
_test_nt_to_hf
)(
input_ids
=
input_ids
)
def
_test_nt_to_hf_with_files
(
parallel_context
:
ParallelContext
,
input_ids
:
torch
.
Tensor
,
test_context
:
TestContext
):
# Create and save nanotron model.
model_nt
=
create_nanotron_model
(
parallel_context
)
root
=
test_context
.
get_auto_remove_tmp_dir
()
nt_path
=
root
/
"nanotron"
hf_path
=
root
/
"hf"
nanotron
.
serialize
.
save_weights
(
model
=
model_nt
,
parallel_context
=
parallel_context
,
root_folder
=
nt_path
)
with
open
(
nt_path
/
"model_config.json"
,
"w+"
)
as
f
:
json
.
dump
(
dataclasses
.
asdict
(
CONFIG
),
f
)
input_mask
=
torch
.
ones_like
(
input_ids
)
logits_nt
=
model_nt
.
model
(
input_ids
,
input_mask
).
permute
(
1
,
0
,
2
)
del
model_nt
# Perform conversion.
convert_nt_to_hf_and_save
(
nt_path
,
hf_path
)
# Load huggingface and get logits.
model_hf
=
LlamaForCausalLM
.
from_pretrained
(
hf_path
).
cuda
()
logits_hf
=
model_hf
(
input_ids
).
logits
assert
logits_nt
.
size
()
==
logits_hf
.
size
()
torch
.
testing
.
assert_allclose
(
logits_nt
,
logits_hf
,
atol
=
ATOL
)
def
test_nt_to_hf_with_files
(
input_ids
:
torch
.
Tensor
):
init_distributed
(
tp
=
1
,
dp
=
1
,
pp
=
1
)(
_test_nt_to_hf_with_files
)(
input_ids
=
input_ids
,
test_context
=
TestContext
())
def
_test_hf_to_nt
(
parallel_context
:
ParallelContext
,
input_ids
:
torch
.
Tensor
):
model_nt
=
create_nanotron_model
(
parallel_context
)
model_hf
=
create_huggingface_model
()
convert_hf_to_nt
(
model_hf
,
model_nt
,
CONFIG
)
input_mask
=
torch
.
ones_like
(
input_ids
)
logits_nt
=
model_nt
.
model
(
input_ids
,
input_mask
).
permute
(
1
,
0
,
2
)
logits_hf
=
model_hf
(
input_ids
).
logits
assert
logits_nt
.
size
()
==
logits_hf
.
size
()
torch
.
testing
.
assert_allclose
(
logits_hf
,
logits_nt
,
atol
=
ATOL
)
def
test_hf_to_nt
(
input_ids
:
torch
.
Tensor
):
init_distributed
(
tp
=
1
,
dp
=
1
,
pp
=
1
)(
_test_hf_to_nt
)(
input_ids
=
input_ids
)
def
_test_hf_to_nt_with_files
(
parallel_context
:
ParallelContext
,
input_ids
:
torch
.
Tensor
,
test_context
:
TestContext
):
# Create and save hf model.
model_hf
=
create_huggingface_model
()
root
=
test_context
.
get_auto_remove_tmp_dir
()
nt_path
=
root
/
"nanotron"
hf_path
=
root
/
"hf"
model_hf
.
save_pretrained
(
hf_path
)
logits_hf
=
model_hf
(
input_ids
).
logits
del
model_hf
# Perform conversion.
convert_hf_to_nt_and_save
(
hf_path
,
nt_path
)
# Load nanotron and get logits.
input_mask
=
torch
.
ones_like
(
input_ids
)
model_nt
=
load_nanotron_model
(
checkpoint_path
=
nt_path
)
logits_nt
=
model_nt
.
model
(
input_ids
,
input_mask
).
permute
(
1
,
0
,
2
)
assert
logits_nt
.
size
()
==
logits_hf
.
size
()
assert
torch
.
allclose
(
logits_nt
,
logits_hf
,
atol
=
ATOL
)
def
test_hf_to_nt_with_files
(
input_ids
:
torch
.
Tensor
):
init_distributed
(
tp
=
1
,
dp
=
1
,
pp
=
1
)(
_test_hf_to_nt_with_files
)(
input_ids
=
input_ids
,
test_context
=
TestContext
())
def
_test_composed_conversion
(
parallel_context
:
ParallelContext
):
# Get HF statedict.
model_hf
=
create_huggingface_model
()
hf_sd
=
{
key
:
val
.
clone
()
for
key
,
val
in
model_hf
.
state_dict
().
items
()}
# Convert once to nanotron, save its statedict.
model_nt
=
create_nanotron_model
(
parallel_context
)
convert_hf_to_nt
(
model_hf
,
model_nt
,
CONFIG
)
nt_sd
=
{
key
:
val
.
clone
()
for
key
,
val
in
model_nt
.
state_dict
().
items
()}
# Convert back to HF, compare statedicts.
del
model_hf
model_hf
=
create_huggingface_model
()
convert_nt_to_hf
(
model_nt
,
model_hf
,
CONFIG
)
hf_sd_new
=
model_hf
.
state_dict
()
assert
set
(
hf_sd_new
)
==
set
(
hf_sd
)
assert
all
(
torch
.
all
(
hf_sd
[
key
]
==
hf_sd_new
[
key
])
for
key
in
hf_sd_new
)
# Convert to nanotron one more time, compare statedicts.
del
model_nt
model_nt
=
create_nanotron_model
(
parallel_context
)
convert_hf_to_nt
(
model_hf
,
model_nt
,
CONFIG
)
nt_sd_new
=
model_nt
.
state_dict
()
assert
set
(
nt_sd_new
)
==
set
(
nt_sd
)
assert
all
(
torch
.
all
(
nt_sd
[
key
]
==
nt_sd_new
[
key
])
for
key
in
nt_sd_new
)
def
test_composed_conversion
():
init_distributed
(
tp
=
1
,
dp
=
1
,
pp
=
1
)(
_test_composed_conversion
)()
def
_save_parallel_nanotron
(
parallel_context
:
ParallelContext
,
input_ids
:
torch
.
Tensor
,
nt_path
:
Path
):
# Create and save a parallel model.
model_nt
=
create_nanotron_model
(
parallel_context
)
nanotron
.
serialize
.
save_weights
(
model
=
model_nt
,
parallel_context
=
parallel_context
,
root_folder
=
nt_path
)
with
open
(
nt_path
/
"model_config.json"
,
"w+"
)
as
f
:
json
.
dump
(
dataclasses
.
asdict
(
CONFIG
),
f
)
# Get parallel predictions.
input_ids
=
input_ids
.
cuda
()
# Move them to the current device index.
input_mask
=
torch
.
ones_like
(
input_ids
)
logits_nt
=
model_nt
.
model
(
input_ids
,
input_mask
).
permute
(
1
,
0
,
2
)
if
torch
.
distributed
.
get_rank
()
==
0
:
torch
.
save
(
logits_nt
.
detach
().
cpu
(),
nt_path
/
"logits.pt"
)
# Convert nanotron to hf, load it and compare logits.
# hf_path = root/"hf"
# convert_nt_to_hf_and_save(nt_path, hf_path)
# model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
# logits_hf = model_hf(input_ids).logits
# assert logits_nt.size() == logits_hf.size()
# assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def
_convert_from_parallel
(
parallel_context
:
ParallelContext
,
input_ids
:
torch
.
Tensor
,
nt_path
:
Path
,
hf_path
:
Path
):
# Convert parallel nanotron to hf, get and save huggingface predictions.
convert_nt_to_hf_and_save
(
nt_path
,
hf_path
)
model_hf
=
LlamaForCausalLM
.
from_pretrained
(
hf_path
).
cuda
()
logits_hf
=
model_hf
(
input_ids
).
logits
torch
.
save
(
logits_hf
.
detach
().
cpu
(),
hf_path
/
"logits.pt"
)
def
test_tensor_parallel_conversion
(
input_ids
:
torch
.
Tensor
):
# Set up test.
test_context
=
TestContext
()
root
=
test_context
.
get_auto_remove_tmp_dir
()
nt_path
=
root
/
"nanotron"
hf_path
=
root
/
"nanotron"
# Launch both parts.
init_distributed
(
tp
=
2
,
dp
=
1
,
pp
=
1
)(
_save_parallel_nanotron
)(
input_ids
=
input_ids
,
nt_path
=
nt_path
)
assert
(
nt_path
/
"logits.pt"
).
exists
()
init_distributed
(
tp
=
1
,
dp
=
1
,
pp
=
1
)(
_convert_from_parallel
)(
input_ids
=
input_ids
,
nt_path
=
nt_path
,
hf_path
=
hf_path
)
assert
(
hf_path
/
"logits.pt"
).
exists
()
# Load logits and verify they match.
logits_nt
=
torch
.
load
(
nt_path
/
"logits.pt"
)
logits_hf
=
torch
.
load
(
hf_path
/
"logits.pt"
)
assert
logits_nt
.
size
()
==
logits_hf
.
size
()
assert
torch
.
allclose
(
logits_nt
,
logits_hf
,
atol
=
ATOL
),
torch
.
mean
(
torch
.
abs
(
logits_nt
-
logits_hf
))
examples/llama/tests/test_conversion.py.orig
0 → 100644
View file @
d99506f3
# ruff: noqa: E402
import json
<<<<<<< HEAD
from pathlib import Path
=======
>>>>>>> main
import pytest
import torch
from transformers import LlamaForCausalLM
from utils import set_system_path
set_system_path()
import nanotron
from nanotron.config import LlamaConfig as NanotronLlamaConfig
from nanotron.models.base import init_on_device_and_dtype
from nanotron.models.llama import LlamaForTraining
from nanotron.parallel import ParallelContext
from examples.llama.convert_hf_to_nanotron import convert_checkpoint_and_save as convert_hf_to_nt_and_save
<<<<<<< HEAD
from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
from examples.llama.convert_weights import load_nanotron_model
from tests.helpers.context import TestContext
from tests.helpers.utils import init_distributed
=======
from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
from examples.llama.convert_weights import load_nanotron_model, make_parallel_config
from tests.helpers.context import TestContext
from tests.helpers.utils import init_distributed, rerun_if_address_is_in_use
>>>>>>> main
CONFIG = NanotronLlamaConfig(
**{
"bos_token_id": 1,
"eos_token_id": 2,
"hidden_act": "silu",
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 1024,
"is_llama_config": True,
"max_position_embeddings": 128,
"num_attention_heads": 8,
"num_hidden_layers": 4,
"num_key_value_heads": 4,
"pad_token_id": None,
"pretraining_tp": 1,
"rms_norm_eps": 1e-06,
"rope_scaling": None,
"tie_word_embeddings": False,
"use_cache": True,
"vocab_size": 4096,
}
)
BATCH_SIZE = 3
SEQUENCE_LENGTH = 5
ATOL = 0.02
def create_nanotron_model(pp: int = 1, tp: int = 1, dp: int = 1) -> LlamaForTraining:
parallel_config = make_parallel_config(dp, pp, tp)
return load_nanotron_model(parallel_config, CONFIG, torch.device("cuda"), torch.bfloat16)
def create_huggingface_model() -> LlamaForCausalLM:
config_hf = get_hf_config(CONFIG)
with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
model_hf = LlamaForCausalLM._from_config(config_hf)
return model_hf
@pytest.fixture(autouse=True, scope="module")
def fix_seed():
torch.manual_seed(0)
yield
@pytest.fixture
def input_ids() -> torch.Tensor:
return torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
def _test_nt_to_hf(parallel_context: ParallelContext, input_ids: torch.Tensor):
model_nt = create_nanotron_model()
model_hf = create_huggingface_model()
convert_nt_to_hf(model_nt, model_hf, CONFIG)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def test_nt_to_hf(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf)(input_ids=input_ids)
def _test_nt_to_hf_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
# Create and save nanotron model.
model_nt = create_nanotron_model()
root = test_context.get_auto_remove_tmp_dir()
nt_path = root / "nanotron"
hf_path = root / "hf"
nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
with open(nt_path / "model_config.json", "w+") as f:
json.dump(vars(CONFIG), f)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
del model_nt
# Perform conversion.
convert_nt_to_hf_and_save(nt_path, hf_path)
# Load huggingface and get logits.
model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def test_nt_to_hf_with_files(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf_with_files)(input_ids=input_ids, test_context=TestContext())
def _test_hf_to_nt(parallel_context: ParallelContext, input_ids: torch.Tensor):
model_nt = create_nanotron_model()
model_hf = create_huggingface_model()
convert_hf_to_nt(model_hf, model_nt, CONFIG)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def test_hf_to_nt(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt)(input_ids=input_ids)
def _test_hf_to_nt_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
# Create and save hf model.
model_hf = create_huggingface_model()
root = test_context.get_auto_remove_tmp_dir()
nt_path = root / "nanotron"
hf_path = root / "hf"
model_hf.save_pretrained(hf_path)
logits_hf = model_hf(input_ids).logits
del model_hf
# Perform conversion.
convert_hf_to_nt_and_save(hf_path, nt_path)
# Load nanotron and get logits.
input_mask = torch.ones_like(input_ids)
model_nt = load_nanotron_model(checkpoint_path=nt_path)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL)
def test_hf_to_nt_with_files(input_ids: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt_with_files)(input_ids=input_ids, test_context=TestContext())
def _test_composed_conversion(parallel_context: ParallelContext):
# Get HF statedict.
model_hf = create_huggingface_model()
hf_sd = {key: val.clone() for key, val in model_hf.state_dict().items()}
# Convert once to nanotron, save its statedict.
model_nt = create_nanotron_model()
convert_hf_to_nt(model_hf, model_nt, CONFIG)
nt_sd = {key: val.clone() for key, val in model_nt.state_dict().items()}
# Convert back to HF, compare statedicts.
del model_hf
model_hf = create_huggingface_model()
convert_nt_to_hf(model_nt, model_hf, CONFIG)
hf_sd_new = model_hf.state_dict()
assert set(hf_sd_new) == set(hf_sd)
assert all(torch.all(hf_sd[key] == hf_sd_new[key]) for key in hf_sd_new)
# Convert to nanotron one more time, compare statedicts.
del model_nt
model_nt = create_nanotron_model()
convert_hf_to_nt(model_hf, model_nt, CONFIG)
nt_sd_new = model_nt.state_dict()
assert set(nt_sd_new) == set(nt_sd)
assert all(torch.all(nt_sd[key] == nt_sd_new[key]) for key in nt_sd_new)
def test_composed_conversion():
init_distributed(tp=1, dp=1, pp=1)(_test_composed_conversion)()
<<<<<<< HEAD
def _save_parallel_nanotron(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path):
# Create and save a parallel model.
model_nt = create_nanotron_model(tp=parallel_context.tensor_parallel_size, pp=parallel_context.pipeline_parallel_size)
# print(torch.distributed.get_rank(), "model_nt", set(p.device for p in model_nt.parameters()))
nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
with open(nt_path/"model_config.json", "w+") as f:
json.dump(vars(CONFIG), f)
# Get parallel predictions.
input_ids = input_ids.cuda() # Move them to the current device index.
input_mask = torch.ones_like(input_ids)
# print(torch.distributed.get_rank(), "input_ids", input_ids.device)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
if torch.distributed.get_rank() == 0:
torch.save(logits_nt.detach().cpu(), nt_path/"logits.pt")
# print(torch.distributed.get_rank(), logits_nt.shape)
# Convert nanotron to hf, load it and compare logits.
# hf_path = root/"hf"
# convert_nt_to_hf_and_save(nt_path, hf_path)
# model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
# logits_hf = model_hf(input_ids).logits
# assert logits_nt.size() == logits_hf.size()
# assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
def _convert_from_parallel(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path, hf_path: Path):
# Convert parallel nanotron to hf, get and save huggingface predictions.
convert_nt_to_hf_and_save(nt_path, hf_path)
model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
logits_hf = model_hf(input_ids).logits
torch.save(logits_hf.detach().cpu(), hf_path/"logits.pt")
def test_tensor_parallel_conversion(input_ids: torch.Tensor):
# Set up test.
test_context = TestContext()
root = test_context.get_auto_remove_tmp_dir()
nt_path =root/"nanotron"
hf_path =root/"nanotron"
# Launch both parts.
init_distributed(tp=2, dp=1, pp=1)(_save_parallel_nanotron)(input_ids=input_ids, nt_path=nt_path)
assert (nt_path/"logits.pt").exists()
init_distributed(tp=1, dp=1, pp=1)(_convert_from_parallel)(input_ids=input_ids, nt_path=nt_path, hf_path=hf_path)
assert (hf_path/"logits.pt").exists()
# Load logits and verify they match.
logits_nt = torch.load(nt_path/"logits.pt")
logits_hf = torch.load(hf_path/"logits.pt")
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
=======
def _test_tensor_parallel_conversion(parallel_context: ParallelContext):
model_nt = create_nanotron_model(tp=2)
model_hf = create_huggingface_model()
convert_nt_to_hf(model_nt, model_hf, CONFIG)
input_mask = torch.ones_like(input_ids)
logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
logits_hf = model_hf(input_ids).logits
assert logits_nt.size() == logits_hf.size()
assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
@rerun_if_address_is_in_use()
def test_tensor_parallel_conversion():
init_distributed(tp=2, dp=1, pp=1)(_test_tensor_parallel_conversion)()
>>>>>>> main
examples/llama/tests/utils.py
0 → 100644
View file @
d99506f3
import
importlib
import
sys
from
pathlib
import
Path
def
set_system_path
():
package
=
importlib
.
import_module
(
"nanotron"
)
# NOTE: Path(package.__file__).parent = .../nanotron/src/nanotron
# we want .../nanotron
package_path
=
Path
(
package
.
__file__
).
parent
.
parent
.
parent
sys
.
path
.
insert
(
0
,
str
(
package_path
))
# we also want ../llama
llama_path
=
Path
(
__file__
).
parent
.
parent
sys
.
path
.
insert
(
0
,
str
(
llama_path
))
examples/mamba/README.md
0 → 100644
View file @
d99506f3
---
library_name
:
nanotron
---
# Mamba
Modeling code for Mamba to use with
[
Nanotron
](
https://github.com/huggingface/nanotron/
)
## 🚀 Quickstart
```
bash
pip
install
-r
requirements.txt
# Run training
./examples/mamba/train_mamba.sh
```

> https://wandb.ai/bouteille/test/reports/Mamba-loss--Vmlldzo2OTgwNDM5
## Bug related to nanotron
Encountered the following issue when ran train_mamba.sh:
```
causal_conv1d_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c1017SymbolicShapeMeta18init_is_contiguousEv
```
Solved this by doing:
pip uninstall mamba-ssm
pip install causal_conv1d==1.1.1
pip install mamba-ssm --no-cache-dir
https://github.com/state-spaces/mamba/issues/169
## Credits
Credits to the following repositories from which the code was adapted:
-
https://github.com/state-spaces/mamba
examples/mamba/assets/loss_mamba.png
0 → 100644
View file @
d99506f3
39.1 KB
examples/mamba/config.py
0 → 100644
View file @
d99506f3
from
dataclasses
import
dataclass
from
typing
import
Optional
,
Union
import
torch
from
nanotron.config
import
Config
,
ExistingCheckpointInit
,
NanotronConfigs
from
nanotron.config.utils_config
import
cast_str_to_torch_dtype
@
dataclass
class
MambaInit
:
initializer_range
:
float
=
0.02
rescale_prenorm_residual
:
bool
=
True
n_residuals_per_layer
:
int
=
1
# Change to 2 if we have MLP
@
dataclass
class
ModelArgs
:
"""Arguments related to model architecture"""
model_config
:
NanotronConfigs
init_method
:
Union
[
MambaInit
,
ExistingCheckpointInit
]
dtype
:
Optional
[
torch
.
dtype
]
=
None
make_vocab_size_divisible_by
:
int
=
1
ddp_bucket_cap_mb
:
int
=
25
def
__post_init__
(
self
):
if
self
.
dtype
is
None
:
self
.
dtype
=
torch
.
bfloat16
if
isinstance
(
self
.
dtype
,
str
):
self
.
dtype
=
cast_str_to_torch_dtype
(
self
.
dtype
)
# if self.model_config.max_position_embeddings is None:
# self.model_config.max_position_embeddings = 0
@
dataclass
(
kw_only
=
True
)
# pylint: disable=unexpected-keyword-arg
class
MambaConfig
(
Config
):
"""Main configuration class"""
model
:
ModelArgs
@
dataclass
class
MambaModelConfig
:
"""Configuration for a Mamba model
Be careful on having a coherent typing as we use it to reconstruct the model from yaml
"""
is_mamba_config
:
bool
=
True
# We use this help differentiate models in yaml/python conversion
d_model
:
int
=
2560
num_hidden_layers
:
int
=
64
vocab_size
:
int
=
50277
ssm_cfg
:
Optional
[
dict
]
=
None
rms_norm
:
bool
=
True
fused_add_norm
:
bool
=
True
residual_in_fp32
:
bool
=
True
pad_vocab_size_multiple
:
int
=
8
# ==== Custom ======
dtype
:
str
=
"float32"
rms_norm_eps
:
float
=
1e-5
pad_token_id
:
Optional
[
int
]
=
None
examples/mamba/config_mamba.yaml
0 → 100644
View file @
d99506f3
checkpoints
:
checkpoint_interval
:
10
checkpoints_path
:
/fsx/ferdinandmom/ferdinand-hf/brrr/nanotron/examples/checkpoints
checkpoints_path_is_shared_file_system
:
false
resume_checkpoint_path
:
null
save_initial_state
:
false
data_stages
:
-
name
:
General purpose training
start_training_step
:
1
data
:
dataset
:
dataset_overwrite_cache
:
false
dataset_processing_num_proc_per_process
:
24
hf_dataset_config_name
:
null
hf_dataset_or_datasets
:
roneneldan/TinyStories
:
1.0
hf_dataset_splits
:
train
text_column_name
:
text
num_loading_workers
:
1
seed
:
42
general
:
benchmark_csv_path
:
null
consumed_train_samples
:
null
ignore_sanity_checks
:
true
project
:
test
run
:
mamba
seed
:
42
step
:
null
lighteval
:
null
logging
:
iteration_step_info_interval
:
1
log_level
:
info
log_level_replica
:
info
model
:
ddp_bucket_cap_mb
:
25
dtype
:
bfloat16
init_method
:
initializer_range
:
0.02
n_residuals_per_layer
:
1
rescale_prenorm_residual
:
true
make_vocab_size_divisible_by
:
1
model_config
:
d_model
:
1536
dtype
:
bfloat16
fused_add_norm
:
true
is_mamba_config
:
true
num_hidden_layers
:
48
pad_token_id
:
null
pad_vocab_size_multiple
:
8
residual_in_fp32
:
true
rms_norm
:
true
rms_norm_eps
:
1.0e-05
ssm_cfg
:
bias
:
false
conv_bias
:
true
d_conv
:
4
d_state
:
16
dt_init
:
random
dt_init_floor
:
0.0001
dt_max
:
0.1
dt_min
:
0.001
dt_rank
:
auto
dt_scale
:
1.0
expand
:
2
use_fast_path
:
true
vocab_size
:
50277
optimizer
:
accumulate_grad_in_fp32
:
true
adam_beta1
:
0.9
adam_beta2
:
0.95
adam_eps
:
1.0e-08
clip_grad
:
1.0
learning_rate_scheduler
:
learning_rate
:
0.0003
lr_decay_starting_step
:
null
lr_decay_steps
:
90
lr_decay_style
:
cosine
lr_warmup_steps
:
10
lr_warmup_style
:
linear
min_decay_lr
:
1.0e-05
torch_adam_is_fused
:
true
weight_decay
:
0.01
zero_stage
:
0
parallelism
:
dp
:
2
expert_parallel_size
:
1
pp
:
2
pp_engine
:
1f1b
tp
:
2
tp_linear_async_communication
:
false
tp_mode
:
ALL_REDUCE
profiler
:
null
tokenizer
:
tokenizer_max_length
:
null
tokenizer_name_or_path
:
gpt2
tokenizer_revision
:
null
tokens
:
batch_accumulation_per_replica
:
1
limit_test_batches
:
0
limit_val_batches
:
0
micro_batch_size
:
2
sequence_length
:
2048
train_steps
:
100
val_check_interval
:
-1
Prev
1
2
3
4
5
6
7
8
9
10
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment