Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
9eb8683b
Commit
9eb8683b
authored
Apr 10, 2025
by
dongcl
Browse files
Merge branch 'main' into megatron_v0.11.0
parents
6f016785
be9a69d7
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1020 additions
and
1038 deletions
+1020
-1038
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+20
-6
dcu_megatron/core/__init__.py
dcu_megatron/core/__init__.py
+0
-1
dcu_megatron/core/extensions/transformer_engine.py
dcu_megatron/core/extensions/transformer_engine.py
+126
-124
dcu_megatron/core/tensor_parallel/__init__.py
dcu_megatron/core/tensor_parallel/__init__.py
+7
-0
dcu_megatron/core/tensor_parallel/layers.py
dcu_megatron/core/tensor_parallel/layers.py
+847
-2
dcu_megatron/core/transformer/transformer_config.py
dcu_megatron/core/transformer/transformer_config.py
+11
-905
dcu_megatron/training/arguments.py
dcu_megatron/training/arguments.py
+9
-0
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
9eb8683b
...
@@ -123,7 +123,7 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -123,7 +123,7 @@ class CoreAdaptation(MegatronAdaptationABC):
def
patch_core_transformers
(
self
):
def
patch_core_transformers
(
self
):
from
..core
import
transformer_block_init_wrapper
,
transformer_block_forward
from
..core
import
transformer_block_init_wrapper
,
transformer_block_forward
from
..core.transformer.transformer_config
import
TransformerConfig
,
MLATransformerConfig
from
..core.transformer.transformer_config
import
TransformerConfig
Patch
,
MLATransformerConfig
Patch
# Transformer block
# Transformer block
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_block.TransformerBlock.__init__'
,
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_block.TransformerBlock.__init__'
,
...
@@ -133,9 +133,9 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -133,9 +133,9 @@ class CoreAdaptation(MegatronAdaptationABC):
# Transformer config
# Transformer config
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_config.TransformerConfig'
,
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_config.TransformerConfig'
,
TransformerConfig
)
TransformerConfig
Patch
)
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_config.MLATransformerConfig'
,
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_config.MLATransformerConfig'
,
MLATransformerConfig
)
MLATransformerConfig
Patch
)
# Moe
# Moe
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity'
,
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity'
,
...
@@ -154,18 +154,19 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -154,18 +154,19 @@ class CoreAdaptation(MegatronAdaptationABC):
def
patch_core_extentions
(
self
):
def
patch_core_extentions
(
self
):
import
transformer_engine
as
te
import
transformer_engine
as
te
from
..core.extensions.transformer_engine
import
te_dot_p
roduct
_a
ttention
_init
from
..core.extensions.transformer_engine
import
TEDotP
roduct
A
ttention
Patch
from
megatron.core.extensions.transformer_engine
import
TEGroupedLinear
from
megatron.core.extensions.transformer_engine
import
TEGroupedLinear
MegatronAdaptation
.
register
(
'megatron.core.extensions.transformer_engine.TEDotProductAttention.__init__'
,
MegatronAdaptation
.
register
(
'megatron.core.extensions.transformer_engine.TEDotProductAttention.__init__'
,
te_dot_p
roduct
_a
ttention_init
)
TEDotP
roduct
A
ttention
Patch
.
_
_init
__
)
if
int
(
os
.
getenv
(
"GROUPED_GEMM_BatchLinear"
,
'0'
)):
if
int
(
os
.
getenv
(
"GROUPED_GEMM_BatchLinear"
,
'0'
)):
TEGroupedLinear
.
__bases__
=
(
te
.
pytorch
.
BatchLinear
,)
TEGroupedLinear
.
__bases__
=
(
te
.
pytorch
.
BatchLinear
,)
def
patch_tensor_parallel
(
self
):
def
patch_tensor_parallel
(
self
):
from
..core
import
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_init
from
..core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
from
..core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
from
..core.tensor_parallel
import
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_init
from
..core.tensor_parallel
import
ColumnParallelLinearPatch
,
RowParallelLinearPatch
,
parallel_linear_init_wrapper
# VocabParallelEmbedding
# VocabParallelEmbedding
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward'
,
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward'
,
...
@@ -186,6 +187,19 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -186,6 +187,19 @@ class CoreAdaptation(MegatronAdaptationABC):
staticmethod
,
staticmethod
,
apply_wrapper
=
True
)
apply_wrapper
=
True
)
# flux
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.ColumnParallelLinear.__init__"
,
parallel_linear_init_wrapper
,
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.ColumnParallelLinear.forward"
,
ColumnParallelLinearPatch
.
forward
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.RowParallelLinear.__init__"
,
parallel_linear_init_wrapper
,
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.RowParallelLinear.forward"
,
RowParallelLinearPatch
.
forward
)
def
patch_training
(
self
):
def
patch_training
(
self
):
from
..training.tokenizer
import
build_tokenizer
from
..training.tokenizer
import
build_tokenizer
from
..training.initialize
import
_initialize_distributed
from
..training.initialize
import
_initialize_distributed
...
...
dcu_megatron/core/__init__.py
View file @
9eb8683b
from
.tensor_parallel.layers
import
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_init
from
.transformer.transformer_block
import
transformer_block_init_wrapper
,
transformer_block_forward
from
.transformer.transformer_block
import
transformer_block_init_wrapper
,
transformer_block_forward
dcu_megatron/core/extensions/transformer_engine.py
View file @
9eb8683b
import
os
import
os
import
dataclasses
import
dataclasses
import
transformer_engine
as
te
from
typing
import
Any
,
Optional
from
typing
import
Any
,
Optional
from
packaging.version
import
Version
as
PkgVersion
from
packaging.version
import
Version
as
PkgVersion
...
@@ -19,135 +20,136 @@ from megatron.core.parallel_state import (
...
@@ -19,135 +20,136 @@ from megatron.core.parallel_state import (
)
)
def
te_dot_product_attention_init
(
class
TEDotProductAttentionPatch
(
te
.
pytorch
.
DotProductAttention
):
self
,
def
__init__
(
config
:
TransformerConfig
,
self
,
layer_number
:
int
,
config
:
TransformerConfig
,
attn_mask_type
:
AttnMaskType
,
layer_number
:
int
,
attention_type
:
str
,
attn_mask_type
:
AttnMaskType
,
attention_dropout
:
Optional
[
float
]
=
None
,
attention_type
:
str
,
softmax_scale
:
Optional
[
float
]
=
None
,
attention_dropout
:
Optional
[
float
]
=
None
,
k_channels
:
Optional
[
int
]
=
None
,
softmax_scale
:
Optional
[
float
]
=
None
,
v_channels
:
Optional
[
int
]
=
None
,
k_channels
:
Optional
[
int
]
=
None
,
cp_comm_type
:
str
=
"p2p"
,
v_channels
:
Optional
[
int
]
=
None
,
):
cp_comm_type
:
str
=
"p2p"
,
self
.
config
=
config
self
.
te_forward_mask_type
=
False
self
.
qkv_format
:
str
=
'sbhd'
if
self
.
config
.
apply_query_key_layer_scaling
!=
bool
(
int
(
os
.
getenv
(
'NVTE_APPLY_QK_LAYER_SCALING'
,
'0'
))
):
):
raise
ValueError
(
self
.
config
=
config
f
"apply_query_key_layer_scaling is
{
self
.
config
.
apply_query_key_layer_scaling
}
"
self
.
te_forward_mask_type
=
False
f
"but environment variable NVTE_APPLY_QK_LAYER_SCALING is "
self
.
qkv_format
:
str
=
'sbhd'
f
"
{
os
.
getenv
(
'NVTE_APPLY_QK_LAYER_SCALING'
)
}
. Transformer Engine does not support "
f
"setting query key layer scaling via argument, so these two must match."
if
self
.
config
.
apply_query_key_layer_scaling
!=
bool
(
)
int
(
os
.
getenv
(
'NVTE_APPLY_QK_LAYER_SCALING'
,
'0'
))
):
raise
ValueError
(
f
"apply_query_key_layer_scaling is
{
self
.
config
.
apply_query_key_layer_scaling
}
"
f
"but environment variable NVTE_APPLY_QK_LAYER_SCALING is "
f
"
{
os
.
getenv
(
'NVTE_APPLY_QK_LAYER_SCALING'
)
}
. Transformer Engine does not support "
f
"setting query key layer scaling via argument, so these two must match."
)
extra_kwargs
:
dict
[
str
,
Any
]
=
{}
extra_kwargs
:
dict
[
str
,
Any
]
=
{}
if
is_te_min_version
(
"0.11.0"
):
if
is_te_min_version
(
"0.11.0"
):
extra_kwargs
[
"num_gqa_groups"
]
=
self
.
config
.
num_query_groups
extra_kwargs
[
"num_gqa_groups"
]
=
self
.
config
.
num_query_groups
elif
self
.
config
.
num_query_groups
!=
self
.
config
.
num_attention_heads
:
elif
self
.
config
.
num_query_groups
!=
self
.
config
.
num_attention_heads
:
raise
ValueError
(
raise
ValueError
(
f
"Transformer Engine v
{
get_te_version
()
}
does not support Grouped Query Attention, "
f
"Transformer Engine v
{
get_te_version
()
}
does not support Grouped Query Attention, "
f
"use a newer version of Transformer Engine. "
f
"use a newer version of Transformer Engine. "
f
"(num_query_groups (
{
self
.
config
.
num_query_groups
}
) != "
f
"(num_query_groups (
{
self
.
config
.
num_query_groups
}
) != "
f
"num_attention_heads (
{
self
.
config
.
num_attention_heads
}
))"
f
"num_attention_heads (
{
self
.
config
.
num_attention_heads
}
))"
)
)
if
is_te_min_version
(
"0.10.0"
):
if
is_te_min_version
(
"0.10.0"
):
extra_kwargs
[
"attention_type"
]
=
attention_type
extra_kwargs
[
"attention_type"
]
=
attention_type
# older version don't need attention_type
# older version don't need attention_type
if
is_te_min_version
(
"0.12.0"
,
check_equality
=
False
):
if
is_te_min_version
(
"0.12.0"
,
check_equality
=
False
):
self
.
te_forward_mask_type
=
True
self
.
te_forward_mask_type
=
True
# This check is important as CP config can be disabled while having a valid CP group
# This check is important as CP config can be disabled while having a valid CP group
# Example - Disabling CP for encoder while a valid CP group exists for decoder
# Example - Disabling CP for encoder while a valid CP group exists for decoder
if
self
.
config
.
context_parallel_size
>
1
:
if
self
.
config
.
context_parallel_size
>
1
:
assert
is_te_min_version
(
assert
is_te_min_version
(
"1.0.0"
"1.0.0"
),
"Only Transformer-Engine version >= 1.0.0 supports context parallelism!"
),
"Only Transformer-Engine version >= 1.0.0 supports context parallelism!"
if
getattr
(
TEDotProductAttention
,
"cp_stream"
)
is
None
:
if
getattr
(
TEDotProductAttention
,
"cp_stream"
)
is
None
:
TEDotProductAttention
.
cp_stream
=
torch
.
cuda
.
Stream
()
TEDotProductAttention
.
cp_stream
=
torch
.
cuda
.
Stream
()
extra_kwargs
[
"cp_group"
]
=
get_context_parallel_group
(
check_initialized
=
False
)
extra_kwargs
[
"cp_group"
]
=
get_context_parallel_group
(
check_initialized
=
False
)
extra_kwargs
[
"cp_global_ranks"
]
=
get_context_parallel_global_ranks
(
extra_kwargs
[
"cp_global_ranks"
]
=
get_context_parallel_global_ranks
(
check_initialized
=
False
check_initialized
=
False
)
)
extra_kwargs
[
"cp_stream"
]
=
TEDotProductAttention
.
cp_stream
extra_kwargs
[
"cp_stream"
]
=
TEDotProductAttention
.
cp_stream
if
is_te_min_version
(
"1.10.0"
):
if
is_te_min_version
(
"1.10.0"
):
if
cp_comm_type
is
None
:
if
cp_comm_type
is
None
:
extra_kwargs
[
"cp_comm_type"
]
=
"p2p"
extra_kwargs
[
"cp_comm_type"
]
=
"p2p"
elif
cp_comm_type
==
"a2a+p2p"
:
elif
cp_comm_type
==
"a2a+p2p"
:
assert
is_te_min_version
(
"1.12.0"
),
(
assert
is_te_min_version
(
"1.12.0"
),
(
f
"Transformer-Engine v
{
get_te_version
()
}
must be >= 1.12.0 to support"
f
"Transformer-Engine v
{
get_te_version
()
}
must be >= 1.12.0 to support"
"hierarchical cp commucation."
"hierarchical cp commucation."
)
)
extra_kwargs
[
"cp_comm_type"
]
=
"a2a+p2p"
extra_kwargs
[
"cp_comm_type"
]
=
"a2a+p2p"
extra_kwargs
[
"cp_group"
]
=
get_hierarchical_context_parallel_groups
(
extra_kwargs
[
"cp_group"
]
=
get_hierarchical_context_parallel_groups
(
check_initialized
=
False
check_initialized
=
False
)
else
:
extra_kwargs
[
"cp_comm_type"
]
=
cp_comm_type
if
self
.
config
.
deterministic_mode
:
if
int
(
os
.
getenv
(
"NVTE_ALLOW_NONDETERMINISTIC_ALGO"
,
"1"
))
!=
0
:
raise
RuntimeError
(
"deterministic_mode is on and we are using DotProductAttention from "
"Transformer Engine, but NVTE_ALLOW_NONDETERMINISTIC_ALGO is not 0. "
f
"Currently set to:
{
os
.
getenv
(
'NVTE_ALLOW_NONDETERMINISTIC_ALGO'
,
'not set'
)
}
."
)
)
else
:
extra_kwargs
[
"cp_comm_type"
]
=
cp_comm_type
if
config
.
window_size
is
not
None
:
# Check version
if
self
.
config
.
deterministic_mode
:
assert
is_te_min_version
(
"1.2.0"
),
(
if
int
(
os
.
getenv
(
"NVTE_ALLOW_NONDETERMINISTIC_ALGO"
,
"1"
))
!=
0
:
f
"Transformer-Engine v
{
get_te_version
()
}
must be >= 1.2.0 to support"
raise
RuntimeError
(
"sliding window attention."
"deterministic_mode is on and we are using DotProductAttention from "
)
"Transformer Engine, but NVTE_ALLOW_NONDETERMINISTIC_ALGO is not 0. "
extra_kwargs
[
'window_size'
]
=
config
.
window_size
f
"Currently set to:
{
os
.
getenv
(
'NVTE_ALLOW_NONDETERMINISTIC_ALGO'
,
'not set'
)
}
."
if
is_te_min_version
(
"1.9.0"
):
# TE 1.10.0 introduces the ability to set the different k and v channels
kv_channels
=
(
(
k_channels
,
v_channels
)
if
k_channels
is
not
None
and
v_channels
is
not
None
else
self
.
config
.
kv_channels
)
)
extra_kwargs
[
'softmax_scale'
]
=
softmax_scale
else
:
kv_channels
=
self
.
config
.
kv_channels
if
config
.
window_size
is
not
None
:
self
.
kept_packed_seq_params
=
set
(
# Check version
field
.
name
for
field
in
dataclasses
.
fields
(
PackedSeqParams
)
assert
is_te_min_version
(
"1.2.0"
),
(
f
"Transformer-Engine v
{
get_te_version
()
}
must be >= 1.2.0 to support"
"sliding window attention."
)
)
extra_kwargs
[
'window_size'
]
=
config
.
window_size
if
get_te_version
()
<
PkgVersion
(
"1.3.0"
):
# TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
if
is_te_min_version
(
"1.9.0"
):
# copies (#555)
# TE 1.10.0 introduces the ability to set the different k and v channels
# These two arguments did not exist prior to 1.3.0
kv_channels
=
(
self
.
kept_packed_seq_params
.
discard
(
"max_seqlen_q"
)
(
k_channels
,
v_channels
)
self
.
kept_packed_seq_params
.
discard
(
"max_seqlen_kv"
)
if
k_channels
is
not
None
and
v_channels
is
not
None
else
self
.
config
.
kv_channels
if
get_te_version
()
<
PkgVersion
(
"1.10.0"
):
# TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
# in each individual sequence in THD format dataset
# These two arguments did not exist prior to 1.8.0. Full support added in 1.10.0 (#1012)
self
.
kept_packed_seq_params
.
discard
(
"cu_seqlens_q_padded"
)
self
.
kept_packed_seq_params
.
discard
(
"cu_seqlens_kv_padded"
)
super
(
TEDotProductAttention
,
self
).
__init__
(
num_attention_heads
=
self
.
config
.
num_attention_heads
,
kv_channels
=
kv_channels
,
attention_dropout
=
(
self
.
config
.
attention_dropout
if
attention_dropout
is
None
else
attention_dropout
),
attn_mask_type
=
attn_mask_type
.
name
,
sequence_parallel
=
self
.
config
.
sequence_parallel
,
tp_size
=
self
.
config
.
tensor_model_parallel_size
,
get_rng_state_tracker
=
(
get_cuda_rng_tracker
if
get_cuda_rng_tracker
().
is_initialized
()
else
None
),
tp_group
=
get_tensor_model_parallel_group
(
check_initialized
=
False
),
layer_number
=
layer_number
,
**
extra_kwargs
,
)
)
extra_kwargs
[
'softmax_scale'
]
=
softmax_scale
else
:
kv_channels
=
self
.
config
.
kv_channels
self
.
kept_packed_seq_params
=
set
(
field
.
name
for
field
in
dataclasses
.
fields
(
PackedSeqParams
)
)
if
get_te_version
()
<
PkgVersion
(
"1.3.0"
):
# TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
# copies (#555)
# These two arguments did not exist prior to 1.3.0
self
.
kept_packed_seq_params
.
discard
(
"max_seqlen_q"
)
self
.
kept_packed_seq_params
.
discard
(
"max_seqlen_kv"
)
if
get_te_version
()
<
PkgVersion
(
"1.10.0"
):
# TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
# in each individual sequence in THD format dataset
# These two arguments did not exist prior to 1.8.0. Full support added in 1.10.0 (#1012)
self
.
kept_packed_seq_params
.
discard
(
"cu_seqlens_q_padded"
)
self
.
kept_packed_seq_params
.
discard
(
"cu_seqlens_kv_padded"
)
super
(
TEDotProductAttention
,
self
).
__init__
(
num_attention_heads
=
self
.
config
.
num_attention_heads
,
kv_channels
=
kv_channels
,
attention_dropout
=
(
self
.
config
.
attention_dropout
if
attention_dropout
is
None
else
attention_dropout
),
attn_mask_type
=
attn_mask_type
.
name
,
sequence_parallel
=
self
.
config
.
sequence_parallel
,
tp_size
=
self
.
config
.
tensor_model_parallel_size
,
get_rng_state_tracker
=
(
get_cuda_rng_tracker
if
get_cuda_rng_tracker
().
is_initialized
()
else
None
),
tp_group
=
get_tensor_model_parallel_group
(
check_initialized
=
False
),
layer_number
=
layer_number
,
**
extra_kwargs
,
)
dcu_megatron/core/tensor_parallel/__init__.py
0 → 100644
View file @
9eb8683b
from
.layers
import
(
parallel_linear_init_wrapper
ColumnParallelLinearPatch
,
RowParallelLinearPatch
,
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_init
,
)
\ No newline at end of file
dcu_megatron/core/tensor_parallel/layers.py
View file @
9eb8683b
This diff is collapsed.
Click to expand it.
dcu_megatron/core/transformer/transformer_config.py
View file @
9eb8683b
This diff is collapsed.
Click to expand it.
dcu_megatron/training/arguments.py
View file @
9eb8683b
...
@@ -165,3 +165,12 @@ def _add_mtp_args(parser):
...
@@ -165,3 +165,12 @@ def _add_mtp_args(parser):
group
.
add_argument
(
'--share-mtp-embedding-and-output-weight'
,
action
=
'store_true'
,
default
=
False
,
group
.
add_argument
(
'--share-mtp-embedding-and-output-weight'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Main model share embedding and output weight with mtp layer.'
)
help
=
'Main model share embedding and output weight with mtp layer.'
)
return
parser
return
parser
def
_add_flux_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'multi token prediction'
)
group
.
add_argument
(
'--use-flux'
,
action
=
'store_true'
,
default
=
False
,
help
=
'If set, flux will be used in ColumnParallelLinear and RowParallelLinear'
)
group
.
add_argument
(
'--flux-transpose-weight'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Whether to transpose weight when using flux kernel'
)
return
parser
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment