Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
ec7c8bc3
Commit
ec7c8bc3
authored
Apr 17, 2025
by
dongcl
Browse files
replace te with flux when using flux
parent
138b70a2
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
296 additions
and
224 deletions
+296
-224
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+7
-17
dcu_megatron/core/models/gpt/gpt_layer_specs.py
dcu_megatron/core/models/gpt/gpt_layer_specs.py
+13
-11
dcu_megatron/core/tensor_parallel/__init__.py
dcu_megatron/core/tensor_parallel/__init__.py
+2
-4
dcu_megatron/core/tensor_parallel/layers.py
dcu_megatron/core/tensor_parallel/layers.py
+274
-192
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
ec7c8bc3
...
...
@@ -190,27 +190,17 @@ class CoreAdaptation(MegatronAdaptationABC):
# flux
if
os
.
getenv
(
"USE_FLUX_OVERLAP"
,
0
):
import
flux
from
..core.tensor_parallel
import
(
ColumnParallelLinearPatch
,
RowParallelLinearPatch
,
column_parallel_linear_init_wrapper
,
row_parallel_linear_init_wrapper
FluxColumnParallelLinear
,
FluxRowParallelLinear
)
from
..core.models.gpt.gpt_layer_specs
import
get_gpt_layer_with_flux_spec
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.ColumnParallelLinear.__init__"
,
column_parallel_linear_init_wrapper
,
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.ColumnParallelLinear.forward"
,
ColumnParallelLinearPatch
.
forward
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.RowParallelLinear.__init__"
,
row_parallel_linear_init_wrapper
,
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.RowParallelLinear.forward"
,
RowParallelLinearPatch
.
forward
)
MegatronAdaptation
.
register
(
"megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_local_spec"
,
MegatronAdaptation
.
register
(
"megatron.core.extensions.transformer_engine.TEColumnParallelLinear"
,
FluxColumnParallelLinear
)
MegatronAdaptation
.
register
(
"megatron.core.extensions.transformer_engine.TERowParallelLinear"
,
FluxRowParallelLinear
)
MegatronAdaptation
.
register
(
"megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec"
,
get_gpt_layer_with_flux_spec
)
def
patch_training
(
self
):
...
...
dcu_megatron/core/models/gpt/gpt_layer_specs.py
View file @
ec7c8bc3
...
...
@@ -3,7 +3,6 @@ from typing import Optional
from
megatron.core.fusions.fused_bias_dropout
import
get_bias_dropout_add
from
megatron.core.models.gpt.moe_module_specs
import
get_moe_module_spec
from
megatron.core.tensor_parallel.layers
import
ColumnParallelLinear
,
RowParallelLinear
from
megatron.core.transformer.attention
import
SelfAttention
,
SelfAttentionSubmodules
from
megatron.core.transformer.enums
import
AttnMaskType
from
megatron.core.transformer.identity_op
import
IdentityOp
...
...
@@ -17,6 +16,9 @@ from megatron.core.transformer.transformer_layer import (
TransformerLayer
,
TransformerLayerSubmodules
,
)
from
dcu_megatron.core.tensor_parallel.layers
import
FluxColumnParallelLinear
,
FluxRowParallelLinear
from
megatron.core.utils
import
is_te_min_version
try
:
...
...
@@ -79,13 +81,13 @@ def get_gpt_layer_with_flux_spec(
module
=
MLASelfAttention
,
params
=
{
"attn_mask_type"
:
AttnMaskType
.
causal
},
submodules
=
MLASelfAttentionSubmodules
(
linear_q_proj
=
ColumnParallelLinear
,
linear_q_down_proj
=
ColumnParallelLinear
,
linear_q_up_proj
=
ColumnParallelLinear
,
linear_kv_down_proj
=
ColumnParallelLinear
,
linear_kv_up_proj
=
ColumnParallelLinear
,
linear_q_proj
=
Flux
ColumnParallelLinear
,
linear_q_down_proj
=
Flux
ColumnParallelLinear
,
linear_q_up_proj
=
Flux
ColumnParallelLinear
,
linear_kv_down_proj
=
Flux
ColumnParallelLinear
,
linear_kv_up_proj
=
Flux
ColumnParallelLinear
,
core_attention
=
TEDotProductAttention
,
linear_proj
=
RowParallelLinear
,
linear_proj
=
Flux
RowParallelLinear
,
q_layernorm
=
TENorm
if
qk_layernorm
else
IdentityOp
,
kv_layernorm
=
TENorm
if
qk_layernorm
else
IdentityOp
,
),
...
...
@@ -111,9 +113,9 @@ def get_gpt_layer_with_flux_spec(
module
=
SelfAttention
,
params
=
{
"attn_mask_type"
:
AttnMaskType
.
causal
},
submodules
=
SelfAttentionSubmodules
(
linear_qkv
=
ColumnParallelLinear
,
linear_qkv
=
Flux
ColumnParallelLinear
,
core_attention
=
TEDotProductAttention
,
linear_proj
=
RowParallelLinear
,
linear_proj
=
Flux
RowParallelLinear
,
q_layernorm
=
qk_norm
if
qk_layernorm
else
IdentityOp
,
k_layernorm
=
qk_norm
if
qk_layernorm
else
IdentityOp
,
),
...
...
@@ -145,8 +147,8 @@ def get_mlp_module_flux_spec(
return
ModuleSpec
(
module
=
MLP
,
submodules
=
MLPSubmodules
(
linear_fc1
=
ColumnParallelLinear
,
linear_fc2
=
RowParallelLinear
,
linear_fc1
=
Flux
ColumnParallelLinear
,
linear_fc2
=
Flux
RowParallelLinear
,
),
)
else
:
...
...
dcu_megatron/core/tensor_parallel/__init__.py
View file @
ec7c8bc3
from
.layers
import
(
column_parallel_linear_init_wrapper
,
row_parallel_linear_init_wrapper
,
ColumnParallelLinearPatch
,
RowParallelLinearPatch
,
FluxColumnParallelLinear
,
FluxRowParallelLinear
,
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_init
,
)
\ No newline at end of file
dcu_megatron/core/tensor_parallel/layers.py
View file @
ec7c8bc3
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment