Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
ce02cd51
Commit
ce02cd51
authored
Apr 21, 2025
by
dongcl
Browse files
Megatron v0.11.0
parent
aeed6d97
Changes
28
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1651 additions
and
625 deletions
+1651
-625
README.md
README.md
+4
-0
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+26
-22
dcu_megatron/core/__init__.py
dcu_megatron/core/__init__.py
+1
-1
dcu_megatron/core/distributed/finalize_model_grads.py
dcu_megatron/core/distributed/finalize_model_grads.py
+0
-3
dcu_megatron/core/models/common/embeddings/language_model_embedding.py
...core/models/common/embeddings/language_model_embedding.py
+0
-2
dcu_megatron/core/models/gpt/gpt_layer_specs.py
dcu_megatron/core/models/gpt/gpt_layer_specs.py
+161
-0
dcu_megatron/core/models/gpt/gpt_model.py
dcu_megatron/core/models/gpt/gpt_model.py
+65
-148
dcu_megatron/core/tensor_parallel/__init__.py
dcu_megatron/core/tensor_parallel/__init__.py
+2
-3
dcu_megatron/core/tensor_parallel/layers.py
dcu_megatron/core/tensor_parallel/layers.py
+472
-235
dcu_megatron/core/transformer/mtp/multi_token_predictor.py
dcu_megatron/core/transformer/mtp/multi_token_predictor.py
+18
-13
dcu_megatron/core/transformer/transformer_block.py
dcu_megatron/core/transformer/transformer_block.py
+3
-190
dcu_megatron/core/transformer/transformer_config.py
dcu_megatron/core/transformer/transformer_config.py
+0
-3
dcu_megatron/core/utils.py
dcu_megatron/core/utils.py
+29
-0
dcu_megatron/legacy/model/rms_norm.py
dcu_megatron/legacy/model/rms_norm.py
+101
-0
dcu_megatron/legacy/model/transformer.py
dcu_megatron/legacy/model/transformer.py
+9
-2
dcu_megatron/legacy/model/utils.py
dcu_megatron/legacy/model/utils.py
+26
-0
dcu_megatron/training/arguments.py
dcu_megatron/training/arguments.py
+14
-3
examples/llama/Llama2_70b.sh
examples/llama/Llama2_70b.sh
+248
-0
examples/llama/Llama2_7b.sh
examples/llama/Llama2_7b.sh
+234
-0
examples/llama/Llama3_405b.sh
examples/llama/Llama3_405b.sh
+238
-0
No files found.
README.md
View file @
ce02cd51
...
@@ -66,6 +66,10 @@ def unpermute(
...
@@ -66,6 +66,10 @@ def unpermute(
):
):
```
```
### 项目支持使用[flux kernel](http://10.6.10.68/dcutoolkit/deeplearing/flux)
在tp场景下,用户可以选择使用flux通算融合算子,获得更好的训练和推理性能。项目通过替换transformer engine方法集成flux,使用时需要设置环境变量USE_FLUX_OVERLAP=1,并设置transformer-impl为transformer_engine。
### 使用方式
### 使用方式
在使用时,进入到examples目录下,有相关模型执行脚本,所用数据集请自行下载:https://r0ddbu55vzx.feishu.cn/drive/folder/ZxHHfCoX4lg75td2hTqcmiAin3g
在使用时,进入到examples目录下,有相关模型执行脚本,所用数据集请自行下载:https://r0ddbu55vzx.feishu.cn/drive/folder/ZxHHfCoX4lg75td2hTqcmiAin3g
```
```
...
...
dcu_megatron/adaptor/megatron_adaptor.py
View file @
ce02cd51
...
@@ -99,7 +99,7 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -99,7 +99,7 @@ class CoreAdaptation(MegatronAdaptationABC):
)
)
from
..core.models.gpt.gpt_model
import
(
from
..core.models.gpt.gpt_model
import
(
gpt_model_forward
,
gpt_model_forward
,
gpt_model_init
,
gpt_model_init
_wrapper
,
shared_embedding_or_mtp_embedding_weight
shared_embedding_or_mtp_embedding_weight
)
)
from
..training.utils
import
get_batch_on_this_tp_rank
from
..training.utils
import
get_batch_on_this_tp_rank
...
@@ -116,20 +116,20 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -116,20 +116,20 @@ class CoreAdaptation(MegatronAdaptationABC):
# GPT Model
# GPT Model
MegatronAdaptation
.
register
(
'megatron.core.models.gpt.gpt_model.GPTModel.forward'
,
gpt_model_forward
)
MegatronAdaptation
.
register
(
'megatron.core.models.gpt.gpt_model.GPTModel.forward'
,
gpt_model_forward
)
MegatronAdaptation
.
register
(
'megatron.core.models.gpt.gpt_model.GPTModel.__init__'
,
gpt_model_init
)
MegatronAdaptation
.
register
(
'megatron.core.models.gpt.gpt_model.GPTModel.__init__'
,
gpt_model_init_wrapper
,
apply_wrapper
=
True
)
from
megatron.core.models.gpt.gpt_model
import
GPTModel
from
megatron.core.models.gpt.gpt_model
import
GPTModel
setattr
(
GPTModel
,
'shared_embedding_or_mtp_embedding_weight'
,
shared_embedding_or_mtp_embedding_weight
)
setattr
(
GPTModel
,
'shared_embedding_or_mtp_embedding_weight'
,
shared_embedding_or_mtp_embedding_weight
)
def
patch_core_transformers
(
self
):
def
patch_core_transformers
(
self
):
from
..core
import
transformer_block_init_wrapper
,
transformer_block_forward
from
..core
import
transformer_block_init_wrapper
from
..core.transformer.transformer_config
import
TransformerConfigPatch
,
MLATransformerConfigPatch
from
..core.transformer.transformer_config
import
TransformerConfigPatch
,
MLATransformerConfigPatch
# Transformer block
# Transformer block
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_block.TransformerBlock.__init__'
,
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_block.TransformerBlock.__init__'
,
transformer_block_init_wrapper
)
transformer_block_init_wrapper
)
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_block.TransformerBlock.forward'
,
transformer_block_forward
)
# Transformer config
# Transformer config
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_config.TransformerConfig'
,
MegatronAdaptation
.
register
(
'megatron.core.transformer.transformer_config.TransformerConfig'
,
...
@@ -141,9 +141,9 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -141,9 +141,9 @@ class CoreAdaptation(MegatronAdaptationABC):
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity'
,
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity'
,
torch
.
compile
(
options
=
{
"triton.cudagraphs"
:
True
,
"triton.cudagraph_trees"
:
False
}),
torch
.
compile
(
options
=
{
"triton.cudagraphs"
:
True
,
"triton.cudagraph_trees"
:
False
}),
apply_wrapper
=
True
)
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func'
,
#
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
torch
.
compile
(
options
=
{
"triton.cudagraphs"
:
True
,
"triton.cudagraph_trees"
:
False
,
"triton.cudagraph_support_input_mutation"
:
True
}),
#
torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
apply_wrapper
=
True
)
#
apply_wrapper=True)
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.permute'
,
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.permute'
,
torch
.
compile
(
mode
=
'max-autotune-no-cudagraphs'
),
torch
.
compile
(
mode
=
'max-autotune-no-cudagraphs'
),
apply_wrapper
=
True
)
apply_wrapper
=
True
)
...
@@ -166,7 +166,6 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -166,7 +166,6 @@ class CoreAdaptation(MegatronAdaptationABC):
def
patch_tensor_parallel
(
self
):
def
patch_tensor_parallel
(
self
):
from
..core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
from
..core.tensor_parallel.cross_entropy
import
VocabParallelCrossEntropy
from
..core.tensor_parallel
import
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_init
from
..core.tensor_parallel
import
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_init
from
..core.tensor_parallel
import
ColumnParallelLinearPatch
,
RowParallelLinearPatch
,
parallel_linear_init_wrapper
# VocabParallelEmbedding
# VocabParallelEmbedding
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward'
,
MegatronAdaptation
.
register
(
'megatron.core.tensor_parallel.layers.VocabParallelEmbedding.forward'
,
...
@@ -188,17 +187,19 @@ class CoreAdaptation(MegatronAdaptationABC):
...
@@ -188,17 +187,19 @@ class CoreAdaptation(MegatronAdaptationABC):
apply_wrapper
=
True
)
apply_wrapper
=
True
)
# flux
# flux
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.ColumnParallelLinear.__init__"
,
if
int
(
os
.
getenv
(
"USE_FLUX_OVERLAP"
,
"0"
)):
parallel_linear_init_wrapper
,
from
..core.tensor_parallel
import
(
apply_wrapper
=
True
)
FluxColumnParallelLinear
,
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.ColumnParallelLinear.forward"
,
FluxRowParallelLinear
ColumnParallelLinearPatch
.
forward
)
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.RowParallelLinear.__init__"
,
from
..core.models.gpt.gpt_layer_specs
import
get_gpt_layer_with_flux_spec
parallel_linear_init_wrapper
,
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
"megatron.core.tensor_parallel.layers.RowParallelLinear.forward"
,
RowParallelLinearPatch
.
forward
)
MegatronAdaptation
.
register
(
"megatron.core.extensions.transformer_engine.TEColumnParallelLinear"
,
FluxColumnParallelLinear
)
MegatronAdaptation
.
register
(
"megatron.core.extensions.transformer_engine.TERowParallelLinear"
,
FluxRowParallelLinear
)
MegatronAdaptation
.
register
(
"megatron.core.models.gpt.gpt_layer_specs.get_gpt_layer_with_transformer_engine_spec"
,
get_gpt_layer_with_flux_spec
)
def
patch_training
(
self
):
def
patch_training
(
self
):
from
..training.tokenizer
import
build_tokenizer
from
..training.tokenizer
import
build_tokenizer
...
@@ -232,19 +233,22 @@ class LegacyAdaptation(MegatronAdaptationABC):
...
@@ -232,19 +233,22 @@ class LegacyAdaptation(MegatronAdaptationABC):
self
.
patch_legacy_models
()
self
.
patch_legacy_models
()
def
patch_legacy_models
(
self
):
def
patch_legacy_models
(
self
):
from
..legacy.model.transformer
import
ParallelMLP
,
ParallelAttention
from
..legacy.model.transformer
import
ParallelMLPPatch
,
ParallelAttentionPatch
from
..legacy.model.utils
import
get_norm
# ParallecMLP
# ParallecMLP
MegatronAdaptation
.
register
(
'megatron.legacy.model.transformer.ParallelMLP.__init__'
,
MegatronAdaptation
.
register
(
'megatron.legacy.model.transformer.ParallelMLP.__init__'
,
ParallelMLP
.
__init__
)
ParallelMLP
Patch
.
__init__
)
MegatronAdaptation
.
register
(
'megatron.legacy.model.transformer.ParallelAttention.forward'
,
MegatronAdaptation
.
register
(
'megatron.legacy.model.transformer.ParallelAttention.forward'
,
ParallelAttention
.
forward
)
ParallelAttention
Patch
.
forward
)
# rms_norm.RMSNorm
# rms_norm.RMSNorm
MegatronAdaptation
.
register
(
'megatron.legacy.model.rms_norm.RMSNorm.forward'
,
MegatronAdaptation
.
register
(
'megatron.legacy.model.rms_norm.RMSNorm.forward'
,
torch
.
compile
(
mode
=
"max-autotune-no-cudagraphs"
),
torch
.
compile
(
mode
=
"max-autotune-no-cudagraphs"
),
apply_wrapper
=
True
)
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
'megatron.legacy.model.utils.get_norm'
,
get_norm
)
MegatronAdaptation
.
execute
()
MegatronAdaptation
.
execute
()
dcu_megatron/core/__init__.py
View file @
ce02cd51
from
.transformer.transformer_block
import
transformer_block_init_wrapper
,
transformer_block_forward
from
.transformer.transformer_block
import
transformer_block_init_wrapper
dcu_megatron/core/distributed/finalize_model_grads.py
View file @
ce02cd51
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
from
typing
import
List
from
typing
import
List
import
torch
import
torch
...
...
dcu_megatron/core/models/common/embeddings/language_model_embedding.py
View file @
ce02cd51
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
from
typing
import
Literal
from
typing
import
Literal
import
torch
import
torch
...
...
dcu_megatron/core/models/gpt/gpt_layer_specs.py
0 → 100644
View file @
ce02cd51
import
warnings
from
typing
import
Optional
from
megatron.core.fusions.fused_bias_dropout
import
get_bias_dropout_add
from
megatron.core.models.gpt.moe_module_specs
import
get_moe_module_spec
from
megatron.core.transformer.attention
import
SelfAttention
,
SelfAttentionSubmodules
from
megatron.core.transformer.enums
import
AttnMaskType
from
megatron.core.transformer.identity_op
import
IdentityOp
from
megatron.core.transformer.mlp
import
MLP
,
MLPSubmodules
from
megatron.core.transformer.multi_latent_attention
import
(
MLASelfAttention
,
MLASelfAttentionSubmodules
,
)
from
megatron.core.transformer.spec_utils
import
ModuleSpec
from
megatron.core.transformer.transformer_layer
import
(
TransformerLayer
,
TransformerLayerSubmodules
,
)
from
dcu_megatron.core.tensor_parallel.layers
import
FluxColumnParallelLinear
,
FluxRowParallelLinear
from
megatron.core.utils
import
is_te_min_version
try
:
from
megatron.core.extensions.transformer_engine
import
(
TEDotProductAttention
,
TENorm
,
)
except
ImportError
:
warnings
.
warn
(
'transformer_engine is not installed.'
)
try
:
import
apex
# pylint: disable=unused-import
from
megatron.core.fusions.fused_layer_norm
import
FusedLayerNorm
except
ImportError
:
warnings
.
warn
(
'Apex is not installed.'
)
def
get_gpt_layer_with_flux_spec
(
num_experts
:
Optional
[
int
]
=
None
,
moe_grouped_gemm
:
Optional
[
bool
]
=
False
,
qk_layernorm
:
Optional
[
bool
]
=
False
,
multi_latent_attention
:
Optional
[
bool
]
=
False
,
fp8
:
Optional
[
str
]
=
None
,
# pylint: disable=unused-arguments
moe_use_legacy_grouped_gemm
:
Optional
[
bool
]
=
False
,
)
->
ModuleSpec
:
"""Use this spec to use flux modules (required for fp8 training).
Args:
num_experts (int, optional): Number of experts. Defaults to None.
moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
Defaults to False.
Returns:
ModuleSpec: Module specification with flux modules
"""
if
fp8
is
not
None
:
warnings
.
warn
(
'The fp8 argument in "get_gpt_layer_with_transformer_engine_spec" has been deprecated'
' and will be removed soon. Please update your code accordingly.'
)
mlp
=
get_mlp_module_flux_spec
(
use_te
=
False
,
num_experts
=
num_experts
,
moe_grouped_gemm
=
moe_grouped_gemm
,
moe_use_legacy_grouped_gemm
=
moe_use_legacy_grouped_gemm
,
)
if
multi_latent_attention
:
return
ModuleSpec
(
module
=
TransformerLayer
,
submodules
=
TransformerLayerSubmodules
(
input_layernorm
=
TENorm
,
self_attention
=
ModuleSpec
(
module
=
MLASelfAttention
,
params
=
{
"attn_mask_type"
:
AttnMaskType
.
causal
},
submodules
=
MLASelfAttentionSubmodules
(
linear_q_proj
=
FluxColumnParallelLinear
,
linear_q_down_proj
=
FluxColumnParallelLinear
,
linear_q_up_proj
=
FluxColumnParallelLinear
,
linear_kv_down_proj
=
FluxColumnParallelLinear
,
linear_kv_up_proj
=
FluxColumnParallelLinear
,
core_attention
=
TEDotProductAttention
,
linear_proj
=
FluxRowParallelLinear
,
q_layernorm
=
TENorm
if
qk_layernorm
else
IdentityOp
,
kv_layernorm
=
TENorm
if
qk_layernorm
else
IdentityOp
,
),
),
self_attn_bda
=
get_bias_dropout_add
,
pre_mlp_layernorm
=
TENorm
,
mlp
=
mlp
,
mlp_bda
=
get_bias_dropout_add
,
),
)
else
:
# TENorm significantly harms convergence when used
# for QKLayerNorm if TE Version < 1.9;
# we instead use the Apex implementation.
qk_norm
=
TENorm
if
is_te_min_version
(
"1.9.0"
)
else
FusedLayerNorm
return
ModuleSpec
(
module
=
TransformerLayer
,
submodules
=
TransformerLayerSubmodules
(
input_layernorm
=
TENorm
,
self_attention
=
ModuleSpec
(
module
=
SelfAttention
,
params
=
{
"attn_mask_type"
:
AttnMaskType
.
causal
},
submodules
=
SelfAttentionSubmodules
(
linear_qkv
=
FluxColumnParallelLinear
,
core_attention
=
TEDotProductAttention
,
linear_proj
=
FluxRowParallelLinear
,
q_layernorm
=
qk_norm
if
qk_layernorm
else
IdentityOp
,
k_layernorm
=
qk_norm
if
qk_layernorm
else
IdentityOp
,
),
),
self_attn_bda
=
get_bias_dropout_add
,
pre_mlp_layernorm
=
TENorm
,
mlp
=
mlp
,
mlp_bda
=
get_bias_dropout_add
,
),
)
def
get_mlp_module_flux_spec
(
use_te
:
Optional
[
bool
]
=
True
,
num_experts
:
Optional
[
int
]
=
None
,
moe_grouped_gemm
:
Optional
[
bool
]
=
False
,
fp8
:
Optional
[
str
]
=
None
,
# pylint: disable=unused-arguments
moe_use_legacy_grouped_gemm
:
Optional
[
bool
]
=
False
,
)
->
ModuleSpec
:
"""Helper function to get module spec for MLP/MoE"""
if
fp8
is
not
None
:
warnings
.
warn
(
'The fp8 argument in "_get_mlp_module_spec" has been deprecated'
' and will be removed soon. Please update your code accordingly.'
)
if
num_experts
is
None
:
# Dense MLP w/ or w/o TE modules.
return
ModuleSpec
(
module
=
MLP
,
submodules
=
MLPSubmodules
(
linear_fc1
=
FluxColumnParallelLinear
,
linear_fc2
=
FluxRowParallelLinear
,
),
)
else
:
# Mixture of experts with modules in megatron core.
return
get_moe_module_spec
(
use_te
=
True
,
num_experts
=
num_experts
,
moe_grouped_gemm
=
moe_grouped_gemm
,
moe_use_legacy_grouped_gemm
=
moe_use_legacy_grouped_gemm
,
)
dcu_megatron/core/models/gpt/gpt_model.py
View file @
ce02cd51
import
os
import
logging
import
logging
from
typing
import
Literal
,
Optional
from
typing
import
Literal
,
Optional
from
functools
import
wraps
from
functools
import
wraps
...
@@ -16,110 +17,28 @@ from megatron.core.packed_seq_params import PackedSeqParams
...
@@ -16,110 +17,28 @@ from megatron.core.packed_seq_params import PackedSeqParams
from
megatron.core.transformer.enums
import
ModelType
from
megatron.core.transformer.enums
import
ModelType
from
megatron.core.transformer.spec_utils
import
ModuleSpec
from
megatron.core.transformer.spec_utils
import
ModuleSpec
from
megatron.core.transformer.transformer_block
import
TransformerBlock
from
megatron.core.transformer.transformer_block
import
TransformerBlock
from
megatron.core.extensions.transformer_engine
import
TEColumnParallelLinear
from
dcu_megatron.core.utils
import
tensor_slide
from
dcu_megatron.core.utils
import
tensor_slide
from
dcu_megatron.core.transformer.mtp.multi_token_predictor
import
MultiTokenPredictor
from
dcu_megatron.core.transformer.mtp.multi_token_predictor
import
MultiTokenPredictor
from
dcu_megatron.core.transformer.transformer_config
import
TransformerConfig
from
dcu_megatron.core.transformer.transformer_config
import
TransformerConfig
from
dcu_megatron.core.tensor_parallel
import
FluxColumnParallelLinear
def
gpt_model_init
(
def
gpt_model_init_wrapper
(
fn
):
self
,
@
wraps
(
fn
)
config
:
TransformerConfig
,
def
wrapper
(
self
,
*
args
,
**
kwargs
):
transformer_layer_spec
:
ModuleSpec
,
fn
(
self
,
*
args
,
**
kwargs
)
vocab_size
:
int
,
max_sequence_length
:
int
,
pre_process
:
bool
=
True
,
post_process
:
bool
=
True
,
fp16_lm_cross_entropy
:
bool
=
False
,
parallel_output
:
bool
=
True
,
share_embeddings_and_output_weights
:
bool
=
False
,
position_embedding_type
:
Literal
[
'learned_absolute'
,
'rope'
,
'none'
]
=
'learned_absolute'
,
rotary_percent
:
float
=
1.0
,
rotary_base
:
int
=
10000
,
rope_scaling
:
bool
=
False
,
rope_scaling_factor
:
float
=
8.0
,
scatter_embedding_sequence_parallel
:
bool
=
True
,
seq_len_interpolation_factor
:
Optional
[
float
]
=
None
,
mtp_spec
:
ModuleSpec
=
None
)
->
None
:
super
(
GPTModel
,
self
).
__init__
(
config
=
config
)
if
has_config_logger_enabled
(
config
):
log_config_to_disk
(
config
,
locals
(),
prefix
=
type
(
self
).
__name__
)
self
.
transformer_layer_spec
:
ModuleSpec
=
transformer_layer_spec
self
.
vocab_size
=
vocab_size
self
.
max_sequence_length
=
max_sequence_length
self
.
pre_process
=
pre_process
self
.
post_process
=
post_process
self
.
fp16_lm_cross_entropy
=
fp16_lm_cross_entropy
self
.
parallel_output
=
parallel_output
self
.
share_embeddings_and_output_weights
=
share_embeddings_and_output_weights
self
.
position_embedding_type
=
position_embedding_type
# megatron core pipelining currently depends on model type
# TODO: remove this dependency ?
self
.
model_type
=
ModelType
.
encoder_or_decoder
# These 4 attributes are needed for TensorRT-LLM export.
self
.
max_position_embeddings
=
max_sequence_length
self
.
rotary_percent
=
rotary_percent
self
.
rotary_base
=
rotary_base
self
.
rotary_scaling
=
rope_scaling
if
self
.
pre_process
:
self
.
embedding
=
LanguageModelEmbedding
(
config
=
self
.
config
,
vocab_size
=
self
.
vocab_size
,
max_sequence_length
=
self
.
max_sequence_length
,
position_embedding_type
=
position_embedding_type
,
scatter_to_sequence_parallel
=
scatter_embedding_sequence_parallel
,
)
if
self
.
position_embedding_type
==
'rope'
and
not
self
.
config
.
multi_latent_attention
:
self
.
rotary_pos_emb
=
RotaryEmbedding
(
kv_channels
=
self
.
config
.
kv_channels
,
rotary_percent
=
rotary_percent
,
rotary_interleaved
=
self
.
config
.
rotary_interleaved
,
seq_len_interpolation_factor
=
seq_len_interpolation_factor
,
rotary_base
=
rotary_base
,
rope_scaling
=
rope_scaling
,
rope_scaling_factor
=
rope_scaling_factor
,
use_cpu_initialization
=
self
.
config
.
use_cpu_initialization
,
)
# Cache for RoPE tensors which do not change between iterations.
self
.
rotary_pos_emb_cache
=
{}
# Transformer.
self
.
decoder
=
TransformerBlock
(
config
=
self
.
config
,
spec
=
transformer_layer_spec
,
pre_process
=
self
.
pre_process
,
post_process
=
self
.
post_process
)
# Output
if
post_process
:
if
self
.
config
.
defer_embedding_wgrad_compute
:
# The embedding activation buffer preserves a reference to the input activations
# of the final embedding projection layer GEMM. It will hold the activations for
# all the micro-batches of a global batch for the last pipeline stage. Once we are
# done with all the back props for all the microbatches for the last pipeline stage,
# it will be in the pipeline flush stage. During this pipeline flush we use the
# input activations stored in embedding activation buffer and gradient outputs
# stored in gradient buffer to calculate the weight gradients for the embedding
# final linear layer.
self
.
embedding_activation_buffer
=
[]
self
.
grad_output_buffer
=
[]
else
:
self
.
embedding_activation_buffer
=
None
self
.
grad_output_buffer
=
None
self
.
output_layer
=
tensor_parallel
.
ColumnParallelLinear
(
if
(
config
.
hidden_size
,
self
.
post_process
and
int
(
os
.
getenv
(
"USE_FLUX_OVERLAP"
,
"0"
))
):
self
.
output_layer
=
FluxColumnParallelLinear
(
self
.
config
.
hidden_size
,
self
.
vocab_size
,
self
.
vocab_size
,
config
=
config
,
config
=
self
.
config
,
init_method
=
config
.
init_method
,
init_method
=
self
.
config
.
init_method
,
bias
=
False
,
bias
=
False
,
skip_bias_add
=
False
,
skip_bias_add
=
False
,
gather_output
=
not
self
.
parallel_output
,
gather_output
=
not
self
.
parallel_output
,
...
@@ -129,18 +48,22 @@ def gpt_model_init(
...
@@ -129,18 +48,22 @@ def gpt_model_init(
grad_output_buffer
=
self
.
grad_output_buffer
,
grad_output_buffer
=
self
.
grad_output_buffer
,
)
)
self
.
setup_embeddings_and_output_layer
()
# add mtp
# add mtp
self
.
mtp_spec
:
ModuleSpec
=
mtp_spec
self
.
num_nextn_predict_layers
=
self
.
config
.
num_nextn_predict_layers
self
.
num_nextn_predict_layers
=
self
.
config
.
num_nextn_predict_layers
if
self
.
num_nextn_predict_layers
:
assert
hasattr
(
self
.
config
,
"mtp_spec"
)
self
.
mtp_spec
:
ModuleSpec
=
self
.
config
.
mtp_spec
self
.
share_mtp_embedding_and_output_weight
=
self
.
config
.
share_mtp_embedding_and_output_weight
self
.
share_mtp_embedding_and_output_weight
=
self
.
config
.
share_mtp_embedding_and_output_weight
self
.
recompute_mtp_norm
=
self
.
config
.
recompute_mtp_norm
self
.
recompute_mtp_norm
=
self
.
config
.
recompute_mtp_norm
self
.
recompute_mtp_layer
=
self
.
config
.
recompute_mtp_layer
self
.
recompute_mtp_layer
=
self
.
config
.
recompute_mtp_layer
self
.
mtp_loss_scale
=
self
.
config
.
mtp_loss_scale
self
.
mtp_loss_scale
=
self
.
config
.
mtp_loss_scale
if
self
.
post_process
and
self
.
training
and
self
.
num_nextn_predict_layers
:
if
self
.
post_process
and
self
.
training
:
self
.
mtp_layers
=
torch
.
nn
.
ModuleList
(
self
.
mtp_layers
=
torch
.
nn
.
ModuleList
(
[
[
MultiTokenPredictor
(
MultiTokenPredictor
(
config
,
self
.
config
,
self
.
mtp_spec
.
submodules
,
self
.
mtp_spec
.
submodules
,
vocab_size
=
self
.
vocab_size
,
vocab_size
=
self
.
vocab_size
,
max_sequence_length
=
self
.
max_sequence_length
,
max_sequence_length
=
self
.
max_sequence_length
,
...
@@ -161,16 +84,10 @@ def gpt_model_init(
...
@@ -161,16 +84,10 @@ def gpt_model_init(
)
)
if
self
.
pre_process
or
self
.
post_process
:
if
self
.
pre_process
or
self
.
post_process
:
self
.
setup_embeddings_and_output_layer
()
if
has_config_logger_enabled
(
self
.
config
):
log_config_to_disk
(
self
.
config
,
self
.
state_dict
(),
prefix
=
f
'
{
type
(
self
).
__name__
}
_init_ckpt'
)
if
self
.
num_nextn_predict_layers
and
(
self
.
pre_process
or
self
.
post_process
):
setup_mtp_embeddings
(
self
)
setup_mtp_embeddings
(
self
)
return
wrapper
def
shared_embedding_or_mtp_embedding_weight
(
self
)
->
Tensor
:
def
shared_embedding_or_mtp_embedding_weight
(
self
)
->
Tensor
:
"""Gets the embedding weight when share embedding and mtp embedding weights set to True.
"""Gets the embedding weight when share embedding and mtp embedding weights set to True.
...
@@ -424,10 +341,10 @@ def gpt_model_forward(
...
@@ -424,10 +341,10 @@ def gpt_model_forward(
if
(
if
(
self
.
num_nextn_predict_layers
self
.
num_nextn_predict_layers
and
getattr
(
self
.
decoder
,
"final_layernorm"
,
None
)
is
not
None
and
getattr
(
self
.
decoder
,
"
main_
final_layernorm"
,
None
)
is
not
None
):
):
# move block main model final norms here
# move block main model final norms here
hidden_states
=
self
.
decoder
.
final_layernorm
(
hidden_states
)
hidden_states
=
self
.
decoder
.
main_
final_layernorm
(
hidden_states
)
logits
,
_
=
self
.
output_layer
(
logits
,
_
=
self
.
output_layer
(
hidden_states
,
weight
=
output_weight
,
runtime_gather_output
=
runtime_gather_output
hidden_states
,
weight
=
output_weight
,
runtime_gather_output
=
runtime_gather_output
...
...
dcu_megatron/core/tensor_parallel/__init__.py
View file @
ce02cd51
from
.layers
import
(
from
.layers
import
(
parallel_linear_init_wrapper
,
FluxColumnParallelLinear
,
ColumnParallelLinearPatch
,
FluxRowParallelLinear
,
RowParallelLinearPatch
,
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_forward
,
vocab_parallel_embedding_init
,
vocab_parallel_embedding_init
,
)
)
\ No newline at end of file
dcu_megatron/core/tensor_parallel/layers.py
View file @
ce02cd51
This diff is collapsed.
Click to expand it.
dcu_megatron/core/transformer/mtp/multi_token_predictor.py
View file @
ce02cd51
# Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
import
os
import
logging
import
logging
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Union
,
Optional
,
Literal
from
typing
import
Union
,
Optional
,
Literal
...
@@ -11,6 +11,7 @@ from megatron.core.models.common.embeddings.language_model_embedding import Lang
...
@@ -11,6 +11,7 @@ from megatron.core.models.common.embeddings.language_model_embedding import Lang
from
megatron.core.models.common.embeddings.rotary_pos_embedding
import
RotaryEmbedding
from
megatron.core.models.common.embeddings.rotary_pos_embedding
import
RotaryEmbedding
from
megatron.core.packed_seq_params
import
PackedSeqParams
from
megatron.core.packed_seq_params
import
PackedSeqParams
from
megatron.core.transformer.module
import
MegatronModule
from
megatron.core.transformer.module
import
MegatronModule
from
megatron.core.extensions.transformer_engine
import
TEColumnParallelLinear
from
megatron.core.fusions.fused_cross_entropy
import
fused_vocab_parallel_cross_entropy
from
megatron.core.fusions.fused_cross_entropy
import
fused_vocab_parallel_cross_entropy
from
megatron.core.transformer
import
ModuleSpec
,
TransformerConfig
,
build_module
from
megatron.core.transformer
import
ModuleSpec
,
TransformerConfig
,
build_module
...
@@ -136,12 +137,16 @@ class MultiTokenPredictor(MegatronModule):
...
@@ -136,12 +137,16 @@ class MultiTokenPredictor(MegatronModule):
self
.
embedding_activation_buffer
=
None
self
.
embedding_activation_buffer
=
None
self
.
grad_output_buffer
=
None
self
.
grad_output_buffer
=
None
self
.
output_layer
=
tensor_parallel
.
ColumnParallelLinear
(
if
int
(
os
.
getenv
(
"USE_FLUX_OVERLAP"
,
"0"
)):
config
.
hidden_size
,
column_parallel_linear_impl
=
FluxColumnParallelLinear
else
:
column_parallel_linear_impl
=
tensor_parallel
.
ColumnParallelLinear
self
.
output_layer
=
column_parallel_linear_impl
(
self
.
config
.
hidden_size
,
self
.
vocab_size
,
self
.
vocab_size
,
config
=
config
,
config
=
self
.
config
,
init_method
=
config
.
init_method
,
init_method
=
self
.
config
.
init_method
,
bias
=
self
.
add_output_layer_bias
,
bias
=
False
,
skip_bias_add
=
False
,
skip_bias_add
=
False
,
gather_output
=
not
self
.
parallel_output
,
gather_output
=
not
self
.
parallel_output
,
skip_weight_param_allocation
=
self
.
share_mtp_embedding_and_output_weight
,
skip_weight_param_allocation
=
self
.
share_mtp_embedding_and_output_weight
,
...
...
dcu_megatron/core/transformer/transformer_block.py
View file @
ce02cd51
from
contextlib
import
nullcontext
from
typing
import
Optional
from
functools
import
wraps
from
functools
import
wraps
import
torch
from
torch
import
Tensor
from
megatron.core
import
InferenceParams
,
parallel_state
,
tensor_parallel
from
megatron.core.fusions.fused_layer_norm
import
FusedLayerNorm
from
megatron.core.packed_seq_params
import
PackedSeqParams
from
megatron.core.utils
import
make_viewless_tensor
try
:
from
megatron.core.extensions.transformer_engine
import
TEDelayedScaling
HAVE_TE
=
True
except
ImportError
:
HAVE_TE
=
False
def
transformer_block_init_wrapper
(
fn
):
def
transformer_block_init_wrapper
(
fn
):
@
wraps
(
fn
)
@
wraps
(
fn
)
...
@@ -25,178 +8,8 @@ def transformer_block_init_wrapper(fn):
...
@@ -25,178 +8,8 @@ def transformer_block_init_wrapper(fn):
# mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
# mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
config
=
args
[
0
]
if
len
(
args
)
>
1
else
kwargs
[
'config'
]
config
=
args
[
0
]
if
len
(
args
)
>
1
else
kwargs
[
'config'
]
self
.
move_final_norm_out_of_block
=
getattr
(
config
,
"num_nextn_predict_layers"
,
0
)
>
0
if
getattr
(
config
,
"num_nextn_predict_layers"
,
0
)
>
0
:
self
.
main_final_layernorm
=
self
.
final_layernorm
self
.
final_layernorm
=
None
return
wrapper
return
wrapper
def
transformer_block_forward
(
self
,
hidden_states
:
Tensor
,
attention_mask
:
Tensor
,
context
:
Tensor
=
None
,
context_mask
:
Tensor
=
None
,
rotary_pos_emb
:
Tensor
=
None
,
rotary_pos_cos
:
Tensor
=
None
,
rotary_pos_sin
:
Tensor
=
None
,
attention_bias
:
Tensor
=
None
,
inference_params
:
InferenceParams
=
None
,
packed_seq_params
:
PackedSeqParams
=
None
,
sequence_len_offset
:
Tensor
=
None
,
):
"""
Perform the forward pass through the transformer block.
This method handles the core computation of the transformer, including
self-attention, optional cross-attention, and feed-forward operations.
Args:
hidden_states (Tensor): Input tensor of shape [s, b, h] where s is the
sequence length, b is the batch size, and h is the hidden size.
attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
self-attention.
context (Tensor, optional): Context tensor for cross-attention.
context_mask (Tensor, optional): Mask for cross-attention context
rotary_pos_emb (Tensor, optional): Rotary positional embeddings.
attention_bias (Tensor): Bias tensor for Q * K.T of shape in shape broadcastable
to [b, num_head, sq, skv], e.g. [1, 1, sq, skv].
Used as an alternative to apply attention mask for TE cuDNN attention.
inference_params (InferenceParams, optional): Parameters for inference-time
optimizations.
packed_seq_params (PackedSeqParams, optional): Parameters for packed sequence
processing.
Returns:
Union[Tensor, Tuple[Tensor, Tensor]]: The output hidden states tensor of shape
[s, b, h], and optionally the updated context tensor if cross-attention is used.
"""
if
not
self
.
pre_process
:
# See set_input_tensor()
hidden_states
=
self
.
input_tensor
# Update the inference parameters with the current batch size in case it is variable
if
inference_params
and
not
self
.
training
:
inference_params
.
current_batch_size
=
hidden_states
.
size
(
1
)
# Viewless tensor.
# - We only need to create a viewless tensor in the case of micro batch
# size (mbs) == 1, since in this case, 'hidden_states.transpose()'
# above creates a view tensor, and '.contiguous()' is a pass-through.
# For mbs >= 2, '.contiguous()' creates a new tensor, eliminating
# the need to make it viewless.
#
# However, we don't explicitly check mbs == 1 here because
# make_viewless_tensor() has negligible overhead when its input
# is already viewless.
#
# - For the 'else' case above, calling make_viewless_tensor() here is
# likely redundant, since p2p_communication.py (likely originator)
# already creates viewless tensors. That said, make_viewless_tensor()
# is called here to be future-proof and corner-case-proof.
hidden_states
=
make_viewless_tensor
(
inp
=
hidden_states
,
requires_grad
=
True
,
keep_graph
=
True
)
if
self
.
config
.
sequence_parallel
:
rng_context
=
tensor_parallel
.
get_cuda_rng_tracker
().
fork
()
else
:
rng_context
=
nullcontext
()
if
self
.
config
.
fp8
:
import
transformer_engine
# To keep out TE dependency when not training in fp8
if
self
.
config
.
fp8
==
"e4m3"
:
fp8_format
=
transformer_engine
.
common
.
recipe
.
Format
.
E4M3
elif
self
.
config
.
fp8
==
"hybrid"
:
fp8_format
=
transformer_engine
.
common
.
recipe
.
Format
.
HYBRID
else
:
raise
ValueError
(
"E4M3 and HYBRID are the only supported FP8 formats."
)
fp8_recipe
=
TEDelayedScaling
(
config
=
self
.
config
,
fp8_format
=
fp8_format
,
override_linear_precision
=
(
False
,
False
,
not
self
.
config
.
fp8_wgrad
),
)
fp8_group
=
None
if
parallel_state
.
model_parallel_is_initialized
():
fp8_group
=
parallel_state
.
get_amax_reduction_group
(
with_context_parallel
=
True
,
tp_only_amax_red
=
self
.
tp_only_amax_red
)
fp8_context
=
transformer_engine
.
pytorch
.
fp8_autocast
(
enabled
=
True
,
fp8_recipe
=
fp8_recipe
,
fp8_group
=
fp8_group
)
else
:
fp8_context
=
nullcontext
()
with
rng_context
,
fp8_context
:
# Forward pass.
if
self
.
config
.
recompute_granularity
==
'full'
and
self
.
training
:
hidden_states
=
self
.
_checkpointed_forward
(
hidden_states
=
hidden_states
,
attention_mask
=
attention_mask
,
context
=
context
,
context_mask
=
context_mask
,
rotary_pos_emb
=
rotary_pos_emb
,
attention_bias
=
attention_bias
,
packed_seq_params
=
packed_seq_params
,
)
else
:
for
l_no
,
layer
in
enumerate
(
self
.
layers
):
with
self
.
offload_context
:
layer
.
use_cudagraph
=
True
if
(
len
(
self
.
cuda_graphs
)
==
0
)
or
(
not
self
.
training
):
hidden_states
,
context
=
layer
(
hidden_states
=
hidden_states
,
attention_mask
=
attention_mask
,
context
=
context
,
context_mask
=
context_mask
,
rotary_pos_emb
=
rotary_pos_emb
,
rotary_pos_cos
=
rotary_pos_cos
,
rotary_pos_sin
=
rotary_pos_sin
,
attention_bias
=
attention_bias
,
inference_params
=
inference_params
,
packed_seq_params
=
packed_seq_params
,
sequence_len_offset
=
sequence_len_offset
,
)
else
:
# CUDA graph replay for layer `l_no` and microbatch
# `self.current_microbatch`. TransformerEngine versions>=1.10
# allow keyword arguments with CUDA graph. However, CUDA graph
# acccepts only Tensor inputs and Tensor outputs. Hence,
# `inference_params` and `packed_seq_params` are excluded from
# input list while output is limited to `hidden_states`.
cg_index
=
self
.
current_microbatch
%
len
(
self
.
cuda_graphs
[
l_no
])
assert
not
any
(
[
inference_params
,
packed_seq_params
]
),
"CUDA graph accepts only Tensor inputs."
optional_inputs
=
self
.
get_cuda_graph_optional_args
(
attention_mask
,
context
,
context_mask
,
rotary_pos_emb
,
attention_bias
,
inference_params
,
packed_seq_params
,
)
hidden_states
=
self
.
cuda_graphs
[
l_no
][
cg_index
](
hidden_states
,
**
optional_inputs
)
if
(
torch
.
is_grad_enabled
()
and
self
.
config
.
cpu_offloading
and
self
.
group_prefetch_offload_commit_async
is
not
None
):
hidden_states
=
self
.
group_prefetch_offload_commit_async
(
hidden_states
)
# Final layer norm.
if
(
not
self
.
move_final_norm_out_of_block
)
and
self
.
final_layernorm
is
not
None
:
hidden_states
=
self
.
final_layernorm
(
hidden_states
)
# TENorm produces a "viewed" tensor. This will result in schedule.py's
# deallocate_output_tensor() throwing an error, so a viewless tensor is
# created to prevent this.
hidden_states
=
make_viewless_tensor
(
inp
=
hidden_states
,
requires_grad
=
True
,
keep_graph
=
True
)
return
hidden_states
dcu_megatron/core/transformer/transformer_config.py
View file @
ce02cd51
...
@@ -26,9 +26,6 @@ class ExtraTransformerConfig:
...
@@ -26,9 +26,6 @@ class ExtraTransformerConfig:
##################
##################
# flux
# flux
##################
##################
use_flux
:
bool
=
False
"""If set, flux will be used in ColumnParallelLinear and RowParallelLinear"""
flux_transpose_weight
:
bool
=
False
flux_transpose_weight
:
bool
=
False
...
...
dcu_megatron/core/utils.py
View file @
ce02cd51
import
torch
import
torch
from
typing
import
List
,
Optional
,
Union
from
typing
import
List
,
Optional
,
Union
from
importlib.metadata
import
version
from
packaging.version
import
Version
as
PkgVersion
_flux_version
=
None
def
get_flux_version
():
"""Get flux version from __version__; if not available use pip's. Use caching."""
def
get_flux_version_str
():
import
flux
if
hasattr
(
flux
,
'__version__'
):
return
str
(
flux
.
__version__
)
else
:
return
version
(
"flux"
)
global
_flux_version
if
_flux_version
is
None
:
_flux_version
=
PkgVersion
(
get_flux_version_str
())
return
_flux_version
def
is_flux_min_version
(
version
,
check_equality
=
True
):
"""Check if minimum version of `flux` is installed."""
if
check_equality
:
return
get_flux_version
()
>=
PkgVersion
(
version
)
return
get_flux_version
()
>
PkgVersion
(
version
)
def
tensor_slide
(
def
tensor_slide
(
...
...
dcu_megatron/legacy/model/rms_norm.py
View file @
ce02cd51
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
torch
from
torch
import
nn
class
RMSNorm
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
dim
:
int
,
eps
:
float
=
1e-6
,
sequence_parallel
:
bool
=
False
,
config
:
dict
=
None
):
"""RMS Normaliation module
Args:
dim (int): The width of input, i.e. hidden size
eps (float): epsilon to use for the norm, default to 1e-6
sequence_parallel (bool): Set to true if sequence parallelism is being used,
this marks the weights as needing to be allreduced.
"""
super
().
__init__
()
self
.
eps
=
eps
self
.
weight
=
nn
.
Parameter
(
torch
.
ones
(
dim
))
setattr
(
self
.
weight
,
'sequence_parallel'
,
sequence_parallel
)
def
_norm
(
self
,
x
):
return
x
*
torch
.
rsqrt
(
x
.
pow
(
2
).
mean
(
-
1
,
keepdim
=
True
)
+
self
.
eps
)
@
torch
.
compile
(
mode
=
"max-autotune-no-cudagraphs"
)
def
forward
(
self
,
x
):
output
=
self
.
_norm
(
x
.
float
()).
type_as
(
x
)
return
output
*
self
.
weight
import
torch
from
typing
import
Any
,
Callable
,
Dict
,
Optional
,
Tuple
,
Union
import
lightop
# rmsnorm_forward,rmsnorm_backward
from
functools
import
partial
from
megatron.core.utils
import
is_torch_min_version
if
is_torch_min_version
(
"2.4.0a0"
):
custom_fwd
=
partial
(
torch
.
amp
.
custom_fwd
,
device_type
=
"cuda"
)
custom_bwd
=
partial
(
torch
.
amp
.
custom_bwd
,
device_type
=
"cuda"
)
else
:
custom_fwd
=
torch
.
cuda
.
amp
.
custom_fwd
custom_bwd
=
torch
.
cuda
.
amp
.
custom_bwd
class
_LightopRMSNorm
(
torch
.
autograd
.
Function
):
""" 使用lightop实现rmsnorm"""
@
staticmethod
# @custom_fwd
def
forward
(
ctx
,
inp
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
ln_out
:
torch
.
Tensor
,
eps
:
float
,
is_grad_enabled
):
output
=
lightop
.
rmsnorm_forward
(
inp
,
weight
,
ln_out
,
eps
,
training
=
True
)
# output = (output, weight)
rsigma
=
output
[
1
]
if
is_grad_enabled
:
ctx
.
save_for_backward
(
inp
,
weight
,
rsigma
)
return
output
[
0
]
@
staticmethod
# @custom_bwd
def
backward
(
ctx
,
grad_output
):
inp
,
weight
,
rsigma
=
ctx
.
saved_tensors
dgrad
,
dgamma
=
lightop
.
rmsnorm_backward
(
grad_output
,
inp
,
rsigma
,
weight
)
return
dgrad
,
dgamma
,
None
,
None
,
None
class
LightopRMSNorm
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
dim
:
int
,
eps
:
float
=
1e-6
,):
"""RMS Normaliation module
Args:
dim (int): The width of input, i.e. hidden size
eps (float): epsilon to use for the norm, default to 1e-6
"""
super
().
__init__
()
self
.
eps
=
eps
self
.
weight
=
torch
.
nn
.
Parameter
(
torch
.
ones
(
dim
))
# @no_torch_dynamo() # 动态torch._dynamo.disable
def
forward
(
self
,
inp
:
torch
.
Tensor
,
is_first_microbatch
:
Optional
[
bool
]
=
None
):
if
torch
.
is_grad_enabled
():
fwd_fn
=
_LightopRMSNorm
.
apply
args
=
[]
else
:
fwd_fn
=
_LightopRMSNorm
.
forward
args
=
[
None
]
ln_out
=
torch
.
empty_like
(
inp
,
dtype
=
inp
.
dtype
,
memory_format
=
torch
.
contiguous_format
)
args
+=
(
inp
,
self
.
weight
,
ln_out
,
self
.
eps
,
torch
.
is_grad_enabled
())
out
=
fwd_fn
(
*
args
)
return
out
dcu_megatron/legacy/model/transformer.py
View file @
ce02cd51
...
@@ -3,14 +3,21 @@ import torch.nn.functional as F
...
@@ -3,14 +3,21 @@ import torch.nn.functional as F
from
megatron.training
import
get_args
from
megatron.training
import
get_args
from
megatron.core
import
tensor_parallel
from
megatron.core
import
tensor_parallel
from
megatron.legacy.model.enums
import
AttnType
from
megatron.core.models.common.embeddings
import
apply_rotary_pos_emb
from
megatron.legacy.model.module
import
MegatronModule
from
megatron.legacy.model.module
import
MegatronModule
from
megatron.legacy.model.transformer
import
ParallelMLP
from
megatron.legacy.model.utils
import
(
from
megatron.legacy.model.utils
import
(
erf_gelu
,
erf_gelu
,
openai_gelu
,
openai_gelu
,
)
)
try
:
from
einops
import
rearrange
except
ImportError
:
rearrange
=
None
class
ParallelMLP
(
MegatronModule
):
class
ParallelMLP
Patch
(
MegatronModule
):
"""MLP.
"""MLP.
MLP will take the input with h hidden state, project it to 4*h
MLP will take the input with h hidden state, project it to 4*h
...
@@ -74,7 +81,7 @@ class ParallelMLP(MegatronModule):
...
@@ -74,7 +81,7 @@ class ParallelMLP(MegatronModule):
)
)
class
ParallelAttention
(
MegatronModule
):
class
ParallelAttention
Patch
(
MegatronModule
):
"""Parallel self-attention layer abstract class.
"""Parallel self-attention layer abstract class.
Self-attention layer takes input with size [s, b, h]
Self-attention layer takes input with size [s, b, h]
...
...
dcu_megatron/legacy/model/utils.py
0 → 100644
View file @
ce02cd51
from
megatron.training
import
get_args
from
megatron.legacy.model
import
LayerNorm
from
.rms_norm
import
RMSNorm
,
LightopRMSNorm
def
get_norm
(
config
):
args
=
get_args
()
if
args
.
normalization
==
"LayerNorm"
:
return
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layernorm_epsilon
,
no_persist_layer_norm
=
not
config
.
persist_layer_norm
,
sequence_parallel
=
config
.
sequence_parallel
,
apply_layernorm_1p
=
args
.
apply_layernorm_1p
)
elif
args
.
normalization
==
"RMSNorm"
:
if
args
.
apply_layernorm_1p
:
raise
NotImplementedError
(
'RMSNorm does not currently support the layernorm_1p formulation.'
)
return
RMSNorm
(
dim
=
config
.
hidden_size
,
eps
=
config
.
layernorm_epsilon
,
sequence_parallel
=
config
.
sequence_parallel
)
elif
args
.
normalization
==
"LightopRMSNorm"
:
return
LightopRMSNorm
(
dim
=
config
.
hidden_size
,
eps
=
config
.
layernorm_epsilon
)
else
:
raise
Exception
(
f
"unsupported norm type '
{
args
.
normalization
}
'."
)
dcu_megatron/training/arguments.py
View file @
ce02cd51
...
@@ -51,6 +51,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
...
@@ -51,6 +51,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
# Standard arguments.
# Standard arguments.
parser
=
_add_network_size_args
(
parser
)
parser
=
_add_network_size_args
(
parser
)
parser
=
_add_extra_network_size_args
(
parser
)
parser
=
_add_regularization_args
(
parser
)
parser
=
_add_regularization_args
(
parser
)
parser
=
_add_training_args
(
parser
)
parser
=
_add_training_args
(
parser
)
parser
=
_add_extra_training_args
(
parser
)
parser
=
_add_extra_training_args
(
parser
)
...
@@ -106,6 +107,18 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
...
@@ -106,6 +107,18 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
return
args
return
args
def
_add_extra_network_size_args
(
parser
):
# 删除原参数
remove_original_params
(
parser
,
[
"normalization"
])
# 重定义参数
group
=
parser
.
add_argument_group
(
title
=
'extra network size args'
)
group
.
add_argument
(
'--normalization'
,
default
=
'LayerNorm'
,
choices
=
[
'LayerNorm'
,
'RMSNorm'
,
'LightopRMSNorm'
],
help
=
'Which normalization technique to use.'
)
return
parser
def
_add_extra_distributed_args
(
parser
):
def
_add_extra_distributed_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'extra distributed args'
)
group
=
parser
.
add_argument_group
(
title
=
'extra distributed args'
)
group
.
add_argument
(
'--rank'
,
default
=-
1
,
type
=
int
,
group
.
add_argument
(
'--rank'
,
default
=-
1
,
type
=
int
,
...
@@ -169,9 +182,7 @@ def _add_mtp_args(parser):
...
@@ -169,9 +182,7 @@ def _add_mtp_args(parser):
def
_add_flux_args
(
parser
):
def
_add_flux_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'multi token prediction'
)
group
=
parser
.
add_argument_group
(
title
=
'flux args'
)
group
.
add_argument
(
'--use-flux'
,
action
=
'store_true'
,
default
=
False
,
help
=
'If set, flux will be used in ColumnParallelLinear and RowParallelLinear'
)
group
.
add_argument
(
'--flux-transpose-weight'
,
action
=
'store_true'
,
default
=
False
,
group
.
add_argument
(
'--flux-transpose-weight'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Whether to transpose weight when using flux kernel'
)
help
=
'Whether to transpose weight when using flux kernel'
)
return
parser
return
parser
examples/llama/Llama2_70b.sh
0 → 100755
View file @
ce02cd51
#!/bin/bash
# set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_P2P_LEVEL
=
PXB
# SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
32
# 20
export
NCCL_MAX_NCHANNELS
=
32
# 20
export
NCCL_IB_TIMEOUT
=
22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_TOPO_FILE
=
"/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install-dtk-25.04-0212/lib:
$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0331-release/lib:
$LD_LIBRARY_PATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# export HIPBLASLT_LOG_LEVEL=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export
cache_size_limit
=
64
# lightop算子库
export
PYTORCH_ROCM_ARCH
=
'gfx906;gfx926;gfx936'
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/public/home/gmhtest_tmp/RedPajama-Data-1T-Sample/redpajama_text_document"
#<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
80
#80 #80 #40 # 20 #
--hidden-size
8192
--ffn-hidden-size
22016
# 28672
--num-attention-heads
64
--max-position-embeddings
8192
--group-query-attention
--num-query-groups
8
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
export
NVTE_FLASH_ATTN
=
1
# 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS
=(
--transformer-impl
local
# 走legacy用这两组参数
--use-legacy-models
--micro-batch-size
1
--global-batch-size
512
#32 #512 #256 # 64 #240 #60 #512 #64
--train-iters
300
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
# --no-gradient-accumulation-fusion
# --no-check-for-nan-in-loss-and-grad
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
# 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-activations
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 1 #0 #
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 启动core
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 启动core
--use-flash-attn
)
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--tensor-model-parallel-size
4
--pipeline-model-parallel-size
8
--context-parallel-size
1
# --num-layers-per-virtual-pipeline-stage 1
# --microbatch-group-size-per-virtual-pipeline-stage 5
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS
=(
--data-path
$DATA_PATH
--seq-length
4096
#8192 #4096
--split
949,50,1
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
/public/home/gmhtest_tmp/RedPajama-Data-1T-Sample/tokenizer.model
# --tokenizer-model /data/model_weights/llama3.2/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
--log-interval
1
--log-throughput
--save-interval
500
--eval-interval
50
--eval-iters
3
--save
$SAVE_PATH
--load
$SAVE_PATH
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS
=(
--profile
--profile-step-start
4
--profile-step-end
5
--use-pytorch-profiler
--profile-ranks
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
--profile-dir
prof_data
)
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
34577
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
APP
=
"python -u ../../pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case
${
LOCAL_RANK
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
examples/llama/Llama2_7b.sh
0 → 100755
View file @
ce02cd51
#!/bin/bash
# set -eux
for
para
in
$*
do
if
[[
$para
==
--profiling
*
]]
;
then
profiling
=
${
para
#*=
}
fi
done
CURRENT_DIR
=
"
$(
cd
"
$(
dirname
"
$0
"
)
"
&&
pwd
)
"
MEGATRON_PATH
=
$(
dirname
$(
dirname
${
CURRENT_DIR
}
))
#default env
#export FLASH_ATTENTION_PRINT_PARAM=1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_P2P_LEVEL
=
PXB
# SYS
# export GPU_MAX_HW_QUEUES=10
#export HIP_ALLOC_INITIALIZE=0
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
# nccl env
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
32
# 20
export
NCCL_MAX_NCHANNELS
=
32
# 20
export
NCCL_IB_TIMEOUT
=
22
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_TOPO_FILE
=
"/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# hipblaslt库
export
LD_LIBRARY_PATH
=
/data/blas/hipblaslt-install-dtk-25.04-0212/lib:
$LD_LIBRARY_PATH
# rocblas
export
LD_LIBRARY_PATH
=
/data/blas/rocblas-install-0331-release/lib:
$LD_LIBRARY_PATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
#增加编译缓存
export
cache_size_limit
=
64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/data/datasets/oscar-1GB/oscar-1GB-llama2_text_document"
#<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
32
--hidden-size
4096
--ffn-hidden-size
11008
--num-attention-heads
32
--max-position-embeddings
4096
--normalization
RMSNorm
# LightopRMSNorm
--position-embedding-type
rope
# none #
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
export
NVTE_FLASH_ATTN
=
1
# 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS
=(
--transformer-impl
local
# 走legacy用这两组参数
--use-legacy-models
--micro-batch-size
1
--global-batch-size
256
#256 #240 #60 #512 #64
--train-iters
50
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
# --no-gradient-accumulation-fusion
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
# 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
--use-flash-attn
)
# 使用torch fa的环境变量
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--tensor-model-parallel-size
1
--pipeline-model-parallel-size
2
# --context-parallel-size 2
# --num-layers-per-virtual-pipeline-stage 4
# --microbatch-group-size-per-virtual-pipeline-stage 1
# --no-overlap-p2p-communication # 开启后
)
DATA_ARGS
=(
--data-path
$DATA_PATH
--seq-length
4096
#4096
--split
949,50,1
--tokenizer-type
Llama2Tokenizer
--tokenizer-model
/data/model_weights/llama2_7b_hf/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
--log-throughput
--eval-iters
50
--log-interval
1
--save-interval
1000
--eval-interval
1000
--save
$SAVE_PATH
--load
$SAVE_PATH
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS
=(
--profile
--profile-step-start
4
--profile-step-end
5
--use-pytorch-profiler
--profile-ranks
0 1 2 3 4 5 6 7
--profile-dir
prof_data
)
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
34577
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
APP
=
"python -u
${
MEGATRON_PATH
}
/pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
# ${APP}
case
${
LOCAL_RANK
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
4]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
4
--membind
=
4
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
5]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
5
--membind
=
5
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
6]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
6
--membind
=
6
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
[
7]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
numactl
--cpunodebind
=
7
--membind
=
7
${
APP
}
# hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
;;
esac
\ No newline at end of file
examples/llama/Llama3_405b.sh
0 → 100644
View file @
ce02cd51
#!/bin/bash
# set -eux
#export FLASH_ATTENTION_PRINT_PARAM=1
# Runs the "7B" parameter model
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
export
OMP_NUM_THREADS
=
1
export
NCCL_P2P_LEVEL
=
PXB
# SYS
#export HIP_ALLOC_INITIALIZE=0
# export GPU_MAX_HW_QUEUES=10
export
NCCL_ALGO
=
Ring
export
NCCL_NCHANNELS_PER_PEER
=
16
export
NCCL_MIN_NCHANNELS
=
32
# 20
export
NCCL_MAX_NCHANNELS
=
32
# 20
export
NCCL_IB_TIMEOUT
=
22
export
CUDA_DEVICE_MAX_CONNECTIONS
=
1
export
NCCL_IB_HCA
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
NCCL_NET_GDR_LEVEL
=
7
export
NCCL_NET_GDR_READ
=
1
export
RCCL_SDMA_COPY_ENABLE
=
0
export
NCCL_TOPO_FILE
=
"/public/home/wangxj/Projects/rccl-test/rccl-tests-0204/topo-input.xml"
# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
export
GLOG_minloglevel
=
3
# 打印error级别的nccl日志
source
/opt/dtk/env.sh
# 导入hipblaslt库
# export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/hipblaslt-install-dtk-25.04-0212/lib:
$LD_LIBRARY_PATH
# 更新rocblas
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/data/rocblas-install-0203-release/lib:$LD_LIBRARY_PATH
export
LD_LIBRARY_PATH
=
/public/home/wangxj/Downloads/blas/rocblas-install-0203-release/lib:
$LD_LIBRARY_PATH
# torch控制多流转单流
export
ALLREDUCE_STREAM_WITH_COMPUTE
=
1
export
SENDRECV_STREAM_WITH_COMPUTE
=
1
# prof采集添加同步, 避免卡顿
# export GPU_FLUSH_ON_EXECUTION=1
# export HIP_DIRECT_DISPATCH=0
# 采集rocblas size
# export ROCBLAS_LAYER=3
# 采集 fa size
# export FLASH_ATTENTION_PRINT_PARAM=1
#增加编译缓存
export
cache_size_limit
=
64
# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b #
SAVE_PATH
=
./tmp_7b
TENSORBOARD_LOGS_PATH
=
./tmp_7b
#$2 #<Specify path>
DATA_PATH
=
"/public/home/wangxj/Downloads/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document"
#<Specify path and file prefix>_text_document
# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama3.2_text_document" #<Specify path and file prefix>_text_document
GPT_MODEL_ARGS
=(
--num-layers
126
#96 #8 # 126
--hidden-size
16384
--ffn-hidden-size
53248
--num-attention-heads
128
--max-position-embeddings
16384
--group-query-attention
--num-query-groups
16
--normalization
RMSNorm
--position-embedding-type
rope
--untie-embeddings-and-output-weights
# 分开处理embed和输出权重, 增加灵活性
)
export
NVTE_FLASH_ATTN
=
1
# 走cutlass
# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
# --transformer-impl transformer_engine # 走core用这两组参数
# --use-mcore-models
# --transformer-impl local # 走legacy用这两组参数
# --use-legacy-models
TRAINING_ARGS
=(
--transformer-impl
transformer_engine
# 走core用这两组参数
--use-mcore-models
--micro-batch-size
1
--global-batch-size
6912
# 252 #32 # 64 #240 #60 #512 #64
--train-iters
100
--weight-decay
0.1
--adam-beta1
0.9
--adam-beta2
0.95
--init-method-std
0.006
--clip-grad
1.0
--bf16
# --fp16 # 开启fp16需要指定loss-scale
# --loss-scale 1024
--use-distributed-optimizer
--disable-bias-linear
--attention-dropout
0
--hidden-dropout
0
# --no-gradient-accumulation-fusion
--swiglu
--lr
3.0e-5
--lr-decay-style
cosine
--min-lr
3.0e-6
--lr-warmup-iters
1
--ckpt-format
torch
--ddp-average-in-collective
# 在dp阶段通信中, 梯度或参数将被直接平均, 而不是先求和(到一个设备)再平均
# --recompute-granularity full # 开启重计算降低显存增加耗时
# --recompute-num-layers 5 #0 #
# --recompute-method block
--overlap-grad-reduce
# 重叠ddp grad reduce
# --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
# --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 优化项未适配
--use-flash-attn-cutlass
)
# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
# export TORCHINDUCTOR_BENCHMARK_FUSION=1
# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
# export TORCHINDUCTOR_MAX_AUTOTUNE=1
# export TORCHINDUCTOR_CACHE_DIR=./cache
# --use-flash-attn-cutlass # cutlass fa
# --use-flash-attn-triton # triton fa
# --use-flash-attn-torch # torch fa
MODEL_PARALLEL_ARGS
=(
--sequence-parallel
--tensor-model-parallel-size
8
--pipeline-model-parallel-size
18
# 7 layer/gpu
--context-parallel-size
2
)
DATA_ARGS
=(
--data-path
$DATA_PATH
--seq-length
4096
#4096
--split
949,50,1
--tokenizer-type
Llama3Tokenizer
--tokenizer-model
/public/home/wangxj/Downloads/model_weights/llama3.2/tokenizer.model
# --tokenizer-model /data/model_weights/llama3.2/tokenizer.model
)
EVAL_AND_LOGGING_ARGS
=(
--log-interval
1
--log-throughput
--save-interval
1000
--eval-interval
1000
--save
$SAVE_PATH
--load
$SAVE_PATH
--eval-iters
10
--tensorboard-dir
$TENSORBOARD_LOGS_PATH
)
# FINETUNE_ARGS=(
# # --finetune
# # --pretrained-checkpoint $CHECKPOINT_PATH
# --load $CHECKPOINT_PATH
# --no-load-optim
# --no-load-rng
# )
PROFILE_ARGS
=(
--profile
--profile-step-start
4
--profile-step-end
5
--use-pytorch-profiler
--profile-ranks
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
--profile-dir
prof_data
)
RANK
=
$OMPI_COMM_WORLD_RANK
LOCAL_RANK
=
$OMPI_COMM_WORLD_LOCAL_RANK
WORLD_SIZE
=
$OMPI_COMM_WORLD_SIZE
DIST_URL
=
${
1
}
DIST_PORT
=
34577
DISTRIBUTED_ARGS
=(
--rank
${
RANK
}
--world-size
${
WORLD_SIZE
}
--local-rank
${
LOCAL_RANK
}
--dist-url
tcp://
${
DIST_URL
}
:
${
DIST_PORT
}
)
APP
=
"python -u pretrain_gpt.py
\
${
GPT_MODEL_ARGS
[@]
}
\
${
TRAINING_ARGS
[@]
}
\
${
MODEL_PARALLEL_ARGS
[@]
}
\
${
DATA_ARGS
[@]
}
\
${
EVAL_AND_LOGGING_ARGS
[@]
}
\
${
DISTRIBUTED_ARGS
[@]
}
\
"
# 开启profile
# ${PROFILE_ARGS[@]} \
# export HIP_VISIBLE_DEVICES=0,7 # # 4,5,6,7 #,
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# # 4,5,6,7 #,
# export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
${
APP
}
# case ${LOCAL_RANK} in
# [0])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [1])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=1 --membind=1 ${APP}
# ;;
# [2])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# numactl --cpunodebind=2 --membind=2 ${APP}
# ;;
# [3])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=3 --membind=3 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [4])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=4 --membind=4 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [5])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=5 --membind=5 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [6])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=6 --membind=6 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# [7])
# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# numactl --cpunodebind=7 --membind=7 ${APP}
# # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
# ;;
# esac
\ No newline at end of file
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment