Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lim
dcu_megatron_core_v0.15.0
Commits
01bcbb1e
Commit
01bcbb1e
authored
Feb 12, 2026
by
lim
Browse files
Initial commit
parent
187361d1
Pipeline
#3395
canceled with stages
Changes
371
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
697 additions
and
0 deletions
+697
-0
dcu_megatron/adaptor/__pycache__/megatron_adaptor.cpython-310.pyc
...tron/adaptor/__pycache__/megatron_adaptor.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/__pycache__/patch_utils.cpython-310.pyc
dcu_megatron/adaptor/__pycache__/patch_utils.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/__init__.py
dcu_megatron/adaptor/features_manager/__init__.py
+19
-0
dcu_megatron/adaptor/features_manager/__pycache__/__init__.cpython-310.pyc
...tor/features_manager/__pycache__/__init__.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/__pycache__/feature.cpython-310.pyc
...ptor/features_manager/__pycache__/feature.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/communication/__pycache__/gradient_compress_feature.cpython-310.pyc
...ion/__pycache__/gradient_compress_feature.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/communication/__pycache__/quantize_comm_feature.cpython-310.pyc
...ication/__pycache__/quantize_comm_feature.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/communication/gradient_compress_feature.py
...atures_manager/communication/gradient_compress_feature.py
+56
-0
dcu_megatron/adaptor/features_manager/communication/quantize_comm_feature.py
...r/features_manager/communication/quantize_comm_feature.py
+52
-0
dcu_megatron/adaptor/features_manager/feature.py
dcu_megatron/adaptor/features_manager/feature.py
+38
-0
dcu_megatron/adaptor/features_manager/memory/__pycache__/cpu_offload_feature.cpython-310.pyc
...er/memory/__pycache__/cpu_offload_feature.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/memory/__pycache__/swap_attention_feature.cpython-310.pyc
...memory/__pycache__/swap_attention_feature.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/memory/cpu_offload_feature.py
...on/adaptor/features_manager/memory/cpu_offload_feature.py
+89
-0
dcu_megatron/adaptor/features_manager/memory/swap_attention_feature.py
...adaptor/features_manager/memory/swap_attention_feature.py
+57
-0
dcu_megatron/adaptor/features_manager/optimizer/__pycache__/optimizer_feature.cpython-310.pyc
...r/optimizer/__pycache__/optimizer_feature.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/optimizer/optimizer_feature.py
...n/adaptor/features_manager/optimizer/optimizer_feature.py
+49
-0
dcu_megatron/adaptor/features_manager/pipeline_parallel/__pycache__/pipeline_feature.cpython-310.pyc
...ine_parallel/__pycache__/pipeline_feature.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/pipeline_parallel/pipeline_feature.py
...or/features_manager/pipeline_parallel/pipeline_feature.py
+304
-0
dcu_megatron/adaptor/features_manager/recompute/__pycache__/activation_function.cpython-310.pyc
...recompute/__pycache__/activation_function.cpython-310.pyc
+0
-0
dcu_megatron/adaptor/features_manager/recompute/activation_function.py
...adaptor/features_manager/recompute/activation_function.py
+33
-0
No files found.
dcu_megatron/adaptor/__pycache__/megatron_adaptor.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/__pycache__/patch_utils.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/__init__.py
0 → 100644
View file @
01bcbb1e
from
.pipeline_parallel.pipeline_feature
import
PipelineFeature
from
.tensor_parallel.parallel_linear_feature
import
ParallelLinearFeature
from
.optimizer.optimizer_feature
import
OptimizerFeature
from
.communication.gradient_compress_feature
import
GradientCompressFeature
from
.communication.quantize_comm_feature
import
QuantizeCommFeature
from
.memory.swap_attention_feature
import
SwapAttentionFeature
from
.memory.cpu_offload_feature
import
CPUOffloadFeature
from
.recompute.activation_function
import
RecomputeActivationFeature
ADAPTOR_FEATURES
=
[
PipelineFeature
(),
OptimizerFeature
(),
ParallelLinearFeature
(),
GradientCompressFeature
(),
QuantizeCommFeature
(),
SwapAttentionFeature
(),
CPUOffloadFeature
(),
RecomputeActivationFeature
(),
]
dcu_megatron/adaptor/features_manager/__pycache__/__init__.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/__pycache__/feature.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/communication/__pycache__/gradient_compress_feature.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/communication/__pycache__/quantize_comm_feature.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/communication/gradient_compress_feature.py
0 → 100755
View file @
01bcbb1e
from
argparse
import
ArgumentParser
from
..feature
import
AbstractFeature
class
GradientCompressFeature
(
AbstractFeature
):
def
__init__
(
self
):
super
().
__init__
(
'enable-dynamic-grad-comp'
)
def
register_args
(
self
,
parser
:
ArgumentParser
):
group
=
parser
.
add_argument_group
(
title
=
self
.
feature_name
)
group
=
parser
.
add_argument_group
(
title
=
'grad comp args'
)
group
.
add_argument
(
'--enable-dynamic-grad-comp'
,
dest
=
'enable_dynamic_grad_comp'
,
action
=
'store_true'
,
help
=
'Enable dynamic gradient compression (e.g., adaptive rank/sparsity based on training phase or gradient statistics).'
)
group
.
add_argument
(
'--grad-comp'
,
dest
=
'grad_comp'
,
action
=
'store_true'
,
help
=
'use grad comp algorithm for data parallel.'
)
group
.
add_argument
(
'--grad-comp-warm-up'
,
type
=
float
,
default
=
0.1
,
help
=
'PwerSGD warm up period for accuracy gain.'
)
group
.
add_argument
(
'--rank-adjust-window-size'
,
type
=
int
,
default
=
1000
,
help
=
'the window size of adjust rank'
)
group
.
add_argument
(
'--iteration-sample-ratio'
,
type
=
float
,
default
=
0.01
,
help
=
'iteration_sample_ratio'
)
group
.
add_argument
(
'--gradient-sample-ratio'
,
type
=
float
,
default
=
1.0
,
help
=
'gradient_sample_ratio'
)
group
.
add_argument
(
'--collect-log-path'
,
type
=
str
,
default
=
'./logs'
,
help
=
'If set, collect some data during the iteration process, such as the time and loss of each iteration'
)
def
register_patches
(
self
,
patch_manager
,
args
):
from
dcu_megatron.core.distributed.finalize_model_grads
import
finalize_model_grads
from
dcu_megatron.core.distributed.param_and_grad_buffer
import
_ParamAndGradBucketGroup
,
_ParamAndGradBuffer
,
\
_ParamAndGradBucket
from
dcu_megatron.training.training
import
save_checkpoint_and_time_wrapper
from
dcu_megatron.training.training
import
pretrain
# edgc相关功能函数替换
if
args
.
enable_dynamic_grad_comp
:
patch_manager
.
register_patch
(
'megatron.core.distributed.finalize_model_grads.finalize_model_grads'
,
finalize_model_grads
)
patch_manager
.
register_patch
(
'megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup'
,
_ParamAndGradBucketGroup
)
patch_manager
.
register_patch
(
'megatron.core.distributed.param_and_grad_buffer._ParamAndGradBuffer._new_bucket'
,
_ParamAndGradBuffer
.
_new_bucket
)
patch_manager
.
register_patch
(
'megatron.core.distributed.param_and_grad_buffer._ParamAndGradBucket'
,
_ParamAndGradBucket
)
patch_manager
.
register_patch
(
'megatron.training.training.save_checkpoint_and_time'
,
save_checkpoint_and_time_wrapper
,
apply_wrapper
=
True
)
patch_manager
.
register_patch
(
'megatron.training.training.pretrain'
,
pretrain
)
dcu_megatron/adaptor/features_manager/communication/quantize_comm_feature.py
0 → 100644
View file @
01bcbb1e
from
argparse
import
ArgumentParser
from
..feature
import
AbstractFeature
QUANT_BIT_DEFAULT_GROUP_SIZE_MAP
=
{
4
:
32
,
8
:
128
,
}
QUANT_BIT_GROUP_SIZE_CHOICES_MAP
=
{
4
:
{
16
,
32
},
8
:
{
64
,
128
},
}
class
QuantizeCommFeature
(
AbstractFeature
):
def
__init__
(
self
):
super
().
__init__
(
'use-quantize-comm'
,
2
)
def
register_args
(
self
,
parser
:
ArgumentParser
):
group
=
parser
.
add_argument_group
(
title
=
self
.
feature_name
)
group
.
add_argument
(
'--use-quantize-comm'
,
default
=
False
,
action
=
"store_true"
,
help
=
'use quantized communication'
)
group
.
add_argument
(
'--quant-comm-bits'
,
type
=
int
,
default
=
8
,
choices
=
[
4
,
8
],
help
=
'the number of bits to quantize to, supported numbers are (4, 8)'
)
group
.
add_argument
(
'--quant-group-size'
,
type
=
int
,
default
=
None
,
help
=
'the group size to use for quantization. If not specified, uses per-column quantization'
)
group
.
add_argument
(
'--quant-scale-dtype'
,
type
=
str
,
default
=
"bf16"
,
choices
=
[
"bf16"
,
"fp16"
,
"fp32"
],
help
=
'the dtype of quantization scale'
)
def
validate_args
(
self
,
args
):
assert
args
.
quant_comm_bits
in
{
4
,
8
},
f
"quant_comm_bits
{
args
.
quant_comm_bits
}
only accepts values from [4, 8]"
if
(
args
.
quant_group_size
is
not
None
and
args
.
quant_group_size
not
in
QUANT_BIT_GROUP_SIZE_CHOICES_MAP
[
args
.
quant_comm_bits
]
):
raise
ValueError
(
f
"quant_group_size
{
args
.
quant_group_size
}
only accepts values from
{
QUANT_BIT_GROUP_SIZE_CHOICES_MAP
[
args
.
quant_comm_bits
]
}
"
)
def
register_patches
(
self
,
patch_manager
,
args
):
from
dcu_megatron.core.tensor_parallel.mappings
import
all_to_all
if
args
.
use_quantize_comm
:
patch_manager
.
register_patch
(
'megatron.core.tensor_parallel.mappings.all_to_all'
,
all_to_all
)
dcu_megatron/adaptor/features_manager/feature.py
0 → 100644
View file @
01bcbb1e
import
argparse
class
AbstractFeature
:
def
__init__
(
self
,
feature_name
:
str
,
optimization_level
:
int
=
2
):
self
.
feature_name
=
feature_name
.
strip
().
replace
(
'-'
,
'_'
)
self
.
optimization_level
=
optimization_level
self
.
default_patches
=
self
.
optimization_level
==
0
def
register_args
(
self
,
parser
):
pass
def
pre_validate_args
(
self
,
args
):
return
args
def
validate_args
(
self
,
args
):
pass
def
post_validate_args
(
self
,
args
):
pass
def
register_patches
(
self
,
patch_manager
,
args
):
...
def
incompatible_check
(
self
,
global_args
,
check_args
):
if
getattr
(
global_args
,
self
.
feature_name
,
None
)
and
getattr
(
global_args
,
check_args
,
None
):
raise
AssertionError
(
'{} and {} are incompatible.'
.
format
(
self
.
feature_name
,
check_args
))
def
dependency_check
(
self
,
global_args
,
check_args
):
if
getattr
(
global_args
,
self
.
feature_name
,
None
)
and
not
getattr
(
global_args
,
check_args
,
None
):
raise
AssertionError
(
'{} requires {}.'
.
format
(
self
.
feature_name
,
check_args
))
@
staticmethod
def
add_parser_argument_choices_value
(
parser
,
argument_name
,
new_choice
):
for
action
in
parser
.
_actions
:
exist_arg
=
isinstance
(
action
,
argparse
.
Action
)
and
argument_name
in
action
.
option_strings
if
exist_arg
and
action
.
choices
is
not
None
and
new_choice
not
in
action
.
choices
:
action
.
choices
.
append
(
new_choice
)
dcu_megatron/adaptor/features_manager/memory/__pycache__/cpu_offload_feature.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/memory/__pycache__/swap_attention_feature.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/memory/cpu_offload_feature.py
0 → 100644
View file @
01bcbb1e
import
os
from
argparse
import
ArgumentParser
from
..feature
import
AbstractFeature
class
CPUOffloadFeature
(
AbstractFeature
):
def
__init__
(
self
):
super
().
__init__
(
'fine-grained-activation-offloading'
,
2
)
def
register_args
(
self
,
parser
:
ArgumentParser
):
group
=
parser
.
add_argument_group
(
title
=
self
.
feature_name
)
group
.
add_argument
(
'--fine-grained-activation-offloading'
,
action
=
'store_true'
,
help
=
'Offload the activation to CPU'
)
group
.
add_argument
(
'--offload-modules'
,
nargs
=
'*'
,
type
=
str
,
default
=
None
,
help
=
'The submodules to offload. '
'choices: "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "expert_fc2", '
' "shared_fc1", "shared_fc2", "moe_act".'
'default: ["core_attn"].'
'"attn_norm": offload the input of the normalization in the attention part. '
'"qkv_linear": offload the qkv_linear part of the transformer layer. '
'"core_attn": offload the core attention part of the transformer layer. '
'"attn_proj": offload the input of the attn linear projection part. '
'"mlp_norm": offload the input of the normalization in the mlp part. '
'"expert_fc1": offload the input of the expert fc1 part. '
'"expert_fc2": offload the input of the expert fc2 part. '
'"shared_fc1": offload the shared_fc1 part of the transformer layer. '
'"shared_fc2": offload the shared_fc2 part of the transformer layer. '
'"moe_act": offload the activation function part of the moe layer.'
)
group
.
add_argument
(
'--min-offloaded-tensor-size'
,
type
=
int
,
default
=
1024
*
1024
,
help
=
'The minimum size of the tensor to be offloaded.'
)
def
register_patches
(
self
,
patch_manager
,
args
):
from
dcu_megatron.core.models.gpt.gpt_model
import
GPTModel
from
dcu_megatron.core.transformer.attention
import
Attention
from
dcu_megatron.core.transformer.multi_latent_attention
import
MultiLatentAttention
from
dcu_megatron.core.transformer.moe.experts
import
TEGroupedMLP
from
dcu_megatron.core.transformer.mlp
import
MLP
from
dcu_megatron.core.transformer.transformer_layer
import
TransformerLayer
from
dcu_megatron.core.transformer.transformer_block
import
TransformerBlock
from
dcu_megatron.core.extensions.transformer_engine
import
te_module_init_wrapper
from
dcu_megatron.core.pipeline_parallel.schedules
import
forward_backward_pipelining_wrapper
from
dcu_megatron.core.transformer.multi_token_prediction
import
MultiTokenPredictionBlock
from
dcu_megatron.core.tensor_parallel.random
import
CheckpointWithoutOutput
from
dcu_megatron.core.models.gpt.fine_grained_callables
import
build_layer_callables_without_split_attn
patch_manager
.
register_patch
(
'megatron.core.models.gpt.gpt_model.GPTModel.preprocess_for_fine_grained_offloading'
,
GPTModel
.
preprocess_for_fine_grained_offloading
,
create_dummy
=
True
)
patch_manager
.
register_patch
(
'megatron.core.models.gpt.gpt_model.GPTModel.__init__'
,
GPTModel
.
__init__
)
patch_manager
.
register_patch
(
'megatron.core.models.gpt.gpt_model.GPTModel.build_schedule_plan'
,
GPTModel
.
build_schedule_plan
)
patch_manager
.
register_patch
(
'megatron.core.transformer.attention.Attention.forward'
,
Attention
.
forward
)
patch_manager
.
register_patch
(
'megatron.core.transformer.multi_latent_attention.MultiLatentAttention.forward'
,
MultiLatentAttention
.
forward
)
patch_manager
.
register_patch
(
'megatron.core.transformer.moe.experts.TEGroupedMLP.forward'
,
TEGroupedMLP
.
forward
)
patch_manager
.
register_patch
(
'megatron.core.pipeline_parallel.schedules.forward_backward_pipelining_with_interleaving'
,
forward_backward_pipelining_wrapper
,
apply_wrapper
=
True
)
patch_manager
.
register_patch
(
'megatron.core.transformer.mlp.MLP.forward'
,
MLP
.
forward
)
patch_manager
.
register_patch
(
'megatron.core.pipeline_parallel.schedules.forward_backward_pipelining_without_interleaving'
,
forward_backward_pipelining_wrapper
,
apply_wrapper
=
True
)
patch_manager
.
register_patch
(
'megatron.core.transformer.transformer_layer.TransformerLayer._forward_attention'
,
TransformerLayer
.
_forward_attention
)
patch_manager
.
register_patch
(
'megatron.core.transformer.transformer_block.TransformerBlock.forward'
,
TransformerBlock
.
forward
)
patch_manager
.
register_patch
(
'megatron.core.transformer.multi_token_prediction.MultiTokenPredictionBlock.forward'
,
MultiTokenPredictionBlock
.
forward
)
patch_manager
.
register_cls_funcs
(
'megatron.core.tensor_parallel.random.CheckpointWithoutOutput'
,
[
CheckpointWithoutOutput
.
checkpoint
,
CheckpointWithoutOutput
.
_recompute
],
create_dummy
=
True
)
patch_manager
.
register_patch
(
'megatron.core.models.gpt.fine_grained_callables.build_layer_callables'
,
build_layer_callables_without_split_attn
)
dcu_megatron/adaptor/features_manager/memory/swap_attention_feature.py
0 → 100644
View file @
01bcbb1e
from
argparse
import
ArgumentParser
from
..feature
import
AbstractFeature
from
megatron.core.utils
import
is_te_min_version
class
SwapAttentionFeature
(
AbstractFeature
):
def
__init__
(
self
):
super
().
__init__
(
'swap-attention'
,
2
)
def
register_args
(
self
,
parser
:
ArgumentParser
):
group
=
parser
.
add_argument_group
(
title
=
self
.
feature_name
)
group
.
add_argument
(
'--swap-attention'
,
action
=
'store_true'
,
default
=
False
,
help
=
'switch to open swap-attention feature.'
'The default is False.'
)
# input_layernorm,self_attention,post_attention_norm
group
.
add_argument
(
'--swap-modules'
,
type
=
str
,
default
=
"self_attention"
,
help
=
'Swap modules for model. Can be used together with "--swap-attention."'
)
group
.
add_argument
(
'--specify-layers'
,
type
=
str
,
default
=
None
,
help
=
'Specify the swap layer. Can be used together with "--swap-attention."eg"0, 2, 4, 6"'
)
group
.
add_argument
(
'--reduce-recompute-for-last-chunk'
,
action
=
'store_true'
,
default
=
False
,
help
=
'with recompute, now default False'
)
def
validate_args
(
self
,
args
):
adaptive_recompute_device_size
=
getattr
(
args
,
'adaptive-recompute-device-size'
,
-
1
)
adaptive_recompute_device_swap
=
getattr
(
args
,
'adaptive-recompute-device-swap'
,
False
)
if
(
adaptive_recompute_device_size
>
0
or
adaptive_recompute_device_swap
)
and
args
.
swap_attention
:
raise
AssertionError
(
'adaptive selective recompute is not compatible with swap_attention feature'
)
self
.
incompatible_check
(
args
,
'adaptive_memory_optimization'
)
is_enable_lora
=
hasattr
(
args
,
"lora_target_modules"
)
and
args
.
lora_target_modules
if
is_enable_lora
:
raise
AssertionError
(
'swap attention is not compatible with LoRA'
)
def
register_patches
(
self
,
patch_manager
,
args
):
if
getattr
(
args
,
self
.
feature_name
,
None
):
if
hasattr
(
args
,
"use_mcore_models"
)
and
args
.
use_mcore_models
:
if
not
is_te_min_version
(
"2.5.0"
)
and
hasattr
(
args
,
"overlap_grad_reduce"
)
and
args
.
overlap_grad_reduce
:
raise
AssertionError
(
"With overlap_grad_reduce must have at least transformer-engine version of 2.5.0"
)
from
dcu_megatron.core.memory.swap_attention.adaptor_swap_atten
import
allowed_recomputing_swap_module_wrapper
from
megatron.legacy.model.transformer
import
ParallelTransformerLayer
from
megatron.core.transformer.transformer_layer
import
TransformerLayer
if
hasattr
(
args
,
"use_legacy_models"
)
and
not
args
.
use_legacy_models
:
allowed_recomputing_swap_module_wrapper
(
TransformerLayer
)
else
:
allowed_recomputing_swap_module_wrapper
(
ParallelTransformerLayer
)
from
dcu_megatron.core.memory.swap_attention.adaptor_swap_atten
import
setup_model_and_optimizer_wrapper
patch_manager
.
register_patch
(
'megatron.training.training.setup_model_and_optimizer'
,
setup_model_and_optimizer_wrapper
)
from
dcu_megatron.core.memory.common
import
linear_forward_main_grad_wrapper
,
linear_backward_main_grad_wrapper
patch_manager
.
register_patch
(
'megatron.core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication.forward'
,
linear_forward_main_grad_wrapper
)
patch_manager
.
register_patch
(
'megatron.core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication.backward'
,
linear_backward_main_grad_wrapper
)
dcu_megatron/adaptor/features_manager/optimizer/__pycache__/optimizer_feature.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/optimizer/optimizer_feature.py
0 → 100644
View file @
01bcbb1e
from
argparse
import
ArgumentParser
from
megatron.core.utils
import
is_te_min_version
from
..feature
import
AbstractFeature
class
OptimizerFeature
(
AbstractFeature
):
def
__init__
(
self
):
super
().
__init__
(
'use-optimizer-feature'
)
def
register_args
(
self
,
parser
:
ArgumentParser
):
group
=
parser
.
add_argument_group
(
title
=
self
.
feature_name
)
group
.
add_argument
(
'--use-optimizer-feature'
,
action
=
'store_true'
,
help
=
'whether to use optimizer related feature.'
)
group
.
add_argument
(
'--reuse-fp32-param'
,
action
=
'store_true'
,
help
=
'The distributed training optimizer frees up '
'param copies of FP32 to save memory.'
)
def
validate_args
(
self
,
args
):
if
args
.
reuse_fp32_param
and
not
args
.
bf16
:
raise
AssertionError
(
'--reuse-fp32-param only support for `bf16`'
)
def
register_patches
(
self
,
patch_manager
,
args
):
if
args
.
reuse_fp32_param
:
from
dcu_megatron.core.memory.reuse_param.adaptor
import
(
step_with_ready_grads
,
prepare_grads
,
reuse_fp32_param_init_wrapper
,
optimizer_config_init_wrapper
)
from
dcu_megatron.core.memory.reuse_param.adaptor
import
reuse_fp32_param_distrib_optimizer_init_wrapper
from
dcu_megatron.core.memory.reuse_param.adaptor
import
reuse_fp32_param_param_and_grad_buffer_init_wrapper
patch_manager
.
register_patch
(
'megatron.core.optimizer.optimizer.MixedPrecisionOptimizer.prepare_grads'
,
prepare_grads
)
patch_manager
.
register_patch
(
'megatron.core.optimizer.optimizer.MixedPrecisionOptimizer.step_with_ready_grads'
,
step_with_ready_grads
)
patch_manager
.
register_patch
(
'megatron.core.optimizer.optimizer.Float16OptimizerWithFloat16Params.__init__'
,
reuse_fp32_param_init_wrapper
)
patch_manager
.
register_patch
(
'megatron.core.optimizer.optimizer_config.OptimizerConfig.__init__'
,
optimizer_config_init_wrapper
)
patch_manager
.
register_patch
(
'megatron.core.optimizer.distrib_optimizer.DistributedOptimizer.__init__'
,
reuse_fp32_param_distrib_optimizer_init_wrapper
)
patch_manager
.
register_patch
(
'megatron.core.distributed.param_and_grad_buffer._ParamAndGradBuffer.__init__'
,
reuse_fp32_param_param_and_grad_buffer_init_wrapper
)
dcu_megatron/adaptor/features_manager/pipeline_parallel/__pycache__/pipeline_feature.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/pipeline_parallel/pipeline_feature.py
0 → 100644
View file @
01bcbb1e
import
os
import
re
from
argparse
import
ArgumentParser
from
megatron.core
import
parallel_state
from
megatron.core.utils
import
is_te_min_version
,
is_torch_min_version
from
..feature
import
AbstractFeature
def
_eval_pattern
(
pattern
):
""" Validate and evaluate a string containing a Python list expression """
assert
isinstance
(
pattern
,
str
)
# validate input, only allow comma, digits, [, ], (, ), +, and *
if
bool
(
re
.
compile
(
r
'[^,\d\[\]\(\)\+\*]'
).
search
(
pattern
)):
raise
ValueError
(
f
"Invalid pattern:
{
pattern
}
"
)
return
eval
(
pattern
)
def
num_layers_build_type
(
x
):
"""number of layers to build.
Accepts either:
- An integer N: meaning n layers for each model block
- A string "N": Same as above, but provided as a string
- A string containing a Python list expression that defines a custom pattern, e.g.:
"([1]*3+[2]*1)*3" evaluates to [1,1,1,2,1,1,1,2,1,1,1,2]
The pattern length must match the total number of transformer blocks.
"""
if
isinstance
(
x
,
int
):
return
x
assert
isinstance
(
x
,
str
)
if
'['
in
x
:
# it's a custom pattern
return
_eval_pattern
(
x
)
else
:
# it's a single int but in str
return
int
(
x
)
class
PipelineFeature
(
AbstractFeature
):
def
__init__
(
self
):
super
().
__init__
(
'schedule-method'
)
def
register_args
(
self
,
parser
:
ArgumentParser
):
group
=
parser
.
add_argument_group
(
title
=
self
.
feature_name
)
group
.
add_argument
(
'--schedule-method'
,
type
=
str
,
default
=
'vanilla'
,
choices
=
[
'vanilla'
,
'dualpipev'
,
'seq1f1b'
,
'interleaved_seq1f1b'
],
help
=
'Use pipeline provided by megatron if schedule-method is set to vanilla'
)
# MoE communication overlap arguments
group
.
add_argument
(
'--overlap-ep-comm-with-split-attn'
,
action
=
"store_true"
,
default
=
False
,
help
=
'whether to split attention'
)
group
.
add_argument
(
'--num-layers-to-build'
,
type
=
num_layers_build_type
,
default
=
None
,
help
=
'number of layers to build: '
'- An integer N: meaning n layers for each model block '
'- A string containing a Python list expression that defines a custom pattern'
)
# Vocabulary parallelism.
group
.
add_argument
(
'--enable-vocab-parallel'
,
action
=
'store_true'
,
help
=
'Enables vocabulary parallelism at the vocabulary layers. '
'Must be enabled together with pipeline model parallelism.'
)
group
.
add_argument
(
'--disable-backward-fusion'
,
action
=
'store_true'
,
help
=
'disables the forward-backward fusion for the output '
'layer. requires two communication barriers instead of one'
)
group
.
add_argument
(
'--schedule-timer-start'
,
type
=
int
,
default
=
10
,
help
=
'Start iteration of the vocabulary parallelism schedule timer'
)
group
.
add_argument
(
'--schedule-timer-end'
,
type
=
int
,
default
=
20
,
help
=
'End iteration of the vocabulary parallelism schedule timer'
)
def
pre_validate_args
(
self
,
args
):
if
args
.
schedule_method
!=
"dualpipev"
:
return
args
pp_size
=
args
.
pipeline_model_parallel_size
*
2
if
args
.
num_layers
is
None
and
args
.
num_layers_to_build
is
not
None
:
pp_size
=
args
.
pipeline_model_parallel_size
if
isinstance
(
args
.
num_layers_to_build
,
int
):
args
.
num_layers
=
args
.
num_layers_to_build
*
pp_size
*
2
else
:
assert
len
(
args
.
num_layers_to_build
)
==
pp_size
*
2
,
"The pattern length must match the total number of transformer blocks"
args
.
num_layers
=
sum
(
args
.
num_layers_to_build
)
return
args
def
validate_args
(
self
,
args
):
if
args
.
schedule_method
==
"dualpipev"
:
if
args
.
delay_wgrad_compute
and
args
.
overlap_grad_reduce
:
assert
bool
(
int
(
os
.
getenv
(
"NVTE_OVERLAP_GRAD_REDUCE"
,
"0"
))),
\
"NVTE_OVERLAP_GRAD_REDUCE should be set to 1 when --delay-wgrad-compute and --overlap-grad-reduce are set"
if
args
.
schedule_method
==
"dualpipev"
:
if
args
.
num_layers_per_virtual_pipeline_stage
is
not
None
or
args
.
num_virtual_stages_per_pipeline_rank
is
not
None
:
raise
AssertionError
(
"The dualpipev and virtual_pipeline are incompatible."
)
layers_to_distribute
=
args
.
num_layers
pipeline_stages_left
=
args
.
pipeline_model_parallel_size
*
2
if
args
.
num_layers_to_build
is
not
None
:
assert
args
.
decoder_first_pipeline_num_layers
is
None
and
args
.
decoder_last_pipeline_num_layers
is
None
,
\
"--decoder-first-pipeline-num-layers and --decoder-last-pipeline-num-layers should NOT be set when using --num-layers-to-build"
if
isinstance
(
args
.
num_layers_to_build
,
int
):
assert
args
.
num_layers_to_build
*
pipeline_stages_left
==
layers_to_distribute
,
"num-layers-to-build mismatch with num-layers"
else
:
assert
len
(
args
.
num_layers_to_build
)
==
pipeline_stages_left
,
"The pattern length must match the total number of transformer blocks"
assert
sum
(
args
.
num_layers_to_build
)
==
args
.
num_layers
if
args
.
decoder_first_pipeline_num_layers
is
not
None
and
args
.
decoder_last_pipeline_num_layers
is
not
None
:
if
args
.
decoder_first_pipeline_num_layers
is
not
None
:
layers_to_distribute
-=
args
.
decoder_first_pipeline_num_layers
pipeline_stages_left
-=
1
if
args
.
decoder_last_pipeline_num_layers
is
not
None
:
layers_to_distribute
-=
args
.
decoder_last_pipeline_num_layers
pipeline_stages_left
-=
1
if
layers_to_distribute
<
pipeline_stages_left
:
raise
AssertionError
(
'number of layers must be at least 2*pipeline_model_parallel_size in dualpipe'
)
num_micro_batch
=
args
.
global_batch_size
//
args
.
micro_batch_size
//
args
.
data_parallel_size
if
num_micro_batch
<
args
.
pipeline_model_parallel_size
:
raise
AssertionError
(
"num_micro_batch should NOT be smaller than pipeline_model_parallel_size"
)
if
not
args
.
delay_wgrad_compute
:
raise
AssertionError
(
"delay-wgrad-compute should be True"
)
if
not
is_te_min_version
(
"2.4.0"
):
raise
AssertionError
(
"Must have at least transformer-engine version of 2.4.0"
)
if
args
.
overlap_moe_expert_parallel_comm
:
assert
args
.
transformer_impl
==
"transformer_engine"
,
\
"moe a2a overlap is only supported with transformer_engine implementation"
assert
args
.
schedule_method
==
"dualpipev"
or
args
.
num_layers_per_virtual_pipeline_stage
is
not
None
or
args
.
num_virtual_stages_per_pipeline_rank
is
not
None
,
\
'moe a2a overlap is only supported with vpp or dualpipev'
# Vocabulary parallelism.
if
args
.
enable_vocab_parallel
:
assert
args
.
pipeline_model_parallel_size
>
1
,
'pipeline parallel size '
\
'must be > 1 when vocab parallel is enabled'
assert
args
.
virtual_pipeline_model_parallel_size
is
None
,
'vocab parallel'
\
'with interleaved schedule is not supported yet'
assert
(
args
.
make_vocab_size_divisible_by
%
(
args
.
tensor_model_parallel_size
*
args
.
pipeline_model_parallel_size
)
==
0
),
f
'vocab size must be divisible by model parallel size (
{
args
.
tensor_model_parallel_size
*
args
.
pipeline_model_parallel_size
}
) for vocab parallel'
assert
args
.
untie_embeddings_and_output_weights
,
'--enable-vocab-parallel requires'
\
'untie embeddings and output weights'
else
:
args
.
disable_backward_fusion
=
False
def
register_patches
(
self
,
patch_manager
,
args
):
from
dcu_megatron.core.pipeline_parallel.schedules
import
get_forward_backward_func_wrapper
patch_manager
.
register_patch
(
'megatron.core.pipeline_parallel.schedules.get_forward_backward_func'
,
get_forward_backward_func_wrapper
,
apply_wrapper
=
True
)
if
args
.
schedule_method
==
"dualpipev"
:
from
megatron.training.utils
import
print_rank_0
from
dcu_megatron.core.pipeline_parallel.dualpipev.dualpipev_chunks
import
(
dualpipev_fp16forward
,
get_num_layers_to_build
,
_allreduce_embedding_grads_wrapper
)
from
dcu_megatron.training.training
import
evaluate
from
dcu_megatron.core.transformer.transformer_layer
import
get_transformer_layer_offset
from
dcu_megatron.training.training
import
pretrain
from
dcu_megatron.core.models.gpt.gpt_model
import
GPTModel
from
dcu_megatron.training.global_vars
import
_set_tensorboard_writer
,
_set_wandb_writer
,
_set_one_logger
from
dcu_megatron.core.models.common.language_module.language_module
import
LanguageModule
from
dcu_megatron.core.transformer.multi_token_prediction
import
get_mtp_num_layers_to_build
from
dcu_megatron.core.tensor_parallel.layers
import
VocabParallelEmbedding
from
dcu_megatron.core.transformer.multi_token_prediction
import
tie_word_embeddings_state_dict_wrapper
from
dcu_megatron.core.pipeline_parallel.schedules
import
forward_step_calc_loss
from
dcu_megatron.core.distributed.distributed_data_parallel
import
DistributedDataParallel
patch_manager
.
register_patch
(
'megatron.core.transformer.module.Float16Module.forward'
,
dualpipev_fp16forward
)
patch_manager
.
register_patch
(
'megatron.core.transformer.transformer_block.get_num_layers_to_build'
,
get_num_layers_to_build
)
patch_manager
.
register_patch
(
'megatron.training.utils.print_rank_last'
,
print_rank_0
)
patch_manager
.
register_patch
(
'megatron.core.distributed.finalize_model_grads._allreduce_embedding_grads'
,
_allreduce_embedding_grads_wrapper
)
# use first rank
patch_manager
.
register_patch
(
'megatron.training.training.evaluate'
,
evaluate
)
patch_manager
.
register_patch
(
'megatron.core.transformer.transformer_layer.get_transformer_layer_offset'
,
get_transformer_layer_offset
)
# support dualpipev, two data iterators
patch_manager
.
register_patch
(
'megatron.training.training.pretrain'
,
pretrain
)
# (1) introduce an attribute dualpipev_first_chunk. (2) remove embedding when using dualpipev
patch_manager
.
register_patch
(
'megatron.core.models.gpt.gpt_model.GPTModel.__init__'
,
GPTModel
.
__init__
)
patch_manager
.
register_patch
(
'megatron.core.models.gpt.gpt_model.GPTModel.shared_embedding_or_output_weight'
,
GPTModel
.
shared_embedding_or_output_weight
)
# set _GLOBAL_TENSORBOARD_WRITER, _GLOBAL_WANDB_WRITER, _GLOBAL_ONE_LOGGER
patch_manager
.
register_patch
(
'megatron.training.global_vars._set_tensorboard_writer'
,
_set_tensorboard_writer
)
patch_manager
.
register_patch
(
'megatron.training.global_vars._set_wandb_writer'
,
_set_wandb_writer
)
patch_manager
.
register_patch
(
'megatron.training.global_vars._set_one_logger'
,
_set_one_logger
)
# support mtp
patch_manager
.
register_patch
(
'megatron.core.models.common.language_module.language_module.LanguageModule.setup_embeddings_and_output_layer'
,
LanguageModule
.
setup_embeddings_and_output_layer
)
patch_manager
.
register_patch
(
'megatron.core.transformer.multi_token_prediction.get_mtp_num_layers_to_build'
,
get_mtp_num_layers_to_build
)
patch_manager
.
register_patch
(
'megatron.core.tensor_parallel.layers.VocabParallelEmbedding.__init__'
,
VocabParallelEmbedding
.
__init__
)
patch_manager
.
register_patch
(
'megatron.core.transformer.multi_token_prediction.tie_word_embeddings_state_dict'
,
tie_word_embeddings_state_dict_wrapper
,
apply_wrapper
=
True
)
patch_manager
.
register_patch
(
'megatron.core.pipeline_parallel.schedules.forward_step_calc_loss'
,
forward_step_calc_loss
)
patch_manager
.
register_patch
(
'megatron.core.distributed.distributed_data_parallel.DistributedDataParallel._make_backward_post_hook'
,
DistributedDataParallel
.
_make_backward_post_hook
)
if
args
.
enable_vocab_parallel
:
from
dcu_megatron.core.parallel_state
import
destroy_model_parallel_wrapper
from
dcu_megatron.core.pipeline_parallel.p2p_communication
import
P2PCommunicator
from
dcu_megatron.core.transformer.module
import
Float16Module
patch_manager
.
register_patch
(
'megatron.core.parallel_state.destroy_model_parallel'
,
destroy_model_parallel_wrapper
,
create_dummy
=
True
)
patch_manager
.
register_cls_funcs
(
'megatron.core.pipeline_parallel.p2p_communication.P2PCommunicator'
,
[
P2PCommunicator
.
_communicate
,
P2PCommunicator
.
recv_forward
,
P2PCommunicator
.
send_backward_recv_forward
])
# embedding/output layer
patch_manager
.
register_cls_funcs
(
'megatron.core.transformer.module.Float16Module'
,
[
Float16Module
.
__init__
,
Float16Module
.
forward
])
from
dcu_megatron.core.transformer.transformer_layer
import
TransformerLayer
from
dcu_megatron.core.transformer.transformer_block
import
TransformerBlock
from
dcu_megatron.core.models.gpt.gpt_model
import
GPTModel
from
dcu_megatron.core.transformer.multi_latent_attention
import
MLASelfAttention
from
dcu_megatron.core.transformer.attention
import
Attention
from
dcu_megatron.core.transformer.moe.moe_layer
import
MoELayer
from
dcu_megatron.core.distributed.data_parallel_base
import
_BaseDataParallel
from
dcu_megatron.core.transformer.module
import
Float16Module
from
dcu_megatron.core.transformer.multi_token_prediction
import
MultiTokenPredictionLayer
,
MultiTokenPredictionBlock
from
dcu_megatron.core.pipeline_parallel.utils
import
ScheduleNode
patch_manager
.
register_patch
(
'megatron.core.transformer.transformer_layer.TransformerLayer.backward_dw'
,
TransformerLayer
.
backward_dw
,
create_dummy
=
True
)
if
args
.
schedule_method
==
"dualpipev"
or
args
.
overlap_ep_comm_with_split_attn
:
patch_manager
.
register_patch
(
'megatron.core.models.gpt.gpt_model.GPTModel.build_schedule_plan'
,
GPTModel
.
build_schedule_plan
)
patch_manager
.
register_patch
(
'megatron.core.models.gpt.gpt_model.GPTModel.backward_dw'
,
GPTModel
.
backward_dw
,
create_dummy
=
True
)
patch_manager
.
register_patch
(
'megatron.core.distributed.data_parallel_base._BaseDataParallel.backward_dw'
,
_BaseDataParallel
.
backward_dw
,
create_dummy
=
True
)
patch_manager
.
register_patch
(
'megatron.core.transformer.module.Float16Module.backward_dw'
,
Float16Module
.
backward_dw
,
create_dummy
=
True
)
patch_manager
.
register_cls_funcs
(
'megatron.core.transformer.multi_latent_attention.MLASelfAttention'
,
[
MLASelfAttention
.
compute_qkv
,
MLASelfAttention
.
compute_attn
,
MLASelfAttention
.
compute_proj
,],
create_dummy
=
True
)
patch_manager
.
register_cls_funcs
(
'megatron.core.transformer.attention.Attention'
,
[
Attention
.
compute_qkv
,
Attention
.
compute_attn
,
Attention
.
compute_proj
,],
create_dummy
=
True
)
patch_manager
.
register_patch
(
'megatron.core.transformer.transformer_block.TransformerBlock.backward_dw'
,
TransformerBlock
.
backward_dw
,
create_dummy
=
True
)
patch_manager
.
register_cls_funcs
(
'megatron.core.transformer.moe.moe_layer.MoELayer'
,
[
MoELayer
.
backward_dw
,
MoELayer
.
backward_shared_expert_dw
,
MoELayer
.
backward_routed_expert_dw
,],
create_dummy
=
True
)
patch_manager
.
register_patch
(
'megatron.core.transformer.multi_token_prediction.MultiTokenPredictionLayer.backward_dw'
,
MultiTokenPredictionLayer
.
backward_dw
,
create_dummy
=
True
)
patch_manager
.
register_patch
(
'megatron.core.transformer.multi_token_prediction.MultiTokenPredictionBlock.backward_dw'
,
MultiTokenPredictionBlock
.
backward_dw
,
create_dummy
=
True
)
patch_manager
.
register_cls_funcs
(
'megatron.core.pipeline_parallel.utils.ScheduleNode'
,
[
ScheduleNode
.
forward
,
ScheduleNode
.
_forward
,
ScheduleNode
.
backward
,
ScheduleNode
.
_backward
,])
dcu_megatron/adaptor/features_manager/recompute/__pycache__/activation_function.cpython-310.pyc
0 → 100644
View file @
01bcbb1e
File added
dcu_megatron/adaptor/features_manager/recompute/activation_function.py
0 → 100644
View file @
01bcbb1e
from
..feature
import
AbstractFeature
class
RecomputeActivationFeature
(
AbstractFeature
):
def
__init__
(
self
):
super
().
__init__
(
'recompute-activation-function'
)
def
register_args
(
self
,
parser
):
group
=
parser
.
add_argument_group
(
title
=
self
.
feature_name
)
group
.
add_argument
(
'--recompute-activation-function'
,
action
=
'store_true'
,
help
=
'Recompute the activation function in MLP layers.'
)
group
.
add_argument
(
'--recompute-activation-function-num-layers'
,
type
=
int
,
default
=
None
,
help
=
'Can be used together with "--recompute-method block." '
'and "--recompute-num-layers". '
)
def
validate_args
(
self
,
args
):
if
args
.
recompute_activation_function_num_layers
is
not
None
:
if
not
isinstance
(
args
.
recompute_activation_function_num_layers
,
int
):
raise
TypeError
(
'--recompute-activation-function-num-layers must be an integer.'
)
if
args
.
recompute_activation_function_num_layers
<
0
:
raise
AssertionError
(
'--recompute-activation-function-num-layers cannot be less than 0.'
)
if
args
.
recompute_activation_function_num_layers
>
args
.
num_layers
:
raise
ValueError
(
f
'--recompute-activation-function-num-layers (
{
args
.
recompute_activation_function_num_layers
}
) '
f
'cannot be greater than --num-layers (
{
args
.
num_layers
}
).'
)
def
register_patches
(
self
,
patch_manager
,
args
):
from
dcu_megatron.core.memory.recompute.activation.adaptor
import
dcu_activation_recompute_forward
from
dcu_megatron.core.transformer.transformer
import
parallel_transformer_layer_init_wrapper
if
getattr
(
args
,
self
.
feature_name
,
None
):
patch_manager
.
register_patch
(
'megatron.core.transformer.transformer_layer.TransformerLayer.__init__'
,
parallel_transformer_layer_init_wrapper
)
patch_manager
.
register_patch
(
'megatron.core.transformer.mlp.MLP.forward'
,
dcu_activation_recompute_forward
)
Prev
1
2
3
4
5
6
7
8
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment