Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
evt_fugx1
dcu_megatron
Commits
4c942eaf
Commit
4c942eaf
authored
Apr 26, 2025
by
silencealiang
Browse files
bug fix
parent
770fa304
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
17 additions
and
12 deletions
+17
-12
dcu_megatron/adaptor/megatron_adaptor.py
dcu_megatron/adaptor/megatron_adaptor.py
+9
-9
dcu_megatron/core/models/gpt/gpt_layer_specs.py
dcu_megatron/core/models/gpt/gpt_layer_specs.py
+1
-1
dcu_megatron/core/models/gpt/gpt_model.py
dcu_megatron/core/models/gpt/gpt_model.py
+3
-1
dcu_megatron/core/transformer/transformer_config.py
dcu_megatron/core/transformer/transformer_config.py
+1
-0
dcu_megatron/legacy/model/transformer.py
dcu_megatron/legacy/model/transformer.py
+2
-0
dcu_megatron/training/arguments.py
dcu_megatron/training/arguments.py
+1
-1
No files found.
dcu_megatron/adaptor/megatron_adaptor.py
View file @
4c942eaf
...
...
@@ -24,13 +24,13 @@ class MegatronAdaptation:
adaptation
.
execute
()
MegatronAdaptation
.
apply
()
from
.patch_utils
import
MegatronPatchesManager
#
from .patch_utils import MegatronPatchesManager
args
=
get_adaptor_args
()
for
feature
in
FEATURES_LIST
:
if
(
getattr
(
args
,
feature
.
feature_name
,
None
)
and
feature
.
optimization_level
>
0
)
or
feature
.
optimization_level
==
0
:
feature
.
register_patches
(
MegatronPatchesManager
,
args
)
MindSpeedPatchesManager
.
apply_patches
()
#
args = get_adaptor_args()
#
for feature in FEATURES_LIST:
#
if (getattr(args, feature.feature_name, None) and feature.optimization_level > 0) or feature.optimization_level == 0:
#
feature.register_patches(MegatronPatchesManager, args)
#
MindSpeedPatchesManager.apply_patches()
# MegatronAdaptation.post_execute()
...
...
@@ -142,9 +142,9 @@ class CoreAdaptation(MegatronAdaptationABC):
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity'
,
torch
.
compile
(
options
=
{
"triton.cudagraphs"
:
True
,
"triton.cudagraph_trees"
:
False
}),
apply_wrapper
=
True
)
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func'
,
torch
.
compile
(
options
=
{
"triton.cudagraphs"
:
True
,
"triton.cudagraph_trees"
:
False
,
"triton.cudagraph_support_input_mutation"
:
True
}),
apply_wrapper
=
True
)
#
MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
#
torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
#
apply_wrapper=True)
MegatronAdaptation
.
register
(
'megatron.core.transformer.moe.moe_utils.permute'
,
torch
.
compile
(
mode
=
'max-autotune-no-cudagraphs'
),
apply_wrapper
=
True
)
...
...
dcu_megatron/core/models/gpt/gpt_layer_specs.py
View file @
4c942eaf
import
warnings
from
typing
import
Optional
from
typing
import
Optional
,
Union
from
megatron.core.fusions.fused_bias_dropout
import
get_bias_dropout_add
from
megatron.core.models.gpt.moe_module_specs
import
get_moe_module_spec
...
...
dcu_megatron/core/models/gpt/gpt_model.py
View file @
4c942eaf
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
import
os
from
collections
import
OrderedDict
from
typing
import
Dict
,
Literal
,
Optional
...
...
@@ -320,7 +322,7 @@ class GPTModel(LanguageModule):
)
if
(
self
.
num_nextn_predict_layers
self
.
mtp_process
is
not
None
and
getattr
(
self
.
decoder
,
"main_final_layernorm"
,
None
)
is
not
None
):
# move block main model final norms here
...
...
dcu_megatron/core/transformer/transformer_config.py
View file @
4c942eaf
from
typing
import
Optional
from
functools
import
wraps
from
dataclasses
import
dataclass
...
...
dcu_megatron/legacy/model/transformer.py
View file @
4c942eaf
import
torch
import
torch.nn.functional
as
F
from
functools
import
wraps
from
megatron.training
import
get_args
from
megatron.core
import
tensor_parallel
from
megatron.legacy.model.enums
import
AttnType
...
...
dcu_megatron/training/arguments.py
View file @
4c942eaf
...
...
@@ -175,7 +175,7 @@ def _add_mtp_args(parser):
'MTP extends the prediction scope to multiple future tokens at each position.'
'This MTP implementation sequentially predict additional tokens '
'by using D sequential modules to predict D additional tokens.'
)
group
.
add_argument
(
'--mtp-loss-scaling-factor'
,
type
=
float
,
default
=
0.
1
,
group
.
add_argument
(
'--mtp-loss-scaling-factor'
,
type
=
float
,
default
=
0.
3
,
help
=
'Scaling factor of Multi-Token Prediction (MTP) loss. '
'We compute the average of the MTP losses across all depths, '
'and multiply it the scaling factor to obtain the overall MTP loss, '
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment