bug fix

4c942eaf · silencealiang · 770fa304 · 4c942eaf · 4c942eaf · 4c942eaf
Commit 4c942eaf authored Apr 26, 2025 by silencealiang
6 changed files
--- a/dcu_megatron/adaptor/megatron_adaptor.py
+++ b/dcu_megatron/adaptor/megatron_adaptor.py
@@ -24,13 +24,13 @@ class MegatronAdaptation:
            adaptation.execute()
        MegatronAdaptation.apply()

-        from .patch_utils import MegatronPatchesManager
+        # from .patch_utils import MegatronPatchesManager
        
-        args = get_adaptor_args()
-        for feature in FEATURES_LIST:
-            if (getattr(args, feature.feature_name, None) and feature.optimization_level > 0) or feature.optimization_level == 0:
-                feature.register_patches(MegatronPatchesManager, args)
-        MindSpeedPatchesManager.apply_patches()
+        # args = get_adaptor_args()
+        # for feature in FEATURES_LIST:
+        #     if (getattr(args, feature.feature_name, None) and feature.optimization_level > 0) or feature.optimization_level == 0:
+        #         feature.register_patches(MegatronPatchesManager, args)
+        # MindSpeedPatchesManager.apply_patches()

        # MegatronAdaptation.post_execute()

@@ -142,9 +142,9 @@ class CoreAdaptation(MegatronAdaptationABC):
        MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.topk_softmax_with_capacity',
                                    torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False}),
                                    apply_wrapper=True)
-        MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
-                                    torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
-                                    apply_wrapper=True)
+        # MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.switch_load_balancing_loss_func',
+        #                             torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False, "triton.cudagraph_support_input_mutation":True}),
+        #                             apply_wrapper=True)
        MegatronAdaptation.register('megatron.core.transformer.moe.moe_utils.permute',
                                    torch.compile(mode='max-autotune-no-cudagraphs'),
                                    apply_wrapper=True)

--- a/dcu_megatron/core/models/gpt/gpt_layer_specs.py
+++ b/dcu_megatron/core/models/gpt/gpt_layer_specs.py
 import warnings
-from typing import Optional
+from typing import Optional, Union

 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec

--- a/dcu_megatron/core/models/gpt/gpt_model.py
+++ b/dcu_megatron/core/models/gpt/gpt_model.py
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.

+import os
+
 from collections import OrderedDict
 from typing import Dict, Literal, Optional

@@ -320,7 +322,7 @@ class GPTModel(LanguageModule):
            )

        if (
-            self.num_nextn_predict_layers
+            self.mtp_process is not None
            and getattr(self.decoder, "main_final_layernorm", None) is not None
        ):
            # move block main model final norms here

--- a/dcu_megatron/core/transformer/transformer_config.py
+++ b/dcu_megatron/core/transformer/transformer_config.py
+from typing import Optional
 from functools import wraps
 from dataclasses import dataclass


--- a/dcu_megatron/legacy/model/transformer.py
+++ b/dcu_megatron/legacy/model/transformer.py
 import torch
 import torch.nn.functional as F

+from functools import wraps
+
 from megatron.training import get_args
 from megatron.core import tensor_parallel
 from megatron.legacy.model.enums import AttnType

--- a/dcu_megatron/training/arguments.py
+++ b/dcu_megatron/training/arguments.py
@@ -175,7 +175,7 @@ def _add_mtp_args(parser):
                       'MTP extends the prediction scope to multiple future tokens at each position.'
                       'This MTP implementation sequentially predict additional tokens '
                       'by using D sequential modules to predict D additional tokens.')
-    group.add_argument('--mtp-loss-scaling-factor', type=float, default=0.1,
+    group.add_argument('--mtp-loss-scaling-factor', type=float, default=0.3,
                       help='Scaling factor of Multi-Token Prediction (MTP) loss. '
                       'We compute the average of the MTP losses across all depths, '
                       'and multiply it the scaling factor to obtain the overall MTP loss, '