bug fix

de64c444 · dongcl · 9daa400a · de64c444
Commit de64c444 authored Mar 10, 2025 by dongcl
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 2 deletions

megatron/core/transformer/transformer_block.py megatron/core/transformer/transformer_block.py +1 -2

No files found.
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -185,7 +185,6 @@ class TransformerBlock(MegatronModule):
        self.post_layer_norm = post_layer_norm
        self.pre_process = pre_process
        self.post_process = post_process
-        self.num_nextn_predict_layers = num_nextn_predict_layers
        # Dictionary to store CUDA graphs. Number of items in the dictionary = len(self.layers).
        # Item `i` in the dictionary is a list of `N` CUDA graphs for layer 'i' where N is the
        # number of microbatches. Multiple CUDA graphs per layer is required to support
@@ -225,7 +224,7 @@ class TransformerBlock(MegatronModule):
        self.num_layers_per_pipeline_rank = len(self.layers)
        self.tp_only_amax_red = config.tp_only_amax_red
        # mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
-        self.move_final_norm_out_of_block = getattr(config, num_nextn_predict_layers, 0) > 0
+        self.move_final_norm_out_of_block = getattr(config, "num_nextn_predict_layers", 0) > 0
    def _build_layers(self):
        # Transformer layers.