Commit de64c444 authored by dongcl's avatar dongcl
Browse files

bug fix

parent 9daa400a
Pipeline #2465 passed with stage
...@@ -185,7 +185,6 @@ class TransformerBlock(MegatronModule): ...@@ -185,7 +185,6 @@ class TransformerBlock(MegatronModule):
self.post_layer_norm = post_layer_norm self.post_layer_norm = post_layer_norm
self.pre_process = pre_process self.pre_process = pre_process
self.post_process = post_process self.post_process = post_process
self.num_nextn_predict_layers = num_nextn_predict_layers
# Dictionary to store CUDA graphs. Number of items in the dictionary = len(self.layers). # Dictionary to store CUDA graphs. Number of items in the dictionary = len(self.layers).
# Item `i` in the dictionary is a list of `N` CUDA graphs for layer 'i' where N is the # Item `i` in the dictionary is a list of `N` CUDA graphs for layer 'i' where N is the
# number of microbatches. Multiple CUDA graphs per layer is required to support # number of microbatches. Multiple CUDA graphs per layer is required to support
...@@ -225,7 +224,7 @@ class TransformerBlock(MegatronModule): ...@@ -225,7 +224,7 @@ class TransformerBlock(MegatronModule):
self.num_layers_per_pipeline_rank = len(self.layers) self.num_layers_per_pipeline_rank = len(self.layers)
self.tp_only_amax_red = config.tp_only_amax_red self.tp_only_amax_red = config.tp_only_amax_red
# mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block # mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
self.move_final_norm_out_of_block = getattr(config, num_nextn_predict_layers, 0) > 0 self.move_final_norm_out_of_block = getattr(config, "num_nextn_predict_layers", 0) > 0
def _build_layers(self): def _build_layers(self):
# Transformer layers. # Transformer layers.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment