Commit de64c444 authored by dongcl's avatar dongcl
Browse files

bug fix

parent 9daa400a
Pipeline #2465 passed with stage
......@@ -185,7 +185,6 @@ class TransformerBlock(MegatronModule):
self.post_layer_norm = post_layer_norm
self.pre_process = pre_process
self.post_process = post_process
self.num_nextn_predict_layers = num_nextn_predict_layers
# Dictionary to store CUDA graphs. Number of items in the dictionary = len(self.layers).
# Item `i` in the dictionary is a list of `N` CUDA graphs for layer 'i' where N is the
# number of microbatches. Multiple CUDA graphs per layer is required to support
......@@ -225,7 +224,7 @@ class TransformerBlock(MegatronModule):
self.num_layers_per_pipeline_rank = len(self.layers)
self.tp_only_amax_red = config.tp_only_amax_red
# mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
self.move_final_norm_out_of_block = getattr(config, num_nextn_predict_layers, 0) > 0
self.move_final_norm_out_of_block = getattr(config, "num_nextn_predict_layers", 0) > 0
def _build_layers(self):
# Transformer layers.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment