Commit f00f0256 authored by dongcl's avatar dongcl
Browse files

deepseek mtp bug解决

parent 627a739f
Pipeline #2462 passed with stage
......@@ -17,9 +17,11 @@ from megatron.core.models.common.language_module.language_module import Language
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.transformer.enums import ModelType
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.spec_utils import build_module
from megatron.core.transformer.transformer_block import TransformerBlock
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.mtp.multi_token_predictor import MultiTokenPredictor
from megatron.core.extensions.transformer_engine import TENorm
class GPTModel(LanguageModule):
......@@ -137,6 +139,7 @@ class GPTModel(LanguageModule):
spec=transformer_layer_spec,
pre_process=self.pre_process,
post_process=self.post_process,
num_nextn_predict_layers=num_nextn_predict_layers
)
# Output
......
......@@ -178,6 +178,7 @@ class TransformerBlock(MegatronModule):
post_layer_norm: bool = True,
pre_process: bool = True,
post_process: bool = True,
num_nextn_predict_layers: int = 0
):
super().__init__(config=config)
......@@ -185,6 +186,7 @@ class TransformerBlock(MegatronModule):
self.post_layer_norm = post_layer_norm
self.pre_process = pre_process
self.post_process = post_process
self.num_nextn_predict_layers = num_nextn_predict_layers
# Dictionary to store CUDA graphs. Number of items in the dictionary = len(self.layers).
# Item `i` in the dictionary is a list of `N` CUDA graphs for layer 'i' where N is the
# number of microbatches. Multiple CUDA graphs per layer is required to support
......@@ -246,7 +248,7 @@ class TransformerBlock(MegatronModule):
# In pipeline parallelism, we want to add this LN only to the last stage of the pipeline
# self.post_process and self.post_layer_norm guide this behavior
# mtp require seperate layernorms for main model and mtp modules, thus move finalnorm out of block
move_final_norm_out_of_block = args.num_nextn_predict_layers > 0
move_final_norm_out_of_block = self.num_nextn_predict_layers > 0
if self.submodules.layer_norm and self.post_process and self.post_layer_norm and not move_final_norm_out_of_block:
self.final_layernorm = build_module(
self.submodules.layer_norm,
......
......@@ -428,7 +428,6 @@ def get_batch_on_this_tp_rank(data_iterator):
_broadcast(batch['position_ids'])
else:
tokens=torch.empty((args.micro_batch_size, args.seq_length + args.num_nextn_predict_layers),
dtype = torch.int64,
device = torch.cuda.current_device())
......
......@@ -37,6 +37,7 @@ from megatron.core.models.gpt.gpt_layer_specs import (
)
from megatron.core.transformer.mtp.mtp_spec import get_mtp_spec
from megatron.core.utils import tensor_slide
import torch._dynamo
torch._dynamo.config.suppress_errors = True
......@@ -190,6 +191,8 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
args = get_args()
losses = output_tensor.float()
if args.num_nextn_predict_layers > 0:
loss_mask = tensor_slide(loss_mask, args.num_nextn_predict_layers, return_first=True)[0]
loss_mask = loss_mask.view(-1).float()
total_tokens = loss_mask.sum()
loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment