Commit ed4b88d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'origin/v0.9.2-dev-yql-mtp' into v0.9.2-dev

parents fb94600a 080ed180
...@@ -65,7 +65,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter, ...@@ -65,7 +65,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import W8a8GetCacheJSON from vllm.utils import W8a8GetCacheJSON
os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '1') os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
class DeepseekV2MLP(nn.Module): class DeepseekV2MLP(nn.Module):
def __init__( def __init__(
...@@ -608,9 +608,13 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -608,9 +608,13 @@ class DeepseekV2DecoderLayer(nn.Module):
residual: Optional[torch.Tensor], residual: Optional[torch.Tensor],
) -> torch.Tensor: ) -> torch.Tensor:
# Self Attention # Self Attention
# Fix residual FP16 overflow
residual_fix_overflow = False
if residual is None: if residual is None:
residual = hidden_states residual = hidden_states
hidden_states = self.input_layernorm(hidden_states) hidden_states = self.input_layernorm(hidden_states)
residual_fix_overflow = True
else: else:
hidden_states, residual = self.input_layernorm( hidden_states, residual = self.input_layernorm(
hidden_states, residual) hidden_states, residual)
...@@ -624,7 +628,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -624,7 +628,7 @@ class DeepseekV2DecoderLayer(nn.Module):
# We scale both hidden_states and residual before # We scale both hidden_states and residual before
# rmsnorm, and rmsnorm result would not affect by scale. # rmsnorm, and rmsnorm result would not affect by scale.
hidden_states *= 1. / self.routed_scaling_factor hidden_states *= 1. / self.routed_scaling_factor
if self.layer_idx == 0: if self.layer_idx == 0 or residual_fix_overflow:
# The residual is shared by all layers, we only scale it on # The residual is shared by all layers, we only scale it on
# first layer. # first layer.
residual *= 1. / self.routed_scaling_factor residual *= 1. / self.routed_scaling_factor
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment