Commit 1851782d authored by zhuwenwen's avatar zhuwenwen
Browse files

删除DPSK_FP16_QUICK

parent 8e22ded2
...@@ -89,8 +89,6 @@ if current_platform.is_cuda_alike(): ...@@ -89,8 +89,6 @@ if current_platform.is_cuda_alike():
elif current_platform.is_xpu(): elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops as ops from vllm._ipex_ops import ipex_ops as ops
os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -191,7 +189,6 @@ class DeepseekV2MoE(nn.Module): ...@@ -191,7 +189,6 @@ class DeepseekV2MoE(nn.Module):
# Load balancing settings. # Load balancing settings.
eplb_config = parallel_config.eplb_config eplb_config = parallel_config.eplb_config
self.enable_eplb = parallel_config.enable_eplb self.enable_eplb = parallel_config.enable_eplb
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
self.n_redundant_experts = eplb_config.num_redundant_experts self.n_redundant_experts = eplb_config.num_redundant_experts
self.n_logical_experts = self.n_routed_experts self.n_logical_experts = self.n_routed_experts
...@@ -288,7 +285,7 @@ class DeepseekV2MoE(nn.Module): ...@@ -288,7 +285,7 @@ class DeepseekV2MoE(nn.Module):
# Fix FP16 overflow # Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details. # See DeepseekV2DecoderLayer for more details.
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick: if hidden_states.dtype != torch.float16:
final_hidden_states *= self.routed_scaling_factor final_hidden_states *= self.routed_scaling_factor
elif self.shared_experts is not None: elif self.shared_experts is not None:
assert shared_output is not None assert shared_output is not None
...@@ -1054,7 +1051,6 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1054,7 +1051,6 @@ class DeepseekV2DecoderLayer(nn.Module):
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
topk_indices_buffer=topk_indices_buffer, topk_indices_buffer=topk_indices_buffer,
) )
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
if (config.n_routed_experts is not None if (config.n_routed_experts is not None
and layer_idx >= config.first_k_dense_replace and layer_idx >= config.first_k_dense_replace
and layer_idx % config.moe_layer_freq == 0): and layer_idx % config.moe_layer_freq == 0):
...@@ -1107,7 +1103,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1107,7 +1103,7 @@ class DeepseekV2DecoderLayer(nn.Module):
) )
residual = new_residual residual = new_residual
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: if hidden_states.dtype == torch.float16:
# rmsnorm, and rmsnorm result would not affect by scale. # rmsnorm, and rmsnorm result would not affect by scale.
hidden_states *= 1. / self.routed_scaling_factor hidden_states *= 1. / self.routed_scaling_factor
if self.layer_idx == 0 or residual_fix_overflow: if self.layer_idx == 0 or residual_fix_overflow:
...@@ -1118,7 +1114,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1118,7 +1114,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual) hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)
if isinstance(self.mlp, if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: DeepseekV2MLP) and hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of # Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer. # input_layernorm of next decoder layer.
...@@ -1142,7 +1138,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1142,7 +1138,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states=hidden_states, hidden_states=hidden_states,
) )
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: if hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# We scale both hidden_states and residual before # We scale both hidden_states and residual before
# rmsnorm, and rmsnorm result would not affect by scale. # rmsnorm, and rmsnorm result would not affect by scale.
...@@ -1157,7 +1153,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1157,7 +1153,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states, residual) hidden_states, residual)
hidden_states = self.mlp(hidden_states) hidden_states = self.mlp(hidden_states)
if isinstance(self.mlp, if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick: DeepseekV2MLP) and hidden_states.dtype == torch.float16:
# Fix FP16 overflow # Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of # Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer. # input_layernorm of next decoder layer.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment