Commit 1851782d authored by zhuwenwen's avatar zhuwenwen
Browse files

删除DPSK_FP16_QUICK

parent 8e22ded2
......@@ -89,8 +89,6 @@ if current_platform.is_cuda_alike():
elif current_platform.is_xpu():
from vllm._ipex_ops import ipex_ops as ops
os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '0')
logger = init_logger(__name__)
......@@ -191,7 +189,6 @@ class DeepseekV2MoE(nn.Module):
# Load balancing settings.
eplb_config = parallel_config.eplb_config
self.enable_eplb = parallel_config.enable_eplb
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
self.n_redundant_experts = eplb_config.num_redundant_experts
self.n_logical_experts = self.n_routed_experts
......@@ -288,7 +285,7 @@ class DeepseekV2MoE(nn.Module):
# Fix FP16 overflow
# See DeepseekV2DecoderLayer for more details.
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
if hidden_states.dtype != torch.float16:
final_hidden_states *= self.routed_scaling_factor
elif self.shared_experts is not None:
assert shared_output is not None
......@@ -1054,7 +1051,6 @@ class DeepseekV2DecoderLayer(nn.Module):
prefix=f"{prefix}.self_attn",
topk_indices_buffer=topk_indices_buffer,
)
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
if (config.n_routed_experts is not None
and layer_idx >= config.first_k_dense_replace
and layer_idx % config.moe_layer_freq == 0):
......@@ -1107,7 +1103,7 @@ class DeepseekV2DecoderLayer(nn.Module):
)
residual = new_residual
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
if hidden_states.dtype == torch.float16:
# rmsnorm, and rmsnorm result would not affect by scale.
hidden_states *= 1. / self.routed_scaling_factor
if self.layer_idx == 0 or residual_fix_overflow:
......@@ -1118,7 +1114,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states, new_resi = self.mlp(hidden_states, self.post_attention_layernorm.weight.data, residual)
if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
DeepseekV2MLP) and hidden_states.dtype == torch.float16:
# Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer.
......@@ -1142,7 +1138,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states=hidden_states,
)
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
if hidden_states.dtype == torch.float16:
# Fix FP16 overflow
# We scale both hidden_states and residual before
# rmsnorm, and rmsnorm result would not affect by scale.
......@@ -1157,7 +1153,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states, residual)
hidden_states = self.mlp(hidden_states)
if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
DeepseekV2MLP) and hidden_states.dtype == torch.float16:
# Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment