Commit cb37537e authored by yangql's avatar yangql
Browse files

修复block-wise的错误参数,以及增加dpsk-fp16-dtype的性能选择功能

parent 4d479e7e
......@@ -468,13 +468,16 @@ class BlockInt8MoEMethod:
e_score_correction_bias: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False,
activation: str = "silu",
enable_eplb: bool = False,
use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False,
**_
) -> torch.Tensor:
from vllm.model_executor.layers.fused_moe import fused_experts
if enable_eplb:
raise NotImplementedError(
"EPLB not supported for `MoeWNA16Method` yet.")
# Expert selection
topk_weights, topk_ids = FusedMoE.select_experts(
hidden_states=x,
......
......@@ -483,7 +483,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
def moe_wna16_weight_loader(param: torch.nn.Parameter,
loaded_weight: torch.Tensor,
weight_name: str, shard_id: str,
expert_id: int):
expert_id: int,
return_success: bool = False):
if "g_idx" in weight_name:
return
if not layer.quant_config.has_zp and "qzeros" in weight_name:
......@@ -539,5 +540,6 @@ class MoeWNA16Method(FusedMoEMethodBase):
else:
weight_loader(param, loaded_weight, weight_name, shard_id,
expert_id)
return_success = True
return return_success
return moe_wna16_weight_loader
\ No newline at end of file
......@@ -65,6 +65,7 @@ from .utils import (PPMissingLayer, is_pp_missing_parameter,
from vllm import _custom_ops as ops
from vllm.utils import W8a8GetCacheJSON
os.environ['DPSK_FP16_QUICK'] = os.environ.get('DPSK_FP16_QUICK', '1')
class DeepseekV2MLP(nn.Module):
def __init__(
......@@ -138,6 +139,7 @@ class DeepseekV2MoE(nn.Module):
vllm_config = get_current_vllm_config()
parallel_config = vllm_config.parallel_config
self.enable_eplb = enable_eplb
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
self.n_redundant_experts = parallel_config.num_redundant_experts
self.n_logical_experts = self.n_routed_experts
......@@ -191,7 +193,7 @@ class DeepseekV2MoE(nn.Module):
# router_logits: (num_tokens, n_experts)
router_logits, _ = self.gate(hidden_states)
if hidden_states.dtype != torch.float16:
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
final_hidden_states = self.experts(
hidden_states=hidden_states,
router_logits=router_logits) * self.routed_scaling_factor
......@@ -202,7 +204,7 @@ class DeepseekV2MoE(nn.Module):
router_logits=router_logits)
if shared_output is not None:
if hidden_states.dtype != torch.float16:
if hidden_states.dtype != torch.float16 or self.dpsk_fp16_quick:
final_hidden_states = final_hidden_states + shared_output
else:
# Fix FP16 overflow
......@@ -575,7 +577,7 @@ class DeepseekV2DecoderLayer(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.self_attn",
)
self.dpsk_fp16_quick = os.environ.get('DPSK_FP16_QUICK') == '1'
if (config.n_routed_experts is not None
and layer_idx >= config.first_k_dense_replace
and layer_idx % config.moe_layer_freq == 0):
......@@ -617,7 +619,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states=hidden_states,
)
if hidden_states.dtype == torch.float16:
if hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
# Fix FP16 overflow
# We scale both hidden_states and residual before
# rmsnorm, and rmsnorm result would not affect by scale.
......@@ -633,7 +635,7 @@ class DeepseekV2DecoderLayer(nn.Module):
hidden_states = self.mlp(hidden_states)
if isinstance(self.mlp,
DeepseekV2MLP) and hidden_states.dtype == torch.float16:
DeepseekV2MLP) and hidden_states.dtype == torch.float16 and not self.dpsk_fp16_quick:
# Fix FP16 overflow
# Scaling the DeepseekV2MLP output, it is the input of
# input_layernorm of next decoder layer.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment