Commit beae085a authored by yangql's avatar yangql
Browse files

处理VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD导致的awq推理bug问题

parent 06185134
......@@ -2073,27 +2073,23 @@ class FusedMoE(CustomOp):
else 1.0
),
)
else:
elif envs.VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD and shared_output is not None:
final_hidden_states = self.quant_method.apply(
layer=self,
x=x, # The type signture of this is wrong due to the hack.
topk_weights=topk_weights,
topk_ids=topk_ids,
use_nn_moe=self.use_nn_moe,
shared_output=(
shared_output
if envs.VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
and shared_output is not None
else None
),
routed_scaling_factor=(
routed_scaling_factor
if envs.VLLM_USE_LIGHTOP_MOE_SUM_MUL_ADD
and shared_output is not None
else 1.0
),
shared_output=shared_output,
routed_scaling_factor=routed_scaling_factor,
)
else:
final_hidden_states = self.quant_method.apply(
layer=self,
x=x, # The type signture of this is wrong due to the hack.
topk_weights=topk_weights,
topk_ids=topk_ids,
use_nn_moe=self.use_nn_moe,)
if has_separate_shared_experts:
assert self.shared_experts is not None
......
......@@ -381,6 +381,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
topk_ids: torch.Tensor,
use_nn_moe: bool | None = False,
use_fused_gate: bool | None = False,
**_
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment