Commit 6ef5d322 authored by laibao's avatar laibao
Browse files

fix(moe): 补齐非Marlin量化路径 shared_output/routed_scaling_factor 透传

parent 06185134
...@@ -1881,6 +1881,8 @@ def fused_experts_impl( ...@@ -1881,6 +1881,8 @@ def fused_experts_impl(
a2_scale=a2_scale, a2_scale=a2_scale,
block_shape=block_shape, block_shape=block_shape,
use_nn_moe=False, use_nn_moe=False,
routed_scaling_factor=routed_scaling_factor,
shared_output=shared_output,
i_q=i_q, i_q=i_q,
i_s=i_s i_s=i_s
) )
...@@ -1903,7 +1905,9 @@ def fused_experts_impl( ...@@ -1903,7 +1905,9 @@ def fused_experts_impl(
a1_scale=a1_scale, a1_scale=a1_scale,
a2_scale=a2_scale, a2_scale=a2_scale,
block_shape=block_shape, block_shape=block_shape,
use_nn_moe=False use_nn_moe=False,
routed_scaling_factor=routed_scaling_factor,
shared_output=shared_output
) )
if use_int4_w4a16: if use_int4_w4a16:
......
...@@ -1111,7 +1111,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): ...@@ -1111,7 +1111,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
use_nn_moe: bool | None = False, use_nn_moe: bool | None = False,
use_fused_gate: bool | None = False, use_fused_gate: bool | None = False,
shared_output: torch.Tensor | None = None,
routed_scaling_factor: float = 1.0,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
assert not self.is_monolithic assert not self.is_monolithic
assert self.kernel is not None assert self.kernel is not None
...@@ -1131,6 +1132,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): ...@@ -1131,6 +1132,8 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
quant_config=self.moe_quant_config, quant_config=self.moe_quant_config,
use_fused_gate=use_fused_gate, use_fused_gate=use_fused_gate,
use_nn_moe=False, use_nn_moe=False,
shared_output=shared_output,
routed_scaling_factor=routed_scaling_factor,
) )
@property @property
...@@ -1256,7 +1259,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): ...@@ -1256,7 +1259,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
use_nn_moe: bool | None = False, use_nn_moe: bool | None = False,
i_q: torch.Tensor | None = None, i_q: torch.Tensor | None = None,
i_s: torch.Tensor | None = None i_s: torch.Tensor | None = None,
shared_output: torch.Tensor | None = None,
routed_scaling_factor: float = 1.0,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe import fused_experts
...@@ -1274,7 +1279,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): ...@@ -1274,7 +1279,9 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
quant_config=self.moe_quant_config, quant_config=self.moe_quant_config,
use_nn_moe=use_nn_moe, use_nn_moe=use_nn_moe,
i_q=i_q, i_q=i_q,
i_s=i_s i_s=i_s,
shared_output=shared_output,
routed_scaling_factor=routed_scaling_factor,
) )
...@@ -2515,4 +2522,4 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): ...@@ -2515,4 +2522,4 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
@property @property
def supports_eplb(self) -> bool: def supports_eplb(self) -> bool:
return False return False
\ No newline at end of file
...@@ -1028,6 +1028,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): ...@@ -1028,6 +1028,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
topk_ids: torch.Tensor, topk_ids: torch.Tensor,
use_nn_moe: bool | None = False, use_nn_moe: bool | None = False,
use_fused_gate: bool | None = False, use_fused_gate: bool | None = False,
shared_output: torch.Tensor | None = None,
routed_scaling_factor: float = 1.0,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
assert self.kernel is not None assert self.kernel is not None
assert not self.is_monolithic assert not self.is_monolithic
...@@ -1047,6 +1049,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): ...@@ -1047,6 +1049,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
quant_config=self.moe_quant_config, quant_config=self.moe_quant_config,
use_fused_gate=use_fused_gate, use_fused_gate=use_fused_gate,
use_nn_moe=False, use_nn_moe=False,
shared_output=shared_output,
routed_scaling_factor=routed_scaling_factor,
) )
......
...@@ -308,7 +308,9 @@ class SlimQuantW4A8Int8MoEMethod: ...@@ -308,7 +308,9 @@ class SlimQuantW4A8Int8MoEMethod:
use_nn_moe: bool | None = False, use_nn_moe: bool | None = False,
use_fused_gate: bool | None = False, use_fused_gate: bool | None = False,
i_q: torch.Tensor | None = None, i_q: torch.Tensor | None = None,
i_s: torch.Tensor | None = None i_s: torch.Tensor | None = None,
shared_output: torch.Tensor | None = None,
routed_scaling_factor: float = 1.0,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe import fused_experts
return fused_experts( return fused_experts(
...@@ -324,4 +326,6 @@ class SlimQuantW4A8Int8MoEMethod: ...@@ -324,4 +326,6 @@ class SlimQuantW4A8Int8MoEMethod:
global_num_experts=layer.global_num_experts, global_num_experts=layer.global_num_experts,
quant_config=self.moe_quant_config, quant_config=self.moe_quant_config,
use_nn_moe=use_nn_moe, use_nn_moe=use_nn_moe,
shared_output=shared_output,
routed_scaling_factor=routed_scaling_factor,
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment