[feat]W8A8适配deepseek以及mtp

530e785f · zhuwenwen · 3c35c466 · 530e785f · 530e785f · 530e785f
Commit 530e785f authored Feb 06, 2026 by zhuwenwen
7 changed files
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -913,6 +913,7 @@ class ModelConfig:
                "mxfp4",
                "cpu_awq",
                "slimquant_w4a8_marlin",
+                "slimquant_marlin",
                "slimquant_compressed_tensors_marlin",
            ]
            quantization_methods = [

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -280,6 +280,7 @@ if TYPE_CHECKING:
    VLLM_USE_LIGHTOP_MOE_SUM: bool = False
    VLLM_USE_LIGHTOP_MOE_ALIGN: bool = False
    VLLM_USE_MERGE_ATTN_STATES_OPT: bool = False
+    USE_FUSED_RMS_QUANT: bool = False
    VLLM_USE_PD_SPLIT: bool = False
    VLLM_USE_PP_SYNC: bool = False
    VLLM_USE_PIECEWISE: bool = False
@@ -1788,6 +1789,9 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_MERGE_ATTN_STATES_OPT":
        lambda: (os.environ.get("VLLM_USE_MERGE_ATTN_STATES_OPT", "True").lower() in
                 ("true", "1")),  
+    # vllm will use rmsquant fused op 
+    "USE_FUSED_RMS_QUANT": 
+        lambda: bool(int(os.getenv("USE_FUSED_RMS_QUANT", "0"))),
    # vLLM will split prefill and decode, not mix up
    "VLLM_USE_PD_SPLIT":
        lambda: (os.environ.get("VLLM_USE_PD_SPLIT", "False").lower() in

--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -872,7 +872,6 @@ class FusedMoEParallelConfig:

    use_ep: bool  # whether to use EP or not
    all2all_backend: str  # all2all backend for MoE communication
-    is_sequence_parallel: bool  # whether sequence parallelism is used
    enable_eplb: bool  # whether to enable expert load balancing

    @property

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -670,10 +670,6 @@ class FusedMoE(CustomOp):
    # This is called after all weight loading and post-processing, so it
    # should be safe to swap out the quant_method.
    def maybe_init_modular_kernel(self) -> None:
-        # NOTE(rob): WIP refactor. For quant methods that own the MK
-        # we create the MK during process_weights_after_loading.
-        if self.quant_method.supports_internal_mk or self.quant_method.is_monolithic:
-            return None

        self.ensure_moe_quant_config_init()
        # routing_tables only needed for round-robin expert placement with
@@ -1930,7 +1926,7 @@ class FusedMoE(CustomOp):
                    topk_weights=topk_weights,
                    topk_ids=topk_ids,
                    use_nn_moe=self.use_nn_moe,
-                    use_fused_gate=self.use_fused_gate,
+                    # use_fused_gate=self.use_fused_gate,
                )

            if has_separate_shared_experts:

--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -38,6 +38,7 @@ QuantizationMethods = Literal[
    "blockwise_int8",
    "slimquant_w4a8",
    "slimquant_w4a8_marlin",
+    "slimquant_marlin",
    "slimquant_compressed_tensors_marlin",
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
@@ -177,6 +178,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
        "blockwise_int8": BlockInt8Config,
        "slimquant_w4a8":SlimQuantW4A8Int8Config,
        "slimquant_w4a8_marlin":SlimQuantW4A8Int8MarlinConfig,
+        "slimquant_marlin":SlimQuantCompressedTensorsMarlinConfig,
        "slimquant_compressed_tensors_marlin":SlimQuantCompressedTensorsMarlinConfig,
    }
    # Update the `method_to_config` with customized quantization methods.

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_marlin.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_marlin.py
@@ -50,6 +50,8 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
        kv_cache_scheme: Optional[dict[str, Any]] = None,
        config: Optional[dict[str, Any]] = None,
        transform_config: Optional[dict[str, Any]] = None,
+        total_num_heads: int | None = None,
+        total_num_kv_heads: int | None = None,
    ):
        super().__init__(
            target_scheme_map,
@@ -61,6 +63,9 @@ class SlimQuantCompressedTensorsMarlinConfig(CompressedTensorsConfig):
            config,
            transform_config
        )
+        self.total_num_heads = total_num_heads
+        self.total_num_kv_heads = total_num_kv_heads
+        
    @classmethod
    def override_quantization_method(
            cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]:

--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
@@ -147,52 +147,15 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
        layer.w13_weight = Parameter(w1_marlin, requires_grad=False)
        layer.w2_weight = Parameter(w2_marlin, requires_grad=False)

-
    def apply(
        self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        top_k: int,
-        renormalize: bool,
-        use_grouped_topk: bool = False,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-        enable_eplb: bool = False,
-        use_nn_moe: Optional[bool] = False,
-        routed_scaling_factor: Optional[float] = None,
-        use_fused_gate: Optional[bool] = False,
-        expert_load_view: Optional[torch.Tensor] = None,
-        logical_to_physical_map: Optional[torch.Tensor] = None,
-        logical_replica_count: Optional[torch.Tensor] = None,
-        shared_output: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for "
-                "`CompressedTensorsW8A8Int8MoEMethod` yet.")
-
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            use_fused_gate=use_fused_gate,
-            e_score_correction_bias=e_score_correction_bias)
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        use_nn_moe: bool | None = False,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        from vllm.model_executor.layers.fused_moe import fused_experts

        return fused_experts_impl_int8_marlin(
            hidden_states=x,
@@ -201,16 +164,16 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            inplace=True,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
+            activation=layer.activation,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
            use_int8_w8a8=True,
            per_channel_quant=True,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            quant_config=self.moe_quant_config,
            w1_scale=layer.w13_weight_scale,
            w2_scale=layer.w2_weight_scale,
            a1_scale=layer.w13_input_scale,
            a2_scale=layer.w2_input_scale,
            use_nn_moe=False,
-            shared_output=shared_output,
-            routed_scaling_factor=routed_scaling_factor)
\ No newline at end of file
+        )
\ No newline at end of file