Merge branch 'v0.9.2-dev-ds-wm-1210' into 'v0.9.2-dev-ds'

[fix]修复deepep 高吞吐模式vmfault问题 See merge request dcutoolkit/deeplearing/vllm!291

Merge branch 'v0.9.2-dev-ds-wm-1210' into 'v0.9.2-dev-ds'
[fix]修复deepep 高吞吐模式vmfault问题 See merge request dcutoolkit/deeplearing/vllm!291
94c4ca4d · zhuwenwen · 8ae59a9c · 916b5876 · 94c4ca4d · 94c4ca4d
Commit 94c4ca4d authored Dec 11, 2025 by zhuwenwen
3 changed files
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
@@ -285,9 +285,10 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
        use_nn_moe: Optional[bool] = False,
        routed_scaling_factor: Optional[float] = None,
        shared_output: Optional[torch.Tensor] = None,
+        q_x: Optional[torch.Tensor] = None,
        **_  ):
            return fused_experts_impl_int8_marlin(
-                hidden_states=x,
+                hidden_states=x if q_x is None else q_x,
                w1=w1,
                w2=w2,
                topk_weights=topk_weights,

--- a/vllm/model_executor/layers/quantization/slimquant_w4a8_marlin.py
+++ b/vllm/model_executor/layers/quantization/slimquant_w4a8_marlin.py
@@ -263,7 +263,7 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
        **_  ):
            workspace, global_reduce_buffer = MarlinMoeWorkspace(x.device).get_buffers()
            return fused_experts_impl_w4a8_marlin(
-                x,
+                x if q_x is None else q_x,
                w1,
                w2,
                topk_ids=topk_ids,
@@ -510,6 +510,8 @@ class SlimQuantW4A8Int8MarlinMoEMethod:
                False)
            
            return TritonOrGroupGemmExperts(
+                # use_int4_w4a8=True,
+                # per_act_token_quant=True,
                fused_experts=self.w4a8_fused_moe_marlin_forward
            )

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -717,9 +717,8 @@ class DeepseekV2DecoderLayer(nn.Module):
        self.dp_size = get_dp_group().world_size
        vllm_config = get_current_vllm_config()
        parallel_config = vllm_config.parallel_config
-        self.use_deepep = self.dp_size > 1 and parallel_config.enable_expert_parallel and \
-            (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" or \
-             envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
+        self.use_deepep_ll = self.dp_size > 1 and parallel_config.enable_expert_parallel and \
+            envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency"
        self.tp_size = get_tensor_model_parallel_world_size()

        if (config.n_routed_experts is not None
@@ -848,7 +847,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                hidden_states, residual)
            
            if isinstance(self.mlp,
-                        DeepseekV2MoE) and self.use_deepep and self.tp_size > 1:
+                        DeepseekV2MoE) and self.use_deepep_ll and self.tp_size > 1:

                self.tp_rank = get_tensor_model_parallel_rank()
                ori_bs = hidden_states.shape[0]
@@ -861,7 +860,7 @@ class DeepseekV2DecoderLayer(nn.Module):
            hidden_states = self.mlp(hidden_states)

            if isinstance(self.mlp,
-                        DeepseekV2MoE) and self.use_deepep and self.tp_size > 1:
+                        DeepseekV2MoE) and self.use_deepep_ll and self.tp_size > 1:
                hidden_states = tensor_model_parallel_all_gather(hidden_states, dim=0).contiguous()
                hidden_states = hidden_states[:ori_bs, :].contiguous()