update fuse_fill_rms_x2_concat

bac269d7 · zhuwenwen · bdae1255 · bac269d7
Commit bac269d7 authored Dec 23, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 4 deletions

vllm/model_executor/models/deepseek_mtp.py vllm/model_executor/models/deepseek_mtp.py +21 -4

No files found.
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -29,8 +29,7 @@ from .utils import maybe_prefix
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.blockwise_int8 import BlockInt8Config
 import vllm.envs as envs
-from lightop import fuse_fill_rms_x2_concat
+from vllm.utils import direct_register_custom_op
 class SharedHead(nn.Module):
@@ -75,6 +74,24 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
        self.mtp_block = DeepseekV2DecoderLayer(config, prefix, model_config,
                                                cache_config, quant_config)
+    def fuse_fill_rms_x2_concat(hidden_states_fuse: torch.Tensor, positions: torch.Tensor, inputs_embeds: torch.Tensor,
+                                    previous_hidden_states: torch.Tensor, weight_inputs_embeds: torch.Tensor, 
+                                    weight_previous_hidden_states: torch.Tensor, epsilon: float) -> None:
+        from lightop import fuse_fill_rms_x2_concat
+        fuse_fill_rms_x2_concat(hidden_states_fuse, positions, inputs_embeds, previous_hidden_states, weight_inputs_embeds, weight_previous_hidden_states, epsilon)
+    def fuse_fill_rms_x2_concat_fake(hidden_states_fuse: torch.Tensor, positions: torch.Tensor, inputs_embeds: torch.Tensor,
+                                    previous_hidden_states: torch.Tensor, weight_inputs_embeds: torch.Tensor, 
+                                    weight_previous_hidden_states: torch.Tensor, epsilon: float) -> None:
+        pass
+    direct_register_custom_op(
+        op_name="fuse_fill_rms_x2_concat",
+        op_func=fuse_fill_rms_x2_concat,
+        mutates_args=["hidden_states_fuse", "inputs_embeds"], 
+        fake_impl=fuse_fill_rms_x2_concat_fake,
+    )
    def forward(
        self,
        input_ids: torch.Tensor,
@@ -88,8 +105,8 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
        assert inputs_embeds is not None
        # masking inputs at position 0, as not needed by MTP
        if envs.VLLM_USE_FUSED_FILL_RMS_CAT:
-            hidden_states_fuse = torch.empty(hidden_states.shape[0], hidden_states.shaope[1]*2, device=hidden_states.device, dtype=hidden_states.dtype)
+            hidden_states_fuse = torch.empty(inputs_embeds.shape[0], inputs_embeds.shape[1]*2, device=inputs_embeds.device, dtype=inputs_embeds.dtype)
-            fuse_fill_rms_x2_concat(hidden_states_fuse, positions, inputs_embeds, previous_hidden_states, self.enorm.weight, self.hnorm.weight, self.enorm.variance_epsilon)
+            torch.ops.vllm.fuse_fill_rms_x2_concat(hidden_states_fuse, positions, inputs_embeds, previous_hidden_states, self.enorm.weight, self.hnorm.weight, self.enorm.variance_epsilon)
            hidden_states = self.eh_proj(hidden_states_fuse)
        else:
            inputs_embeds[positions == 0] = 0