Revert "[Perf] Reduce MLA CPU overheads in V1 (#14384)" (#14471)

ca7a2d5f · Tyler Michael Smith · GitHub · 33368140 · ca7a2d5f · ca7a2d5f
Unverified Commit ca7a2d5f authored Mar 08, 2025 by Tyler Michael Smith Committed by GitHub Mar 07, 2025
Showing with 6 additions and 18 deletions

vllm/model_executor/layers/rotary_embedding.py vllm/model_executor/layers/rotary_embedding.py +2 -7

vllm/v1/attention/backends/mla/common.py vllm/v1/attention/backends/mla/common.py +4 -11

No files found.
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -161,13 +161,8 @@ class RotaryEmbedding(CustomOp):
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        from vllm import _custom_ops as ops
-        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-        # is expensive, so avoid calling it if possible
+                                                   dtype=query.dtype)
-        if self.cos_sin_cache.device != query.device or \
-            self.cos_sin_cache.dtype != query.dtype:
-            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                       dtype=query.dtype)
        # ops.rotary_embedding()/batched_rotary_embedding()
        # are in-place operations that update the query and key tensors.
        if offsets is not None:

--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -222,8 +222,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import (
-from vllm.platforms import current_platform
+    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.utils import cdiv, round_down
 try:
@@ -627,15 +627,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
        self.v_head_dim = v_head_dim
        self.rotary_emb = rotary_emb
+        self.use_yarn_rope = isinstance(rotary_emb,
-        if current_platform.is_cuda():
+                                        DeepseekScalingRotaryEmbedding)
-            # Hack for V1 for now to avoid torch library overhead (since we are
-            # already inside an attention custom op), pull out the forward
-            # method from the rotary embedding and call it directly (and avoid
-            # calling forward_native, when we can call forward_cuda)
-            # TODO(lucas): we should probably find a cleaner way to do this
-            self.rotary_emb = rotary_emb.forward_cuda
        self.q_proj = q_proj
        self.kv_b_proj = kv_b_proj
        self.o_proj = o_proj