[Attention] Remove slow setattr in MLA (#14769)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>

[Attention] Remove slow setattr in MLA (#14769)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
d47807ba · Lucas Wilkinson · GitHub · 02fcaa3d · d47807ba
Unverified Commit d47807ba authored Mar 13, 2025 by Lucas Wilkinson Committed by GitHub Mar 13, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 2 deletions

vllm/model_executor/layers/rotary_embedding.py vllm/model_executor/layers/rotary_embedding.py +7 -2

No files found.
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -161,8 +161,13 @@ class RotaryEmbedding(CustomOp):
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        from vllm import _custom_ops as ops

-        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                   dtype=query.dtype)
+        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+        # is expensive, so avoid calling it if possible
+        if self.cos_sin_cache.device != query.device or \
+            self.cos_sin_cache.dtype != query.dtype:
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                       dtype=query.dtype)
+
        # ops.rotary_embedding()/batched_rotary_embedding()
        # are in-place operations that update the query and key tensors.
        if offsets is not None: