[Bugfix] DeepSeek Accuracy (#14476)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>

[Bugfix] DeepSeek Accuracy (#14476)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
db84f5eb · Lucas Wilkinson · GitHub · 206e2577 · db84f5eb
Unverified Commit db84f5eb authored Mar 08, 2025 by Lucas Wilkinson Committed by GitHub Mar 08, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 5 deletions

vllm/v1/attention/backends/mla/common.py vllm/v1/attention/backends/mla/common.py +7 -5

No files found.
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -222,8 +222,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import (
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.utils import cdiv, round_down
 try:
@@ -626,9 +625,12 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
        self.qk_head_dim = qk_head_dim
        self.v_head_dim = v_head_dim
-        self.rotary_emb = rotary_emb
+        # Hack for V1 for now to avoid torch library overhead (since we are
-        self.use_yarn_rope = isinstance(rotary_emb,
+        # already inside an attention custom op), pull out the forward
-                                        DeepseekScalingRotaryEmbedding)
+        # method from the rotary embedding and call it directly
+        # TODO(lucas): we should probably find a cleaner way to do this
+        self.rotary_emb = rotary_emb._forward_method
        self.q_proj = q_proj
        self.kv_b_proj = kv_b_proj
        self.o_proj = o_proj