Unverified Commit db84f5eb authored by Lucas Wilkinson's avatar Lucas Wilkinson Committed by GitHub
Browse files

[Bugfix] DeepSeek Accuracy (#14476)


Signed-off-by: default avatarLucas Wilkinson <lwilkins@redhat.com>
parent 206e2577
...@@ -222,8 +222,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( ...@@ -222,8 +222,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8) Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
scaled_quantize) scaled_quantize)
from vllm.model_executor.layers.rotary_embedding import ( from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
DeepseekScalingRotaryEmbedding, RotaryEmbedding)
from vllm.utils import cdiv, round_down from vllm.utils import cdiv, round_down
try: try:
...@@ -626,9 +625,12 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): ...@@ -626,9 +625,12 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
self.qk_head_dim = qk_head_dim self.qk_head_dim = qk_head_dim
self.v_head_dim = v_head_dim self.v_head_dim = v_head_dim
self.rotary_emb = rotary_emb # Hack for V1 for now to avoid torch library overhead (since we are
self.use_yarn_rope = isinstance(rotary_emb, # already inside an attention custom op), pull out the forward
DeepseekScalingRotaryEmbedding) # method from the rotary embedding and call it directly
# TODO(lucas): we should probably find a cleaner way to do this
self.rotary_emb = rotary_emb._forward_method
self.q_proj = q_proj self.q_proj = q_proj
self.kv_b_proj = kv_b_proj self.kv_b_proj = kv_b_proj
self.o_proj = o_proj self.o_proj = o_proj
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment