Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ca7a2d5f
Unverified
Commit
ca7a2d5f
authored
Mar 08, 2025
by
Tyler Michael Smith
Committed by
GitHub
Mar 07, 2025
Browse files
Revert "[Perf] Reduce MLA CPU overheads in V1 (#14384)" (#14471)
parent
33368140
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
18 deletions
+6
-18
vllm/model_executor/layers/rotary_embedding.py
vllm/model_executor/layers/rotary_embedding.py
+2
-7
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+4
-11
No files found.
vllm/model_executor/layers/rotary_embedding.py
View file @
ca7a2d5f
...
@@ -161,13 +161,8 @@ class RotaryEmbedding(CustomOp):
...
@@ -161,13 +161,8 @@ class RotaryEmbedding(CustomOp):
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
# __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
self
.
cos_sin_cache
=
self
.
cos_sin_cache
.
to
(
query
.
device
,
# is expensive, so avoid calling it if possible
dtype
=
query
.
dtype
)
if
self
.
cos_sin_cache
.
device
!=
query
.
device
or
\
self
.
cos_sin_cache
.
dtype
!=
query
.
dtype
:
self
.
cos_sin_cache
=
self
.
cos_sin_cache
.
to
(
query
.
device
,
dtype
=
query
.
dtype
)
# ops.rotary_embedding()/batched_rotary_embedding()
# ops.rotary_embedding()/batched_rotary_embedding()
# are in-place operations that update the query and key tensors.
# are in-place operations that update the query and key tensors.
if
offsets
is
not
None
:
if
offsets
is
not
None
:
...
...
vllm/v1/attention/backends/mla/common.py
View file @
ca7a2d5f
...
@@ -222,8 +222,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
...
@@ -222,8 +222,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
Fp8LinearGenericOp
,
current_platform_fp8_dtype
,
is_fp8
)
Fp8LinearGenericOp
,
current_platform_fp8_dtype
,
is_fp8
)
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
scaled_quantize
)
scaled_quantize
)
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
from
vllm.model_executor.layers.rotary_embedding
import
(
from
vllm.platforms
import
current_platform
DeepseekScalingRotaryEmbedding
,
RotaryEmbedding
)
from
vllm.utils
import
cdiv
,
round_down
from
vllm.utils
import
cdiv
,
round_down
try
:
try
:
...
@@ -627,15 +627,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
...
@@ -627,15 +627,8 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
self
.
v_head_dim
=
v_head_dim
self
.
v_head_dim
=
v_head_dim
self
.
rotary_emb
=
rotary_emb
self
.
rotary_emb
=
rotary_emb
self
.
use_yarn_rope
=
isinstance
(
rotary_emb
,
if
current_platform
.
is_cuda
():
DeepseekScalingRotaryEmbedding
)
# Hack for V1 for now to avoid torch library overhead (since we are
# already inside an attention custom op), pull out the forward
# method from the rotary embedding and call it directly (and avoid
# calling forward_native, when we can call forward_cuda)
# TODO(lucas): we should probably find a cleaner way to do this
self
.
rotary_emb
=
rotary_emb
.
forward_cuda
self
.
q_proj
=
q_proj
self
.
q_proj
=
q_proj
self
.
kv_b_proj
=
kv_b_proj
self
.
kv_b_proj
=
kv_b_proj
self
.
o_proj
=
o_proj
self
.
o_proj
=
o_proj
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment