Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
302ef403
Unverified
Commit
302ef403
authored
Oct 15, 2025
by
Mengqing Cao
Committed by
GitHub
Oct 15, 2025
Browse files
[DSA][MLA] Tiny refactor on DeepSeek to make it reusable for different backends (#26656)
Signed-off-by:
MengqingCao
<
cmq0113@163.com
>
parent
8865da15
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
3 deletions
+12
-3
vllm/attention/layer.py
vllm/attention/layer.py
+2
-0
vllm/model_executor/models/deepseek_mtp.py
vllm/model_executor/models/deepseek_mtp.py
+8
-2
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+2
-1
No files found.
vllm/attention/layer.py
View file @
302ef403
...
...
@@ -587,6 +587,7 @@ class MLAAttention(nn.Module, AttentionLayerBase):
prefix
:
str
=
""
,
use_sparse
:
bool
=
False
,
indexer
:
object
|
None
=
None
,
**
extra_impl_args
,
):
super
().
__init__
()
self
.
num_heads
=
num_heads
...
...
@@ -639,6 +640,7 @@ class MLAAttention(nn.Module, AttentionLayerBase):
v_head_dim
=
self
.
v_head_dim
,
kv_b_proj
=
kv_b_proj
,
indexer
=
indexer
,
**
extra_impl_args
,
)
self
.
use_direct_call
=
not
current_platform
.
opaque_attention_op
()
...
...
vllm/model_executor/models/deepseek_mtp.py
View file @
302ef403
...
...
@@ -17,9 +17,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding
,
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.platforms
import
current_platform
from
vllm.sequence
import
IntermediateTensors
from
.deepseek_v2
import
DeepseekV2DecoderLayer
,
get_spec_layer_idx_from_weight_name
from
.deepseek_v2
import
(
DeepseekV2DecoderLayer
,
get_spec_layer_idx_from_weight_name
,
)
from
.interfaces
import
SupportsPP
from
.utils
import
maybe_prefix
...
...
@@ -56,6 +60,8 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
self
.
hnorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
eh_proj
=
nn
.
Linear
(
config
.
hidden_size
*
2
,
config
.
hidden_size
,
bias
=
False
)
self
.
device
=
current_platform
.
device_type
self
.
is_v32
=
hasattr
(
config
,
"index_topk"
)
if
self
.
is_v32
:
topk_tokens
=
config
.
index_topk
...
...
@@ -63,7 +69,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
vllm_config
.
scheduler_config
.
max_num_batched_tokens
,
topk_tokens
,
dtype
=
torch
.
int32
,
device
=
"cuda"
,
device
=
self
.
device
,
)
else
:
topk_indices_buffer
=
None
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
302ef403
...
...
@@ -1165,6 +1165,7 @@ class DeepseekV2Model(nn.Module):
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
device
=
current_platform
.
device_type
self
.
vocab_size
=
config
.
vocab_size
self
.
is_v32
=
hasattr
(
config
,
"index_topk"
)
...
...
@@ -1174,7 +1175,7 @@ class DeepseekV2Model(nn.Module):
vllm_config
.
scheduler_config
.
max_num_batched_tokens
,
topk_tokens
,
dtype
=
torch
.
int32
,
device
=
"cuda"
,
device
=
self
.
device
,
)
else
:
topk_indices_buffer
=
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment