Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c5fa1992
Commit
c5fa1992
authored
Mar 11, 2026
by
liuchy5
Browse files
支持fp8 mqa&&跳过VLLM_USE_FUSED_FILL_RMS_CAT&&跳过load_error
parent
3824b261
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
1 deletion
+25
-1
vllm/model_executor/layers/sparse_attn_indexer.py
vllm/model_executor/layers/sparse_attn_indexer.py
+23
-0
vllm/model_executor/model_loader/default_loader.py
vllm/model_executor/model_loader/default_loader.py
+1
-0
vllm/model_executor/models/deepseek_mtp.py
vllm/model_executor/models/deepseek_mtp.py
+1
-1
No files found.
vllm/model_executor/layers/sparse_attn_indexer.py
View file @
c5fa1992
...
...
@@ -112,6 +112,29 @@ def sparse_attn_indexer(
chunk
.
cu_seqlen_ks
,
chunk
.
cu_seqlen_ke
,
)
elif
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
.
split
(
':'
)[
0
]
==
"gfx938"
:
k_fp8
=
k_fp8_full
[:
chunk
.
total_seq_lens
]
k_scale
=
k_scale_full
[:
chunk
.
total_seq_lens
]
ops
.
cp_gather_indexer_k_quant_cache
(
kv_cache
,
k_fp8
,
k_scale
,
chunk
.
block_table
,
chunk
.
cu_seq_lens
,
)
logits
=
op
.
mqa_logits
(
q_fp8
[
chunk
.
token_start
:
chunk
.
token_end
],
k_fp8
,
weights
[
chunk
.
token_start
:
chunk
.
token_end
],
chunk
.
cu_seqlen_ks
,
chunk
.
cu_seqlen_ke
,
q_fp8
[
chunk
.
token_start
:
chunk
.
token_end
].
shape
[
0
],
k_fp8
.
shape
[
0
],
q_fp8
.
shape
[
1
],
q_fp8
.
shape
[
2
],
k_scale
.
view
(
torch
.
float32
).
flatten
(),
True
)
else
:
logits
=
op
.
mqa_logits
(
q_fp8
[
chunk
.
token_start
:
chunk
.
token_end
],
...
...
vllm/model_executor/model_loader/default_loader.py
View file @
c5fa1992
...
...
@@ -298,6 +298,7 @@ class DefaultModelLoader(BaseModelLoader):
if
model_config
.
quantization
is
None
and
loaded_weights
is
not
None
:
weights_not_loaded
=
weights_to_load
-
loaded_weights
weights_not_loaded
=
{
k
for
k
in
weights_not_loaded
if
not
k
.
endswith
(
"indexer.weights_proj.bias"
)}
weights_not_loaded
=
{
k
for
k
in
weights_not_loaded
if
k
not
in
[
'model.layers.78.shared_head.head.weight'
,
'model.embed_tokens.weight'
]}
if
weights_not_loaded
:
raise
ValueError
(
"Following weights were not initialized from "
...
...
vllm/model_executor/models/deepseek_mtp.py
View file @
c5fa1992
...
...
@@ -112,7 +112,7 @@ class DeepSeekMultiTokenPredictorLayer(nn.Module):
)
->
torch
.
Tensor
:
assert
inputs_embeds
is
not
None
# masking inputs at position 0, as not needed by MTP
if
envs
.
VLLM_USE_FUSED_FILL_RMS_CAT
:
if
False
:
hidden_states_fuse
=
torch
.
empty
(
inputs_embeds
.
shape
[
0
],
inputs_embeds
.
shape
[
1
]
*
2
,
device
=
inputs_embeds
.
device
,
dtype
=
inputs_embeds
.
dtype
)
torch
.
ops
.
vllm
.
fuse_fill_rms_x2_concat
(
hidden_states_fuse
,
positions
,
inputs_embeds
,
previous_hidden_states
,
self
.
enorm
.
weight
,
self
.
hnorm
.
weight
,
self
.
enorm
.
variance_epsilon
)
hidden_states
=
self
.
eh_proj
(
hidden_states_fuse
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment