Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
de889cb6
Commit
de889cb6
authored
Feb 05, 2026
by
zhuwenwen
Browse files
sync v0.15.1
parent
c721b814
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
11 additions
and
3 deletions
+11
-3
vllm/model_executor/models/mistral.py
vllm/model_executor/models/mistral.py
+9
-1
vllm/v1/attention/backends/mla/flashmla.py
vllm/v1/attention/backends/mla/flashmla.py
+1
-0
vllm/v1/attention/ops/chunked_prefill_paged_decode.py
vllm/v1/attention/ops/chunked_prefill_paged_decode.py
+0
-1
vllm/v1/attention/ops/flashmla.py
vllm/v1/attention/ops/flashmla.py
+1
-1
No files found.
vllm/model_executor/models/mistral.py
View file @
de889cb6
...
@@ -156,8 +156,16 @@ class MistralDecoderLayer(LlamaDecoderLayer):
...
@@ -156,8 +156,16 @@ class MistralDecoderLayer(LlamaDecoderLayer):
)
)
self
.
layer_idx
=
int
(
prefix
.
split
(
sep
=
"."
)[
-
1
])
self
.
layer_idx
=
int
(
prefix
.
split
(
sep
=
"."
)[
-
1
])
quant_config
=
self
.
get_quant_config
(
vllm_config
)
config
=
config
or
vllm_config
.
model_config
.
hf_config
config
=
config
or
vllm_config
.
model_config
.
hf_config
do_fusion
=
getattr
(
quant_config
,
"enable_quantization_scaling_fusion"
,
False
)
and
vllm_config
.
cache_config
.
cache_dtype
.
startswith
(
"fp8"
)
if
do_fusion
:
self
.
input_layernorm
.
quant_scaling_from
=
self
.
self_attn
.
qkv_proj
self
.
post_attention_layernorm
.
quant_scaling_from
=
self
.
mlp
.
gate_up_proj
if
getattr
(
config
,
"ada_rms_norm_t_cond"
,
False
):
if
getattr
(
config
,
"ada_rms_norm_t_cond"
,
False
):
self
.
ada_rms_norm_t_cond
=
nn
.
Sequential
(
self
.
ada_rms_norm_t_cond
=
nn
.
Sequential
(
ColumnParallelLinear
(
ColumnParallelLinear
(
...
@@ -339,4 +347,4 @@ class MistralForCausalLM(LlamaForCausalLM):
...
@@ -339,4 +347,4 @@ class MistralForCausalLM(LlamaForCausalLM):
elif
item
in
mapping
and
mapping
[
item
]
not
in
name
:
elif
item
in
mapping
and
mapping
[
item
]
not
in
name
:
name
=
name
.
replace
(
item
,
mapping
[
item
])
name
=
name
.
replace
(
item
,
mapping
[
item
])
return
name
,
loaded_weight
return
name
,
loaded_weight
\ No newline at end of file
vllm/v1/attention/backends/mla/flashmla.py
View file @
de889cb6
...
@@ -284,6 +284,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
...
@@ -284,6 +284,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
num_splits
=
torch
.
zeros
((
B
+
1
,),
dtype
=
dtype
,
device
=
device
)
num_splits
=
torch
.
zeros
((
B
+
1
,),
dtype
=
dtype
,
device
=
device
)
scheduler_metadata
.
tile_scheduler_metadata
=
tile_scheduler_metadata
scheduler_metadata
.
tile_scheduler_metadata
=
tile_scheduler_metadata
scheduler_metadata
.
num_splits
=
num_splits
scheduler_metadata
.
num_splits
=
num_splits
if
self
.
kv_cache_dtype
.
startswith
(
"fp8"
):
if
self
.
kv_cache_dtype
.
startswith
(
"fp8"
):
o
,
lse
=
flash_mla_with_kvcache_fp8
(
o
,
lse
=
flash_mla_with_kvcache_fp8
(
q
=
q
,
q
=
q
,
...
...
vllm/v1/attention/ops/chunked_prefill_paged_decode.py
View file @
de889cb6
...
@@ -302,7 +302,6 @@ def chunked_prefill_paged_decode(
...
@@ -302,7 +302,6 @@ def chunked_prefill_paged_decode(
block_size
=
value_cache
.
shape
[
3
]
block_size
=
value_cache
.
shape
[
3
]
num_seqs
=
len
(
seq_lens
)
num_seqs
=
len
(
seq_lens
)
num_query_heads
=
query
.
shape
[
1
]
num_query_heads
=
query
.
shape
[
1
]
# key may be None in cross-attention decode (already cached from encoder)
num_kv_heads
=
key
.
shape
[
1
]
num_kv_heads
=
key
.
shape
[
1
]
num_queries_per_kv
=
query
.
shape
[
1
]
//
key
.
shape
[
1
]
num_queries_per_kv
=
query
.
shape
[
1
]
//
key
.
shape
[
1
]
head_size
=
query
.
shape
[
2
]
head_size
=
query
.
shape
[
2
]
...
...
vllm/v1/attention/ops/flashmla.py
View file @
de889cb6
...
@@ -22,7 +22,7 @@ else:
...
@@ -22,7 +22,7 @@ else:
if
current_platform
.
is_cuda
():
if
current_platform
.
is_cuda
():
try
:
try
:
import
vllm._flashmla_extension_C
# noqa: F401
import
vllm._flashmla_extension_C
# noqa: F401
_flashmla_extension_C_AVAILABLE
=
True
_flashmla_extension_C_AVAILABLE
=
True
except
ImportError
:
except
ImportError
:
_flashmla_extension_C_AVAILABLE
=
False
_flashmla_extension_C_AVAILABLE
=
False
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment