Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
da85feb7
Commit
da85feb7
authored
Dec 17, 2025
by
zhuwenwen
Browse files
convert q to float8_e4m3fn
parent
99981972
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
4 additions
and
4 deletions
+4
-4
vllm/attention/backends/flashmla.py
vllm/attention/backends/flashmla.py
+1
-1
vllm/attention/layer.py
vllm/attention/layer.py
+1
-1
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+1
-1
vllm/v1/attention/backends/mla/flashmla.py
vllm/v1/attention/backends/mla/flashmla.py
+1
-1
No files found.
vllm/attention/backends/flashmla.py
View file @
da85feb7
...
...
@@ -238,7 +238,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
if
torch
.
cuda
.
get_device_properties
(
"cuda"
).
gcnArchName
.
split
(
':'
)[
0
]
==
"gfx938"
and
kv_cache_dtype
==
"fp8_e4m3"
and
envs
.
VLLM_USE_FLASH_MLA_FP8
:
o
,
_
=
flash_mla_with_kvcache_fp8
(
q
=
q
,
q
=
q
.
to
(
torch
.
float8_e4m3fn
)
,
k_cache
=
kv_c_and_k_pe_cache
.
unsqueeze
(
-
2
),
# Add head dim of 1
block_table
=
decode_meta
.
block_tables
,
cache_seqlens
=
decode_meta
.
seq_lens_tensor
,
...
...
vllm/attention/layer.py
View file @
da85feb7
...
...
@@ -198,7 +198,7 @@ class Attention(nn.Module):
# For some alternate attention backends like MLA the attention output
# shape does not match the query shape, so we optionally let the model
# definition specify the output tensor shape.
num_local_heads
:
Optional
[
int
]
=
None
,
output_shape
:
Optional
[
torch
.
Size
]
=
None
,
q_ori
:
Optional
[
torch
.
Tensor
]
=
None
,
key_normed
:
Optional
[
torch
.
Tensor
]
=
None
,
positions
:
Optional
[
torch
.
Tensor
]
=
None
,
...
...
vllm/v1/attention/backends/mla/common.py
View file @
da85feb7
...
...
@@ -1163,7 +1163,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
kv_cache_dtype_str
=
"bf16"
else
:
kv_cache_dtype_str
=
self
.
kv_cache_dtype
from
lightop
import
fused_rms_norm_rope_contiguous
fused_rms_norm_rope_contiguous
(
positions
[:
num_actual_toks
,
...],
q
,
...
...
vllm/v1/attention/backends/mla/flashmla.py
View file @
da85feb7
...
...
@@ -185,7 +185,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
.
unsqueeze
(
1
)
# Add seqlen dim of 1 (decode)
o
,
_
=
flash_mla_with_kvcache_fp8
(
q
=
q
,
q
=
q
.
to
(
torch
.
float8_e4m3fn
)
,
k_cache
=
kv_c_and_k_pe_cache
.
unsqueeze
(
-
2
),
# Add head dim of 1
block_table
=
attn_metadata
.
decode
.
block_table
,
cache_seqlens
=
attn_metadata
.
decode
.
seq_lens
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment