Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b5e383cd
Unverified
Commit
b5e383cd
authored
Sep 10, 2025
by
Chen Zhang
Committed by
GitHub
Sep 10, 2025
Browse files
[gpt-oss] raise error for flashinfer backend without trtllm (#24482)
Signed-off-by:
Chen Zhang
<
zhangch99@outlook.com
>
parent
9a161307
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
10 additions
and
2 deletions
+10
-2
vllm/v1/attention/backends/flashinfer.py
vllm/v1/attention/backends/flashinfer.py
+10
-2
No files found.
vllm/v1/attention/backends/flashinfer.py
View file @
b5e383cd
...
@@ -216,7 +216,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
...
@@ -216,7 +216,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
self
.
window_left
=
self
.
global_hyperparameters
.
window_left
self
.
window_left
=
self
.
global_hyperparameters
.
window_left
self
.
logits_soft_cap
=
self
.
global_hyperparameters
.
logits_soft_cap
self
.
logits_soft_cap
=
self
.
global_hyperparameters
.
logits_soft_cap
self
.
has_sinks
=
self
.
global_hyperparameters
.
has_sinks
self
.
has_sinks
=
self
.
global_hyperparameters
.
has_sinks
if
self
.
has_sinks
and
not
supports_trtllm_attention
()[
0
]:
raise
NotImplementedError
(
"FlashInfer backend currently does not support attention "
"sinks, please use trtllm on blackwell or flash attention on "
"earlier GPUs."
)
# Preparing persistent buffers (device-side)
# Preparing persistent buffers (device-side)
self
.
paged_kv_indptr
=
torch
.
zeros
(
max_num_reqs
+
1
,
self
.
paged_kv_indptr
=
torch
.
zeros
(
max_num_reqs
+
1
,
dtype
=
torch
.
int32
,
dtype
=
torch
.
int32
,
...
@@ -408,7 +412,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
...
@@ -408,7 +412,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
self
.
q_data_type
,
self
.
q_data_type
,
is_prefill
=
False
,
is_prefill
=
False
,
has_sinks
=
self
.
has_sinks
)
has_sinks
=
self
.
has_sinks
)
if
self
.
has_sinks
and
not
(
prefill_use_trtllm
and
decode_use_trtllm
):
raise
NotImplementedError
(
"FlashInfer backend currently does not support attention "
"sinks, please use trtllm on blackwell or flash attention on "
"earlier GPUs."
)
attn_metadata
=
FlashInferMetadata
(
attn_metadata
=
FlashInferMetadata
(
num_actual_tokens
=
num_actual_tokens
,
num_actual_tokens
=
num_actual_tokens
,
q_data_type
=
self
.
q_data_type
,
q_data_type
=
self
.
q_data_type
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment