Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
cbdfb771
"projects/vscode:/vscode.git/clone" did not exist on "8ea2381334b21ca1d16c82dcdf6b54a845d3e0e5"
Unverified
Commit
cbdfb771
authored
Jul 20, 2025
by
Clay
Committed by
GitHub
Jul 19, 2025
Browse files
Enable FlashInfer support encoder models and add head_dim padding workaround (#6230)
parent
282eb59f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
25 additions
and
3 deletions
+25
-3
python/sglang/srt/layers/attention/flashinfer_backend.py
python/sglang/srt/layers/attention/flashinfer_backend.py
+10
-1
test/srt/models/test_encoder_embedding_models.py
test/srt/models/test_encoder_embedding_models.py
+15
-2
No files found.
python/sglang/srt/layers/attention/flashinfer_backend.py
View file @
cbdfb771
...
...
@@ -25,6 +25,7 @@ from sglang.global_config import global_config
from
sglang.srt.layers.attention.base_attn_backend
import
AttentionBackend
from
sglang.srt.layers.attention.utils
import
create_flashinfer_kv_indices_triton
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
from
sglang.srt.layers.radix_attention
import
AttentionType
from
sglang.srt.layers.utils
import
is_sm100_supported
from
sglang.srt.mem_cache.allocator
import
SWATokenToKVPoolAllocator
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
...
...
@@ -486,12 +487,20 @@ class FlashInferAttnBackend(AttentionBackend):
v_scale
=
layer
.
v_scale
,
)
else
:
causal
=
True
if
layer
.
attn_type
==
AttentionType
.
ENCODER_ONLY
:
save_kv_cache
=
False
causal
=
False
if
self
.
forward_metadata
.
extend_no_prefix
:
# NOTE: FlashInfer currently has limitations with head_dim = 32 or other dimensions
# The FlashInfer head_dim limitation itself is tracked here:
# https://github.com/flashinfer-ai/flashinfer/issues/1048
o
=
self
.
prefill_wrapper_ragged
.
forward
(
q
.
view
(
-
1
,
layer
.
tp_q_head_num
,
layer
.
head_dim
),
k
.
view
(
-
1
,
layer
.
tp_k_head_num
,
layer
.
head_dim
),
v
.
view
(
-
1
,
layer
.
tp_v_head_num
,
layer
.
head_dim
),
causal
=
True
,
causal
=
causal
,
sm_scale
=
layer
.
scaling
,
logits_soft_cap
=
logits_soft_cap
,
)
...
...
test/srt/models/test_encoder_embedding_models.py
View file @
cbdfb771
...
...
@@ -27,9 +27,9 @@ from sglang.test.test_utils import CustomTestCase, get_similarities, is_in_ci
MODELS
=
[(
"BAAI/bge-small-en"
,
1
,
1e-5
),
(
"BAAI/bge-m3"
,
1
,
1e-5
)]
ATTENTION_BACKEND
=
[
"torch_native"
,
"triton"
]
ATTENTION_BACKEND
=
[
"torch_native"
,
"triton"
,
"flashinfer"
]
BATCH_SIZE
=
[
1
,
2
]
TORCH_DTYPES
=
[
torch
.
float32
]
TORCH_DTYPES
=
[
torch
.
float32
,
torch
.
float16
]
sgl_to_st_ratio
=
[]
...
...
@@ -126,6 +126,19 @@ class TestEncoderEmbeddingModels(CustomTestCase):
for
attention_backend
in
ATTENTION_BACKEND
:
for
batch_size
in
BATCH_SIZE
:
for
torch_dtype
in
TORCH_DTYPES
:
# NOTE: FlashInfer currently has limitations with head_dim = 32 or
# other dimensions.
# The FlashInfer head_dim limitation itself is tracked here:
# https://github.com/flashinfer-ai/flashinfer/issues/1048
#
# Flashinfer does not support torch.float32 for dtype_q, so skip it
if
attention_backend
==
"flashinfer"
:
if
(
model
==
"BAAI/bge-small-en"
or
torch_dtype
==
torch
.
float32
):
continue
self
.
assert_close_prefill_logits
(
DEFAULT_PROMPTS
,
model
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment