Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
38af4f68
Unverified
Commit
38af4f68
authored
Jun 14, 2025
by
Lianmin Zheng
Committed by
GitHub
Jun 14, 2025
Browse files
Fix grammar abort & Minor style fixes (#7204)
parent
a6305c7d
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
19 additions
and
24 deletions
+19
-24
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+5
-6
python/sglang/srt/layers/attention/flashmla_backend.py
python/sglang/srt/layers/attention/flashmla_backend.py
+1
-3
python/sglang/srt/layers/attention/triton_backend.py
python/sglang/srt/layers/attention/triton_backend.py
+5
-4
python/sglang/srt/layers/attention/triton_ops/decode_attention.py
...glang/srt/layers/attention/triton_ops/decode_attention.py
+2
-2
python/sglang/srt/layers/radix_attention.py
python/sglang/srt/layers/radix_attention.py
+2
-3
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+2
-1
python/sglang/srt/mem_cache/memory_pool.py
python/sglang/srt/mem_cache/memory_pool.py
+0
-3
python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
...g/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
+2
-2
No files found.
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
View file @
38af4f68
...
...
@@ -15,7 +15,6 @@ from functools import partial
from
typing
import
TYPE_CHECKING
,
Callable
,
Optional
,
Union
import
torch
import
triton
if
os
.
environ
[
"SGLANG_ENABLE_TORCH_COMPILE"
]
==
"1"
:
import
logging
...
...
@@ -33,7 +32,7 @@ from sglang.srt.layers.utils import is_sm100_supported
from
sglang.srt.managers.schedule_batch
import
global_server_args_dict
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.speculative.eagle_utils
import
EagleDraftInput
,
EagleVerifyInput
from
sglang.srt.utils
import
is_flashinfer_available
from
sglang.srt.utils
import
is_flashinfer_available
,
next_power_of_2
if
TYPE_CHECKING
:
from
sglang.srt.layers.radix_attention
import
RadixAttention
...
...
@@ -756,7 +755,7 @@ class FlashInferMLAMultiStepDraftBackend:
if
topk
>
1
:
raise
ValueError
(
f
"Currently Flashinfer MLA only supports topk=1 for speculative decoding"
"Currently Flashinfer MLA only supports topk=1 for speculative decoding"
)
self
.
topk
=
topk
self
.
speculative_num_steps
=
speculative_num_steps
...
...
@@ -815,9 +814,9 @@ class FlashInferMLAMultiStepDraftBackend:
self
.
pool_len
,
kv_indices_buffer
.
shape
[
1
],
self
.
kv_indptr
.
shape
[
1
],
triton
.
next_power_of_2
(
num_seqs
),
triton
.
next_power_of_2
(
self
.
speculative_num_steps
),
triton
.
next_power_of_2
(
bs
),
next_power_of_2
(
num_seqs
),
next_power_of_2
(
self
.
speculative_num_steps
),
next_power_of_2
(
bs
),
)
assert
forward_batch
.
spec_info
is
not
None
...
...
python/sglang/srt/layers/attention/flashmla_backend.py
View file @
38af4f68
...
...
@@ -464,11 +464,9 @@ class FlashMLAMultiStepDraftBackend:
topk
:
int
,
speculative_num_steps
:
int
,
):
from
sglang.srt.speculative.eagle_utils
import
generate_draft_decode_kv_indices
if
topk
>
1
:
raise
ValueError
(
f
"Currently FlashMLA only supports topk=1 for speculative decoding"
"Currently FlashMLA only supports topk=1 for speculative decoding"
)
self
.
topk
=
topk
self
.
speculative_num_steps
=
speculative_num_steps
...
...
python/sglang/srt/layers/attention/triton_backend.py
View file @
38af4f68
...
...
@@ -12,7 +12,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
from
sglang.srt.layers.radix_attention
import
AttentionType
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.utils
import
get_bool_env_var
,
get_device_core_count
from
sglang.srt.utils
import
get_bool_env_var
,
get_device_core_count
,
next_power_of_2
if
TYPE_CHECKING
:
from
sglang.srt.layers.radix_attention
import
RadixAttention
...
...
@@ -766,6 +766,7 @@ class TritonMultiStepDraftBackend:
self
.
device
=
model_runner
.
device
# Cached variables for generate_draft_decode_kv_indices
self
.
pool_len
=
model_runner
.
req_to_token_pool
.
req_to_token
.
shape
[
1
]
self
.
page_size
=
model_runner
.
server_args
.
page_size
def
common_template
(
self
,
forward_batch
:
ForwardBatch
,
kv_indices_buffer
:
torch
.
Tensor
,
call_fn
:
int
...
...
@@ -788,9 +789,9 @@ class TritonMultiStepDraftBackend:
self
.
pool_len
,
kv_indices_buffer
.
shape
[
1
],
self
.
kv_indptr
.
shape
[
1
],
triton
.
next_power_of_2
(
num_seqs
),
triton
.
next_power_of_2
(
self
.
speculative_num_steps
),
triton
.
next_power_of_2
(
bs
),
next_power_of_2
(
num_seqs
),
next_power_of_2
(
self
.
speculative_num_steps
),
next_power_of_2
(
bs
),
)
for
i
in
range
(
self
.
speculative_num_steps
):
...
...
python/sglang/srt/layers/attention/triton_ops/decode_attention.py
View file @
38af4f68
...
...
@@ -708,7 +708,7 @@ def decode_attention_fwd(
num_kv_splits
,
max_kv_splits
,
sm_scale
,
logit_cap
,
logit_cap
=
logit_cap
,
)
else
:
# GQA/MQA/MLA
...
...
@@ -724,5 +724,5 @@ def decode_attention_fwd(
num_kv_splits
,
max_kv_splits
,
sm_scale
,
logit_cap
,
logit_cap
=
logit_cap
,
)
python/sglang/srt/layers/radix_attention.py
View file @
38af4f68
...
...
@@ -18,7 +18,6 @@ from typing import Optional
from
torch
import
nn
from
sglang.srt.layers.linear
import
UnquantizedLinearMethod
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
...
...
@@ -52,9 +51,9 @@ class RadixAttention(nn.Module):
sliding_window_size
:
int
=
-
1
,
is_cross_attention
:
bool
=
False
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
attn_type
=
AttentionType
.
DECODER
,
prefix
:
str
=
""
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
use_irope
:
bool
=
False
,
prefix
:
str
=
""
,
):
super
().
__init__
()
self
.
tp_q_head_num
=
num_heads
...
...
python/sglang/srt/managers/scheduler.py
View file @
38af4f68
...
...
@@ -2108,7 +2108,8 @@ class Scheduler(
# In this case, we change the input_ids to be only one token to make this prefill cheap.
if
req
.
rid
.
startswith
(
recv_req
.
rid
):
logger
.
debug
(
f
"Abort grammar queue request.
{
req
.
rid
=
}
"
)
req
.
grammar
.
cancel
()
if
req
.
grammar
:
req
.
grammar
.
cancel
()
req
.
set_finish_with_abort
(
"Aborted by AbortReq."
)
# Delete requests in the running batch
...
...
python/sglang/srt/mem_cache/memory_pool.py
View file @
38af4f68
...
...
@@ -141,15 +141,12 @@ class KVCache(abc.ABC):
)
->
None
:
raise
NotImplementedError
()
@
abc
.
abstractmethod
def
get_flat_data
(
self
,
indices
):
raise
NotImplementedError
()
@
abc
.
abstractmethod
def
transfer
(
self
,
indices
,
flat_data
):
raise
NotImplementedError
()
@
abc
.
abstractmethod
def
transfer_per_layer
(
self
,
indices
,
flat_data
,
layer_id
):
raise
NotImplementedError
()
...
...
python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
View file @
38af4f68
...
...
@@ -86,8 +86,8 @@ class EAGLEDraftExtendCudaGraphRunner:
self
.
seq_lens
=
torch
.
ones
((
self
.
max_bs
,),
dtype
=
torch
.
int32
)
self
.
extend_seq_lens
=
torch
.
ones
((
self
.
max_bs
,),
dtype
=
torch
.
int32
)
self
.
accept_length
=
(
torch
.
ones
(
(
self
.
max_bs
,),
dtype
=
torch
.
int32
)
*
self
.
num_tokens_per_bs
self
.
accept_length
=
torch
.
full
(
(
self
.
max_bs
,),
self
.
num_tokens_per_bs
,
dtype
=
torch
.
int32
)
# Capture
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment