Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
38af4f68
"vscode:/vscode.git/clone" did not exist on "978e3b32c5eb9744f60e5d215395989a96231d22"
Unverified
Commit
38af4f68
authored
Jun 14, 2025
by
Lianmin Zheng
Committed by
GitHub
Jun 14, 2025
Browse files
Fix grammar abort & Minor style fixes (#7204)
parent
a6305c7d
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
19 additions
and
24 deletions
+19
-24
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+5
-6
python/sglang/srt/layers/attention/flashmla_backend.py
python/sglang/srt/layers/attention/flashmla_backend.py
+1
-3
python/sglang/srt/layers/attention/triton_backend.py
python/sglang/srt/layers/attention/triton_backend.py
+5
-4
python/sglang/srt/layers/attention/triton_ops/decode_attention.py
...glang/srt/layers/attention/triton_ops/decode_attention.py
+2
-2
python/sglang/srt/layers/radix_attention.py
python/sglang/srt/layers/radix_attention.py
+2
-3
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+2
-1
python/sglang/srt/mem_cache/memory_pool.py
python/sglang/srt/mem_cache/memory_pool.py
+0
-3
python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
...g/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
+2
-2
No files found.
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
View file @
38af4f68
...
...
@@ -15,7 +15,6 @@ from functools import partial
from
typing
import
TYPE_CHECKING
,
Callable
,
Optional
,
Union
import
torch
import
triton
if
os
.
environ
[
"SGLANG_ENABLE_TORCH_COMPILE"
]
==
"1"
:
import
logging
...
...
@@ -33,7 +32,7 @@ from sglang.srt.layers.utils import is_sm100_supported
from
sglang.srt.managers.schedule_batch
import
global_server_args_dict
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.speculative.eagle_utils
import
EagleDraftInput
,
EagleVerifyInput
from
sglang.srt.utils
import
is_flashinfer_available
from
sglang.srt.utils
import
is_flashinfer_available
,
next_power_of_2
if
TYPE_CHECKING
:
from
sglang.srt.layers.radix_attention
import
RadixAttention
...
...
@@ -756,7 +755,7 @@ class FlashInferMLAMultiStepDraftBackend:
if
topk
>
1
:
raise
ValueError
(
f
"Currently Flashinfer MLA only supports topk=1 for speculative decoding"
"Currently Flashinfer MLA only supports topk=1 for speculative decoding"
)
self
.
topk
=
topk
self
.
speculative_num_steps
=
speculative_num_steps
...
...
@@ -815,9 +814,9 @@ class FlashInferMLAMultiStepDraftBackend:
self
.
pool_len
,
kv_indices_buffer
.
shape
[
1
],
self
.
kv_indptr
.
shape
[
1
],
triton
.
next_power_of_2
(
num_seqs
),
triton
.
next_power_of_2
(
self
.
speculative_num_steps
),
triton
.
next_power_of_2
(
bs
),
next_power_of_2
(
num_seqs
),
next_power_of_2
(
self
.
speculative_num_steps
),
next_power_of_2
(
bs
),
)
assert
forward_batch
.
spec_info
is
not
None
...
...
python/sglang/srt/layers/attention/flashmla_backend.py
View file @
38af4f68
...
...
@@ -464,11 +464,9 @@ class FlashMLAMultiStepDraftBackend:
topk
:
int
,
speculative_num_steps
:
int
,
):
from
sglang.srt.speculative.eagle_utils
import
generate_draft_decode_kv_indices
if
topk
>
1
:
raise
ValueError
(
f
"Currently FlashMLA only supports topk=1 for speculative decoding"
"Currently FlashMLA only supports topk=1 for speculative decoding"
)
self
.
topk
=
topk
self
.
speculative_num_steps
=
speculative_num_steps
...
...
python/sglang/srt/layers/attention/triton_backend.py
View file @
38af4f68
...
...
@@ -12,7 +12,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
from
sglang.srt.layers.radix_attention
import
AttentionType
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.utils
import
get_bool_env_var
,
get_device_core_count
from
sglang.srt.utils
import
get_bool_env_var
,
get_device_core_count
,
next_power_of_2
if
TYPE_CHECKING
:
from
sglang.srt.layers.radix_attention
import
RadixAttention
...
...
@@ -766,6 +766,7 @@ class TritonMultiStepDraftBackend:
self
.
device
=
model_runner
.
device
# Cached variables for generate_draft_decode_kv_indices
self
.
pool_len
=
model_runner
.
req_to_token_pool
.
req_to_token
.
shape
[
1
]
self
.
page_size
=
model_runner
.
server_args
.
page_size
def
common_template
(
self
,
forward_batch
:
ForwardBatch
,
kv_indices_buffer
:
torch
.
Tensor
,
call_fn
:
int
...
...
@@ -788,9 +789,9 @@ class TritonMultiStepDraftBackend:
self
.
pool_len
,
kv_indices_buffer
.
shape
[
1
],
self
.
kv_indptr
.
shape
[
1
],
triton
.
next_power_of_2
(
num_seqs
),
triton
.
next_power_of_2
(
self
.
speculative_num_steps
),
triton
.
next_power_of_2
(
bs
),
next_power_of_2
(
num_seqs
),
next_power_of_2
(
self
.
speculative_num_steps
),
next_power_of_2
(
bs
),
)
for
i
in
range
(
self
.
speculative_num_steps
):
...
...
python/sglang/srt/layers/attention/triton_ops/decode_attention.py
View file @
38af4f68
...
...
@@ -708,7 +708,7 @@ def decode_attention_fwd(
num_kv_splits
,
max_kv_splits
,
sm_scale
,
logit_cap
,
logit_cap
=
logit_cap
,
)
else
:
# GQA/MQA/MLA
...
...
@@ -724,5 +724,5 @@ def decode_attention_fwd(
num_kv_splits
,
max_kv_splits
,
sm_scale
,
logit_cap
,
logit_cap
=
logit_cap
,
)
python/sglang/srt/layers/radix_attention.py
View file @
38af4f68
...
...
@@ -18,7 +18,6 @@ from typing import Optional
from
torch
import
nn
from
sglang.srt.layers.linear
import
UnquantizedLinearMethod
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
...
...
@@ -52,9 +51,9 @@ class RadixAttention(nn.Module):
sliding_window_size
:
int
=
-
1
,
is_cross_attention
:
bool
=
False
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
attn_type
=
AttentionType
.
DECODER
,
prefix
:
str
=
""
,
attn_type
:
AttentionType
=
AttentionType
.
DECODER
,
use_irope
:
bool
=
False
,
prefix
:
str
=
""
,
):
super
().
__init__
()
self
.
tp_q_head_num
=
num_heads
...
...
python/sglang/srt/managers/scheduler.py
View file @
38af4f68
...
...
@@ -2108,6 +2108,7 @@ class Scheduler(
# In this case, we change the input_ids to be only one token to make this prefill cheap.
if
req
.
rid
.
startswith
(
recv_req
.
rid
):
logger
.
debug
(
f
"Abort grammar queue request.
{
req
.
rid
=
}
"
)
if
req
.
grammar
:
req
.
grammar
.
cancel
()
req
.
set_finish_with_abort
(
"Aborted by AbortReq."
)
...
...
python/sglang/srt/mem_cache/memory_pool.py
View file @
38af4f68
...
...
@@ -141,15 +141,12 @@ class KVCache(abc.ABC):
)
->
None
:
raise
NotImplementedError
()
@
abc
.
abstractmethod
def
get_flat_data
(
self
,
indices
):
raise
NotImplementedError
()
@
abc
.
abstractmethod
def
transfer
(
self
,
indices
,
flat_data
):
raise
NotImplementedError
()
@
abc
.
abstractmethod
def
transfer_per_layer
(
self
,
indices
,
flat_data
,
layer_id
):
raise
NotImplementedError
()
...
...
python/sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py
View file @
38af4f68
...
...
@@ -86,8 +86,8 @@ class EAGLEDraftExtendCudaGraphRunner:
self
.
seq_lens
=
torch
.
ones
((
self
.
max_bs
,),
dtype
=
torch
.
int32
)
self
.
extend_seq_lens
=
torch
.
ones
((
self
.
max_bs
,),
dtype
=
torch
.
int32
)
self
.
accept_length
=
(
torch
.
ones
(
(
self
.
max_bs
,),
dtype
=
torch
.
int32
)
*
self
.
num_tokens_per_bs
self
.
accept_length
=
torch
.
full
(
(
self
.
max_bs
,),
self
.
num_tokens_per_bs
,
dtype
=
torch
.
int32
)
# Capture
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment