Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
9d61205d
Unverified
Commit
9d61205d
authored
Oct 22, 2025
by
Liangsheng Yin
Committed by
GitHub
Oct 22, 2025
Browse files
[lint] improve ruff check (#11922)
Co-authored-by:
Xiaoyu Zhang
<
35585791+BBuf@users.noreply.github.com
>
parent
590bc4b7
Changes
19
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
73 additions
and
31 deletions
+73
-31
.pre-commit-config.yaml
.pre-commit-config.yaml
+3
-1
benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
...ghtning_attention/benchmark_lightning_attention_decode.py
+1
-0
benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py
...htning_attention/benchmark_lightning_attention_prefill.py
+4
-0
python/sglang/bench_one_batch_server.py
python/sglang/bench_one_batch_server.py
+3
-0
python/sglang/srt/disaggregation/common/conn.py
python/sglang/srt/disaggregation/common/conn.py
+2
-2
python/sglang/srt/disaggregation/mooncake/conn.py
python/sglang/srt/disaggregation/mooncake/conn.py
+1
-1
python/sglang/srt/entrypoints/openai/serving_responses.py
python/sglang/srt/entrypoints/openai/serving_responses.py
+2
-1
python/sglang/srt/layers/attention/flashinfer_backend.py
python/sglang/srt/layers/attention/flashinfer_backend.py
+1
-1
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+4
-1
python/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py
...ang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py
+20
-19
python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
...ers/quantization/compressed_tensors/compressed_tensors.py
+7
-0
python/sglang/srt/layers/quantization/fpgemm_fp8.py
python/sglang/srt/layers/quantization/fpgemm_fp8.py
+1
-1
python/sglang/srt/mem_cache/memory_pool.py
python/sglang/srt/mem_cache/memory_pool.py
+3
-1
python/sglang/srt/model_loader/weight_utils.py
python/sglang/srt/model_loader/weight_utils.py
+3
-0
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+2
-1
python/sglang/srt/models/glm4v.py
python/sglang/srt/models/glm4v.py
+1
-0
python/sglang/srt/models/opt.py
python/sglang/srt/models/opt.py
+4
-0
python/sglang/srt/utils/common.py
python/sglang/srt/utils/common.py
+10
-2
python/sglang/test/few_shot_gsm8k_engine.py
python/sglang/test/few_shot_gsm8k_engine.py
+1
-0
No files found.
.pre-commit-config.yaml
View file @
9d61205d
...
...
@@ -27,7 +27,9 @@ repos:
rev
:
v0.11.7
hooks
:
-
id
:
ruff
args
:
[
--select=F401
,
F821
,
--fixable=F401
]
args
:
-
--select=F401,F821
-
--fix
files
:
^(benchmark/|docs/|examples/|python/sglang/)
exclude
:
__init__\.py$|\.ipynb$|^python/sglang/srt/grpc/.*_pb2\.py$|^python/sglang/srt/grpc/.*_pb2_grpc\.py$|^python/sglang/srt/grpc/.*_pb2\.pyi$|^python/sglang/srt/grpc/.*_pb2_grpc\.pyi$
-
repo
:
https://github.com/psf/black
...
...
benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_decode.py
View file @
9d61205d
...
...
@@ -167,6 +167,7 @@ class MiniMaxText01LightningAttention(nn.Module):
past_key_value
:
Optional
[
Tuple
[
torch
.
Tensor
]]
=
None
,
use_cache
:
bool
=
False
,
slope_rate
:
Optional
[
torch
.
Tensor
]
=
None
,
do_eval
:
bool
=
False
,
**
kwargs
,
):
if
(
not
self
.
training
)
and
(
not
do_eval
):
...
...
benchmark/kernels/minmax-text-01-lightning_attention/benchmark_lightning_attention_prefill.py
View file @
9d61205d
import
itertools
import
logging
import
math
import
os
from
typing
import
Optional
,
Tuple
...
...
@@ -10,6 +11,8 @@ import triton
import
triton.language
as
tl
from
einops
import
rearrange
logger
=
logging
.
getLogger
(
__name__
)
# Adapted from https://github.com/OpenNLPLab/lightning-attention/blob/main/lightning_attn/ops/triton/lightning_attn2.py
@
triton
.
jit
...
...
@@ -302,6 +305,7 @@ class MiniMaxText01LightningAttention(nn.Module):
past_key_value
:
Optional
[
Tuple
[
torch
.
Tensor
]]
=
None
,
use_cache
:
bool
=
False
,
slope_rate
:
Optional
[
torch
.
Tensor
]
=
None
,
do_eval
:
bool
=
False
,
**
kwargs
,
):
if
(
not
self
.
training
)
and
(
not
do_eval
):
...
...
python/sglang/bench_one_batch_server.py
View file @
9d61205d
...
...
@@ -16,6 +16,7 @@ import argparse
import
dataclasses
import
itertools
import
json
import
logging
import
multiprocessing
import
os
import
random
...
...
@@ -39,6 +40,8 @@ from sglang.srt.server_args import ServerArgs
from
sglang.srt.utils
import
is_blackwell
,
kill_process_tree
from
sglang.test.test_utils
import
is_in_ci
,
write_github_step_summary
logger
=
logging
.
getLogger
(
__name__
)
class
ProfileLinks
(
BaseModel
):
"""Pydantic model for profile trace links."""
...
...
python/sglang/srt/disaggregation/common/conn.py
View file @
9d61205d
...
...
@@ -77,8 +77,8 @@ class CommonKVManager(BaseKVManager):
if
self
.
disaggregation_mode
==
DisaggregationMode
.
PREFILL
:
self
.
_register_to_bootstrap
()
self
.
transfer_infos
:
Dict
[
int
,
Dict
[
str
,
TransferInfo
]]
=
{}
self
.
decode_kv_args_table
:
Dict
[
str
,
KVArgsRegisterInfo
]
=
{}
self
.
transfer_infos
=
{}
self
.
decode_kv_args_table
=
{}
self
.
pp_group
=
get_pp_group
()
elif
self
.
disaggregation_mode
==
DisaggregationMode
.
DECODE
:
self
.
connection_pool
:
Dict
[
str
,
Dict
[
str
,
Union
[
str
,
int
]]]
=
{}
...
...
python/sglang/srt/disaggregation/mooncake/conn.py
View file @
9d61205d
...
...
@@ -9,7 +9,7 @@ import struct
import
threading
import
time
from
collections
import
defaultdict
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
import
numpy
as
np
import
numpy.typing
as
npt
...
...
python/sglang/srt/entrypoints/openai/serving_responses.py
View file @
9d61205d
...
...
@@ -14,6 +14,7 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterator, Optional,
import
jinja2
import
openai.types.responses
as
openai_responses_types
import
orjson
from
fastapi
import
Request
from
fastapi.responses
import
ORJSONResponse
from
openai.types.responses
import
(
...
...
@@ -1063,7 +1064,7 @@ class OpenAIServingResponses(OpenAIServingChat):
):
function_name
=
previous_item
.
recipient
[
len
(
"browser."
)
:]
action
=
None
parsed_args
=
or
orjson
.
loads
(
previous_item
.
content
[
0
].
text
)
parsed_args
=
orjson
.
loads
(
previous_item
.
content
[
0
].
text
)
if
function_name
==
"search"
:
action
=
openai_responses_types
.
response_function_web_search
.
ActionSearch
(
type
=
"search"
,
...
...
python/sglang/srt/layers/attention/flashinfer_backend.py
View file @
9d61205d
...
...
@@ -194,7 +194,7 @@ class FlashInferAttnBackend(AttentionBackend):
)
if
init_new_workspace
:
self
.
workspace_buffer
=
torch
.
empty
(
global_config
.
flashinfer_workspace_size
,
envs
.
SGLANG_FLASHINFER_WORKSPACE_SIZE
.
get
()
,
dtype
=
torch
.
uint8
,
device
=
model_runner
.
device
,
)
...
...
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
View file @
9d61205d
...
...
@@ -38,6 +38,9 @@ from sglang.srt.utils import (
)
if
TYPE_CHECKING
:
from
sglang.srt.layers.attention.flashinfer_mla_backend
import
(
FlashInferMlaAttnBackend
,
)
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.model_executor.model_runner
import
ModelRunner
from
sglang.srt.speculative.spec_info
import
SpecInput
...
...
@@ -66,7 +69,7 @@ global_workspace_buffer = None
class
FlashInferMhaChunkKVRunner
:
def
__init__
(
self
,
model_runner
:
ModelRunner
,
attn_backend
:
"
FlashInferMlaAttnBackend
"
self
,
model_runner
:
ModelRunner
,
attn_backend
:
FlashInferMlaAttnBackend
):
# Parse Constants
self
.
num_local_heads
=
(
...
...
python/sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py
View file @
9d61205d
...
...
@@ -13,7 +13,8 @@ from triton_kernels.matmul_ogs import (
PrecisionConfig
,
matmul_ogs
,
)
from
triton_kernels.numerics
import
InFlexData
from
triton_kernels.numerics
import
InFlexData
,
MicroscalingCtx
from
triton_kernels.quantization
import
downcast_to_mxfp
from
triton_kernels.routing
import
GatherIndx
,
RoutingData
,
ScatterIndx
from
triton_kernels.swiglu
import
swiglu_fn
...
...
@@ -119,14 +120,14 @@ def triton_kernel_fused_experts(
block_shape
:
Optional
[
list
[
int
]]
=
None
,
)
->
torch
.
Tensor
:
assert
use_fp8_w8a8
==
False
,
"use_fp8_w8a8 is not supported"
assert
per_channel_quant
==
False
,
"per_channel_quant is not supported"
assert
expert_map
==
None
,
"expert_map is not supported"
assert
w1_scale
==
None
,
"w1_scale is not supported"
assert
w2_scale
==
None
,
"w2_scale is not supported"
assert
a1_scale
==
None
,
"a1_scale is not supported"
assert
a2_scale
==
None
,
"a2_scale is not supported"
assert
block_shape
==
None
,
"block_shape is not supported"
assert
use_fp8_w8a8
is
False
,
"use_fp8_w8a8 is not supported"
assert
per_channel_quant
is
False
,
"per_channel_quant is not supported"
assert
expert_map
is
None
,
"expert_map is not supported"
assert
w1_scale
is
None
,
"w1_scale is not supported"
assert
w2_scale
is
None
,
"w2_scale is not supported"
assert
a1_scale
is
None
,
"a1_scale is not supported"
assert
a2_scale
is
None
,
"a2_scale is not supported"
assert
block_shape
is
None
,
"block_shape is not supported"
# type check
assert
hidden_states
.
dtype
==
torch
.
bfloat16
,
"hidden_states must be bfloat16"
...
...
@@ -143,7 +144,7 @@ def triton_kernel_fused_experts(
),
f
"w2 shape[-1]
{
w2
.
shape
[
-
1
]
}
must be equal to w1 shape[1]
{
w1
.
shape
[
1
]
}
"
# feature check
assert
inplace
==
False
,
"Inplace is not supported in new triton MoE kernel"
assert
inplace
is
False
,
"Inplace is not supported in new triton MoE kernel"
M
,
K
=
hidden_states
.
shape
E
,
_
,
N
=
w1
.
shape
...
...
@@ -264,14 +265,14 @@ def triton_kernel_fused_experts_with_bias(
gemm1_alpha
:
Optional
[
float
]
=
None
,
gemm1_clamp_limit
:
Optional
[
float
]
=
None
,
)
->
torch
.
Tensor
:
assert
use_fp8_w8a8
==
False
,
"use_fp8_w8a8 is not supported"
assert
per_channel_quant
==
False
,
"per_channel_quant is not supported"
assert
expert_map
==
None
,
"expert_map is not supported"
assert
w1_scale
==
None
,
"w1_scale is not supported"
assert
w2_scale
==
None
,
"w2_scale is not supported"
assert
a1_scale
==
None
,
"a1_scale is not supported"
assert
a2_scale
==
None
,
"a2_scale is not supported"
assert
block_shape
==
None
,
"block_shape is not supported"
assert
use_fp8_w8a8
is
False
,
"use_fp8_w8a8 is not supported"
assert
per_channel_quant
is
False
,
"per_channel_quant is not supported"
assert
expert_map
is
None
,
"expert_map is not supported"
assert
w1_scale
is
None
,
"w1_scale is not supported"
assert
w2_scale
is
None
,
"w2_scale is not supported"
assert
a1_scale
is
None
,
"a1_scale is not supported"
assert
a2_scale
is
None
,
"a2_scale is not supported"
assert
block_shape
is
None
,
"block_shape is not supported"
# type check
assert
hidden_states
.
dtype
==
torch
.
bfloat16
,
"hidden_states must be bfloat16"
...
...
@@ -290,7 +291,7 @@ def triton_kernel_fused_experts_with_bias(
),
f
"w2 shape[-1]
{
w2
.
shape
[
-
1
]
}
must be equal to w1 shape[1]
{
w1
.
shape
[
1
]
}
"
# feature check
assert
inplace
==
False
,
"Inplace is not supported in new triton MoE kernel"
assert
inplace
is
False
,
"Inplace is not supported in new triton MoE kernel"
E
,
_
,
_
=
w1
.
shape
...
...
python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py
View file @
9d61205d
...
...
@@ -44,6 +44,13 @@ from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
from
sglang.srt.layers.quantization.unquant
import
UnquantizedLinearMethod
try
:
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_24
import
(
CompressedTensors24
,
)
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_w4a16_sparse24
import
(
W4A16SPARSE24_SUPPORTED_BITS
,
CompressedTensorsW4A16Sparse24
,
)
from
vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16
import
(
WNA16_SUPPORTED_BITS
,
CompressedTensorsWNA16
,
...
...
python/sglang/srt/layers/quantization/fpgemm_fp8.py
View file @
9d61205d
...
...
@@ -2,7 +2,7 @@
from
__future__
import
annotations
import
logging
from
typing
import
Any
,
Optional
from
typing
import
Any
,
List
,
Optional
import
torch
from
torch.nn
import
Module
...
...
python/sglang/srt/mem_cache/memory_pool.py
View file @
9d61205d
...
...
@@ -47,6 +47,8 @@ from sglang.srt.utils import get_bool_env_var, is_cuda, is_npu, next_power_of_2
if
TYPE_CHECKING
:
from
sglang.srt.managers.cache_controller
import
LayerDoneCounter
from
sglang.srt.managers.schedule_batch
import
Req
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -341,7 +343,7 @@ class HybridReqToTokenPool(ReqToTokenPool):
# For chunk prefill req, we do not need to allocate mamba cache,
# We could use allocated mamba cache instead.
def
alloc
(
self
,
need_size
:
int
,
reqs
:
Optional
[
List
[
"
Req
"
]]
=
None
self
,
need_size
:
int
,
reqs
:
Optional
[
List
[
Req
]]
=
None
)
->
Optional
[
List
[
int
]]:
select_index
=
super
().
alloc
(
need_size
)
if
select_index
==
None
:
...
...
python/sglang/srt/model_loader/weight_utils.py
View file @
9d61205d
...
...
@@ -110,6 +110,9 @@ def convert_bin_to_safetensor_file(
dirname
=
os
.
path
.
dirname
(
sf_filename
)
os
.
makedirs
(
dirname
,
exist_ok
=
True
)
from
safetensors.torch
import
save_file
save_file
(
loaded
,
sf_filename
,
metadata
=
{
"format"
:
"pt"
})
# check file size
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
9d61205d
...
...
@@ -25,6 +25,7 @@ from typing import Any, Dict, Iterable, Optional, Tuple, Union
import
torch
import
torch.nn.functional
as
F
import
tqdm
from
torch
import
nn
from
transformers
import
PretrainedConfig
...
...
@@ -3499,7 +3500,7 @@ class DeepseekV2ForCausalLM(nn.Module):
# temporarily only support DeepSeek V3/R1
weight_block_size
=
[
128
,
128
]
for
layer_id
in
trange
(
for
layer_id
in
tqdm
.
trange
(
self
.
config
.
num_hidden_layers
+
int
(
is_nextn
),
desc
=
"quant attn to fp8 ue8m0"
,
):
...
...
python/sglang/srt/models/glm4v.py
View file @
9d61205d
...
...
@@ -9,6 +9,7 @@ from transformers.models.glm4v.configuration_glm4v import Glm4vConfig, Glm4vVisi
from
sglang.srt.layers.activation
import
SiluAndMul
from
sglang.srt.layers.attention
import
vision_utils
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
from
sglang.srt.layers.layernorm
import
RMSNorm
from
sglang.srt.layers.linear
import
(
ColumnParallelLinear
,
...
...
python/sglang/srt/models/opt.py
View file @
9d61205d
...
...
@@ -13,6 +13,7 @@
# ==============================================================================
"""Inference-only OPT model compatible with HuggingFace weights."""
import
logging
from
collections.abc
import
Iterable
from
typing
import
Optional
,
Union
...
...
@@ -46,6 +47,9 @@ from sglang.srt.model_loader.weight_utils import (
kv_cache_scales_loader
,
)
from
sglang.srt.utils
import
add_prefix
,
make_layers
from
sglang.utils
import
get_exception_traceback
logger
=
logging
.
getLogger
(
__name__
)
def
get_activation
(
name
=
"relu"
):
...
...
python/sglang/srt/utils/common.py
View file @
9d61205d
...
...
@@ -42,6 +42,7 @@ import tempfile
import
threading
import
time
import
traceback
import
types
import
uuid
import
warnings
from
collections
import
OrderedDict
,
defaultdict
...
...
@@ -55,6 +56,7 @@ from json import JSONDecodeError
from
multiprocessing.reduction
import
ForkingPickler
from
pathlib
import
Path
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
Dict
,
...
...
@@ -62,6 +64,7 @@ from typing import (
List
,
Optional
,
Protocol
,
Sequence
,
Set
,
Tuple
,
TypeVar
,
...
...
@@ -91,6 +94,9 @@ from typing_extensions import Literal
from
sglang.srt.environ
import
envs
from
sglang.srt.metrics.func_timer
import
enable_func_timer
if
TYPE_CHECKING
:
from
sglang.srt.layers.quantization.base_config
import
QuantizeMethodBase
logger
=
logging
.
getLogger
(
__name__
)
show_time_cost
=
False
...
...
@@ -1076,7 +1082,7 @@ def monkey_patch_vllm_gguf_config():
def
get_quant_method_with_embedding_replaced
(
self
,
layer
:
torch
.
nn
.
Module
,
prefix
:
str
)
->
Optional
[
"
QuantizeMethodBase
"
]:
)
->
Optional
[
QuantizeMethodBase
]:
if
isinstance
(
layer
,
LinearBase
):
return
GGUFLinearMethod
(
self
)
elif
isinstance
(
layer
,
VocabParallelEmbedding
):
...
...
@@ -1956,7 +1962,9 @@ def direct_register_custom_op(
if
fake_impl
is
not
None
:
my_lib
.
_register_fake
(
op_name
,
fake_impl
)
except
RuntimeError
as
error
:
if
"Tried to register an operator"
in
str
(
e
)
and
"multiple times"
in
str
(
e
):
if
"Tried to register an operator"
in
str
(
error
)
and
"multiple times"
in
str
(
error
):
# Silently ignore duplicate registration errors
# This can happen in multi-engine scenarios
pass
...
...
python/sglang/test/few_shot_gsm8k_engine.py
View file @
9d61205d
...
...
@@ -3,6 +3,7 @@ import ast
import
asyncio
import
re
import
time
from
typing
import
Optional
import
numpy
as
np
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment