Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
c7d57d5b
"vscode:/vscode.git/clone" did not exist on "7ec165c2c5bdba30e6905d0f0565ec41e7c8ec22"
Unverified
Commit
c7d57d5b
authored
Nov 05, 2025
by
Lianmin Zheng
Committed by
GitHub
Nov 05, 2025
Browse files
Fix CI and style (#12658)
parent
80802c4c
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
96 additions
and
70 deletions
+96
-70
python/sglang/srt/configs/kimi_linear.py
python/sglang/srt/configs/kimi_linear.py
+2
-1
python/sglang/srt/disaggregation/decode.py
python/sglang/srt/disaggregation/decode.py
+1
-0
python/sglang/srt/distributed/device_communicators/pynccl_allocator.py
.../srt/distributed/device_communicators/pynccl_allocator.py
+17
-5
python/sglang/srt/entrypoints/engine.py
python/sglang/srt/entrypoints/engine.py
+21
-23
python/sglang/srt/layers/linear.py
python/sglang/srt/layers/linear.py
+1
-0
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+17
-23
python/sglang/srt/managers/scheduler_output_processor_mixin.py
...n/sglang/srt/managers/scheduler_output_processor_mixin.py
+0
-1
python/sglang/srt/managers/tokenizer_manager.py
python/sglang/srt/managers/tokenizer_manager.py
+5
-9
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+27
-4
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+3
-3
test/srt/test_request_queue_validation.py
test/srt/test_request_queue_validation.py
+2
-1
No files found.
python/sglang/srt/configs/kimi_linear.py
View file @
c7d57d5b
...
...
@@ -2,7 +2,6 @@
from
transformers.configuration_utils
import
PretrainedConfig
from
sglang.srt.configs.mamba_utils
import
KimiLinearCacheParams
,
KimiLinearStateShape
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
class
KimiLinearConfig
(
PretrainedConfig
):
...
...
@@ -150,6 +149,8 @@ class KimiLinearConfig(PretrainedConfig):
@
property
def
mamba2_cache_params
(
self
)
->
KimiLinearCacheParams
:
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
shape
=
KimiLinearStateShape
.
create
(
tp_world_size
=
get_attention_tp_size
(),
num_heads
=
self
.
linear_attn_config
[
"num_heads"
],
...
...
python/sglang/srt/disaggregation/decode.py
View file @
c7d57d5b
...
...
@@ -156,6 +156,7 @@ class HybridMambaDecodeReqToTokenPool(HybridReqToTokenPool):
enable_memory_saver
=
enable_memory_saver
,
pre_alloc_size
=
pre_alloc_size
,
)
self
.
enable_memory_saver
=
enable_memory_saver
self
.
_init_mamba_pool
(
size
+
pre_alloc_size
,
cache_params
,
device
,
speculative_num_draft_tokens
)
...
...
python/sglang/srt/distributed/device_communicators/pynccl_allocator.py
View file @
c7d57d5b
...
...
@@ -3,9 +3,14 @@ import tempfile
from
contextlib
import
nullcontext
import
torch
import
torch.utils.cpp_extension
from
packaging
import
version
from
torch.cuda.memory
import
CUDAPluggableAllocator
from
sglang.srt.distributed.parallel_state
import
GroupCoordinator
from
sglang.srt.server_args
import
get_global_server_args
after_2_8_0
=
version
.
parse
(
torch
.
__version__
)
>=
version
.
parse
(
"2.8.0"
)
nccl_allocator_source
=
"""
...
...
@@ -60,9 +65,6 @@ _cur_device = None
def
is_symmetric_memory_enabled
():
# Import here to avoid circular import
from
sglang.srt.server_args
import
get_global_server_args
return
get_global_server_args
().
enable_symm_mem
...
...
@@ -123,7 +125,12 @@ class SymmetricMemoryContext:
_graph_pool_id
is
not
None
),
"graph_pool_id is not set under graph capture"
# Pause graph memory pool to use symmetric memory with cuda graph
torch
.
_C
.
_cuda_endAllocateToPool
(
_cur_device
,
_graph_pool_id
)
if
after_2_8_0
:
torch
.
_C
.
_cuda_endAllocateToPool
(
_cur_device
,
_graph_pool_id
)
else
:
torch
.
_C
.
_cuda_endAllocateCurrentStreamToPool
(
_cur_device
,
_graph_pool_id
)
self
.
_mem_pool_ctx
.
__enter__
()
...
...
@@ -137,7 +144,12 @@ class SymmetricMemoryContext:
self
.
_mem_pool_ctx
.
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
if
self
.
is_graph_capture
:
torch
.
_C
.
_cuda_beginAllocateCurrentThreadToPool
(
_cur_device
,
_graph_pool_id
)
if
after_2_8_0
:
torch
.
_C
.
_cuda_beginAllocateCurrentThreadToPool
(
_cur_device
,
_graph_pool_id
)
else
:
torch
.
_C
.
_cuda_beginAllocateToPool
(
_cur_device
,
_graph_pool_id
)
def
use_symmetric_memory
(
group_coordinator
:
GroupCoordinator
,
disabled
:
bool
=
False
):
...
...
python/sglang/srt/entrypoints/engine.py
View file @
c7d57d5b
...
...
@@ -31,8 +31,6 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
import
zmq
from
sglang.srt.tracing.trace
import
process_tracing_init
,
trace_set_thread_info
# Fix a bug of Python threading
setattr
(
threading
,
"_register_atexit"
,
lambda
*
args
,
**
kwargs
:
None
)
...
...
@@ -67,6 +65,7 @@ from sglang.srt.managers.scheduler import run_scheduler_process
from
sglang.srt.managers.template_manager
import
TemplateManager
from
sglang.srt.managers.tokenizer_manager
import
TokenizerManager
from
sglang.srt.server_args
import
PortArgs
,
ServerArgs
from
sglang.srt.tracing.trace
import
process_tracing_init
,
trace_set_thread_info
from
sglang.srt.utils
import
(
MultiprocessingSerializer
,
assert_pkg_version
,
...
...
@@ -513,6 +512,21 @@ class Engine(EngineBase):
self
.
tokenizer_manager
.
update_weights_from_disk
(
obj
,
None
)
)
def
update_weights_from_ipc
(
self
,
zmq_handles
:
Dict
[
str
,
str
],
flush_cache
:
bool
=
True
,
):
"""Update weights from IPC for checkpoint-engine integration."""
obj
=
UpdateWeightsFromIPCReqInput
(
zmq_handles
=
zmq_handles
,
flush_cache
=
flush_cache
,
)
loop
=
asyncio
.
get_event_loop
()
return
loop
.
run_until_complete
(
self
.
tokenizer_manager
.
update_weights_from_ipc
(
obj
,
None
)
)
def
get_weights_by_name
(
self
,
name
:
str
,
truncate_size
:
int
=
100
):
"""Get weights by parameter name."""
obj
=
GetWeightsByNameReqInput
(
name
=
name
,
truncate_size
=
truncate_size
)
...
...
@@ -658,21 +672,6 @@ class Engine(EngineBase):
request
=
None
,
)
def
update_weights_from_ipc
(
self
,
zmq_handles
:
Dict
[
str
,
str
],
flush_cache
:
bool
=
True
,
):
"""Update weights from IPC for checkpoint-engine integration."""
obj
=
UpdateWeightsFromIPCReqInput
(
zmq_handles
=
zmq_handles
,
flush_cache
=
flush_cache
,
)
loop
=
asyncio
.
get_event_loop
()
return
loop
.
run_until_complete
(
self
.
tokenizer_manager
.
update_weights_from_ipc
(
obj
,
None
)
)
def
_set_envs_and_config
(
server_args
:
ServerArgs
):
# Set global environments
...
...
@@ -881,14 +880,14 @@ def _launch_subprocesses(
detoken_proc
.
start
()
# Init tokenizer manager first, as the bootstrap server is initialized here
if
server_args
.
tokenizer_worker_num
>
1
:
# Launch multi-tokenizer router
tokenizer_manager
=
MultiTokenizerRouter
(
server_args
,
port_args
)
template_manager
=
None
else
:
if
server_args
.
tokenizer_worker_num
==
1
:
tokenizer_manager
,
template_manager
=
_init_tokenizer_manager
(
server_args
,
port_args
)
else
:
# Launch multi-tokenizer router
tokenizer_manager
=
MultiTokenizerRouter
(
server_args
,
port_args
)
template_manager
=
None
# Wait for the model to finish loading
scheduler_infos
=
[]
...
...
@@ -911,7 +910,6 @@ def _launch_subprocesses(
# Assume all schedulers have the same scheduler_info
scheduler_info
=
scheduler_infos
[
0
]
tokenizer_manager
.
max_req_input_len
=
scheduler_info
[
"max_req_input_len"
]
return
tokenizer_manager
,
template_manager
,
scheduler_info
,
port_args
python/sglang/srt/layers/linear.py
View file @
c7d57d5b
...
...
@@ -162,6 +162,7 @@ class LinearBase(torch.nn.Module):
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
self
.
params_dtype
=
params_dtype
self
.
quant_config
=
quant_config
if
quant_config
is
None
:
self
.
quant_method
:
Optional
[
QuantizeMethodBase
]
=
UnquantizedLinearMethod
()
else
:
...
...
python/sglang/srt/managers/scheduler.py
View file @
c7d57d5b
...
...
@@ -269,10 +269,11 @@ class Scheduler(
server_args
.
speculative_algorithm
)
self
.
gpu_id
=
gpu_id
self
.
page_size
=
server_args
.
page_size
self
.
enable_hierarchical_cache
=
server_args
.
enable_hierarchical_cache
self
.
enable_hicache_storage
=
server_args
.
hicache_storage_backend
is
not
None
self
.
page_size
=
server_args
.
page_size
# Distributed rank info
self
.
attn_tp_rank
,
self
.
attn_tp_size
,
self
.
attn_dp_rank
=
(
compute_dp_attention_world_info
(
server_args
.
enable_dp_attention
,
...
...
@@ -298,22 +299,12 @@ class Scheduler(
# Init moe config
self
.
init_moe_config
()
# Set reasoning_parser and think_end_id if --reasoning_parser is enabled
if
self
.
server_args
.
reasoning_parser
and
self
.
tokenizer
:
reasoning_parser
=
ReasoningParser
(
model_type
=
self
.
server_args
.
reasoning_parser
,
stream_reasoning
=
False
)
self
.
tokenizer
.
think_end_id
=
self
.
tokenizer
.
encode
(
reasoning_parser
.
detector
.
think_end_token
,
add_special_tokens
=
False
)[
0
]
# Check whether overlap can be enabled
if
not
self
.
is_generation
:
self
.
enable_overlap
=
False
logger
.
info
(
"Overlap scheduler is disabled for embedding models."
)
# Launch a tensor parallel worker
from
sglang.srt.managers.tp_worker
import
TpModelWorker
self
.
tp_worker
=
TpModelWorker
(
...
...
@@ -327,7 +318,6 @@ class Scheduler(
)
# Launch a draft worker for speculative decoding
draft_worker_kwargs
=
dict
(
gpu_id
=
gpu_id
,
tp_rank
=
tp_rank
,
...
...
@@ -481,10 +471,6 @@ class Scheduler(
)
# Enable preemption for priority scheduling.
self
.
try_preemption
=
self
.
enable_priority_scheduling
assert
(
server_args
.
schedule_conservativeness
>=
0
),
"Invalid schedule_conservativeness"
self
.
init_new_token_ratio
=
min
(
envs
.
SGLANG_INIT_NEW_TOKEN_RATIO
.
get
()
*
server_args
.
schedule_conservativeness
,
...
...
@@ -511,7 +497,6 @@ class Scheduler(
)
self
.
offload_tags
=
set
()
self
.
init_profiler
()
self
.
recv_skipper
=
SchedulerRecvSkipper
.
maybe_create
(
server_args
)
self
.
input_blocker
=
(
SchedulerInputBlocker
(
noop
=
self
.
attn_tp_rank
!=
0
)
...
...
@@ -519,18 +504,15 @@ class Scheduler(
else
None
)
# Init disaggregation
self
.
init_disaggregation
()
# Init metrics stats
self
.
init_metrics
(
tp_rank
,
pp_rank
,
dp_rank
)
if
self
.
enable_kv_cache_events
:
self
.
init_kv_events
(
server_args
.
kv_events_config
)
# Init disaggregation
self
.
disaggregation_mode
=
DisaggregationMode
(
self
.
server_args
.
disaggregation_mode
)
self
.
init_disaggregation
()
if
envs
.
SGLANG_LOG_GC
.
get
():
configure_gc_logger
()
...
...
@@ -695,6 +677,15 @@ class Scheduler(
revision
=
server_args
.
revision
,
)
# Set reasoning_parser and think_end_id if --reasoning_parser is enabled
if
self
.
server_args
.
reasoning_parser
and
self
.
tokenizer
:
reasoning_parser
=
ReasoningParser
(
model_type
=
self
.
server_args
.
reasoning_parser
,
stream_reasoning
=
False
)
self
.
tokenizer
.
think_end_id
=
self
.
tokenizer
.
encode
(
reasoning_parser
.
detector
.
think_end_token
,
add_special_tokens
=
False
)[
0
]
def
init_memory_pool_and_cache
(
self
):
server_args
=
self
.
server_args
...
...
@@ -835,6 +826,9 @@ class Scheduler(
init_embedding_cache
(
embedding_cache_size
*
1024
*
1024
)
def
init_disaggregation
(
self
):
self
.
disaggregation_mode
=
DisaggregationMode
(
self
.
server_args
.
disaggregation_mode
)
self
.
transfer_backend
=
TransferBackend
(
self
.
server_args
.
disaggregation_transfer_backend
)
...
...
python/sglang/srt/managers/scheduler_output_processor_mixin.py
View file @
c7d57d5b
...
...
@@ -858,7 +858,6 @@ class SchedulerOutputProcessorMixin:
prompt_tokens
.
append
(
len
(
req
.
origin_input_ids
))
completion_tokens
.
append
(
len
(
output_ids_
))
cached_tokens
.
append
(
req
.
cached_tokens
)
retraction_counts
.
append
(
req
.
retraction_count
)
if
not
self
.
spec_algorithm
.
is_none
():
...
...
python/sglang/srt/managers/tokenizer_manager.py
View file @
c7d57d5b
...
...
@@ -196,9 +196,9 @@ class TokenizerManager(TokenizerCommunicatorMixin):
else
server_args
.
speculative_num_draft_tokens
)
# Initialize tokenizer and processor
set_global_server_args_for_tokenizer
(
server_args
)
# Initialize tokenizer and processor
if
self
.
model_config
.
is_multimodal
:
import_processors
(
"sglang.srt.multimodal.processors"
)
try
:
...
...
@@ -370,6 +370,7 @@ class TokenizerManager(TokenizerCommunicatorMixin):
if
self
.
server_args
.
gc_warning_threshold_secs
>
0.0
:
configure_gc_warning
(
self
.
server_args
.
gc_warning_threshold_secs
)
# Dispatcher and communicators
self
.
_result_dispatcher
=
TypeBasedDispatcher
(
[
(
...
...
@@ -387,15 +388,11 @@ class TokenizerManager(TokenizerCommunicatorMixin):
UpdateWeightFromDiskReqOutput
,
self
.
_handle_update_weights_from_disk_req_output
,
),
(
FreezeGCReq
,
lambda
x
:
None
,
),
(
FreezeGCReq
,
lambda
x
:
None
),
# For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
(
HealthCheckOutput
,
lambda
x
:
None
),
]
)
self
.
init_communicators
(
server_args
)
async
def
generate_request
(
...
...
@@ -407,9 +404,8 @@ class TokenizerManager(TokenizerCommunicatorMixin):
self
.
auto_create_handle_loop
()
obj
.
normalize_batch_and_arguments
()
if
request
:
if
"trace_context"
in
request
.
headers
:
trace_set_remote_propagate_context
(
request
.
headers
[
"trace_context"
])
if
request
and
"trace_context"
in
request
.
headers
:
trace_set_remote_propagate_context
(
request
.
headers
[
"trace_context"
])
if
self
.
server_args
.
tokenizer_worker_num
>
1
:
self
.
_attach_multi_http_worker_info
(
obj
)
...
...
python/sglang/srt/server_args.py
View file @
c7d57d5b
...
...
@@ -58,6 +58,7 @@ from sglang.srt.utils.common import (
json_list_type
,
nullable_str
,
parse_connector_type
,
wait_port_available
,
xpu_has_xmx_support
,
)
from
sglang.srt.utils.hf_transformers_utils
import
check_gguf_file
,
get_config
...
...
@@ -3763,6 +3764,10 @@ class ServerArgs:
"Please set --chunked-prefill-size -1 when using --multi-item-scoring-delimiter."
)
assert
(
self
.
schedule_conservativeness
>=
0
),
"schedule_conservativeness must be non-negative"
def
check_lora_server_args
(
self
):
assert
self
.
max_loras_per_batch
>
0
,
"max_loras_per_batch must be positive"
...
...
@@ -3956,9 +3961,7 @@ def set_global_server_args_for_scheduler(server_args: ServerArgs):
_global_server_args
=
server_args
def
set_global_server_args_for_tokenizer
(
server_args
:
ServerArgs
):
global
_global_server_args
_global_server_args
=
server_args
set_global_server_args_for_tokenizer
=
set_global_server_args_for_scheduler
def
get_global_server_args
()
->
ServerArgs
:
...
...
@@ -4082,7 +4085,8 @@ class PortArgs:
),
"please provide --dist-init-addr as host:port of head node"
dist_init_host
,
dist_init_port
=
dist_init_addr
port_base
=
int
(
dist_init_port
)
+
1
dist_init_port
=
int
(
dist_init_port
)
port_base
=
dist_init_port
+
1
detokenizer_port
=
port_base
+
1
rpc_port
=
port_base
+
2
metrics_ipc_name
=
port_base
+
3
...
...
@@ -4092,6 +4096,25 @@ class PortArgs:
else
:
assert
worker_ports
is
not
None
scheduler_input_port
=
worker_ports
[
dp_rank
]
try
:
if
dp_rank
is
None
:
wait_port_available
(
dist_init_port
,
"dist_init_port"
)
wait_port_available
(
port_base
,
"port_base"
)
wait_port_available
(
detokenizer_port
,
"detokenizer_port"
)
wait_port_available
(
nccl_port
,
"nccl_port"
)
wait_port_available
(
rpc_port
,
"rpc_port"
)
wait_port_available
(
metrics_ipc_name
,
"metrics_ipc_name"
)
# Check scheduler_input_port only for dp.
# Skip check when using worker_ports since the port is already bound by our ZMQ socket
if
dp_rank
is
None
or
worker_ports
is
None
:
wait_port_available
(
scheduler_input_port
,
"scheduler_input_port"
)
except
ValueError
as
e
:
logger
.
exception
(
f
"Port is already in use.
{
dist_init_port
=
}
{
port_base
=
}
{
detokenizer_port
=
}
{
nccl_port
=
}
{
scheduler_input_port
=
}
"
)
raise
return
PortArgs
(
tokenizer_ipc_name
=
f
"tcp://
{
dist_init_host
}
:
{
port_base
}
"
,
scheduler_input_ipc_name
=
f
"tcp://
{
dist_init_host
}
:
{
scheduler_input_port
}
"
,
...
...
python/sglang/test/test_utils.py
View file @
c7d57d5b
...
...
@@ -1557,7 +1557,7 @@ def send_generate_requests(base_url: str, num_requests: int) -> List[str]:
"text"
:
prompt
,
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
50
,
"max_new_tokens"
:
50
0
,
},
},
)
...
...
@@ -1584,7 +1584,7 @@ async def send_concurrent_generate_requests(
"text"
:
prompt
,
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
50
,
"max_new_tokens"
:
50
0
,
},
},
)
as
response
:
...
...
@@ -1608,7 +1608,7 @@ async def send_concurrent_generate_requests_with_custom_params(
"""
,
"sampling_params"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
50
,
"max_new_tokens"
:
50
0
,
},
}
...
...
test/srt/test_request_queue_validation.py
View file @
c7d57d5b
...
...
@@ -2,7 +2,6 @@ import asyncio
import
os
import
re
import
unittest
from
concurrent.futures
import
ThreadPoolExecutor
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.test_utils
import
(
...
...
@@ -37,6 +36,8 @@ class TestMaxQueuedRequests(CustomTestCase):
"1"
,
"--max-queued-requests"
,
# Enforce max queued request number is 1
"1"
,
"--attention-backend"
,
"triton"
,
),
return_stdout_stderr
=
(
cls
.
stdout
,
cls
.
stderr
),
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment