Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
f4941906
"git@developer.sourcefind.cn:change/sglang.git" did not exist on "ced35a06490ffe4a75dac87648a7da68d84aef4c"
Unverified
Commit
f4941906
authored
Oct 12, 2025
by
Liangsheng Yin
Committed by
GitHub
Oct 12, 2025
Browse files
Move args from `global_config` to `environ` (#11332)
parent
01e59e82
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
34 additions
and
46 deletions
+34
-46
python/sglang/global_config.py
python/sglang/global_config.py
+1
-22
python/sglang/srt/environ.py
python/sglang/srt/environ.py
+9
-0
python/sglang/srt/layers/attention/flashinfer_backend.py
python/sglang/srt/layers/attention/flashinfer_backend.py
+10
-10
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
+2
-2
python/sglang/srt/managers/schedule_batch.py
python/sglang/srt/managers/schedule_batch.py
+4
-4
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+8
-8
No files found.
python/sglang/global_config.py
View file @
f4941906
"""Global configurations"""
"""Global configurations"""
import
os
# FIXME: deprecate this file and move all usage to sglang.srt.environ or sglang.__init__.py
class
GlobalConfig
:
class
GlobalConfig
:
...
@@ -20,27 +20,6 @@ class GlobalConfig:
...
@@ -20,27 +20,6 @@ class GlobalConfig:
# Default backend of the language
# Default backend of the language
self
.
default_backend
=
None
self
.
default_backend
=
None
# Runtime constants: New generation token ratio estimation
self
.
default_init_new_token_ratio
=
float
(
os
.
environ
.
get
(
"SGLANG_INIT_NEW_TOKEN_RATIO"
,
0.7
)
)
self
.
default_min_new_token_ratio_factor
=
float
(
os
.
environ
.
get
(
"SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR"
,
0.14
)
)
self
.
default_new_token_ratio_decay_steps
=
float
(
os
.
environ
.
get
(
"SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS"
,
600
)
)
self
.
torch_empty_cache_interval
=
float
(
os
.
environ
.
get
(
"SGLANG_EMPTY_CACHE_INTERVAL"
,
-
1
)
# in seconds. Set if you observe high memory accumulation over a long serving period.
)
# Runtime constants: others
self
.
retract_decode_steps
=
20
self
.
flashinfer_workspace_size
=
int
(
os
.
environ
.
get
(
"FLASHINFER_WORKSPACE_SIZE"
,
384
*
1024
*
1024
)
)
# Output tokenization configs
# Output tokenization configs
self
.
skip_special_tokens_in_output
=
True
self
.
skip_special_tokens_in_output
=
True
self
.
spaces_between_special_tokens_in_out
=
True
self
.
spaces_between_special_tokens_in_out
=
True
...
...
python/sglang/srt/environ.py
View file @
f4941906
...
@@ -128,6 +128,14 @@ class Envs:
...
@@ -128,6 +128,14 @@ class Envs:
SGLANG_SIMULATE_ACC_METHOD
=
EnvStr
(
"multinomial"
)
SGLANG_SIMULATE_ACC_METHOD
=
EnvStr
(
"multinomial"
)
SGLANG_TORCH_PROFILER_DIR
=
EnvStr
(
"/tmp"
)
SGLANG_TORCH_PROFILER_DIR
=
EnvStr
(
"/tmp"
)
# Scheduler: new token ratio hyperparameters
SGLANG_INIT_NEW_TOKEN_RATIO
=
EnvFloat
(
0.7
)
SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR
=
EnvFloat
(
0.14
)
SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS
=
EnvInt
(
600
)
SGLANG_RETRACT_DECODE_STEPS
=
EnvInt
(
20
)
# Scheduler: others:
SGLANG_EMPTY_CACHE_INTERVAL
=
EnvFloat
(
-
1
)
# in seconds. Set if you observe high memory accumulation over a long serving period.
# Test: pd-disaggregation
# Test: pd-disaggregation
SGLANG_TEST_PD_DISAGG_BACKEND
=
EnvStr
(
"mooncake"
)
SGLANG_TEST_PD_DISAGG_BACKEND
=
EnvStr
(
"mooncake"
)
SGLANG_TEST_PD_DISAGG_DEVICES
=
EnvStr
(
None
)
SGLANG_TEST_PD_DISAGG_DEVICES
=
EnvStr
(
None
)
...
@@ -159,6 +167,7 @@ class Envs:
...
@@ -159,6 +167,7 @@ class Envs:
# Flashinfer
# Flashinfer
SGLANG_IS_FLASHINFER_AVAILABLE
=
EnvBool
(
True
)
SGLANG_IS_FLASHINFER_AVAILABLE
=
EnvBool
(
True
)
SGLANG_ENABLE_FLASHINFER_GEMM
=
EnvBool
(
False
)
SGLANG_ENABLE_FLASHINFER_GEMM
=
EnvBool
(
False
)
SGLANG_FLASHINFER_WORKSPACE_SIZE
=
EnvInt
(
384
*
1024
*
1024
)
# Triton
# Triton
SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS
=
EnvBool
(
False
)
SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS
=
EnvBool
(
False
)
...
...
python/sglang/srt/layers/attention/flashinfer_backend.py
View file @
f4941906
...
@@ -16,13 +16,7 @@ from typing import TYPE_CHECKING, Callable, List, Optional, Union
...
@@ -16,13 +16,7 @@ from typing import TYPE_CHECKING, Callable, List, Optional, Union
import
torch
import
torch
if
os
.
environ
[
"SGLANG_ENABLE_TORCH_COMPILE"
]
==
"1"
:
from
sglang.srt.environ
import
envs
torch
.
_logging
.
set_logs
(
dynamo
=
logging
.
ERROR
)
torch
.
_dynamo
.
config
.
suppress_errors
=
True
logger
=
logging
.
getLogger
(
__name__
)
from
sglang.global_config
import
global_config
from
sglang.srt.layers.attention.base_attn_backend
import
AttentionBackend
from
sglang.srt.layers.attention.base_attn_backend
import
AttentionBackend
from
sglang.srt.layers.attention.utils
import
create_flashinfer_kv_indices_triton
from
sglang.srt.layers.attention.utils
import
create_flashinfer_kv_indices_triton
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
from
sglang.srt.layers.dp_attention
import
get_attention_tp_size
...
@@ -41,6 +35,12 @@ if TYPE_CHECKING:
...
@@ -41,6 +35,12 @@ if TYPE_CHECKING:
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.model_executor.model_runner
import
ModelRunner
from
sglang.srt.model_executor.model_runner
import
ModelRunner
logger
=
logging
.
getLogger
(
__name__
)
if
envs
.
SGLANG_ENABLE_TORCH_COMPILE
.
get
():
torch
.
_logging
.
set_logs
(
dynamo
=
logging
.
ERROR
)
torch
.
_dynamo
.
config
.
suppress_errors
=
True
if
is_flashinfer_available
():
if
is_flashinfer_available
():
from
flashinfer
import
(
from
flashinfer
import
(
...
@@ -160,7 +160,7 @@ class FlashInferAttnBackend(AttentionBackend):
...
@@ -160,7 +160,7 @@ class FlashInferAttnBackend(AttentionBackend):
or
"Qwen3ForCausalLM"
in
model_runner
.
model_config
.
hf_config
.
architectures
or
"Qwen3ForCausalLM"
in
model_runner
.
model_config
.
hf_config
.
architectures
or
"MiMoForCausalLM"
in
model_runner
.
model_config
.
hf_config
.
architectures
or
"MiMoForCausalLM"
in
model_runner
.
model_config
.
hf_config
.
architectures
):
):
global_config
.
flashinfer_workspace_size
=
512
*
1024
*
1024
envs
.
SGLANG_FLASHINFER_WORKSPACE_SIZE
.
set
(
512
*
1024
*
1024
)
# When deterministic inference is enabled, tensor cores should be used for decode
# When deterministic inference is enabled, tensor cores should be used for decode
# Also set split tile sizes for prefill and decode from environment variables, and disable kv split for cuda graph
# Also set split tile sizes for prefill and decode from environment variables, and disable kv split for cuda graph
...
@@ -180,13 +180,13 @@ class FlashInferAttnBackend(AttentionBackend):
...
@@ -180,13 +180,13 @@ class FlashInferAttnBackend(AttentionBackend):
"SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE"
,
2048
"SGLANG_FLASHINFER_DECODE_SPLIT_TILE_SIZE"
,
2048
)
)
self
.
disable_cuda_graph_kv_split
=
True
self
.
disable_cuda_graph_kv_split
=
True
global_config
.
flashinfer_workspace_size
=
2048
*
1024
*
1024
envs
.
SGLANG_FLASHINFER_WORKSPACE_SIZE
.
set
(
2048
*
1024
*
1024
)
# Allocate buffers
# Allocate buffers
global
global_workspace_buffer
global
global_workspace_buffer
if
global_workspace_buffer
is
None
:
if
global_workspace_buffer
is
None
:
# different from flashinfer zero_init_global_workspace_buffer
# different from flashinfer zero_init_global_workspace_buffer
global_workspace_size
=
global_config
.
flashinfer_workspace_size
global_workspace_size
=
envs
.
SGLANG_FLASHINFER_WORKSPACE_SIZE
.
get
()
global_workspace_buffer
=
torch
.
empty
(
global_workspace_buffer
=
torch
.
empty
(
global_workspace_size
,
global_workspace_size
,
dtype
=
torch
.
uint8
,
dtype
=
torch
.
uint8
,
...
...
python/sglang/srt/layers/attention/flashinfer_mla_backend.py
View file @
f4941906
...
@@ -22,7 +22,7 @@ if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
...
@@ -22,7 +22,7 @@ if os.environ["SGLANG_ENABLE_TORCH_COMPILE"] == "1":
torch
.
_logging
.
set_logs
(
dynamo
=
logging
.
ERROR
)
torch
.
_logging
.
set_logs
(
dynamo
=
logging
.
ERROR
)
torch
.
_dynamo
.
config
.
suppress_errors
=
True
torch
.
_dynamo
.
config
.
suppress_errors
=
True
from
sglang.
global_config
import
global_config
from
sglang.
srt.environ
import
envs
from
sglang.srt.layers.attention.base_attn_backend
import
AttentionBackend
from
sglang.srt.layers.attention.base_attn_backend
import
AttentionBackend
from
sglang.srt.layers.attention.flashinfer_backend
import
(
from
sglang.srt.layers.attention.flashinfer_backend
import
(
create_flashinfer_kv_indices_triton
,
create_flashinfer_kv_indices_triton
,
...
@@ -204,7 +204,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
...
@@ -204,7 +204,7 @@ class FlashInferMLAAttnBackend(AttentionBackend):
if
global_workspace_buffer
is
None
:
if
global_workspace_buffer
is
None
:
# different from flashinfer zero_init_global_workspace_buffer
# different from flashinfer zero_init_global_workspace_buffer
global_workspace_buffer
=
torch
.
empty
(
global_workspace_buffer
=
torch
.
empty
(
global_config
.
flashinfer_workspace_size
,
envs
.
SGLANG_FLASHINFER_WORKSPACE_SIZE
.
get
()
,
dtype
=
torch
.
uint8
,
dtype
=
torch
.
uint8
,
device
=
model_runner
.
device
,
device
=
model_runner
.
device
,
)
)
...
...
python/sglang/srt/managers/schedule_batch.py
View file @
f4941906
...
@@ -37,7 +37,6 @@ import copy
...
@@ -37,7 +37,6 @@ import copy
import
dataclasses
import
dataclasses
import
logging
import
logging
import
re
import
re
import
threading
import
time
import
time
from
enum
import
Enum
,
auto
from
enum
import
Enum
,
auto
from
http
import
HTTPStatus
from
http
import
HTTPStatus
...
@@ -47,7 +46,6 @@ from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union
...
@@ -47,7 +46,6 @@ from typing import TYPE_CHECKING, Any, List, Optional, Set, Tuple, Union
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
from
sglang.global_config
import
global_config
from
sglang.srt.constrained.base_grammar_backend
import
BaseGrammarObject
from
sglang.srt.constrained.base_grammar_backend
import
BaseGrammarObject
from
sglang.srt.disaggregation.base
import
BaseKVSender
from
sglang.srt.disaggregation.base
import
BaseKVSender
from
sglang.srt.disaggregation.decode_schedule_batch_mixin
import
(
from
sglang.srt.disaggregation.decode_schedule_batch_mixin
import
(
...
@@ -55,6 +53,7 @@ from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
...
@@ -55,6 +53,7 @@ from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
)
)
from
sglang.srt.disaggregation.utils
import
DisaggregationMode
from
sglang.srt.disaggregation.utils
import
DisaggregationMode
from
sglang.srt.distributed.parallel_state
import
get_tensor_model_parallel_rank
from
sglang.srt.distributed.parallel_state
import
get_tensor_model_parallel_rank
from
sglang.srt.environ
import
envs
from
sglang.srt.mem_cache.allocator
import
(
from
sglang.srt.mem_cache.allocator
import
(
BaseTokenToKVPoolAllocator
,
BaseTokenToKVPoolAllocator
,
SWATokenToKVPoolAllocator
,
SWATokenToKVPoolAllocator
,
...
@@ -1481,7 +1480,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
...
@@ -1481,7 +1480,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
total_max_new_tokens
=
sum
(
r
.
sampling_params
.
max_new_tokens
for
r
in
self
.
reqs
)
total_max_new_tokens
=
sum
(
r
.
sampling_params
.
max_new_tokens
for
r
in
self
.
reqs
)
new_estimate_ratio
=
(
new_estimate_ratio
=
(
total_decoded_tokens
+
global_config
.
retract_decode_steps
*
len
(
self
.
reqs
)
total_decoded_tokens
+
envs
.
SGLANG_RETRACT_DECODE_STEPS
.
get
()
*
len
(
self
.
reqs
)
)
/
total_max_new_tokens
)
/
total_max_new_tokens
new_estimate_ratio
=
min
(
1.0
,
new_estimate_ratio
)
new_estimate_ratio
=
min
(
1.0
,
new_estimate_ratio
)
...
@@ -1520,7 +1520,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
...
@@ -1520,7 +1520,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
self
.
tree_cache
.
dec_lock_ref
(
req
.
last_node
)
self
.
tree_cache
.
dec_lock_ref
(
req
.
last_node
)
# NOTE(lsyin): we should use the newly evictable memory instantly.
# NOTE(lsyin): we should use the newly evictable memory instantly.
num_tokens
=
remaing_req_count
*
global_config
.
retract_decode_steps
num_tokens
=
remaing_req_count
*
envs
.
SGLANG_RETRACT_DECODE_STEPS
.
get
()
self
.
_evict_tree_cache_if_needed
(
num_tokens
)
self
.
_evict_tree_cache_if_needed
(
num_tokens
)
req
.
reset_for_retract
()
req
.
reset_for_retract
()
...
...
python/sglang/srt/managers/scheduler.py
View file @
f4941906
...
@@ -35,7 +35,6 @@ from torch.cuda import Stream as CudaStream
...
@@ -35,7 +35,6 @@ from torch.cuda import Stream as CudaStream
from
torch.cuda
import
StreamContext
as
CudaStreamContext
from
torch.cuda
import
StreamContext
as
CudaStreamContext
from
torch.distributed
import
barrier
from
torch.distributed
import
barrier
from
sglang.global_config
import
global_config
from
sglang.srt.configs.model_config
import
ModelConfig
from
sglang.srt.configs.model_config
import
ModelConfig
from
sglang.srt.constrained.base_grammar_backend
import
(
from
sglang.srt.constrained.base_grammar_backend
import
(
INVALID_GRAMMAR_OBJ
,
INVALID_GRAMMAR_OBJ
,
...
@@ -61,6 +60,7 @@ from sglang.srt.disaggregation.utils import (
...
@@ -61,6 +60,7 @@ from sglang.srt.disaggregation.utils import (
prepare_abort
,
prepare_abort
,
)
)
from
sglang.srt.distributed
import
get_pp_group
,
get_world_group
from
sglang.srt.distributed
import
get_pp_group
,
get_world_group
from
sglang.srt.environ
import
envs
from
sglang.srt.eplb.expert_distribution
import
get_global_expert_distribution_recorder
from
sglang.srt.eplb.expert_distribution
import
get_global_expert_distribution_recorder
from
sglang.srt.layers.dp_attention
import
compute_dp_attention_world_info
from
sglang.srt.layers.dp_attention
import
compute_dp_attention_world_info
from
sglang.srt.layers.logits_processor
import
LogitsProcessorOutput
from
sglang.srt.layers.logits_processor
import
LogitsProcessorOutput
...
@@ -556,18 +556,17 @@ class Scheduler(
...
@@ -556,18 +556,17 @@ class Scheduler(
server_args
.
schedule_conservativeness
>=
0
server_args
.
schedule_conservativeness
>=
0
),
"Invalid schedule_conservativeness"
),
"Invalid schedule_conservativeness"
self
.
init_new_token_ratio
=
min
(
self
.
init_new_token_ratio
=
min
(
global_config
.
default_init_new_token_ratio
envs
.
SGLANG_INIT_NEW_TOKEN_RATIO
.
get
()
*
server_args
.
schedule_conservativeness
,
*
server_args
.
schedule_conservativeness
,
1.0
,
1.0
,
)
)
self
.
min_new_token_ratio
=
min
(
self
.
min_new_token_ratio
=
min
(
self
.
init_new_token_ratio
self
.
init_new_token_ratio
*
envs
.
SGLANG_MIN_NEW_TOKEN_RATIO_FACTOR
.
get
(),
*
global_config
.
default_min_new_token_ratio_factor
,
1.0
,
1.0
,
)
)
self
.
new_token_ratio_decay
=
(
self
.
new_token_ratio_decay
=
(
self
.
init_new_token_ratio
-
self
.
min_new_token_ratio
self
.
init_new_token_ratio
-
self
.
min_new_token_ratio
)
/
global_config
.
default_new_token_ratio_decay_steps
)
/
envs
.
SGLANG_NEW_TOKEN_RATIO_DECAY_STEPS
.
get
()
self
.
new_token_ratio
=
self
.
init_new_token_ratio
self
.
new_token_ratio
=
self
.
init_new_token_ratio
# Init watchdog thread
# Init watchdog thread
...
@@ -2897,12 +2896,13 @@ class IdleSleeper:
...
@@ -2897,12 +2896,13 @@ class IdleSleeper:
for
s
in
sockets
:
for
s
in
sockets
:
self
.
poller
.
register
(
s
,
zmq
.
POLLIN
)
self
.
poller
.
register
(
s
,
zmq
.
POLLIN
)
self
.
empty_cache_interval
=
envs
.
SGLANG_EMPTY_CACHE_INTERVAL
.
get
()
def
maybe_sleep
(
self
):
def
maybe_sleep
(
self
):
self
.
poller
.
poll
(
1000
)
self
.
poller
.
poll
(
1000
)
if
(
if
(
global_config
.
torch_empty_cache_interval
>
0
self
.
empty_cache_interval
>
0
and
time
.
time
()
-
self
.
last_empty_time
and
time
.
time
()
-
self
.
last_empty_time
>
self
.
empty_cache_interval
>
global_config
.
torch_empty_cache_interval
):
):
self
.
last_empty_time
=
time
.
time
()
self
.
last_empty_time
=
time
.
time
()
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment