Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
73600673
Unverified
Commit
73600673
authored
May 07, 2025
by
Baizhou Zhang
Committed by
GitHub
May 07, 2025
Browse files
Clean logs for DeepSeek-V3 launching (#6079)
parent
8f508cc7
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
61 additions
and
39 deletions
+61
-39
python/sglang/srt/distributed/device_communicators/pynccl.py
python/sglang/srt/distributed/device_communicators/pynccl.py
+2
-1
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+4
-1
python/sglang/srt/layers/quantization/fp8.py
python/sglang/srt/layers/quantization/fp8.py
+2
-4
python/sglang/srt/layers/quantization/fp8_kernel.py
python/sglang/srt/layers/quantization/fp8_kernel.py
+4
-3
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+35
-26
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+7
-4
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+7
-0
No files found.
python/sglang/srt/distributed/device_communicators/pynccl.py
View file @
73600673
...
@@ -75,7 +75,8 @@ class PyNcclCommunicator:
...
@@ -75,7 +75,8 @@ class PyNcclCommunicator:
self
.
available
=
True
self
.
available
=
True
self
.
disabled
=
False
self
.
disabled
=
False
logger
.
info
(
"sglang is using nccl==%s"
,
self
.
nccl
.
ncclGetVersion
())
if
self
.
rank
==
0
:
logger
.
info
(
"sglang is using nccl==%s"
,
self
.
nccl
.
ncclGetVersion
())
if
self
.
rank
==
0
:
if
self
.
rank
==
0
:
# get the unique id from NCCL
# get the unique id from NCCL
...
...
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
View file @
73600673
...
@@ -29,6 +29,7 @@ from sglang.srt.utils import (
...
@@ -29,6 +29,7 @@ from sglang.srt.utils import (
get_device_name
,
get_device_name
,
is_cuda
,
is_cuda
,
is_hip
,
is_hip
,
log_info_on_rank0
,
)
)
_is_hip
=
is_hip
()
_is_hip
=
is_hip
()
...
@@ -945,7 +946,9 @@ def get_moe_configs(
...
@@ -945,7 +946,9 @@ def get_moe_configs(
# For example, updating the Triton version might cause all old configs to become suboptimal.
# For example, updating the Triton version might cause all old configs to become suboptimal.
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
logger
.
info
(
"Using MoE kernel config from %s."
,
config_file_path
)
log_info_on_rank0
(
logger
,
f
"Using MoE kernel config from
{
config_file_path
}
."
)
# If a configuration has been found, return it
# If a configuration has been found, return it
return
{
int
(
key
):
val
for
key
,
val
in
json
.
load
(
f
).
items
()}
return
{
int
(
key
):
val
for
key
,
val
in
json
.
load
(
f
).
items
()}
...
...
python/sglang/srt/layers/quantization/fp8.py
View file @
73600673
...
@@ -66,6 +66,7 @@ from sglang.srt.utils import (
...
@@ -66,6 +66,7 @@ from sglang.srt.utils import (
get_bool_env_var
,
get_bool_env_var
,
is_cuda
,
is_cuda
,
is_hip
,
is_hip
,
log_info_on_rank0
,
print_warning_once
,
print_warning_once
,
set_weight_attrs
,
set_weight_attrs
,
)
)
...
@@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig):
...
@@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig):
)
->
None
:
)
->
None
:
self
.
is_checkpoint_fp8_serialized
=
is_checkpoint_fp8_serialized
self
.
is_checkpoint_fp8_serialized
=
is_checkpoint_fp8_serialized
if
is_checkpoint_fp8_serialized
:
if
is_checkpoint_fp8_serialized
:
logger
.
warning
(
log_info_on_rank0
(
logger
,
"Detected fp8 checkpoint."
)
"Detected fp8 checkpoint. Please note that the "
"format is experimental and subject to change."
)
if
activation_scheme
not
in
ACTIVATION_SCHEMES
:
if
activation_scheme
not
in
ACTIVATION_SCHEMES
:
raise
ValueError
(
f
"Unsupported activation scheme
{
activation_scheme
}
"
)
raise
ValueError
(
f
"Unsupported activation scheme
{
activation_scheme
}
"
)
self
.
activation_scheme
=
activation_scheme
self
.
activation_scheme
=
activation_scheme
...
...
python/sglang/srt/layers/quantization/fp8_kernel.py
View file @
73600673
...
@@ -30,6 +30,7 @@ from sglang.srt.utils import (
...
@@ -30,6 +30,7 @@ from sglang.srt.utils import (
get_device_name
,
get_device_name
,
is_cuda
,
is_cuda
,
is_hip
,
is_hip
,
log_info_on_rank0
,
supports_custom_op
,
supports_custom_op
,
)
)
...
@@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs(
...
@@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs(
)
)
if
os
.
path
.
exists
(
config_file_path
):
if
os
.
path
.
exists
(
config_file_path
):
with
open
(
config_file_path
)
as
f
:
with
open
(
config_file_path
)
as
f
:
log
ger
.
info
(
log
_info_on_rank0
(
"Using configuration from %s for W8A8 Block FP8 kernel."
,
logger
,
config_file_path
,
f
"Using configuration from
{
config_file_path
}
for W8A8 Block FP8 kernel."
,
)
)
# If a configuration has been found, return it
# If a configuration has been found, return it
return
{
int
(
key
):
val
for
key
,
val
in
json
.
load
(
f
).
items
()}
return
{
int
(
key
):
val
for
key
,
val
in
json
.
load
(
f
).
items
()}
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
73600673
...
@@ -278,9 +278,10 @@ class ModelRunner:
...
@@ -278,9 +278,10 @@ class ModelRunner:
server_args
.
attention_backend
=
"fa3"
server_args
.
attention_backend
=
"fa3"
else
:
else
:
server_args
.
attention_backend
=
"triton"
server_args
.
attention_backend
=
"triton"
logger
.
info
(
if
self
.
should_log
:
f
"Attention backend not set. Use
{
server_args
.
attention_backend
}
backend by default."
logger
.
info
(
)
f
"Attention backend not set. Use
{
server_args
.
attention_backend
}
backend by default."
)
elif
self
.
use_mla_backend
:
elif
self
.
use_mla_backend
:
if
server_args
.
device
!=
"cpu"
:
if
server_args
.
device
!=
"cpu"
:
if
server_args
.
attention_backend
in
[
if
server_args
.
attention_backend
in
[
...
@@ -290,9 +291,10 @@ class ModelRunner:
...
@@ -290,9 +291,10 @@ class ModelRunner:
"flashmla"
,
"flashmla"
,
"cutlass_mla"
,
"cutlass_mla"
,
]:
]:
logger
.
info
(
if
self
.
should_log
:
f
"MLA optimization is turned on. Use
{
server_args
.
attention_backend
}
backend."
logger
.
info
(
)
f
"MLA optimization is turned on. Use
{
server_args
.
attention_backend
}
backend."
)
else
:
else
:
raise
ValueError
(
raise
ValueError
(
f
"Invalid attention backend for MLA:
{
server_args
.
attention_backend
}
"
f
"Invalid attention backend for MLA:
{
server_args
.
attention_backend
}
"
...
@@ -311,9 +313,10 @@ class ModelRunner:
...
@@ -311,9 +313,10 @@ class ModelRunner:
server_args
.
attention_backend
=
"triton"
server_args
.
attention_backend
=
"triton"
if
server_args
.
enable_double_sparsity
:
if
server_args
.
enable_double_sparsity
:
logger
.
info
(
if
self
.
should_log
:
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
logger
.
info
(
)
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
)
server_args
.
attention_backend
=
"triton"
server_args
.
attention_backend
=
"triton"
server_args
.
disable_cuda_graph
=
True
server_args
.
disable_cuda_graph
=
True
if
server_args
.
ds_heavy_channel_type
is
None
:
if
server_args
.
ds_heavy_channel_type
is
None
:
...
@@ -324,23 +327,26 @@ class ModelRunner:
...
@@ -324,23 +327,26 @@ class ModelRunner:
if
self
.
is_multimodal
:
if
self
.
is_multimodal
:
self
.
mem_fraction_static
*=
0.90
self
.
mem_fraction_static
*=
0.90
logger
.
info
(
if
self
.
should_log
:
f
"Automatically reduce --mem-fraction-static to
{
self
.
mem_fraction_static
:.
3
f
}
"
logger
.
info
(
f
"because this is a multimodal model."
f
"Automatically reduce --mem-fraction-static to
{
self
.
mem_fraction_static
:.
3
f
}
"
)
f
"because this is a multimodal model."
logger
.
info
(
)
"Automatically turn off --chunked-prefill-size for multimodal model."
logger
.
info
(
)
"Automatically turn off --chunked-prefill-size for multimodal model."
)
server_args
.
chunked_prefill_size
=
-
1
server_args
.
chunked_prefill_size
=
-
1
if
not
self
.
use_mla_backend
:
if
not
self
.
use_mla_backend
:
server_args
.
disable_chunked_prefix_cache
=
True
server_args
.
disable_chunked_prefix_cache
=
True
elif
self
.
page_size
>
1
:
elif
self
.
page_size
>
1
:
logger
.
info
(
"Disable chunked prefix cache when page size > 1."
)
if
self
.
should_log
:
logger
.
info
(
"Disable chunked prefix cache when page size > 1."
)
server_args
.
disable_chunked_prefix_cache
=
True
server_args
.
disable_chunked_prefix_cache
=
True
if
not
server_args
.
disable_chunked_prefix_cache
:
if
not
server_args
.
disable_chunked_prefix_cache
:
logger
.
info
(
"Chunked prefix cache is turned on."
)
if
self
.
should_log
:
logger
.
info
(
"Chunked prefix cache is turned on."
)
def
init_torch_distributed
(
self
):
def
init_torch_distributed
(
self
):
logger
.
info
(
"Init torch distributed begin."
)
logger
.
info
(
"Init torch distributed begin."
)
...
@@ -433,9 +439,10 @@ class ModelRunner:
...
@@ -433,9 +439,10 @@ class ModelRunner:
torch
.
set_num_threads
(
1
)
torch
.
set_num_threads
(
1
)
if
self
.
device
==
"cuda"
:
if
self
.
device
==
"cuda"
:
if
torch
.
cuda
.
get_device_capability
()[
0
]
<
8
:
if
torch
.
cuda
.
get_device_capability
()[
0
]
<
8
:
logger
.
info
(
if
self
.
should_log
:
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
logger
.
info
(
)
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
)
self
.
server_args
.
dtype
=
"float16"
self
.
server_args
.
dtype
=
"float16"
self
.
model_config
.
dtype
=
torch
.
float16
self
.
model_config
.
dtype
=
torch
.
float16
if
torch
.
cuda
.
get_device_capability
()[
1
]
<
5
:
if
torch
.
cuda
.
get_device_capability
()[
1
]
<
5
:
...
@@ -471,10 +478,11 @@ class ModelRunner:
...
@@ -471,10 +478,11 @@ class ModelRunner:
self
.
model
.
load_kv_cache_scales
(
self
.
model
.
load_kv_cache_scales
(
self
.
server_args
.
quantization_param_path
self
.
server_args
.
quantization_param_path
)
)
logger
.
info
(
if
self
.
should_log
:
"Loaded KV cache scaling factors from %s"
,
logger
.
info
(
self
.
server_args
.
quantization_param_path
,
"Loaded KV cache scaling factors from %s"
,
)
self
.
server_args
.
quantization_param_path
,
)
else
:
else
:
raise
RuntimeError
(
raise
RuntimeError
(
"Using FP8 KV cache and scaling factors provided but "
"Using FP8 KV cache and scaling factors provided but "
...
@@ -1021,7 +1029,8 @@ class ModelRunner:
...
@@ -1021,7 +1029,8 @@ class ModelRunner:
)
)
def
apply_torch_tp
(
self
):
def
apply_torch_tp
(
self
):
logger
.
info
(
f
"Enabling torch tensor parallelism on
{
self
.
tp_size
}
devices."
)
if
self
.
should_log
:
logger
.
info
(
f
"Enabling torch tensor parallelism on
{
self
.
tp_size
}
devices."
)
from
sglang.srt.model_parallel
import
tensor_parallel
from
sglang.srt.model_parallel
import
tensor_parallel
device_mesh
=
torch
.
distributed
.
init_device_mesh
(
self
.
device
,
(
self
.
tp_size
,))
device_mesh
=
torch
.
distributed
.
init_device_mesh
(
self
.
device
,
(
self
.
tp_size
,))
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
73600673
...
@@ -88,6 +88,7 @@ from sglang.srt.utils import (
...
@@ -88,6 +88,7 @@ from sglang.srt.utils import (
get_int_env_var
,
get_int_env_var
,
is_cuda
,
is_cuda
,
is_hip
,
is_hip
,
log_info_on_rank0
,
)
)
_is_hip
=
is_hip
()
_is_hip
=
is_hip
()
...
@@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module):
...
@@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module):
):
):
self
.
n_share_experts_fusion
=
0
self
.
n_share_experts_fusion
=
0
global_server_args_dict
[
"n_share_experts_fusion"
]
=
0
global_server_args_dict
[
"n_share_experts_fusion"
]
=
0
logger
.
info
(
log_info_on_rank0
(
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled."
logger
,
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled."
,
)
)
else
:
else
:
assert
(
assert
(
...
@@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module):
...
@@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module):
):
):
self
.
n_share_experts_fusion
=
self
.
tp_size
self
.
n_share_experts_fusion
=
self
.
tp_size
global_server_args_dict
[
"n_share_experts_fusion"
]
=
self
.
tp_size
global_server_args_dict
[
"n_share_experts_fusion"
]
=
self
.
tp_size
logger
.
info
(
log_info_on_rank0
(
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled."
logger
,
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled."
,
)
)
def
get_input_embeddings
(
self
)
->
nn
.
Embedding
:
def
get_input_embeddings
(
self
)
->
nn
.
Embedding
:
...
...
python/sglang/srt/utils.py
View file @
73600673
...
@@ -2096,3 +2096,10 @@ class BumpAllocator:
...
@@ -2096,3 +2096,10 @@ class BumpAllocator:
output
=
self
.
_buffer
[
self
.
_pointer
:
self
.
_pointer
+
size
]
output
=
self
.
_buffer
[
self
.
_pointer
:
self
.
_pointer
+
size
]
self
.
_pointer
+=
size
self
.
_pointer
+=
size
return
output
return
output
def
log_info_on_rank0
(
logger
,
msg
):
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
if
get_tensor_model_parallel_rank
()
==
0
:
logger
.
info
(
msg
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment