Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
73600673
"vscode:/vscode.git/clone" did not exist on "4353acb469d46afe3b652928729803492873d0cd"
Unverified
Commit
73600673
authored
May 07, 2025
by
Baizhou Zhang
Committed by
GitHub
May 07, 2025
Browse files
Clean logs for DeepSeek-V3 launching (#6079)
parent
8f508cc7
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
61 additions
and
39 deletions
+61
-39
python/sglang/srt/distributed/device_communicators/pynccl.py
python/sglang/srt/distributed/device_communicators/pynccl.py
+2
-1
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
+4
-1
python/sglang/srt/layers/quantization/fp8.py
python/sglang/srt/layers/quantization/fp8.py
+2
-4
python/sglang/srt/layers/quantization/fp8_kernel.py
python/sglang/srt/layers/quantization/fp8_kernel.py
+4
-3
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+35
-26
python/sglang/srt/models/deepseek_v2.py
python/sglang/srt/models/deepseek_v2.py
+7
-4
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+7
-0
No files found.
python/sglang/srt/distributed/device_communicators/pynccl.py
View file @
73600673
...
...
@@ -75,6 +75,7 @@ class PyNcclCommunicator:
self
.
available
=
True
self
.
disabled
=
False
if
self
.
rank
==
0
:
logger
.
info
(
"sglang is using nccl==%s"
,
self
.
nccl
.
ncclGetVersion
())
if
self
.
rank
==
0
:
...
...
python/sglang/srt/layers/moe/fused_moe_triton/fused_moe.py
View file @
73600673
...
...
@@ -29,6 +29,7 @@ from sglang.srt.utils import (
get_device_name
,
is_cuda
,
is_hip
,
log_info_on_rank0
,
)
_is_hip
=
is_hip
()
...
...
@@ -945,7 +946,9 @@ def get_moe_configs(
# For example, updating the Triton version might cause all old configs to become suboptimal.
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
logger
.
info
(
"Using MoE kernel config from %s."
,
config_file_path
)
log_info_on_rank0
(
logger
,
f
"Using MoE kernel config from
{
config_file_path
}
."
)
# If a configuration has been found, return it
return
{
int
(
key
):
val
for
key
,
val
in
json
.
load
(
f
).
items
()}
...
...
python/sglang/srt/layers/quantization/fp8.py
View file @
73600673
...
...
@@ -66,6 +66,7 @@ from sglang.srt.utils import (
get_bool_env_var
,
is_cuda
,
is_hip
,
log_info_on_rank0
,
print_warning_once
,
set_weight_attrs
,
)
...
...
@@ -104,10 +105,7 @@ class Fp8Config(QuantizationConfig):
)
->
None
:
self
.
is_checkpoint_fp8_serialized
=
is_checkpoint_fp8_serialized
if
is_checkpoint_fp8_serialized
:
logger
.
warning
(
"Detected fp8 checkpoint. Please note that the "
"format is experimental and subject to change."
)
log_info_on_rank0
(
logger
,
"Detected fp8 checkpoint."
)
if
activation_scheme
not
in
ACTIVATION_SCHEMES
:
raise
ValueError
(
f
"Unsupported activation scheme
{
activation_scheme
}
"
)
self
.
activation_scheme
=
activation_scheme
...
...
python/sglang/srt/layers/quantization/fp8_kernel.py
View file @
73600673
...
...
@@ -30,6 +30,7 @@ from sglang.srt.utils import (
get_device_name
,
is_cuda
,
is_hip
,
log_info_on_rank0
,
supports_custom_op
,
)
...
...
@@ -698,9 +699,9 @@ def get_w8a8_block_fp8_configs(
)
if
os
.
path
.
exists
(
config_file_path
):
with
open
(
config_file_path
)
as
f
:
log
ger
.
info
(
"Using configuration from %s for W8A8 Block FP8 kernel."
,
config_file_path
,
log
_info_on_rank0
(
logger
,
f
"Using configuration from
{
config_file_path
}
for W8A8 Block FP8 kernel."
,
)
# If a configuration has been found, return it
return
{
int
(
key
):
val
for
key
,
val
in
json
.
load
(
f
).
items
()}
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
73600673
...
...
@@ -278,6 +278,7 @@ class ModelRunner:
server_args
.
attention_backend
=
"fa3"
else
:
server_args
.
attention_backend
=
"triton"
if
self
.
should_log
:
logger
.
info
(
f
"Attention backend not set. Use
{
server_args
.
attention_backend
}
backend by default."
)
...
...
@@ -290,6 +291,7 @@ class ModelRunner:
"flashmla"
,
"cutlass_mla"
,
]:
if
self
.
should_log
:
logger
.
info
(
f
"MLA optimization is turned on. Use
{
server_args
.
attention_backend
}
backend."
)
...
...
@@ -311,6 +313,7 @@ class ModelRunner:
server_args
.
attention_backend
=
"triton"
if
server_args
.
enable_double_sparsity
:
if
self
.
should_log
:
logger
.
info
(
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
)
...
...
@@ -324,6 +327,7 @@ class ModelRunner:
if
self
.
is_multimodal
:
self
.
mem_fraction_static
*=
0.90
if
self
.
should_log
:
logger
.
info
(
f
"Automatically reduce --mem-fraction-static to
{
self
.
mem_fraction_static
:.
3
f
}
"
f
"because this is a multimodal model."
...
...
@@ -336,10 +340,12 @@ class ModelRunner:
if
not
self
.
use_mla_backend
:
server_args
.
disable_chunked_prefix_cache
=
True
elif
self
.
page_size
>
1
:
if
self
.
should_log
:
logger
.
info
(
"Disable chunked prefix cache when page size > 1."
)
server_args
.
disable_chunked_prefix_cache
=
True
if
not
server_args
.
disable_chunked_prefix_cache
:
if
self
.
should_log
:
logger
.
info
(
"Chunked prefix cache is turned on."
)
def
init_torch_distributed
(
self
):
...
...
@@ -433,6 +439,7 @@ class ModelRunner:
torch
.
set_num_threads
(
1
)
if
self
.
device
==
"cuda"
:
if
torch
.
cuda
.
get_device_capability
()[
0
]
<
8
:
if
self
.
should_log
:
logger
.
info
(
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
)
...
...
@@ -471,6 +478,7 @@ class ModelRunner:
self
.
model
.
load_kv_cache_scales
(
self
.
server_args
.
quantization_param_path
)
if
self
.
should_log
:
logger
.
info
(
"Loaded KV cache scaling factors from %s"
,
self
.
server_args
.
quantization_param_path
,
...
...
@@ -1021,6 +1029,7 @@ class ModelRunner:
)
def
apply_torch_tp
(
self
):
if
self
.
should_log
:
logger
.
info
(
f
"Enabling torch tensor parallelism on
{
self
.
tp_size
}
devices."
)
from
sglang.srt.model_parallel
import
tensor_parallel
...
...
python/sglang/srt/models/deepseek_v2.py
View file @
73600673
...
...
@@ -88,6 +88,7 @@ from sglang.srt.utils import (
get_int_env_var
,
is_cuda
,
is_hip
,
log_info_on_rank0
,
)
_is_hip
=
is_hip
()
...
...
@@ -1485,8 +1486,9 @@ class DeepseekV2ForCausalLM(nn.Module):
):
self
.
n_share_experts_fusion
=
0
global_server_args_dict
[
"n_share_experts_fusion"
]
=
0
logger
.
info
(
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled."
log_info_on_rank0
(
logger
,
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled."
,
)
else
:
assert
(
...
...
@@ -1501,8 +1503,9 @@ class DeepseekV2ForCausalLM(nn.Module):
):
self
.
n_share_experts_fusion
=
self
.
tp_size
global_server_args_dict
[
"n_share_experts_fusion"
]
=
self
.
tp_size
logger
.
info
(
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled."
log_info_on_rank0
(
logger
,
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled."
,
)
def
get_input_embeddings
(
self
)
->
nn
.
Embedding
:
...
...
python/sglang/srt/utils.py
View file @
73600673
...
...
@@ -2096,3 +2096,10 @@ class BumpAllocator:
output
=
self
.
_buffer
[
self
.
_pointer
:
self
.
_pointer
+
size
]
self
.
_pointer
+=
size
return
output
def
log_info_on_rank0
(
logger
,
msg
):
from
sglang.srt.distributed
import
get_tensor_model_parallel_rank
if
get_tensor_model_parallel_rank
()
==
0
:
logger
.
info
(
msg
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment