Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6c88f6c8
Unverified
Commit
6c88f6c8
authored
Aug 01, 2025
by
Cheng Wan
Committed by
GitHub
Aug 01, 2025
Browse files
[5/N] MoE Refactor: Update MoE parallelism arguments (#8658)
parent
c8d3a402
Changes
38
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
181 additions
and
168 deletions
+181
-168
python/sglang/srt/models/grok.py
python/sglang/srt/models/grok.py
+3
-3
python/sglang/srt/models/mixtral.py
python/sglang/srt/models/mixtral.py
+3
-3
python/sglang/srt/models/qwen2_moe.py
python/sglang/srt/models/qwen2_moe.py
+1
-4
python/sglang/srt/models/qwen3_moe.py
python/sglang/srt/models/qwen3_moe.py
+7
-8
python/sglang/srt/models/step3_vl.py
python/sglang/srt/models/step3_vl.py
+1
-1
python/sglang/srt/operations_strategy.py
python/sglang/srt/operations_strategy.py
+1
-1
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+47
-20
python/sglang/srt/two_batch_overlap.py
python/sglang/srt/two_batch_overlap.py
+5
-4
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+2
-23
python/sglang/test/runners.py
python/sglang/test/runners.py
+0
-2
test/srt/test_deepep_large.py
test/srt/test_deepep_large.py
+4
-2
test/srt/test_deepep_small.py
test/srt/test_deepep_small.py
+14
-7
test/srt/test_eplb.py
test/srt/test_eplb.py
+3
-3
test/srt/test_hybrid_dp_ep_tp_mtp.py
test/srt/test_hybrid_dp_ep_tp_mtp.py
+80
-80
test/srt/test_moe_deepep.py
test/srt/test_moe_deepep.py
+4
-2
test/srt/test_moe_deepep_eval_accuracy_large.py
test/srt/test_moe_deepep_eval_accuracy_large.py
+2
-1
test/srt/test_moe_ep.py
test/srt/test_moe_ep.py
+0
-2
test/srt/test_two_batch_overlap.py
test/srt/test_two_batch_overlap.py
+4
-2
No files found.
python/sglang/srt/models/grok.py
View file @
6c88f6c8
...
@@ -29,6 +29,7 @@ from torch import nn
...
@@ -29,6 +29,7 @@ from torch import nn
from
transformers
import
PretrainedConfig
from
transformers
import
PretrainedConfig
from
sglang.srt.distributed
import
(
from
sglang.srt.distributed
import
(
get_moe_expert_parallel_world_size
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_gather
,
...
@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module):
...
@@ -117,7 +118,7 @@ class Grok1MoE(nn.Module):
)
)
kwargs
=
{}
kwargs
=
{}
if
g
lobal_server_args_dict
[
"enable_ep_moe"
]
:
if
g
et_moe_expert_parallel_world_size
()
>
1
:
MoEImpl
=
EPMoE
MoEImpl
=
EPMoE
else
:
else
:
MoEImpl
=
FusedMoE
MoEImpl
=
FusedMoE
...
@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module):
...
@@ -616,8 +617,7 @@ class Grok1ForCausalLM(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
MoEImpl
=
EPMoE
if
global_server_args_dict
[
"enable_ep_moe"
]
else
FusedMoE
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
expert_params_mapping
=
MoEImpl
.
make_expert_params_mapping
(
ckpt_gate_proj_name
=
"w1"
,
ckpt_gate_proj_name
=
"w1"
,
ckpt_down_proj_name
=
"w2"
,
ckpt_down_proj_name
=
"w2"
,
ckpt_up_proj_name
=
"w3"
,
ckpt_up_proj_name
=
"w3"
,
...
...
python/sglang/srt/models/mixtral.py
View file @
6c88f6c8
...
@@ -24,6 +24,7 @@ from torch import nn
...
@@ -24,6 +24,7 @@ from torch import nn
from
transformers
import
MixtralConfig
from
transformers
import
MixtralConfig
from
sglang.srt.distributed
import
(
from
sglang.srt.distributed
import
(
get_moe_expert_parallel_world_size
,
get_pp_group
,
get_pp_group
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
,
tensor_model_parallel_all_reduce
,
...
@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module):
...
@@ -94,7 +95,7 @@ class MixtralMoE(nn.Module):
renormalize
=
True
,
renormalize
=
True
,
)
)
MoEImpl
=
EPMoE
if
g
lobal_server_args_dict
[
"enable_ep_moe"
]
else
FusedMoE
MoEImpl
=
EPMoE
if
g
et_moe_expert_parallel_world_size
()
>
1
else
FusedMoE
self
.
experts
=
MoEImpl
(
self
.
experts
=
MoEImpl
(
num_experts
=
num_experts
,
num_experts
=
num_experts
,
top_k
=
top_k
,
top_k
=
top_k
,
...
@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module):
...
@@ -398,8 +399,7 @@ class MixtralForCausalLM(nn.Module):
# Params for weights, fp8 weight scales, fp8 activation scales
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
# (param_name, weight_name, expert_id, shard_id)
MoEImpl
=
EPMoE
if
global_server_args_dict
[
"enable_ep_moe"
]
else
FusedMoE
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
expert_params_mapping
=
MoEImpl
.
make_expert_params_mapping
(
ckpt_gate_proj_name
=
"w1"
,
ckpt_gate_proj_name
=
"w1"
,
ckpt_down_proj_name
=
"w2"
,
ckpt_down_proj_name
=
"w2"
,
ckpt_up_proj_name
=
"w3"
,
ckpt_up_proj_name
=
"w3"
,
...
...
python/sglang/srt/models/qwen2_moe.py
View file @
6c88f6c8
...
@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
...
@@ -148,7 +148,6 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
**
(
**
(
dict
(
dict
(
enable_flashinfer_cutlass_moe
=
True
,
enable_flashinfer_cutlass_moe
=
True
,
enable_ep_moe
=
global_server_args_dict
[
"enable_ep_moe"
],
)
)
if
global_server_args_dict
[
"enable_flashinfer_cutlass_moe"
]
if
global_server_args_dict
[
"enable_flashinfer_cutlass_moe"
]
else
{}
else
{}
...
@@ -616,9 +615,7 @@ class Qwen2MoeForCausalLM(nn.Module):
...
@@ -616,9 +615,7 @@ class Qwen2MoeForCausalLM(nn.Module):
(
"gate_up_proj"
,
"up_proj"
,
1
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
]
MoEImpl
=
EPMoE
if
global_server_args_dict
[
"enable_ep_moe"
]
else
FusedMoE
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
expert_params_mapping
=
MoEImpl
.
make_expert_params_mapping
(
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_up_proj_name
=
"up_proj"
,
ckpt_up_proj_name
=
"up_proj"
,
...
...
python/sglang/srt/models/qwen3_moe.py
View file @
6c88f6c8
...
@@ -24,6 +24,7 @@ import torch
...
@@ -24,6 +24,7 @@ import torch
from
torch
import
nn
from
torch
import
nn
from
sglang.srt.distributed
import
(
from
sglang.srt.distributed
import
(
get_moe_expert_parallel_world_size
,
get_pp_group
,
get_pp_group
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
...
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
...
@@ -51,7 +52,6 @@ from sglang.srt.layers.linear import (
)
)
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
,
LogitsProcessorOutput
from
sglang.srt.layers.logits_processor
import
LogitsProcessor
,
LogitsProcessorOutput
from
sglang.srt.layers.moe.ep_moe.layer
import
get_moe_impl_class
from
sglang.srt.layers.moe.ep_moe.layer
import
get_moe_impl_class
from
sglang.srt.layers.moe.ep_moe.token_dispatcher
import
DeepEPDispatcher
from
sglang.srt.layers.moe.topk
import
TopK
from
sglang.srt.layers.moe.topk
import
TopK
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.quantization.base_config
import
QuantizationConfig
from
sglang.srt.layers.radix_attention
import
RadixAttention
from
sglang.srt.layers.radix_attention
import
RadixAttention
...
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
...
@@ -72,7 +72,7 @@ from sglang.srt.model_loader.weight_utils import default_weight_loader
from
sglang.srt.models.qwen2_moe
import
Qwen2MoeMLP
as
Qwen3MoeMLP
from
sglang.srt.models.qwen2_moe
import
Qwen2MoeMLP
as
Qwen3MoeMLP
from
sglang.srt.models.qwen2_moe
import
Qwen2MoeModel
from
sglang.srt.models.qwen2_moe
import
Qwen2MoeModel
from
sglang.srt.two_batch_overlap
import
MaybeTboDeepEPDispatcher
from
sglang.srt.two_batch_overlap
import
MaybeTboDeepEPDispatcher
from
sglang.srt.utils
import
DeepEPMode
,
add_prefix
,
is_cuda
,
is_non_idle_and_non_empty
from
sglang.srt.utils
import
add_prefix
,
is_cuda
,
is_non_idle_and_non_empty
Qwen3MoeConfig
=
None
Qwen3MoeConfig
=
None
...
@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
...
@@ -113,15 +113,14 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
add_prefix
(
"experts"
,
prefix
),
prefix
=
add_prefix
(
"experts"
,
prefix
),
**
(
**
(
dict
(
deepep_mode
=
DeepEPMode
[
global_server_args_dict
[
"deepep_mode"
]
]
)
dict
(
deepep_mode
=
global_server_args_dict
[
"deepep_mode"
])
if
global_server_args_dict
[
"
enable
_deepep
_moe"
]
if
global_server_args_dict
[
"
moe_a2a_backend"
].
is
_deepep
()
else
{}
else
{}
),
),
# Additional args for FusedMoE
# Additional args for FusedMoE
**
(
**
(
dict
(
dict
(
enable_flashinfer_cutlass_moe
=
True
,
enable_flashinfer_cutlass_moe
=
True
,
enable_ep_moe
=
global_server_args_dict
[
"enable_ep_moe"
],
)
)
if
global_server_args_dict
[
"enable_flashinfer_cutlass_moe"
]
if
global_server_args_dict
[
"enable_flashinfer_cutlass_moe"
]
else
{}
else
{}
...
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
...
@@ -136,9 +135,9 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
prefix
=
add_prefix
(
"gate"
,
prefix
),
prefix
=
add_prefix
(
"gate"
,
prefix
),
)
)
if
global_server_args_dict
[
"
enable
_deepep
_moe"
]
:
if
global_server_args_dict
[
"
moe_a2a_backend"
].
is
_deepep
()
:
# TODO: we will support tp < ep in the future
# TODO: we will support tp < ep in the future
self
.
ep_size
=
get_
tensor_model
_parallel_world_size
()
self
.
ep_size
=
get_
moe_expert
_parallel_world_size
()
self
.
num_experts
=
(
self
.
num_experts
=
(
config
.
num_experts
+
global_server_args_dict
[
"ep_num_redundant_experts"
]
config
.
num_experts
+
global_server_args_dict
[
"ep_num_redundant_experts"
]
)
)
...
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
...
@@ -148,7 +147,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
self
,
hidden_states
:
torch
.
Tensor
,
forward_batch
:
Optional
[
ForwardBatch
]
=
None
self
,
hidden_states
:
torch
.
Tensor
,
forward_batch
:
Optional
[
ForwardBatch
]
=
None
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
if
not
global_server_args_dict
[
"
enable
_deepep
_moe"
]
:
if
not
global_server_args_dict
[
"
moe_a2a_backend"
].
is
_deepep
()
:
return
self
.
forward_normal
(
hidden_states
)
return
self
.
forward_normal
(
hidden_states
)
else
:
else
:
return
self
.
forward_deepep
(
hidden_states
,
forward_batch
)
return
self
.
forward_deepep
(
hidden_states
,
forward_batch
)
...
...
python/sglang/srt/models/step3_vl.py
View file @
6c88f6c8
...
@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
...
@@ -146,7 +146,7 @@ class Step3TextMoEMLP(nn.Module):
prefix
=
add_prefix
(
"gate"
,
prefix
),
prefix
=
add_prefix
(
"gate"
,
prefix
),
)
)
if
global_server_args_dict
[
"
enable
_deepep
_moe"
]
:
if
global_server_args_dict
[
"
moe_a2a_backend"
].
is
_deepep
()
:
raise
NotImplementedError
(
"DeepEP MoE is not supported yet in Step3 model."
)
raise
NotImplementedError
(
"DeepEP MoE is not supported yet in Step3 model."
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
...
...
python/sglang/srt/operations_strategy.py
View file @
6c88f6c8
...
@@ -4,7 +4,7 @@ from typing import List, Optional
...
@@ -4,7 +4,7 @@ from typing import List, Optional
import
torch
import
torch
from
sglang.srt
import
operations
from
sglang.srt
import
operations
from
sglang.srt.layers.moe.
ep_moe.
token_dispatcher
import
DeepEPConfig
from
sglang.srt.layers.moe.token_dispatcher
import
DeepEPConfig
from
sglang.srt.model_executor.forward_batch_info
import
ForwardMode
from
sglang.srt.model_executor.forward_batch_info
import
ForwardMode
from
sglang.srt.operations
import
Operation
from
sglang.srt.operations
import
Operation
...
...
python/sglang/srt/server_args.py
View file @
6c88f6c8
...
@@ -172,12 +172,11 @@ class ServerArgs:
...
@@ -172,12 +172,11 @@ class ServerArgs:
# Expert parallelism
# Expert parallelism
ep_size
:
int
=
1
ep_size
:
int
=
1
enable_ep_moe
:
bool
=
False
moe_a2a_backend
:
Optional
[
Literal
[
"deepep"
]]
=
None
enable_deepep_moe
:
bool
=
False
enable_flashinfer_cutlass_moe
:
bool
=
False
enable_flashinfer_cutlass_moe
:
bool
=
False
enable_flashinfer_trtllm_moe
:
bool
=
False
enable_flashinfer_trtllm_moe
:
bool
=
False
enable_flashinfer_allreduce_fusion
:
bool
=
False
enable_flashinfer_allreduce_fusion
:
bool
=
False
deepep_mode
:
Optional
[
Literal
[
"auto"
,
"normal"
,
"low_latency"
]
]
=
"auto"
deepep_mode
:
Literal
[
"auto"
,
"normal"
,
"low_latency"
]
=
"auto"
ep_num_redundant_experts
:
int
=
0
ep_num_redundant_experts
:
int
=
0
ep_dispatch_algorithm
:
Optional
[
Literal
[
"static"
,
"dynamic"
,
"fake"
]]
=
None
ep_dispatch_algorithm
:
Optional
[
Literal
[
"static"
,
"dynamic"
,
"fake"
]]
=
None
init_expert_location
:
str
=
"trivial"
init_expert_location
:
str
=
"trivial"
...
@@ -272,7 +271,27 @@ class ServerArgs:
...
@@ -272,7 +271,27 @@ class ServerArgs:
enable_pdmux
:
bool
=
False
enable_pdmux
:
bool
=
False
sm_group_num
:
int
=
3
sm_group_num
:
int
=
3
# Deprecated arguments
enable_ep_moe
:
bool
=
False
enable_deepep_moe
:
bool
=
False
def
__post_init__
(
self
):
def
__post_init__
(
self
):
# Check deprecated arguments
def
print_deprecated_warning
(
message
:
str
):
logger
.
warning
(
f
"
\033
[33m
{
message
}
\033
[0m"
)
if
self
.
enable_ep_moe
:
self
.
ep_size
=
self
.
tp_size
print_deprecated_warning
(
"NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
)
if
self
.
enable_deepep_moe
:
self
.
moe_a2a_backend
=
"deepep"
print_deprecated_warning
(
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
)
# Set missing default values
# Set missing default values
if
self
.
tokenizer_path
is
None
:
if
self
.
tokenizer_path
is
None
:
self
.
tokenizer_path
=
self
.
model_path
self
.
tokenizer_path
=
self
.
model_path
...
@@ -455,14 +474,13 @@ class ServerArgs:
...
@@ -455,14 +474,13 @@ class ServerArgs:
self
.
quantization
==
"modelopt_fp4"
self
.
quantization
==
"modelopt_fp4"
),
"modelopt_fp4 quantization is required for Flashinfer MOE"
),
"modelopt_fp4 quantization is required for Flashinfer MOE"
os
.
environ
[
"TRTLLM_ENABLE_PDL"
]
=
"1"
os
.
environ
[
"TRTLLM_ENABLE_PDL"
]
=
"1"
if
self
.
enable_ep_moe
:
assert
self
.
ep_size
in
[
self
.
ep_size
=
self
.
tp_size
1
,
logger
.
warning
(
self
.
tp_size
,
f
"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[
{
self
.
tp_size
}
]."
],
"The expert parallel size must be 1 or the same as the tensor parallel size"
)
# DeepEP MoE
# DeepEP MoE
if
self
.
enable_
deepep
_moe
:
if
self
.
moe_a2a_backend
==
"
deepep
"
:
if
self
.
deepep_mode
==
"normal"
:
if
self
.
deepep_mode
==
"normal"
:
logger
.
warning
(
"Cuda graph is disabled because deepep_mode=`normal`"
)
logger
.
warning
(
"Cuda graph is disabled because deepep_mode=`normal`"
)
self
.
disable_cuda_graph
=
True
self
.
disable_cuda_graph
=
True
...
@@ -486,7 +504,7 @@ class ServerArgs:
...
@@ -486,7 +504,7 @@ class ServerArgs:
)
)
if
self
.
enable_eplb
:
if
self
.
enable_eplb
:
assert
self
.
e
nable_ep_moe
or
self
.
enable_deepep_mo
e
assert
self
.
e
p_size
>
1
or
self
.
moe_a2a_backend
is
not
Non
e
if
self
.
enable_expert_distribution_metrics
and
(
if
self
.
enable_expert_distribution_metrics
and
(
self
.
expert_distribution_recorder_mode
is
None
self
.
expert_distribution_recorder_mode
is
None
...
@@ -1354,30 +1372,27 @@ class ServerArgs:
...
@@ -1354,30 +1372,27 @@ class ServerArgs:
help
=
"The expert parallelism size."
,
help
=
"The expert parallelism size."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--enable-ep-moe"
,
"--moe-a2a-backend"
,
action
=
"store_true"
,
type
=
str
,
help
=
"Enabling expert parallelism for moe. The ep size is equal to the tp size."
,
choices
=
[
"deepep"
],
default
=
ServerArgs
.
moe_a2a_backend
,
help
=
"Choose the backend for MoE A2A."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--enable-flashinfer-cutlass-moe"
,
"--enable-flashinfer-cutlass-moe"
,
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP
with --enable-ep-moe
"
,
help
=
"Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--enable-flashinfer-trtllm-moe"
,
"--enable-flashinfer-trtllm-moe"
,
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP
with --enable-ep-moe
"
,
help
=
"Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP"
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--enable-flashinfer-allreduce-fusion"
,
"--enable-flashinfer-allreduce-fusion"
,
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"Enable FlashInfer allreduce fusion for Add_RMSNorm."
,
help
=
"Enable FlashInfer allreduce fusion for Add_RMSNorm."
,
)
)
parser
.
add_argument
(
"--enable-deepep-moe"
,
action
=
"store_true"
,
help
=
"Enabling DeepEP MoE implementation for EP MoE."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--deepep-mode"
,
"--deepep-mode"
,
type
=
str
,
type
=
str
,
...
@@ -1839,6 +1854,18 @@ class ServerArgs:
...
@@ -1839,6 +1854,18 @@ class ServerArgs:
help
=
"Disable mmap while loading weight using safetensors."
,
help
=
"Disable mmap while loading weight using safetensors."
,
)
)
# Deprecated arguments
parser
.
add_argument
(
"--enable-ep-moe"
,
action
=
"store_true"
,
help
=
"(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size."
,
)
parser
.
add_argument
(
"--enable-deepep-moe"
,
action
=
"store_true"
,
help
=
"(Deprecated) Enabling DeepEP MoE implementation for EP MoE."
,
)
@
classmethod
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
):
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
):
args
.
tp_size
=
args
.
tensor_parallel_size
args
.
tp_size
=
args
.
tensor_parallel_size
...
...
python/sglang/srt/two_batch_overlap.py
View file @
6c88f6c8
...
@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
...
@@ -13,17 +13,18 @@ from sglang.srt.layers.communicator import (
CommunicateSummableTensorPairFn
,
CommunicateSummableTensorPairFn
,
ScatterMode
,
ScatterMode
,
)
)
from
sglang.srt.layers.moe.ep_moe.token_dispatcher
import
DeepEPDispatcher
from
sglang.srt.layers.moe.token_dispatcher
import
DeepEPDispatcher
from
sglang.srt.layers.moe.utils
import
DeepEPMode
from
sglang.srt.layers.quantization
import
deep_gemm_wrapper
from
sglang.srt.layers.quantization
import
deep_gemm_wrapper
from
sglang.srt.managers.schedule_batch
import
ScheduleBatch
,
global_server_args_dict
from
sglang.srt.managers.schedule_batch
import
ScheduleBatch
,
global_server_args_dict
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.model_executor.forward_batch_info
import
ForwardBatch
,
ForwardMode
from
sglang.srt.operations
import
execute_operations
,
execute_overlapped_operations
from
sglang.srt.operations
import
execute_operations
,
execute_overlapped_operations
from
sglang.srt.operations_strategy
import
OperationsStrategy
from
sglang.srt.operations_strategy
import
OperationsStrategy
from
sglang.srt.speculative.eagle_utils
import
EagleDraftInput
,
EagleVerifyInput
from
sglang.srt.speculative.eagle_utils
import
EagleDraftInput
,
EagleVerifyInput
from
sglang.srt.utils
import
BumpAllocator
,
DeepEPMode
,
get_bool_env_var
from
sglang.srt.utils
import
BumpAllocator
,
get_bool_env_var
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
sglang.srt.layers.moe.
ep_moe.
token_dispatcher
import
DispatchOutput
from
sglang.srt.layers.moe.token_dispatcher
import
DispatchOutput
_tbo_debug
=
get_bool_env_var
(
"SGLANG_TBO_DEBUG"
)
_tbo_debug
=
get_bool_env_var
(
"SGLANG_TBO_DEBUG"
)
...
@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
...
@@ -310,7 +311,7 @@ class TboDPAttentionPreparer:
and
not
local_batch
.
forward_mode
.
is_target_verify
()
and
not
local_batch
.
forward_mode
.
is_target_verify
()
)
)
and
enable_deepep_moe
and
enable_deepep_moe
and
(
resolved_deepep_mode
==
DeepEPMode
.
low_latency
)
and
(
resolved_deepep_mode
==
DeepEPMode
.
LOW_LATENCY
)
)
)
else
:
else
:
self
.
local_tbo_split_seq_index
=
0
self
.
local_tbo_split_seq_index
=
0
...
...
python/sglang/srt/utils.py
View file @
6c88f6c8
...
@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
...
@@ -2205,27 +2205,6 @@ def flatten_nested_list(nested_list):
return
[
nested_list
]
return
[
nested_list
]
class
DeepEPMode
(
Enum
):
normal
=
"normal"
low_latency
=
"low_latency"
auto
=
"auto"
def
enable_normal
(
self
):
return
self
in
[
DeepEPMode
.
normal
,
DeepEPMode
.
auto
]
def
enable_low_latency
(
self
):
return
self
in
[
DeepEPMode
.
low_latency
,
DeepEPMode
.
auto
]
def
resolve
(
self
,
is_extend_in_batch
:
bool
):
if
self
!=
DeepEPMode
.
auto
:
return
self
if
is_extend_in_batch
:
return
DeepEPMode
.
normal
else
:
return
DeepEPMode
.
low_latency
def
is_non_idle_and_non_empty
(
forward_mode
,
hidden_states
):
def
is_non_idle_and_non_empty
(
forward_mode
,
hidden_states
):
return
(
return
(
(
forward_mode
is
not
None
)
(
forward_mode
is
not
None
)
...
@@ -2414,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
...
@@ -2414,7 +2393,7 @@ def require_mlp_tp_gather(server_args):
return
True
return
True
elif
not
server_args
.
enable_dp_lm_head
:
elif
not
server_args
.
enable_dp_lm_head
:
return
True
return
True
elif
not
server_args
.
enable_deepep_mo
e
:
elif
server_args
.
moe_a2a_backend
is
Non
e
:
return
True
return
True
else
:
else
:
return
(
return
(
...
@@ -2430,7 +2409,7 @@ def require_attn_tp_gather(server_args):
...
@@ -2430,7 +2409,7 @@ def require_attn_tp_gather(server_args):
Check if the input of attention is scattered.
Check if the input of attention is scattered.
"""
"""
assert
server_args
.
moe_dense_tp_size
in
[
1
,
None
]
assert
server_args
.
moe_dense_tp_size
in
[
1
,
None
]
if
server_args
.
enable_deepep_mo
e
or
server_args
.
moe_dense_tp_size
==
1
:
if
server_args
.
moe_a2a_backend
is
not
Non
e
or
server_args
.
moe_dense_tp_size
==
1
:
if
server_args
.
enable_dp_attention
:
if
server_args
.
enable_dp_attention
:
return
server_args
.
dp_size
<
server_args
.
tp_size
return
server_args
.
dp_size
<
server_args
.
tp_size
else
:
else
:
...
...
python/sglang/test/runners.py
View file @
6c88f6c8
...
@@ -499,7 +499,6 @@ class SRTRunner:
...
@@ -499,7 +499,6 @@ class SRTRunner:
chunked_prefill_size
:
Optional
[
int
]
=
None
,
chunked_prefill_size
:
Optional
[
int
]
=
None
,
dp_size
:
int
=
1
,
dp_size
:
int
=
1
,
tokenizer_path
:
Optional
[
str
]
=
None
,
tokenizer_path
:
Optional
[
str
]
=
None
,
enable_ep_moe
:
bool
=
False
,
mem_fraction_static
:
float
=
0.65
,
mem_fraction_static
:
float
=
0.65
,
trust_remote_code
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
speculative_draft_model_path
:
Optional
[
str
]
=
None
,
speculative_draft_model_path
:
Optional
[
str
]
=
None
,
...
@@ -550,7 +549,6 @@ class SRTRunner:
...
@@ -550,7 +549,6 @@ class SRTRunner:
enable_dp_attention
=
enable_dp_attention
,
enable_dp_attention
=
enable_dp_attention
,
dp_size
=
dp_size
,
dp_size
=
dp_size
,
tokenizer_path
=
tokenizer_path
,
tokenizer_path
=
tokenizer_path
,
enable_ep_moe
=
enable_ep_moe
,
disable_overlap_schedule
=
disable_overlap_schedule
,
disable_overlap_schedule
=
disable_overlap_schedule
,
cuda_graph_max_bs
=
cuda_graph_max_bs
,
cuda_graph_max_bs
=
cuda_graph_max_bs
,
disable_custom_all_reduce
=
disable_custom_all_reduce
,
disable_custom_all_reduce
=
disable_custom_all_reduce
,
...
...
test/srt/test_deepep_large.py
View file @
6c88f6c8
...
@@ -33,7 +33,8 @@ class TestDeepseek(CustomTestCase):
...
@@ -33,7 +33,8 @@ class TestDeepseek(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--enable-two-batch-overlap"
,
"--enable-two-batch-overlap"
,
"--ep-num-redundant-experts"
,
"--ep-num-redundant-experts"
,
"32"
,
"32"
,
...
@@ -88,7 +89,8 @@ class TestDeepseekMTP(CustomTestCase):
...
@@ -88,7 +89,8 @@ class TestDeepseekMTP(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--enable-two-batch-overlap"
,
"--enable-two-batch-overlap"
,
"--ep-num-redundant-experts"
,
"--ep-num-redundant-experts"
,
"32"
,
"32"
,
...
...
test/srt/test_deepep_small.py
View file @
6c88f6c8
...
@@ -31,7 +31,8 @@ class TestPureDP(CustomTestCase):
...
@@ -31,7 +31,8 @@ class TestPureDP(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -77,7 +78,8 @@ class TestHybridDPTP(CustomTestCase):
...
@@ -77,7 +78,8 @@ class TestHybridDPTP(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"2"
,
"2"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -118,7 +120,8 @@ class TestTP(CustomTestCase):
...
@@ -118,7 +120,8 @@ class TestTP(CustomTestCase):
"--trust-remote-code"
,
"--trust-remote-code"
,
"--tp"
,
"--tp"
,
"4"
,
"4"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -166,7 +169,8 @@ class TestNoGatherdBuffer(CustomTestCase):
...
@@ -166,7 +169,8 @@ class TestNoGatherdBuffer(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -212,7 +216,8 @@ class TestTBO(CustomTestCase):
...
@@ -212,7 +216,8 @@ class TestTBO(CustomTestCase):
"4"
,
"4"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--enable-two-batch-overlap"
,
"--enable-two-batch-overlap"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
...
@@ -259,7 +264,8 @@ class TestMTP(CustomTestCase):
...
@@ -259,7 +264,8 @@ class TestMTP(CustomTestCase):
"--dp"
,
"--dp"
,
"2"
,
"2"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -326,7 +332,8 @@ class TestMTPWithTBO(CustomTestCase):
...
@@ -326,7 +332,8 @@ class TestMTPWithTBO(CustomTestCase):
"--dp-size"
,
"--dp-size"
,
"4"
,
"4"
,
"--enable-two-batch-overlap"
,
"--enable-two-batch-overlap"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--trust-remote-code"
,
"--trust-remote-code"
,
"--speculative-algorithm"
,
"--speculative-algorithm"
,
"EAGLE"
,
"EAGLE"
,
...
...
test/srt/test_eplb.py
View file @
6c88f6c8
...
@@ -34,7 +34,8 @@ class _BaseTestDynamicEPLB(CustomTestCase):
...
@@ -34,7 +34,8 @@ class _BaseTestDynamicEPLB(CustomTestCase):
"--dp"
,
"--dp"
,
"2"
,
"2"
,
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--deepep-mode"
,
"--deepep-mode"
,
"normal"
,
"normal"
,
"--disable-cuda-graph"
,
"--disable-cuda-graph"
,
...
@@ -96,8 +97,7 @@ class TestStaticEPLB(CustomTestCase):
...
@@ -96,8 +97,7 @@ class TestStaticEPLB(CustomTestCase):
trust_remote_code
=
True
,
trust_remote_code
=
True
,
ep_num_redundant_experts
=
4
,
ep_num_redundant_experts
=
4
,
enable_dp_attention
=
True
,
enable_dp_attention
=
True
,
enable_deepep_moe
=
True
,
moe_a2a_backend
=
"deepep"
,
deepep_mode
=
"normal"
,
disable_cuda_graph
=
True
,
disable_cuda_graph
=
True
,
expert_distribution_recorder_mode
=
"stat"
,
expert_distribution_recorder_mode
=
"stat"
,
tp_size
=
2
,
tp_size
=
2
,
...
...
test/srt/test_hybrid_dp_ep_tp_mtp.py
View file @
6c88f6c8
...
@@ -407,9 +407,8 @@ class Test10(CustomTestCase):
...
@@ -407,9 +407,8 @@ class Test10(CustomTestCase):
"--trust-remote-code"
,
"--trust-remote-code"
,
"--tp"
,
"--tp"
,
"8"
,
"8"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -449,9 +448,8 @@ class Test11(CustomTestCase):
...
@@ -449,9 +448,8 @@ class Test11(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -491,9 +489,8 @@ class Test12(CustomTestCase):
...
@@ -491,9 +489,8 @@ class Test12(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"8"
,
"8"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -532,9 +529,8 @@ class Test13(CustomTestCase):
...
@@ -532,9 +529,8 @@ class Test13(CustomTestCase):
"8"
,
"8"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -576,9 +572,8 @@ class Test14(CustomTestCase):
...
@@ -576,9 +572,8 @@ class Test14(CustomTestCase):
"4"
,
"4"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -620,9 +615,8 @@ class Test15(CustomTestCase):
...
@@ -620,9 +615,8 @@ class Test15(CustomTestCase):
"8"
,
"8"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -663,9 +657,8 @@ class Test16(CustomTestCase):
...
@@ -663,9 +657,8 @@ class Test16(CustomTestCase):
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -706,9 +699,8 @@ class Test17(CustomTestCase):
...
@@ -706,9 +699,8 @@ class Test17(CustomTestCase):
"--dp"
,
"--dp"
,
"8"
,
"8"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -751,9 +743,8 @@ class Test18(CustomTestCase):
...
@@ -751,9 +743,8 @@ class Test18(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -796,9 +787,8 @@ class Test19(CustomTestCase):
...
@@ -796,9 +787,8 @@ class Test19(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
@@ -835,7 +825,8 @@ class Test20(CustomTestCase):
...
@@ -835,7 +825,8 @@ class Test20(CustomTestCase):
"--trust-remote-code"
,
"--trust-remote-code"
,
"--tp"
,
"--tp"
,
"8"
,
"8"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -873,7 +864,8 @@ class Test21(CustomTestCase):
...
@@ -873,7 +864,8 @@ class Test21(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -911,7 +903,8 @@ class Test22(CustomTestCase):
...
@@ -911,7 +903,8 @@ class Test22(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"8"
,
"8"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -948,7 +941,8 @@ class Test23(CustomTestCase):
...
@@ -948,7 +941,8 @@ class Test23(CustomTestCase):
"8"
,
"8"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -988,7 +982,8 @@ class Test24(CustomTestCase):
...
@@ -988,7 +982,8 @@ class Test24(CustomTestCase):
"4"
,
"4"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -1028,7 +1023,8 @@ class Test25(CustomTestCase):
...
@@ -1028,7 +1023,8 @@ class Test25(CustomTestCase):
"8"
,
"8"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -1067,7 +1063,8 @@ class Test26(CustomTestCase):
...
@@ -1067,7 +1063,8 @@ class Test26(CustomTestCase):
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -1106,7 +1103,8 @@ class Test27(CustomTestCase):
...
@@ -1106,7 +1103,8 @@ class Test27(CustomTestCase):
"--dp"
,
"--dp"
,
"8"
,
"8"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -1147,7 +1145,8 @@ class Test28(CustomTestCase):
...
@@ -1147,7 +1145,8 @@ class Test28(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -1188,7 +1187,8 @@ class Test29(CustomTestCase):
...
@@ -1188,7 +1187,8 @@ class Test29(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
],
],
)
)
...
@@ -1701,9 +1701,8 @@ class Test40(CustomTestCase):
...
@@ -1701,9 +1701,8 @@ class Test40(CustomTestCase):
"--trust-remote-code"
,
"--trust-remote-code"
,
"--tp"
,
"--tp"
,
"8"
,
"8"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -1755,9 +1754,8 @@ class Test41(CustomTestCase):
...
@@ -1755,9 +1754,8 @@ class Test41(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -1809,9 +1807,8 @@ class Test42(CustomTestCase):
...
@@ -1809,9 +1807,8 @@ class Test42(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"8"
,
"8"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -1862,9 +1859,8 @@ class Test43(CustomTestCase):
...
@@ -1862,9 +1859,8 @@ class Test43(CustomTestCase):
"8"
,
"8"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -1918,9 +1914,8 @@ class Test44(CustomTestCase):
...
@@ -1918,9 +1914,8 @@ class Test44(CustomTestCase):
"4"
,
"4"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -1974,9 +1969,8 @@ class Test45(CustomTestCase):
...
@@ -1974,9 +1969,8 @@ class Test45(CustomTestCase):
"8"
,
"8"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -2029,9 +2023,8 @@ class Test46(CustomTestCase):
...
@@ -2029,9 +2023,8 @@ class Test46(CustomTestCase):
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -2084,9 +2077,8 @@ class Test47(CustomTestCase):
...
@@ -2084,9 +2077,8 @@ class Test47(CustomTestCase):
"--dp"
,
"--dp"
,
"8"
,
"8"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -2141,9 +2133,8 @@ class Test48(CustomTestCase):
...
@@ -2141,9 +2133,8 @@ class Test48(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -2198,9 +2189,8 @@ class Test49(CustomTestCase):
...
@@ -2198,9 +2189,8 @@ class Test49(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"--deepep-mode"
,
"deepep"
,
"auto"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"32"
,
"32"
,
"--max-running-requests"
,
"--max-running-requests"
,
...
@@ -2249,7 +2239,8 @@ class Test50(CustomTestCase):
...
@@ -2249,7 +2239,8 @@ class Test50(CustomTestCase):
"--trust-remote-code"
,
"--trust-remote-code"
,
"--tp"
,
"--tp"
,
"8"
,
"8"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2297,7 +2288,8 @@ class Test51(CustomTestCase):
...
@@ -2297,7 +2288,8 @@ class Test51(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2345,7 +2337,8 @@ class Test52(CustomTestCase):
...
@@ -2345,7 +2337,8 @@ class Test52(CustomTestCase):
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--dp"
,
"--dp"
,
"8"
,
"8"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2392,7 +2385,8 @@ class Test53(CustomTestCase):
...
@@ -2392,7 +2385,8 @@ class Test53(CustomTestCase):
"8"
,
"8"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2442,7 +2436,8 @@ class Test54(CustomTestCase):
...
@@ -2442,7 +2436,8 @@ class Test54(CustomTestCase):
"4"
,
"4"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2492,7 +2487,8 @@ class Test55(CustomTestCase):
...
@@ -2492,7 +2487,8 @@ class Test55(CustomTestCase):
"8"
,
"8"
,
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2541,7 +2537,8 @@ class Test56(CustomTestCase):
...
@@ -2541,7 +2537,8 @@ class Test56(CustomTestCase):
"--dp"
,
"--dp"
,
"4"
,
"4"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2590,7 +2587,8 @@ class Test57(CustomTestCase):
...
@@ -2590,7 +2587,8 @@ class Test57(CustomTestCase):
"--dp"
,
"--dp"
,
"8"
,
"8"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2641,7 +2639,8 @@ class Test58(CustomTestCase):
...
@@ -2641,7 +2639,8 @@ class Test58(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
@@ -2692,7 +2691,8 @@ class Test59(CustomTestCase):
...
@@ -2692,7 +2691,8 @@ class Test59(CustomTestCase):
"--moe-dense-tp-size"
,
"--moe-dense-tp-size"
,
"1"
,
"1"
,
"--enable-dp-lm-head"
,
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--ep"
,
"8"
,
"--speculative-algo"
,
"--speculative-algo"
,
"EAGLE"
,
"EAGLE"
,
"--speculative-draft"
,
"--speculative-draft"
,
...
...
test/srt/test_moe_deepep.py
View file @
6c88f6c8
...
@@ -27,7 +27,8 @@ class TestPureTP(CustomTestCase):
...
@@ -27,7 +27,8 @@ class TestPureTP(CustomTestCase):
"--trust-remote-code"
,
"--trust-remote-code"
,
"--tp"
,
"--tp"
,
"2"
,
"2"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--disable-cuda-graph"
,
"--disable-cuda-graph"
,
],
],
)
)
...
@@ -65,7 +66,8 @@ class TestDPAttn(unittest.TestCase):
...
@@ -65,7 +66,8 @@ class TestDPAttn(unittest.TestCase):
"--dp"
,
"--dp"
,
"2"
,
"2"
,
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--deepep-mode"
,
"--deepep-mode"
,
"normal"
,
"normal"
,
"--disable-cuda-graph"
,
"--disable-cuda-graph"
,
...
...
test/srt/test_moe_deepep_eval_accuracy_large.py
View file @
6c88f6c8
...
@@ -31,7 +31,8 @@ class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase):
...
@@ -31,7 +31,8 @@ class TestMoEDeepEPEvalAccuracyLarge(CustomTestCase):
"--trust-remote-code"
,
"--trust-remote-code"
,
"--tp"
,
"--tp"
,
"8"
,
"8"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--cuda-graph-max-bs"
,
"--cuda-graph-max-bs"
,
"128"
,
"128"
,
],
],
...
...
test/srt/test_moe_ep.py
View file @
6c88f6c8
...
@@ -27,7 +27,6 @@ class TestEpMoE(CustomTestCase):
...
@@ -27,7 +27,6 @@ class TestEpMoE(CustomTestCase):
"2"
,
"2"
,
"--ep-size"
,
"--ep-size"
,
"2"
,
"2"
,
"--enable-ep-moe"
,
],
],
)
)
...
@@ -75,7 +74,6 @@ class TestEpMoEFP8(CustomTestCase):
...
@@ -75,7 +74,6 @@ class TestEpMoEFP8(CustomTestCase):
"2"
,
"2"
,
"--ep-size"
,
"--ep-size"
,
"2"
,
"2"
,
"--enable-ep-moe"
,
"--quantization"
,
"--quantization"
,
"fp8"
,
"fp8"
,
],
],
...
...
test/srt/test_two_batch_overlap.py
View file @
6c88f6c8
...
@@ -33,7 +33,8 @@ class TestTwoBatchOverlap(unittest.TestCase):
...
@@ -33,7 +33,8 @@ class TestTwoBatchOverlap(unittest.TestCase):
"--dp"
,
"--dp"
,
"2"
,
"2"
,
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--deepep-mode"
,
"--deepep-mode"
,
"normal"
,
"normal"
,
"--disable-cuda-graph"
,
# DeepEP normal does not support CUDA Graph
"--disable-cuda-graph"
,
# DeepEP normal does not support CUDA Graph
...
@@ -122,7 +123,8 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
...
@@ -122,7 +123,8 @@ class TestQwen3TwoBatchOverlap(TestTwoBatchOverlap):
"--dp"
,
"--dp"
,
"2"
,
"2"
,
"--enable-dp-attention"
,
"--enable-dp-attention"
,
"--enable-deepep-moe"
,
"--moe-a2a-backend"
,
"deepep"
,
"--deepep-mode"
,
"--deepep-mode"
,
"normal"
,
"normal"
,
"--disable-cuda-graph"
,
# DeepEP normal does not support CUDA Graph
"--disable-cuda-graph"
,
# DeepEP normal does not support CUDA Graph
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment