Commit faa80947 authored by Roy Wang's avatar Roy Wang Committed by khluu
Browse files

[Performance] Add --enable-ep-weight-filter CLI option (#37351)


Signed-off-by: default avataresmeetu <jasonailu87@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
(cherry picked from commit 761e0aa7)
parent eeabf740
...@@ -138,6 +138,13 @@ class ParallelConfig: ...@@ -138,6 +138,13 @@ class ParallelConfig:
"""Whether the deployed model is MoE (if known).""" """Whether the deployed model is MoE (if known)."""
enable_expert_parallel: bool = False enable_expert_parallel: bool = False
"""Use expert parallelism instead of tensor parallelism for MoE layers.""" """Use expert parallelism instead of tensor parallelism for MoE layers."""
enable_ep_weight_filter: bool = False
"""Skip non-local expert weights during model loading when expert
parallelism is active. Each rank only reads its own expert shard from
disk, which can drastically reduce storage I/O for MoE models with
per-expert weight tensors (e.g. DeepSeek, Mixtral, Kimi-K2.5). Has no
effect on 3D fused-expert checkpoints (e.g. GPT-OSS) or non-MoE
models."""
enable_eplb: bool = False enable_eplb: bool = False
"""Enable expert parallelism load balancing for MoE layers.""" """Enable expert parallelism load balancing for MoE layers."""
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig) eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
......
...@@ -419,6 +419,7 @@ class EngineArgs: ...@@ -419,6 +419,7 @@ class EngineArgs:
data_parallel_external_lb: bool = False data_parallel_external_lb: bool = False
data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
enable_ep_weight_filter: bool = ParallelConfig.enable_ep_weight_filter
moe_backend: MoEBackend = KernelConfig.moe_backend moe_backend: MoEBackend = KernelConfig.moe_backend
all2all_backend: All2AllBackend = ParallelConfig.all2all_backend all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep
...@@ -901,6 +902,10 @@ class EngineArgs: ...@@ -901,6 +902,10 @@ class EngineArgs:
"-ep", "-ep",
**parallel_kwargs["enable_expert_parallel"], **parallel_kwargs["enable_expert_parallel"],
) )
parallel_group.add_argument(
"--enable-ep-weight-filter",
**parallel_kwargs["enable_ep_weight_filter"],
)
parallel_group.add_argument( parallel_group.add_argument(
"--all2all-backend", **parallel_kwargs["all2all_backend"] "--all2all-backend", **parallel_kwargs["all2all_backend"]
) )
...@@ -1727,6 +1732,7 @@ class EngineArgs: ...@@ -1727,6 +1732,7 @@ class EngineArgs:
data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
is_moe_model=model_config.is_moe, is_moe_model=model_config.is_moe,
enable_expert_parallel=self.enable_expert_parallel, enable_expert_parallel=self.enable_expert_parallel,
enable_ep_weight_filter=self.enable_ep_weight_filter,
all2all_backend=self.all2all_backend, all2all_backend=self.all2all_backend,
enable_elastic_ep=self.enable_elastic_ep, enable_elastic_ep=self.enable_elastic_ep,
enable_dbo=self.enable_dbo, enable_dbo=self.enable_dbo,
......
...@@ -313,7 +313,11 @@ class DefaultModelLoader(BaseModelLoader): ...@@ -313,7 +313,11 @@ class DefaultModelLoader(BaseModelLoader):
vllm_config = get_current_vllm_config() vllm_config = get_current_vllm_config()
parallel_config = vllm_config.parallel_config parallel_config = vllm_config.parallel_config
if not (model_config.is_moe and parallel_config.enable_expert_parallel): if not (
model_config.is_moe
and parallel_config.enable_expert_parallel
and parallel_config.enable_ep_weight_filter
):
return return
num_experts = model_config.get_num_experts() num_experts = model_config.get_num_experts()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment