Unverified Commit e6f114ac authored by Sage Moore's avatar Sage Moore Committed by GitHub
Browse files

[Bugfix][EPLB] Prevent user-provided EPLB config from being overwritten with defaults (#29911)


Signed-off-by: default avatarSage Moore <sage@neuralmagic.com>
parent 6fc5841d
...@@ -22,7 +22,14 @@ def get_model_args( ...@@ -22,7 +22,14 @@ def get_model_args(
"num_speculative_tokens": 1, "num_speculative_tokens": 1,
"max_model_len": model_max_len, "max_model_len": model_max_len,
} }
eplb_config = {
"num_redundant_experts": tp_size,
"window_size": 128,
"step_interval": 1024,
"log_balancedness": False,
}
if use_async:
eplb_config["use_async"] = True
model_args = { model_args = {
"pretrained": model_name, "pretrained": model_name,
"dtype": "auto", "dtype": "auto",
...@@ -31,15 +38,10 @@ def get_model_args( ...@@ -31,15 +38,10 @@ def get_model_args(
"gpu_memory_utilization": 0.7, "gpu_memory_utilization": 0.7,
"speculative_config": speculative_config, "speculative_config": speculative_config,
"enable_expert_parallel": True, "enable_expert_parallel": True,
"num_redundant_experts": tp_size, "eplb_config": eplb_config,
"eplb_window_size": 128,
"eplb_step_interval": 1024,
"eplb_log_balancedness": False,
"enable_eplb": True, "enable_eplb": True,
"max_model_len": model_max_len, "max_model_len": model_max_len,
} }
if use_async:
model_args["eplb_config"] = {"use_async": True}
return model_args return model_args
......
...@@ -421,10 +421,6 @@ class EngineArgs: ...@@ -421,10 +421,6 @@ class EngineArgs:
) )
_api_process_count: int = ParallelConfig._api_process_count _api_process_count: int = ParallelConfig._api_process_count
_api_process_rank: int = ParallelConfig._api_process_rank _api_process_rank: int = ParallelConfig._api_process_rank
num_redundant_experts: int = EPLBConfig.num_redundant_experts
eplb_window_size: int = EPLBConfig.window_size
eplb_step_interval: int = EPLBConfig.step_interval
eplb_log_balancedness: bool = EPLBConfig.log_balancedness
max_parallel_loading_workers: int | None = ( max_parallel_loading_workers: int | None = (
ParallelConfig.max_parallel_loading_workers ParallelConfig.max_parallel_loading_workers
) )
...@@ -1582,16 +1578,6 @@ class EngineArgs: ...@@ -1582,16 +1578,6 @@ class EngineArgs:
) )
self.disable_nccl_for_dp_synchronization = True self.disable_nccl_for_dp_synchronization = True
# Forward the deprecated CLI args to the EPLB config.
if self.num_redundant_experts is not None:
self.eplb_config.num_redundant_experts = self.num_redundant_experts
if self.eplb_window_size is not None:
self.eplb_config.window_size = self.eplb_window_size
if self.eplb_step_interval is not None:
self.eplb_config.step_interval = self.eplb_step_interval
if self.eplb_log_balancedness is not None:
self.eplb_config.log_balancedness = self.eplb_log_balancedness
parallel_config = ParallelConfig( parallel_config = ParallelConfig(
pipeline_parallel_size=self.pipeline_parallel_size, pipeline_parallel_size=self.pipeline_parallel_size,
tensor_parallel_size=self.tensor_parallel_size, tensor_parallel_size=self.tensor_parallel_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment