"git@developer.sourcefind.cn:OpenDAS/torchaudio.git" did not exist on "faf8f1cc06995c56fe06237fd2e485ab7b571546"
Unverified Commit 7e5071c9 authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Super tiny enable sole usage of expert distribution metrics and update doc (#6680)

parent 78689d33
...@@ -27,7 +27,8 @@ class EPLBManager: ...@@ -27,7 +27,8 @@ class EPLBManager:
<= self._server_args.expert_distribution_recorder_buffer_size <= self._server_args.expert_distribution_recorder_buffer_size
), "eplb_rebalance_num_iterations must be less than expert_distribution_recorder_buffer_size" ), "eplb_rebalance_num_iterations must be less than expert_distribution_recorder_buffer_size"
get_global_expert_distribution_recorder().start_record() if not get_global_expert_distribution_recorder().recording:
get_global_expert_distribution_recorder().start_record()
logger.info( logger.info(
f"[EPLBManager] system started, will rebalance per {self._server_args.eplb_rebalance_num_iterations} iterations." f"[EPLBManager] system started, will rebalance per {self._server_args.eplb_rebalance_num_iterations} iterations."
......
...@@ -91,6 +91,10 @@ class ExpertDistributionRecorder(ABC): ...@@ -91,6 +91,10 @@ class ExpertDistributionRecorder(ABC):
def dump_record(self, output_mode: _OutputMode = "file"): def dump_record(self, output_mode: _OutputMode = "file"):
self._on_not_implemented() self._on_not_implemented()
@property
def recording(self):
return False
def _on_not_implemented(self): def _on_not_implemented(self):
raise Exception( raise Exception(
"Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder." "Please set ServerArgs.expert_distribution_recorder_mode to use ExpertDistributionRecorder."
...@@ -123,6 +127,12 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder): ...@@ -123,6 +127,12 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
for k in self._accumulator.get_single_pass_gatherer_keys() for k in self._accumulator.get_single_pass_gatherer_keys()
} }
if server_args.enable_expert_distribution_metrics:
logger.info(
"ExpertDistributionRecorder auto start record since enable_expert_distribution_metrics"
)
self.start_record()
def with_current_layer(self, layer_idx): def with_current_layer(self, layer_idx):
return self._current_layer_idx.with_value(layer_idx) return self._current_layer_idx.with_value(layer_idx)
...@@ -221,6 +231,10 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder): ...@@ -221,6 +231,10 @@ class _ExpertDistributionRecorderReal(ExpertDistributionRecorder):
self._reset() self._reset()
return output return output
@property
def recording(self):
return self._recording
_global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = ( _global_expert_distribution_recorder: Optional[ExpertDistributionRecorder] = (
_ExpertDistributionRecorderNoop() _ExpertDistributionRecorderNoop()
......
...@@ -1355,7 +1355,7 @@ class ServerArgs: ...@@ -1355,7 +1355,7 @@ class ServerArgs:
"--deepep-config", "--deepep-config",
type=str, type=str,
default=ServerArgs.deepep_config, default=ServerArgs.deepep_config,
help="Tuned DeepEP config suitable for your own cluster.", help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
) )
parser.add_argument( parser.add_argument(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment