Unverified Commit 6e78ed6b authored by Wentao Ye's avatar Wentao Ye Committed by GitHub
Browse files
parent 7c16f3fb
...@@ -885,12 +885,11 @@ def get_moe_configs( ...@@ -885,12 +885,11 @@ def get_moe_configs(
# If no optimized configuration is available, we will use the default # If no optimized configuration is available, we will use the default
# configuration # configuration
logger.warning( logger.warning_once(
( "Using default MoE config. Performance might be sub-optimal! "
"Using default MoE config. Performance might be sub-optimal! " "Config file not found at %s",
"Config file not found at %s" ", ".join(config_file_paths),
), scope="local",
config_file_paths,
) )
return None return None
......
...@@ -369,7 +369,9 @@ class FusedMoE(CustomOp): ...@@ -369,7 +369,9 @@ class FusedMoE(CustomOp):
# aux_stream() returns None on non-cuda-alike platforms. # aux_stream() returns None on non-cuda-alike platforms.
self.shared_experts_stream = aux_stream() self.shared_experts_stream = aux_stream()
if self.shared_experts_stream is not None: if self.shared_experts_stream is not None:
logger.info_once("Enabled separate cuda stream for MoE shared_experts") logger.info_once(
"Enabled separate cuda stream for MoE shared_experts", scope="local"
)
if params_dtype is None: if params_dtype is None:
params_dtype = torch.get_default_dtype() params_dtype = torch.get_default_dtype()
......
...@@ -409,10 +409,11 @@ class CudaPlatformBase(Platform): ...@@ -409,10 +409,11 @@ class CudaPlatformBase(Platform):
) )
selected_index = sorted_indices[0] selected_index = sorted_indices[0]
selected_backend = valid_backends_priorities[selected_index][0] selected_backend = valid_backends_priorities[selected_index][0]
logger.info( logger.info_once(
"Using %s attention backend out of potential backends: %s", "Using %s attention backend out of potential backends: %s",
selected_backend.name, selected_backend.name,
[b[0].name for b in valid_backends_priorities], tuple(b[0].name for b in valid_backends_priorities),
scope="local",
) )
return selected_backend.get_path() return selected_backend.get_path()
......
...@@ -61,7 +61,7 @@ class WorkerProfiler(ABC): ...@@ -61,7 +61,7 @@ class WorkerProfiler(ABC):
"""Call _stop with error handling but no safeguards.""" """Call _stop with error handling but no safeguards."""
try: try:
self._stop() self._stop()
logger.info("Profiler stopped successfully.") logger.info_once("Profiler stopped successfully.", scope="local")
except Exception as e: except Exception as e:
logger.warning("Failed to stop profiler: %s", e) logger.warning("Failed to stop profiler: %s", e)
self._running = False # Always mark as not running, assume stop worked self._running = False # Always mark as not running, assume stop worked
...@@ -91,7 +91,7 @@ class WorkerProfiler(ABC): ...@@ -91,7 +91,7 @@ class WorkerProfiler(ABC):
and self._delay_iters > 0 and self._delay_iters > 0
and self._active_iteration_count == self._delay_iters and self._active_iteration_count == self._delay_iters
): ):
logger.info("Starting profiler after delay...") logger.info_once("Starting profiler after delay...", scope="local")
self._call_start() self._call_start()
if self._running: if self._running:
...@@ -105,7 +105,9 @@ class WorkerProfiler(ABC): ...@@ -105,7 +105,9 @@ class WorkerProfiler(ABC):
# Automatically stop the profiler after max iters # Automatically stop the profiler after max iters
# will be marked as not running, but leave as active so that stop # will be marked as not running, but leave as active so that stop
# can clean up properly # can clean up properly
logger.info("Max profiling iterations reached. Stopping profiler...") logger.info_once(
"Max profiling iterations reached. Stopping profiler...", scope="local"
)
self._call_stop() self._call_stop()
return return
...@@ -125,7 +127,7 @@ class WorkerProfiler(ABC): ...@@ -125,7 +127,7 @@ class WorkerProfiler(ABC):
def shutdown(self) -> None: def shutdown(self) -> None:
"""Ensure profiler is stopped when shutting down.""" """Ensure profiler is stopped when shutting down."""
logger.info_once("Shutting down profiler") logger.info_once("Shutting down profiler", scope="local")
if self._running: if self._running:
self.stop() self.stop()
...@@ -156,9 +158,10 @@ class TorchProfilerWrapper(WorkerProfiler): ...@@ -156,9 +158,10 @@ class TorchProfilerWrapper(WorkerProfiler):
self.profiler_config = profiler_config self.profiler_config = profiler_config
torch_profiler_trace_dir = profiler_config.torch_profiler_dir torch_profiler_trace_dir = profiler_config.torch_profiler_dir
if local_rank in (None, 0): if local_rank in (None, 0):
logger.info( logger.info_once(
"Torch profiling enabled. Traces will be saved to: %s", "Torch profiling enabled. Traces will be saved to: %s",
torch_profiler_trace_dir, torch_profiler_trace_dir,
scope="local",
) )
logger.debug( logger.debug(
"Profiler config: record_shapes=%s," "Profiler config: record_shapes=%s,"
......
...@@ -706,7 +706,7 @@ class WorkerProc: ...@@ -706,7 +706,7 @@ class WorkerProc:
death_pipe.recv() death_pipe.recv()
except EOFError: except EOFError:
# Parent process has exited, terminate this worker # Parent process has exited, terminate this worker
logger.info("Parent process exited, terminating worker") logger.info_once("Parent process exited, terminating worker")
# Send signal to self to trigger clean shutdown # Send signal to self to trigger clean shutdown
shutdown_event.set() shutdown_event.set()
except Exception as e: except Exception as e:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment