Unverified Commit b2d8b422 authored by Ilya Markov's avatar Ilya Markov Committed by GitHub
Browse files

[EPLB] Enforce sync eplb for NCCL-based all2all backend (#35212)


Signed-off-by: default avatarilmarkov <markovilya197@gmail.com>
parent 1d5ab5d6
...@@ -774,6 +774,17 @@ class ParallelConfig: ...@@ -774,6 +774,17 @@ class ParallelConfig:
"backend is mp, uni or external_launcher." "backend is mp, uni or external_launcher."
) )
if (
self.all2all_backend in ("allgather_reducescatter", "naive")
and self.eplb_config.use_async
):
logger.warning(
"Async EPLB causes hangs with the '%s' all2all backend. "
"Forcing synchronous EPLB.",
self.all2all_backend,
)
self.eplb_config.use_async = False
@property @property
def use_ray(self) -> bool: def use_ray(self) -> bool:
return self.distributed_executor_backend == "ray" or ( return self.distributed_executor_backend == "ray" or (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment