Unverified Commit c9a53307 authored by Ilya Markov's avatar Ilya Markov Committed by GitHub
Browse files

[EPLB][BugFix]Possible deadlock fix (#32418)


Signed-off-by: default avatarilmarkov <markovilya197@gmail.com>
Signed-off-by: default avatarTyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: default avatarTyler Michael Smith <tlrmchlsmth@gmail.com>
parent 6ca4f400
......@@ -970,8 +970,23 @@ class EplbState:
ep_group: ProcessGroup,
is_profile: bool = False,
):
if not model_state.buffer_lock.acquire(blocking=False):
return
# We call move_to_workspace only when ep_buffer_ready is 1.
# It means we only need to wait for the lock for a short time.
max_retries = 6 # 1 minute max
retries = 0
while not model_state.buffer_lock.acquire(blocking=True, timeout=10.0):
retries += 1
if retries >= max_retries:
raise RuntimeError(
f"Rank {ep_group.rank()}: buffer_lock timeout after "
"{max_retries * 10}s"
)
logger.warning(
"Rank %d: EPLB buffer_lock acquire failed, retrying (%d/%d)",
ep_group.rank(),
retries,
max_retries,
)
try:
assert model_state.new_physical_to_logical_map is not None
device_index = model_state.cuda_device_index or self.cuda_device_index
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment