Unverified Commit 5ccf8fe1 authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Hint users when weight update timeouts (#6570)

parent 3f23d8cd
......@@ -338,8 +338,14 @@ def update_expert_weights_single_layer(
return
reqs = torch.distributed.batch_isend_irecv(p2p_ops)
for req in reqs:
req.wait()
try:
for req in reqs:
req.wait(timeout=30)
except RuntimeError:
logger.error(
f"Context: {rank=} {old_physical_to_logical_map=} {new_physical_to_logical_map=} {num_local_physical_experts=} {num_gpu_per_node=}"
)
raise
def _execute_buffer2weight_copies(buffer2weight_copy_infos):
for (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment