"...text-generation-inference.git" did not exist on "c8c7ccd31e1e760d216c9d2f2b17b0d984ed033b"
Unverified Commit 5ccf8fe1 authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Hint users when weight update timeouts (#6570)

parent 3f23d8cd
...@@ -338,8 +338,14 @@ def update_expert_weights_single_layer( ...@@ -338,8 +338,14 @@ def update_expert_weights_single_layer(
return return
reqs = torch.distributed.batch_isend_irecv(p2p_ops) reqs = torch.distributed.batch_isend_irecv(p2p_ops)
for req in reqs: try:
req.wait() for req in reqs:
req.wait(timeout=30)
except RuntimeError:
logger.error(
f"Context: {rank=} {old_physical_to_logical_map=} {new_physical_to_logical_map=} {num_local_physical_experts=} {num_gpu_per_node=}"
)
raise
def _execute_buffer2weight_copies(buffer2weight_copy_infos): def _execute_buffer2weight_copies(buffer2weight_copy_infos):
for ( for (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment