addressed Jared and Patrick comments.

90ce932d · mshoeybi · 37181ef4 · 90ce932d
Commit 90ce932d authored Feb 17, 2022 by mshoeybi
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 0 deletions

megatron/mpu/random.py megatron/mpu/random.py +5 -0

No files found.
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -92,6 +92,11 @@ def gather_split_1d_tensor(tensor):
    gathered = torch.empty(numel_gathered, dtype=tensor.dtype,
                           device=torch.cuda.current_device(),
                           requires_grad=False)
+    # TODO: This API is experimental in pytorch (as of Feb 2022) and
+    # this might break in future pytorch releases. We chose this API
+    # as opposed to torch.distributed.all_gather for efficiency reasons.
+    # This API calls directly NCCL all-gather versus the former does
+    # internal copies and can potentially cause slow down.
    torch.distributed._all_gather_base(gathered, tensor,
                                       group=get_tensor_model_parallel_group())
    return gathered