Add contiguous to grad_output in layers.py

09c2f6e4 · Markel Ausin · Jared Casper · b7f3c263 · 09c2f6e4
Commit 09c2f6e4 authored Mar 22, 2023 by Markel Ausin Committed by Jared Casper Mar 22, 2023
Show whitespace changes
Inline Side-by-side

Showing with 5 additions and 0 deletions

megatron/core/tensor_parallel/layers.py megatron/core/tensor_parallel/layers.py +5 -0

No files found.
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -269,6 +269,11 @@ class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
        if ctx.sequence_parallel:
            handle.wait()

+        # Doing gather + slicing during the NeMo forward pass can make this tensor 
+        # not be contiguous. PyTorch only checks if the tensor is contiguous, and only 
+        # clones it if it's not contiguous: 
+        # https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
+        grad_output = grad_output.contiguous()
        # Convert the tensor shapes to 2D for execution compatibility
        grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
                                       grad_output.shape[2])