Commit 09c2f6e4 authored by Markel Ausin's avatar Markel Ausin Committed by Jared Casper
Browse files

Add contiguous to grad_output in layers.py

parent b7f3c263
......@@ -269,6 +269,11 @@ class LinearWithGradAccumulationAndAsyncCommunication(torch.autograd.Function):
if ctx.sequence_parallel:
handle.wait()
# Doing gather + slicing during the NeMo forward pass can make this tensor
# not be contiguous. PyTorch only checks if the tensor is contiguous, and only
# clones it if it's not contiguous:
# https://github.com/pytorch/pytorch/blob/c47cf9bc7f9e02f649ab4ed53fe4d35732c92ab6/torch/_refs/__init__.py#L2761
grad_output = grad_output.contiguous()
# Convert the tensor shapes to 2D for execution compatibility
grad_output = grad_output.view(grad_output.shape[0] * grad_output.shape[1],
grad_output.shape[2])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment