Commit cf1c7848 authored by Lawrence McAfee's avatar Lawrence McAfee
Browse files

fixed for new jit warmup changes.

parent aff99e32
......@@ -683,6 +683,17 @@ class ParallelTransformerLayer(MegatronModule):
mlp_bias.expand_as(residual),
residual,
self.hidden_dropout)
# Jit compiled function creates 'view' tensor. This tensor
# potentially gets saved in the MPU checkpoint function context,
# which rejects view tensors. While making a viewless tensor here
# won't result in memory savings (like the data loader, or
# p2p_communication), it serves to document the origin of this
# 'view' tensor.
output = mpu.make_viewless_tensor(inp = output,
requires_grad = output.requires_grad,
keep_graph = True)
else:
out = torch.nn.functional.dropout(mlp_output + mlp_bias,
p=self.hidden_dropout,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment