Commit 6625e8df authored by Nikita Smetanin's avatar Nikita Smetanin
Browse files

Improve TriangularMultiplicativeUpdate stability in fp16 mode

parent 7c32b79f
......@@ -393,6 +393,11 @@ class TriangleMultiplicativeUpdate(nn.Module):
b = b * self.sigmoid(self.linear_b_g(z))
b = b * self.linear_b_p(z)
# Prevents overflow of torch.matmul in combine projections in
# reduced-precision modes
a = a / a.std()
b = b / b.std()
if(is_fp16_enabled()):
with torch.cuda.amp.autocast(enabled=False):
x = self._combine_projections(a.float(), b.float())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment