fix amax -> abs max in fp8_calibration (#534)

[PyTorch] fix amax calculate during fp8 calibration Signed-off-by: ladyrick <ladyrick@qq.com>

fix amax -> abs max in fp8_calibration (#534)
[PyTorch] fix amax calculate during fp8 calibration Signed-off-by: ladyrick <ladyrick@qq.com>
4f1d70fb · LadyRick · GitHub · 4d444db1 · 4f1d70fb · 4f1d70fb
Unverified Commit 4f1d70fb authored Dec 02, 2023 by LadyRick Committed by GitHub Dec 01, 2023
3 changed files
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -220,11 +220,13 @@ class _LayerNormLinear(torch.autograd.Function):

            if fp8_calibration:
                # amax of input
+                amin, amax = ln_out_total.aminmax()
                fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM1_INPUT] = \
-                    torch.amax(ln_out_total).float()
+                    torch.max(-amin, amax).float()
                # amax of weight
+                amin, amax = weight.aminmax()
                fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM1_WEIGHT] = \
-                    torch.amax(weight).float()
+                    torch.max(-amin, amax).float()

            out, _, _ = tex.gemm(
                weight,

--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -345,11 +345,13 @@ class _LayerNormMLP(torch.autograd.Function):

            if fp8_calibration:
                # amax of fc1 input
+                amin, amax = ln_out_total.aminmax()
                fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM1_INPUT] = \
-                    torch.amax(ln_out_total).float()
+                    torch.max(-amin, amax).float()
                # amax of fc1 weight
+                amin, amax = fc1_weight.aminmax()
                fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM1_WEIGHT] = \
-                    torch.amax(fc1_weight).float()
+                    torch.max(-amin, amax).float()

            fc1_outputs = tex.gemm(
                fc1_weight,
@@ -383,11 +385,13 @@ class _LayerNormMLP(torch.autograd.Function):

            if fp8_calibration:
                # amax of fc2 input
+                amin, amax = gelu_out.aminmax()
                fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM2_INPUT] = \
-                    torch.amax(gelu_out).float()
+                    torch.max(-amin, amax).float()
                # amax of fc2 weight
+                amin, amax = fc2_weight.aminmax()
                fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM2_WEIGHT] = \
-                    torch.amax(fc2_weight).float()
+                    torch.max(-amin, amax).float()

            if ub_split_rs:
                ub_obj_fc2out = get_ub("fc2_fprop")

--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -226,11 +226,13 @@ class _Linear(torch.autograd.Function):

            if fp8_calibration:
                # amax of input
+                amin, amax = inputmat_total.aminmax()
                fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM1_INPUT] = \
-                    torch.amax(inputmat_total).float()
+                    torch.max(-amin, amax).float()
                # amax of weight
+                amin, amax = weight.aminmax()
                fp8_meta["scaling_fwd"].amax_history[0][tex.FP8FwdTensors.GEMM1_WEIGHT] = \
-                    torch.amax(weight).float()
+                    torch.max(-amin, amax).float()

            if ub_split_rs:
                ub_obj_projout = get_ub("proj_fprop")