Support arbitrary output dtypes in PyT GEMM functions (#75)

* Deprecate fp32_output option for PyT linear layers Automatically detect dtype for user-provided output tensors. Signed-off-by: Tim Moon <tmoon@nvidia.com> * Remove deprecated options Signed-off-by: Tim Moon <tmoon@nvidia.com> --------- Signed-off-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

Support arbitrary output dtypes in PyT GEMM functions (#75)
* Deprecate fp32_output option for PyT linear layers Automatically detect dtype for user-provided output tensors. Signed-off-by: Tim Moon <tmoon@nvidia.com> * Remove deprecated options Signed-off-by: Tim Moon <tmoon@nvidia.com> --------- Signed-off-by: Tim Moon <tmoon@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
5898702e · Tim Moon · GitHub · 5c7c6016 · 5898702e · 5898702e
Unverified Commit 5898702e authored Feb 23, 2023 by Tim Moon Committed by GitHub Feb 23, 2023
3 changed files
--- a/tests/test_onnx_export.py
+++ b/tests/test_onnx_export.py
@@ -371,7 +371,6 @@ def test_export_gemm(
                get_workspace(),
                bias=self.bias,
                use_bias=self.use_bias,
-                fp32_output=(self.precision==torch.float32),
                use_split_accumulator=False)
            return ret

--- a/transformer_engine/pytorch/cpp_extensions.py
+++ b/transformer_engine/pytorch/cpp_extensions.py
@@ -26,9 +26,8 @@ def fp8_gemm(
    fp8_meta_tensor: tex.FP8TensorMeta = None,
    bias: Optional[torch.Tensor] = None,
    use_bias: bool = False,
-    fp32_output: bool = False,
    use_split_accumulator: bool = False,
-    D_dtype: tex.DType = None,
+    D_dtype: Optional[tex.DType] = None,
 ) -> torch.Tensor:
    """TN layout GEMM with fp8 inputs."""
@@ -41,15 +40,14 @@ def fp8_gemm(
        out = torch.empty(
            B.shape[0],
            A.shape[0],
-            dtype=torch.float32 if fp32_output else out_dtype,
+            dtype=out_dtype,
            device="cuda",
        )
        return_output = True
-    out_dtype = tex.DType.kFloat32 if fp32_output else TE_DType[out_dtype]
+    out_dtype = TE_DType[out.dtype] if D_dtype is None else D_dtype
    # Use bfloat16 as default bias_dtype
    bias_dtype = tex.DType.kBFloat16 if bias is None else TE_DType[bias.dtype]
-    out_dtype = D_dtype if D_dtype is not None else out_dtype
    _ = torch.ops.tex_ts.te_gemm_ts(
        A,
@@ -94,7 +92,6 @@ def gemm(
    out: Optional[torch.Tensor] = None,
    bias: Optional[torch.Tensor] = None,
    use_bias: bool = False,
-    fp32_output: bool = False,
 ) -> Tuple[Union[torch.Tensor, None], ...]:
    """Non FP8 GEMM."""
@@ -104,16 +101,12 @@ def gemm(
    empty_tensor = torch.Tensor()
    fp8_index = -1 # dummy index
-    input_dtype = TE_DType[dtype]
-    output_dtype = tex.DType.kFloat32 if fp32_output else input_dtype
-    bias_dtype = output_dtype if bias is None else TE_DType[bias.dtype]
    return_output = False
    if out is None:
        out = torch.empty(
            B.shape[1] if transb else B.shape[0],
            A.shape[0] if transa else A.shape[1],
-            dtype=torch.float32 if fp32_output else dtype,
+            dtype=dtype,
            device="cuda",
        )
        return_output = True
@@ -124,14 +117,21 @@ def gemm(
        gelu_input = empty_tensor
    if grad and use_bias:
-        grad_bias = torch.empty(
+        grad_bias = torch.empty(B.shape[1], dtype=out.dtype, device="cuda")
-            B.shape[1], dtype=torch.float32 if fp32_output else dtype, device="cuda"
-        )
    else:
        grad_bias = empty_tensor
    bias = bias if use_bias else empty_tensor
+    assert A.dtype == dtype and B.dtype == dtype, \
+        f'Expected dtype={dtype}, but found A.dtype={A.dtype} and B.dtype={B.dtype}'
+    input_dtype = TE_DType[dtype]
+    output_dtype = TE_DType[out.dtype]
+    if use_bias:
+        bias_dtype = TE_DType[grad_bias.dtype] if grad else TE_DType[bias.dtype]
+    else:
+        bias_dtype = output_dtype
    _ = torch.ops.tex_ts.te_gemm_ts(
        A,
        empty_tensor,

--- a/transformer_engine/pytorch/module.py
+++ b/transformer_engine/pytorch/module.py
@@ -960,7 +960,6 @@ class _LayerNormLinear(torch.autograd.Function):
                            ctx.activation_dtype,
                            get_workspace(),
                            accumulate=accumulate_wgrad_into_param_main_grad,
-                            fp32_output=ctx.fuse_wgrad_accumulation,
                            out=weight.main_grad if ctx.fuse_wgrad_accumulation else None,
                            use_split_accumulator=_2X_ACC_WGRAD,
                        )
@@ -980,7 +979,6 @@ class _LayerNormLinear(torch.autograd.Function):
                            layout="NT",
                            grad=True,
                            accumulate=accumulate_wgrad_into_param_main_grad,
-                            fp32_output=ctx.fuse_wgrad_accumulation,
                            out=weight.main_grad if ctx.fuse_wgrad_accumulation else None,
                        )
                else:
@@ -994,7 +992,6 @@ class _LayerNormLinear(torch.autograd.Function):
                        grad=True,
                        use_bias=ctx.use_bias,
                        accumulate=accumulate_wgrad_into_param_main_grad,
-                        fp32_output=ctx.fuse_wgrad_accumulation,
                        out=weight.main_grad if ctx.fuse_wgrad_accumulation else None,
                    )
@@ -1652,7 +1649,6 @@ class _Linear(torch.autograd.Function):
                            ctx.activation_dtype,
                            get_workspace(),
                            accumulate=accumulate_wgrad_into_param_main_grad,
-                            fp32_output=ctx.fuse_wgrad_accumulation,
                            out=weight.main_grad if ctx.fuse_wgrad_accumulation else None,
                            use_split_accumulator=_2X_ACC_WGRAD,
                        )
@@ -1665,7 +1661,6 @@ class _Linear(torch.autograd.Function):
                            layout="NT",
                            grad=True,
                            accumulate=accumulate_wgrad_into_param_main_grad,
-                            fp32_output=ctx.fuse_wgrad_accumulation,
                            out=weight.main_grad if ctx.fuse_wgrad_accumulation else None,
                        )
                else:
@@ -1679,7 +1674,6 @@ class _Linear(torch.autograd.Function):
                        grad=True,
                        use_bias=ctx.use_bias,
                        accumulate=accumulate_wgrad_into_param_main_grad,
-                        fp32_output=ctx.fuse_wgrad_accumulation,
                        out=weight.main_grad if ctx.fuse_wgrad_accumulation else None,
                    )
@@ -2358,7 +2352,6 @@ class _LayerNormMLP(torch.autograd.Function):
                            ctx.activation_dtype,
                            get_workspace(),
                            accumulate=accumulate_wgrad_into_param_main_grad,
-                            fp32_output=ctx.fuse_wgrad_accumulation,
                            out=fc2_weight.main_grad
                            if ctx.fuse_wgrad_accumulation
                            else None,
@@ -2390,7 +2383,6 @@ class _LayerNormMLP(torch.autograd.Function):
                            grad=True,
                            use_bias=ctx.use_bias,
                            accumulate=accumulate_wgrad_into_param_main_grad,
-                            fp32_output=ctx.fuse_wgrad_accumulation,
                            out=fc2_weight.main_grad
                            if ctx.fuse_wgrad_accumulation
                            else None,
@@ -2446,7 +2438,6 @@ class _LayerNormMLP(torch.autograd.Function):
                        grad=True,
                        use_bias=ctx.use_bias,
                        accumulate=accumulate_wgrad_into_param_main_grad,
-                        fp32_output=ctx.fuse_wgrad_accumulation,
                        out=fc2_weight.main_grad if ctx.fuse_wgrad_accumulation else None,
                    )
@@ -2491,7 +2482,6 @@ class _LayerNormMLP(torch.autograd.Function):
                            ctx.activation_dtype,
                            get_workspace(),
                            accumulate=accumulate_wgrad_into_param_main_grad,
-                            fp32_output=ctx.fuse_wgrad_accumulation,
                            out=fc1_weight.main_grad
                            if ctx.fuse_wgrad_accumulation
                            else None,
@@ -2513,7 +2503,6 @@ class _LayerNormMLP(torch.autograd.Function):
                            layout="NT",
                            grad=True,
                            accumulate=accumulate_wgrad_into_param_main_grad,
-                            fp32_output=ctx.fuse_wgrad_accumulation,
                            out=fc1_weight.main_grad
                            if ctx.fuse_wgrad_accumulation
                            else None,
@@ -2529,7 +2518,6 @@ class _LayerNormMLP(torch.autograd.Function):
                        grad=True,
                        use_bias=not ctx.bias_gelu_nvfusion,
                        accumulate=accumulate_wgrad_into_param_main_grad,
-                        fp32_output=ctx.fuse_wgrad_accumulation,
                        out=fc1_weight.main_grad if ctx.fuse_wgrad_accumulation else None,
                    )