Revert "Fix BF16 ONNX export for successful ONNX Runtime Verification (#271)" (#275)

This reverts commit 914f3841 . Signed-off-by: Tim Moon <tmoon@nvidia.com>

Revert "Fix BF16 ONNX export for successful ONNX Runtime Verification (#271)" (#275)
This reverts commit 914f3841 . Signed-off-by: Tim Moon <tmoon@nvidia.com>
487871e2 · Tim Moon · GitHub · 914f3841 · 487871e2 · 487871e2
Unverified Commit 487871e2 authored Jun 12, 2023 by Tim Moon Committed by GitHub Jun 12, 2023
Showing with 1 addition and 4 deletions

transformer_engine/pytorch/attention.py transformer_engine/pytorch/attention.py +1 -1

transformer_engine/pytorch/te_onnx_extensions.py transformer_engine/pytorch/te_onnx_extensions.py +0 -3

No files found.
--- a/transformer_engine/pytorch/attention.py
+++ b/transformer_engine/pytorch/attention.py
@@ -173,7 +173,7 @@ class UnfusedDotProductAttention(torch.nn.Module):
            output_size[0] * output_size[1],
            output_size[2],
            output_size[3],
-            dtype=torch.float32 if is_in_onnx_export_mode() else query_layer.dtype,
+            dtype=query_layer.dtype,
            device=torch.cuda.current_device(),
        )


--- a/transformer_engine/pytorch/te_onnx_extensions.py
+++ b/transformer_engine/pytorch/te_onnx_extensions.py
@@ -176,7 +176,6 @@ def onnx_te_gemm(
    """ONNX graph for te_gemm"""
    # pylint: disable=unused-argument
    is_fp16 = is_dtype_fp16(inputs)
-    is_bf16 = is_dtype_bf16(inputs)
    if input_type == int(tex.DType.kFloat8E4M3):
        inputs = dequantize(g, inputs, input_scale_inverse, input_fp8_tensor, out_type)

@@ -201,8 +200,6 @@ def onnx_te_gemm(
    else:
        if is_fp16:
            output = g.op("Cast", output, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
-        elif is_bf16:
-            output = g.op("Cast", output, to_i=_C_onnx.TensorProtoDataType.BFLOAT16)
    return output