Refactor for MIOpen integration (#4)

Refactor, so can bring multi-index transformation and padding support into MIOpen

Refactor for MIOpen integration (#4)
Refactor, so can bring multi-index transformation and padding support into MIOpen
52c3fe05 · Chao Liu · GitHub · 9aaeacc8 · 52c3fe05 · 52c3fe05
Unverified Commit 52c3fe05 authored Oct 11, 2019 by Chao Liu Committed by GitHub Oct 11, 2019
16 changed files
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -5,14 +5,12 @@
 #include "utility.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
+#include "float_type.hpp"
 #include "type.hpp"
 #include "tuple.hpp"
 #include "math.hpp"
-#include "vector_type.hpp"
 #include "sequence.hpp"
-#include "sequence_helper.hpp"
 #include "array.hpp"
-#include "array_helper.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
@@ -22,8 +20,8 @@
 #include "amd_inline_asm.hpp"
 #endif
-#if CK_USE_AMD_INTRINSIC
+#if CK_USE_AMD_BUFFER_ADDRESSING
-#include "amd_intrinsic.hpp"
+#include "amd_buffer_addressing.hpp"
 #endif
 #endif
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -3,23 +3,58 @@
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
+#include "bfloat16_dev.hpp"
+// index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
+// device backend
 #define CK_DEVICE_BACKEND_AMD 1
-#define CK_USE_AMD_INTRINSIC 1
+// AMD inline asm
+#ifndef CK_USE_AMD_INLINE_ASM
 #define CK_USE_AMD_INLINE_ASM 1
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
+#endif
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
+#ifndef CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 1
+#endif
+// AMD buffer addressing
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING
+#define CK_USE_AMD_BUFFER_ADDRESSING 1
+#endif
+#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
+#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
+#endif
+// AMD XDLOPS
+#ifndef CK_USE_AMD_XDLOPS
+#define CK_USE_AMD_XDLOPS 1
+#endif
+#ifndef CK_USE_AMD_XDLOPS_INLINE_ASM
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 1
+#endif
+// experimental implementation
+#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
+#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
+#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+// workaround
+#define CK_WORKAROUND_SWDEV_202749 1
 namespace ck {
-enum address_space_t
+enum AddressSpace
 {
-    generic = 0,
+    generic,
-    global  = 3
+    global
 };
 #if CK_UNSIGNED_INDEX_TYPE
@@ -28,24 +63,8 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif
-// For some reason, HIP compiler need this definition to generate optimal load and store
+// int32x4_t use by buffer_load and buffer_store llvm intrinsic
-// instruction
-typedef float float2_t __attribute__((ext_vector_type(2)));
-typedef float float4_t __attribute__((ext_vector_type(4)));
 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(const X& x) const
-    {
-        return static_cast<T>(x);
-    }
-};
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -6,22 +6,34 @@
 #include "nvToolsExt.h"
 #include "helper_cuda.h"
+// index type: unsigned or signed
 #define CK_UNSIGNED_INDEX_TYPE 0
+// device backend
 #define CK_DEVICE_BACKEND_NVIDIA 1
-#define CK_USE_AMD_INTRINSIC 0
+// disable AMD inline asm and intrinsic
 #define CK_USE_AMD_INLINE_ASM 0
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
+#define CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM 0
+#define CK_USE_AMD_BUFFER_ADDRESSING 0
+#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 0
+#define CK_USE_AMD_XDLOPS 0
+#define CK_USE_AMD_XDLOPS_INLINE_ASM 0
+// experimental implementation
+#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 0
+#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
+#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
-#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
 namespace ck {
-enum address_space_t
+enum AddressSpace
 {
-    generic = 0,
+    generic,
-    global  = generic
+    global = generic
 };
 #if CK_UNSIGNED_INDEX_TYPE
@@ -30,24 +42,5 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif
-// For some reason, CUDA need this definition, otherwise
-//   compiler won't generate optimal load and store instruction, and
-//   kernel would produce wrong result, indicating the compiler fail to generate correct
-//   instruction,
-using float2_t = float2;
-using float4_t = float4;
-// data type conversion
-template <typename T>
-struct type_convert
-{
-    template <typename X>
-    __device__ T operator()(const X& x) const
-    {
-        return static_cast<T>(x);
-    }
-};
 } // namespace ck
 #endif
--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
+#ifndef CK_FLOAT_TYPE_AMD_HPP
+#define CK_FLOAT_TYPE_AMD_HPP
+namespace ck {
+// For some reason, HIP compiler need this definition to generate optimal ISA
+// float
+typedef float float2_t __attribute__((ext_vector_type(2)));
+typedef float float4_t __attribute__((ext_vector_type(4)));
+typedef float float32_t __attribute__((ext_vector_type(32)));
+// float16
+typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
+typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
+// bfloat16
+typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
+typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+template <class T, index_t N>
+struct vector_type
+{
+    typedef struct
+    {
+        T scalar[N];
+    } MemoryType;
+};
+template <>
+struct vector_type<float, 1>
+{
+    using MemoryType = float;
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+};
+template <>
+struct vector_type<float, 2>
+{
+    using MemoryType = float2_t;
+    union DataType
+    {
+        MemoryType vector;
+        float scalar[2];
+    };
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+    __host__ __device__ static MemoryType Pack(float s0, float s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+template <>
+struct vector_type<float, 4>
+{
+    using MemoryType = float4_t;
+    __host__ __device__ static constexpr index_t GetSize() { return 4; }
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<float*>(&v) + I) = s;
+    }
+};
+template <>
+struct vector_type<half, 1>
+{
+    using MemoryType = half;
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+};
+template <>
+struct vector_type<half, 2>
+{
+    using MemoryType = half2_t;
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[2];
+    };
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+template <>
+struct vector_type<half, 4>
+{
+    using MemoryType = half4_t;
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[4];
+    };
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+    __host__ __device__ static MemoryType Pack(half s0, half s1, half s2, half s3)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
+};
+template <>
+struct vector_type<ushort, 1>
+{
+    using MemoryType = ushort;
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+};
+template <>
+struct vector_type<ushort, 2>
+{
+    using MemoryType = ushort2_t;
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[2];
+    };
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
+};
+template <>
+struct vector_type<ushort, 4>
+{
+    using MemoryType = ushort4_t;
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[4];
+    };
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 4, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+    __host__ __device__ static MemoryType Pack(ushort s0, ushort s1, ushort s2, ushort s3)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        data.scalar[2] = s2;
+        data.scalar[3] = s3;
+        return data.vector;
+    }
+};
+// data type conversion
+template <typename T>
+struct type_convert
+{
+    template <typename X>
+    __device__ T operator()(X x) const
+    {
+        return static_cast<T>(x);
+    }
+};
+template <>
+template <>
+__device__ float type_convert<float>::operator()<ushort>(ushort x) const
+{
+    return bfloat16_to_float(x);
+}
+template <>
+template <>
+__device__ ushort type_convert<ushort>::operator()<float>(float x) const
+{
+    return float_to_bfloat16(x);
+}
+template <typename T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
+    __device__ T operator()(half2_t a, half2_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+    __device__ T operator()(half4_t a, half4_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+    __device__ T operator()(ushort2_t a, ushort2_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
+    __device__ T operator()(ushort4_t a, ushort4_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/vector_type.hpp
+++ b/composable_kernel/include/utility/vector_type.hpp
-#ifndef CK_VECTOR_TYPE_HPP
+#ifndef CK_FLOAT_TYPE_NVIDIA_HPP
-#define CK_VECTOR_TYPE_HPP
+#define CK_FLOAT_TYPE_NVIDIA_HPP
-#include "config.hpp"
+#include "number.hpp"
-#include "integral_constant.hpp"
 namespace ck {
+// For some reason, CUDA need this definition, otherwise
+//   compiler won't generate optimal load and store instruction, and
+//   kernel would produce wrong result, indicating the compiler fail to generate correct
+//   instruction,
+// float
+using float2_t = float2;
+using float4_t = float4;
+// float16
+using half2_t = half2;
 template <class T, index_t N>
 struct vector_type
 {
+    typedef struct
+    {
+        T scalar[N];
+    } MemoryType;
 };
 template <>
@@ -29,7 +43,7 @@ struct vector_type<float, 2>
 {
    using MemoryType = float2_t;
-    union Data
+    union DataType
    {
        MemoryType vector;
        float scalar[2];
@@ -44,7 +58,7 @@ struct vector_type<float, 2>
    __host__ __device__ static MemoryType Pack(float s0, float s1)
    {
-        Data data;
+        DataType data;
        data.scalar[0] = s0;
        data.scalar[1] = s1;
        return data.vector;
@@ -56,6 +70,8 @@ struct vector_type<float, 4>
 {
    using MemoryType = float4_t;
+    __host__ __device__ static constexpr index_t GetSize() { return 4; }
    template <index_t I>
    __host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
    {
@@ -65,23 +81,77 @@ struct vector_type<float, 4>
 };
 template <>
-struct vector_type<const float, 1>
+struct vector_type<half, 1>
 {
-    using MemoryType = const float;
+    using MemoryType = half;
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 1, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
 };
 template <>
-struct vector_type<const float, 2>
+struct vector_type<half, 2>
 {
-    using MemoryType = const float2_t;
+    using MemoryType = half2_t;
+    union DataType
+    {
+        MemoryType vector;
+        half scalar[2];
+    };
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half s, Number<I>)
+    {
+        static_assert(I < 2, "wrong");
+        *(reinterpret_cast<half*>(&v) + I) = s;
+    }
+    __host__ __device__ static MemoryType Pack(half s0, half s1)
+    {
+        DataType data;
+        data.scalar[0] = s0;
+        data.scalar[1] = s1;
+        return data.vector;
+    }
 };
-template <>
+// data type conversion
-struct vector_type<const float, 4>
+template <typename T>
+struct type_convert
 {
-    using MemoryType = const float4_t;
+    template <typename X>
+    __device__ T operator()(const X& x) const
+    {
+        return static_cast<T>(x);
+    }
 };
-} // namespace ck
+template <typename T>
+struct inner_product_with_conversion
+{
+    static constexpr auto convert = type_convert<T>();
+    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
+    __device__ T operator()(half2_t a, half2_t b) const
+    {
+        const half* p_a_half = reinterpret_cast<const half*>(&a);
+        const half* p_b_half = reinterpret_cast<const half*>(&b);
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+};
+} // namespace ck
 #endif
--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
-#ifndef CK_ARRAY_HELPER_HPP
+#ifndef CK_PRINT_ARRAY_HPP
-#define CK_ARRAY_HELPER_HPP
+#define CK_PRINT_ARRAY_HPP
 #include "array.hpp"

--- a/composable_kernel/include/utility/sequence_helper.hpp
+++ b/composable_kernel/include/utility/sequence_helper.hpp
-#ifndef CK_SEQUENCE_HELPER_HPP
+#ifndef CK_PRINT_SEQUENCE_HPP
-#define CK_SEQUENCE_HELPER_HPP
+#define CK_PRINT_SEQUENCE_HPP
 #include "sequence.hpp"

--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -3,14 +3,17 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
+#include "convolution_common.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
-template <class T,
+template <typename T,
-          class InDesc,
+          typename InDesc,
-          class WeiDesc,
+          typename WeiDesc,
-          class OutDesc,
+          typename OutDesc,
-          class ConvStrides,
+          typename ConvStrides,
-          class ConvDilations>
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
 void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
                                                          const Tensor<T>& in_nchw,
                                                          WeiDesc,
@@ -19,6 +22,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
                                                          Tensor<T>& out_nkhw,
                                                          ConvStrides,
                                                          ConvDilations,
+                                                          LeftPads,
+                                                          RightPads,
                                                          ck::index_t nrepeat)
 {
    using namespace ck;
@@ -28,9 +33,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
-    constexpr auto in_nchw_desc  = InDesc{};
+    constexpr auto in_nchw_desc =
-    constexpr auto wei_kcyx_desc = WeiDesc{};
+        make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
-    constexpr auto out_nkhw_desc = OutDesc{};
+    constexpr auto wei_kcyx_desc =
+        make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
+    constexpr auto out_nkhw_desc =
+        make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
    constexpr index_t K  = out_nkhw_desc.GetLength(I1);
@@ -47,7 +55,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 #if 1
-    // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
+    // BlockSize = 256, each thread hold 64 data
    constexpr index_t BlockSize = 256;
    constexpr index_t BPerBlock = 16;
@@ -84,7 +92,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 0
-    // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
+    // BlockSize = 64, each thread hold 64 data
    constexpr index_t BlockSize = 64;
    constexpr index_t BPerBlock = 8;
@@ -120,7 +128,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
    // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
    constexpr index_t BlockSize = 256;
@@ -174,11 +182,15 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
            GridSize,
            BlockSize,
            T,
+            T,
            decltype(in_nchw_desc),
            decltype(wei_kcyx_desc),
            decltype(out_nkhw_desc),
            ConvStrides,
            ConvDilations,
+            LeftPads,
+            RightPads,
+            ConvolutionDirection::Forward,
            BPerBlock,
            KPerBlock,
            EPerBlock,

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -3,27 +3,23 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp"
-template <typename T,
+template <class T,
-          typename InDesc,
+          class InDesc,
-          typename WeiDesc,
+          class WeiDesc,
-          typename OutDesc,
+          class OutDesc,
-          typename ConvStrides,
+          class ConvStrides,
-          typename ConvDilations,
+          class ConvDilations>
-          typename LeftPads,
+void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(InDesc,
-          typename RightPads>
+                                                                     const Tensor<T>& in_nchw,
-void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
+                                                                     WeiDesc,
-                                                                 const Tensor<T>& in_nchw,
+                                                                     const Tensor<T>& wei_kcyx,
-                                                                 WeiDesc,
+                                                                     OutDesc,
-                                                                 const Tensor<T>& wei_kcyx,
+                                                                     Tensor<T>& out_nkhw,
-                                                                 OutDesc,
+                                                                     ConvStrides,
-                                                                 Tensor<T>& out_nkhw,
+                                                                     ConvDilations,
-                                                                 ConvStrides,
+                                                                     ck::index_t nrepeat)
-                                                                 ConvDilations,
-                                                                 LeftPads,
-                                                                 RightPads,
-                                                                 ck::index_t nrepeat)
 {
    using namespace ck;
@@ -32,12 +28,9 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
    constexpr auto I2 = Number<2>{};
    constexpr auto I3 = Number<3>{};
-    constexpr auto in_nchw_desc =
+    constexpr auto in_nchw_desc  = InDesc{};
-        make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
+    constexpr auto wei_kcyx_desc = WeiDesc{};
-    constexpr auto wei_kcyx_desc =
+    constexpr auto out_nkhw_desc = OutDesc{};
-        make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
-    constexpr auto out_nkhw_desc =
-        make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
    constexpr index_t N  = out_nkhw_desc.GetLength(I0);
    constexpr index_t K  = out_nkhw_desc.GetLength(I1);
@@ -54,7 +47,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
 #if 1
-    // BlockSize = 256, each thread hold 64 data
+    // BlockSize = 256, blockwise-GEMM 128x128, each thread hold 64 data
    constexpr index_t BlockSize = 256;
    constexpr index_t BPerBlock = 16;
@@ -91,7 +84,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #elif 0
-    // BlockSize = 64, each thread hold 64 data
+    // BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
    constexpr index_t BlockSize = 64;
    constexpr index_t BPerBlock = 8;
@@ -127,7 +120,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 0
+#elif 1
    // BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
    constexpr index_t BlockSize = 256;
@@ -177,48 +170,44 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
    constexpr auto gridwise_conv =
-#if 0
+        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated<
-        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
+            GridSize,
-#else
+            BlockSize,
-        GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
+            T,
-#endif
+            T,
-        <GridSize,
+            decltype(in_nchw_desc),
-         BlockSize,
+            decltype(wei_kcyx_desc),
-         T,
+            decltype(out_nkhw_desc),
-         decltype(in_nchw_desc),
+            ConvStrides,
-         decltype(wei_kcyx_desc),
+            ConvDilations,
-         decltype(out_nkhw_desc),
+            ConvolutionDirection::Forward,
-         ConvStrides,
+            BPerBlock,
-         ConvDilations,
+            KPerBlock,
-         LeftPads,
+            EPerBlock,
-         RightPads,
+            GemmNRepeat,
-         BPerBlock,
+            GemmMPerThreadSubC,
-         KPerBlock,
+            GemmNPerThreadSubC,
-         EPerBlock,
+            GemmMLevel0Cluster,
-         GemmNRepeat,
+            GemmNLevel0Cluster,
-         GemmMPerThreadSubC,
+            GemmMLevel1Cluster,
-         GemmNPerThreadSubC,
+            GemmNLevel1Cluster,
-         GemmMLevel0Cluster,
+            GemmKPerThreadLoop,
-         GemmNLevel0Cluster,
+            GemmDataPerReadA,
-         GemmMLevel1Cluster,
+            GemmDataPerReadB,
-         GemmNLevel1Cluster,
+            InBlockCopySubLengths_E_N1_B_N2,
-         GemmKPerThreadLoop,
+            InBlockCopyClusterLengths_E_N1_B_N2,
-         GemmDataPerReadA,
+            InBlockCopyThreadClusterArrangeOrder,
-         GemmDataPerReadB,
+            InBlockCopySrcAccessOrder,
-         InBlockCopySubLengths_E_N1_B_N2,
+            InBlockCopyDstAccessOrder,
-         InBlockCopyClusterLengths_E_N1_B_N2,
+            InBlockCopySrcDataPerRead_B,
-         InBlockCopyThreadClusterArrangeOrder,
+            InBlockCopyDstDataPerWrite_N2,
-         InBlockCopySrcAccessOrder,
+            WeiBlockCopySubLengths_E_K,
-         InBlockCopyDstAccessOrder,
+            WeiBlockCopyClusterLengths_E_K,
-         InBlockCopySrcDataPerRead_B,
+            WeiBlockCopyThreadClusterArrangeOrder,
-         InBlockCopyDstDataPerWrite_N2,
+            WeiBlockCopySrcAccessOrder,
-         WeiBlockCopySubLengths_E_K,
+            WeiBlockCopyDstAccessOrder,
-         WeiBlockCopyClusterLengths_E_K,
+            WeiBlockCopySrcDataPerRead_E,
-         WeiBlockCopyThreadClusterArrangeOrder,
+            WeiBlockCopyDstDataPerWrite_K>{};
-         WeiBlockCopySrcAccessOrder,
-         WeiBlockCopyDstAccessOrder,
-         WeiBlockCopySrcDataPerRead_E,
-         WeiBlockCopyDstDataPerWrite_K>{};
    for(index_t i = 0; i < nrepeat; ++i)
    {

--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -5,14 +5,14 @@
 #include "gridwise_convolution_kernel_wrapper.hpp"
 #include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
-using namespace ck;
 template <class T,
          class InDesc,
          class WeiDesc,
          class OutDesc,
          class ConvStrides,
-          class ConvDilations>
+          class ConvDilations,
+          class LeftPads,
+          class RightPads>
 void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
                                                          const Tensor<T>& in_nchw,
                                                          WeiDesc,
@@ -21,8 +21,12 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
                                                          Tensor<T>& out_nkhw,
                                                          ConvStrides,
                                                          ConvDilations,
+                                                          LeftPads,
+                                                          RightPads,
                                                          ck::index_t nrepeat)
 {
+    using namespace ck;
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
@@ -164,7 +168,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
    constexpr auto gridwise_conv =
 #if 0
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
 #else
        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
 #endif
@@ -176,6 +180,8 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
         decltype(out_nkhw_desc),
         ConvStrides,
         ConvDilations,
+         LeftPads,
+         RightPads,
         BPerBlock,
         KPerBlock,
         EPerBlock,

--- a/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
@@ -3,30 +3,26 @@
 #include "device.hpp"
 #include "tensor.hpp"
 #include "gridwise_convolution_kernel_wrapper.hpp"
-#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp"
+#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp"
+using namespace ck;
 template <class T,
          class InDesc,
          class WeiDesc,
          class OutDesc,
          class ConvStrides,
-          class ConvDilations,
+          class ConvDilations>
-          class LeftPads,
+void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(InDesc,
-          class RightPads>
+                                                                     const Tensor<T>& in_nchw,
-void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
+                                                                     WeiDesc,
-                                                                 const Tensor<T>& in_nchw,
+                                                                     const Tensor<T>& wei_kcyx,
-                                                                 WeiDesc,
+                                                                     OutDesc,
-                                                                 const Tensor<T>& wei_kcyx,
+                                                                     Tensor<T>& out_nkhw,
-                                                                 OutDesc,
+                                                                     ConvStrides,
-                                                                 Tensor<T>& out_nkhw,
+                                                                     ConvDilations,
-                                                                 ConvStrides,
+                                                                     ck::index_t nrepeat)
-                                                                 ConvDilations,
-                                                                 LeftPads,
-                                                                 RightPads,
-                                                                 ck::index_t nrepeat)
 {
-    using namespace ck;
    constexpr auto I0 = Number<0>{};
    constexpr auto I1 = Number<1>{};
    constexpr auto I2 = Number<2>{};
@@ -168,9 +164,9 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
    constexpr auto gridwise_conv =
 #if 0
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
 #else
-        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer
+        GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated
 #endif
        <GridSize,
         BlockSize,
@@ -180,8 +176,6 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
         decltype(out_nkhw_desc),
         ConvStrides,
         ConvDilations,
-         LeftPads,
-         RightPads,
         BPerBlock,
         KPerBlock,
         EPerBlock,

--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
 #pragma once
 #include "tensor.hpp"
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 // this is ugly, only for 4d
 template <class TConstTensorDesc>

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -4,7 +4,9 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include "config.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
+#include "print_array.hpp"
+#include "print_sequence.hpp"
 #include "device.hpp"
 #include "conv_common.hpp"
 #include "host_conv.hpp"
@@ -14,12 +16,12 @@
 //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
 //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated.hpp"
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
 //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
 //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
+#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
 struct GeneratorTensor_1
 {
@@ -438,7 +440,17 @@ int main(int argc, char* argv[])
 #elif 0
    device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
        (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
-#elif 1
+#elif 0
+    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
+                                                                    in_nchw,
+                                                                    wei_kcyx_desc,
+                                                                    wei_kcyx,
+                                                                    out_nkhw_desc,
+                                                                    out_nkhw_device,
+                                                                    ConvStrides{},
+                                                                    ConvDilations{},
+                                                                    nrepeat);
+#elif 0
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,
@@ -447,19 +459,9 @@ int main(int argc, char* argv[])
                                                         out_nkhw_device,
                                                         ConvStrides{},
                                                         ConvDilations{},
+                                                         LeftPads{},
+                                                         RightPads{},
                                                         nrepeat);
-#elif 1
-    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
-                                                                in_nchw,
-                                                                wei_kcyx_desc,
-                                                                wei_kcyx,
-                                                                out_nkhw_desc,
-                                                                out_nkhw_device,
-                                                                ConvStrides{},
-                                                                ConvDilations{},
-                                                                LeftPads{},
-                                                                RightPads{},
-                                                                nrepeat);
 #elif 0
    device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
@@ -481,6 +483,16 @@ int main(int argc, char* argv[])
                                                         ConvDilations{},
                                                         nrepeat);
 #elif 0
+    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
+                                                                    in_nchw,
+                                                                    wei_kcyx_desc,
+                                                                    wei_kcyx,
+                                                                    out_nkhw_desc,
+                                                                    out_nkhw_device,
+                                                                    ConvStrides{},
+                                                                    ConvDilations{},
+                                                                    nrepeat);
+#elif 1
    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,
@@ -489,19 +501,9 @@ int main(int argc, char* argv[])
                                                         out_nkhw_device,
                                                         ConvStrides{},
                                                         ConvDilations{},
+                                                         LeftPads{},
+                                                         RightPads{},
                                                         nrepeat);
-#elif 1
-    device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc,
-                                                                in_nchw,
-                                                                wei_kcyx_desc,
-                                                                wei_kcyx,
-                                                                out_nkhw_desc,
-                                                                out_nkhw_device,
-                                                                ConvStrides{},
-                                                                ConvDilations{},
-                                                                LeftPads{},
-                                                                RightPads{},
-                                                                nrepeat);
 #endif
    if(do_verification)

--- a/external/include/bfloat16_dev.hpp
+++ b/external/include/bfloat16_dev.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef BFLOAT16_DEVICE_HPP
+#define BFLOAT16_DEVICE_HPP
+#ifdef __cplusplus
+extern "C" {
+#endif
+#ifdef __HIP_PLATFORM_HCC__
+#define EXECUTION_SPECIFIER __device__
+#else
+#define EXECUTION_SPECIFIER
+#endif // MIOPEN_BACKEND_HIP
+typedef union
+{
+    uint u32;
+    ushort2 ushortx2;
+// Composable kernels are written in HIP language. The language doesnt support
+// ushort2.hi or ushort2.low.
+#ifdef __HIP_PLATFORM_HCC__
+    ushort ushortvec[2];
+#endif // MIOPEN_BACKEND_HIP
+    float f32;
+} cvt_bf16_fp32_t;
+EXECUTION_SPECIFIER float bfloat16_to_float(ushort src_val)
+{
+    cvt_bf16_fp32_t target_val;
+#ifdef __HIP_PLATFORM_HCC__
+    target_val.ushortx2 = make_ushort2(0, src_val);
+#else
+    target_val.ushortx2 = (ushort2)(0, src_val);
+#endif
+    return target_val.f32;
+}
+EXECUTION_SPECIFIER ushort float_to_bfloat16(float src_val)
+{
+    cvt_bf16_fp32_t target_val;
+    target_val.f32 = src_val;
+    // BF16 round and NaN preservation code matches
+    // https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/library/include/rocblas_bfloat16.h
+    if((~target_val.u32 & 0x7f800000) == 0) // Inf or NaN
+    {
+        // When all of the exponent bits are 1, the value is Inf or NaN.
+        // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+        // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+        // bit being 1. Signaling NaN is indicated by the most significant
+        // mantissa bit being 0 but some other bit(s) being 1. If any of the
+        // lower 16 bits of the mantissa are 1, we set the least significant bit
+        // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+        // the bloat16's mantissa bits are all 0.
+        if((target_val.u32 & 0xffff) != 0)
+        {
+            target_val.u32 |= 0x10000; // Preserve signaling NaN
+        }
+    }
+    else
+    {
+#ifdef MIOPEN_USE_RNE_BFLOAT16
+// When the exponent bits are not all 1s, then the value is zero, normal,
+// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+// This causes the bfloat16's mantissa to be incremented by 1 if the 16
+// least significant bits of the float mantissa are greater than 0x8000,
+// or if they are equal to 0x8000 and the least significant bit of the
+// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+// has the value 0x7f, then incrementing it causes it to become 0x00 and
+// the exponent is incremented by one, which is the next higher FP value
+// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+// incrementing it causes it to become an exponent of 0xFF and a mantissa
+// of 0x00, which is Inf, the next higher value to the unrounded value.
+#ifdef __HIP_PLATFORM_HCC__
+        target_val.u32 += (0x7fff + (target_val.ushortvec[1] & 1));
+#else
+        target_val.u32 +=
+            (0x7fff + (target_val.ushortx2.hi & 1)); // Round to nearest, round to even
+#endif // MIOPEN_BACKEND_HIP
+#endif // MIOPEN_USE_RNE_BFLOAT16
+    }
+#ifdef __HIP_PLATFORM_HCC__
+    return target_val.ushortvec[1];
+#else
+    return target_val.ushortx2.hi;
+#endif // MIOPEN_BACKEND_HIP
+}
+#ifdef __cplusplus
+}
+#endif
+#endif // BFLOAT16_DEVICE_HPP
--- a/script/compile-hip.sh
+++ b/script/compile-hip.sh
 #!/bin/bash
+ export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
 export KMDUMPISA=1
 export KMDUMPLLVM=1
-#export KMOPTLLC="-mattr=+enable-ds128"
+ export KMDUMPDIR=$PWD
- export KMOPTLLC="-mattr=+enable-ds128 -amdgpu-enable-global-sgpr-addr"
-make -j driver
+ make -j driver
-/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm
+#/opt/rocm/hcc/bin/llvm-objdump -mcpu=gfx906 -source -line-numbers driver/dump-gfx906.isabin > driver/dump-gfx906.isabin.asm