MIopen integration (#13)

* update for miopen integration: cosmetic refactor

MIopen integration (#13)
* update for miopen integration: cosmetic refactor
1a66e35b · Chao Liu · GitHub · 3406a114 · 1a66e35b · 1a66e35b
Unverified Commit 1a66e35b authored Feb 17, 2020 by Chao Liu Committed by GitHub Feb 17, 2020
9 changed files
--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -54,21 +54,23 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
+#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0

 namespace ck {

 enum AddressSpace
 {
-    generic,
-    global,
-    lds,
-    vgpr
+    Generic,
+    Global,
+    Lds,
+    Vgpr
 };

 enum InMemoryDataOperation
 {
-    none,
-    atomic_add
+    Set,
+    AtomicAdd
 };

 #if CK_UNSIGNED_INDEX_TYPE

--- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
@@ -10,13 +10,14 @@ template <typename T,
          index_t DataPerAccess,
          AddressSpace SrcAddressSpace,
          AddressSpace DstAddressSpace>
-__device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+__device__ void set_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
 {
    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;

 #if CK_USE_AMD_BUFFER_ADDRESSING
    // TODO: use static_if::ElseIf, instead of nested static_if
-    static_if<SrcAddressSpace == AddressSpace::global && DstAddressSpace == vgpr>{}([&](auto) {
+    static_if<SrcAddressSpace == AddressSpace::Global &&
+              DstAddressSpace == AddressSpace::Vgpr>{}([&](auto) {
        // buffer_load requires:
        //   1) p_src must be in global memory space, d_dst must be vgpr
        //   2) p_src to be a block-invariant pointer.
@@ -24,7 +25,8 @@ __device__ void copy_data(const T* p_src, index_t src_offset, T* p_dst, index_t
        *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
            amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
    }).Else([&](auto) {
-        static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == global>{}([&](auto) {
+        static_if<SrcAddressSpace == AddressSpace::Vgpr &&
+                  DstAddressSpace == AddressSpace::Global>{}([&](auto) {
            // buffer_store requires:
            //   1) p_src must be in vgpr space, d_dst must be global memory
            //   2) p_dst to be a block-invariant pointer.
@@ -50,8 +52,8 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in
 {
    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;

-    static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
-        [&](auto) {
+    static_if<SrcAddressSpace == AddressSpace::Vgpr &&
+              DstAddressSpace == AddressSpace::Global>{}([&](auto) {
 #if CK_USE_AMD_BUFFER_ATOMIC_ADD
        amd_intrinsic_buffer_atomic_add<T, DataPerAccess>(
            *reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
@@ -59,8 +61,7 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in
        atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
                  *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
 #endif
-        })
-        .Else([&](auto fwd) {
+    }).Else([&](auto fwd) {
        static_assert(fwd(false), "atomic_add doesn't support this memory space");
    });
 }
@@ -72,17 +73,17 @@ template <typename T,
          InMemoryDataOperation DstInMemOp>
 __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
 {
-    static_assert(DstInMemOp == InMemoryDataOperation::none ||
-                      DstInMemOp == InMemoryDataOperation::atomic_add,
+    static_assert(DstInMemOp == InMemoryDataOperation::Set ||
+                      DstInMemOp == InMemoryDataOperation::AtomicAdd,
                  "wrong! InMemoryDataOperation not supported!");

    // TODO: use static_if::ElseIf
-    static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
-        copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
+    static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+        set_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
            p_src, src_offset, p_dst, dst_offset);
    });

-    static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
+    static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
        atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
            p_src, src_offset, p_dst, dst_offset);
    });

--- a/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in
@@ -23,12 +23,11 @@ __device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, in
 {
    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;

-    static_if<SrcAddressSpace == AddressSpace::vgpr && DstAddressSpace == AddressSpace::global>{}(
-        [&](auto) {
+    static_if<SrcAddressSpace == AddressSpace::Vgpr &&
+              DstAddressSpace == AddressSpace::Global>{}([&](auto) {
        atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
                  *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
-        })
-        .Else([&](auto fwd) {
+    }).Else([&](auto fwd) {
        static_assert(fwd(false), "atomic_add doesn't support this memory space");
    });
 }
@@ -40,17 +39,17 @@ template <typename T,
          InMemoryDataOperation DstInMemOp>
 __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
 {
-    static_assert(DstInMemOp == InMemoryDataOperation::none ||
-                      DstInMemOp == InMemoryDataOperation::atomic_add,
+    static_assert(DstInMemOp == InMemoryDataOperation::Set ||
+                      DstInMemOp == InMemoryDataOperation::AtomicAdd,
                  "wrong! InMemoryDataOperation not supported!");

    // TODO: use static_if::ElseIf
-    static_if<DstInMemOp == InMemoryDataOperation::none>{}([&](auto) {
+    static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
        copy_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
            p_src, src_offset, p_dst, dst_offset);
    });

-    static_if<DstInMemOp == InMemoryDataOperation::atomic_add>{}([&](auto) {
+    static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
        atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
            p_src, src_offset, p_dst, dst_offset);
    });

--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -107,27 +107,22 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
 template <typename T>
 __host__ __device__ constexpr T gcd(T x, T y)
 {
-    if(x == 0)
+    if(x == y || x == 0)
    {
        return y;
    }
-
-    if(y == 0)
-    {
-        return x;
-    }
-
-    if(x == y)
+    else if(y == 0)
    {
        return x;
    }
-
-    if(x > y)
+    else if(x > y)
    {
        return gcd(x - y, y);
    }
-
+    else
+    {
        return gcd(x, y - x);
+    }
 }

 template <index_t X, index_t Y>
@@ -150,10 +145,10 @@ __host__ __device__ constexpr T lcm(T x, T y)
    return (x * y) / gcd(x, y);
 }

-template <typename X, typename Y, typename... Zs>
-__host__ __device__ constexpr auto lcm(X x, Y y, Zs... zs)
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto lcm(X x, Ys... ys)
 {
-    return lcm(x, lcm(y, zs...));
+    return lcm(x, lcm(ys...));
 }

 template <class T>

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
@@ -49,20 +49,20 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i
    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());

-#if 1
+#if 0
    // BlockSize = 256, each thread hold 64 data
    constexpr index_t BlockSize = 256;

    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 8;
-    constexpr index_t GemmMPerThreadSubC         = 4;
-    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMPerThread         = 4;
+    constexpr index_t GemmNPerThread         = 4;
+    constexpr index_t GemmKPerThread         = 1;
    constexpr index_t GemmMLevel0Cluster         = 4;
    constexpr index_t GemmNLevel0Cluster         = 4;
    constexpr index_t GemmMLevel1Cluster         = 4;
    constexpr index_t GemmNLevel1Cluster         = 4;
-    constexpr index_t GemmKPerThreadLoop         = 1;
    constexpr index_t GemmThreadGemmDataPerReadM = 4;
    constexpr index_t GemmThreadGemmDataPerReadN = 4;

@@ -79,6 +79,36 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;

    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
+#elif 1
+    // BlockSize = 256, each thread hold 64 data
+    constexpr index_t BlockSize = 256;
+
+    constexpr index_t GemmMPerBlock              = 128;
+    constexpr index_t GemmNPerBlock              = 128;
+    constexpr index_t GemmKPerBlock              = 16;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
+    constexpr index_t GemmMLevel0Cluster         = 4;
+    constexpr index_t GemmNLevel0Cluster         = 4;
+    constexpr index_t GemmMLevel1Cluster         = 4;
+    constexpr index_t GemmNLevel1Cluster         = 4;
+    constexpr index_t GemmThreadGemmDataPerReadM = 4;
+    constexpr index_t GemmThreadGemmDataPerReadN = 4;
+
+    using GemmABlockCopyThreadSliceLengths_GemmK_GemmM   = Sequence<2, 4>;
+    using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<8, 32>;
+
+    constexpr index_t GemmABlockCopySrcDataPerRead_GemmM  = 4;
+    constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 4;
+
+    using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN   = Sequence<2, 4>;
+    using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<8, 32>;
+
+    constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN  = 4;
+    constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 4;
+
+    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 4;
 #endif

    constexpr index_t GemmM = C * Y * X;
@@ -104,13 +134,13 @@ void device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw(InDesc i
        GemmMPerBlock,
        GemmNPerBlock,
        GemmKPerBlock,
-        GemmMPerThreadSubC,
-        GemmNPerThreadSubC,
+        GemmMPerThread,
+        GemmNPerThread,
+        GemmKPerThread,
        GemmMLevel0Cluster,
        GemmNLevel0Cluster,
        GemmMLevel1Cluster,
        GemmNLevel1Cluster,
-        GemmKPerThreadLoop,
        GemmThreadGemmDataPerReadM,
        GemmThreadGemmDataPerReadN,
        GemmABlockCopyThreadSliceLengths_GemmK_GemmM,

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
@@ -66,13 +66,13 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 8;
-    constexpr index_t GemmMPerThreadSubC         = 4;
-    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
    constexpr index_t GemmMLevel0Cluster         = 4;
    constexpr index_t GemmNLevel0Cluster         = 4;
    constexpr index_t GemmMLevel1Cluster         = 4;
    constexpr index_t GemmNLevel1Cluster         = 4;
-    constexpr index_t GemmKPerThreadLoop         = 1;
    constexpr index_t GemmThreadGemmDataPerReadM = 4;
    constexpr index_t GemmThreadGemmDataPerReadN = 4;

@@ -96,13 +96,13 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 8;
-    constexpr index_t GemmMPerThreadSubC         = 4;
-    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
    constexpr index_t GemmMLevel0Cluster         = 4;
    constexpr index_t GemmNLevel0Cluster         = 4;
    constexpr index_t GemmMLevel1Cluster         = 4;
    constexpr index_t GemmNLevel1Cluster         = 4;
-    constexpr index_t GemmKPerThreadLoop         = 1;
    constexpr index_t GemmThreadGemmDataPerReadM = 4;
    constexpr index_t GemmThreadGemmDataPerReadN = 4;

@@ -127,13 +127,13 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 8;
-    constexpr index_t GemmMPerThreadSubC         = 4;
-    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
    constexpr index_t GemmMLevel0Cluster         = 4;
    constexpr index_t GemmNLevel0Cluster         = 4;
    constexpr index_t GemmMLevel1Cluster         = 4;
    constexpr index_t GemmNLevel1Cluster         = 4;
-    constexpr index_t GemmKPerThreadLoop         = 1;
    constexpr index_t GemmThreadGemmDataPerReadM = 4;
    constexpr index_t GemmThreadGemmDataPerReadN = 4;

@@ -152,33 +152,33 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 4;
 #endif

-    constexpr index_t gcd_stride_dilation_h = math::gcd(ConvStrideH, ConvDilationH);
-    constexpr index_t gcd_stride_dilation_w = math::gcd(ConvStrideW, ConvDilationW);
+    constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);

-    constexpr index_t Ytilda = ConvStrideH / gcd_stride_dilation_h;
-    constexpr index_t Xtilda = ConvStrideW / gcd_stride_dilation_w;
+    constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH;
+    constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW;

-    constexpr index_t Ydot = math::integer_divide_ceil(Y, Ytilda);
-    constexpr index_t Xdot = math::integer_divide_ceil(X, Xtilda);
+    constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda);
+    constexpr index_t XDot = math::integer_divide_ceil(X, XTilda);

-    constexpr index_t Htilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
-    constexpr index_t Wtilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW);
+    constexpr index_t HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
+    constexpr index_t WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW);

-    constexpr index_t HtildaLeft = math::integer_divide_floor(
-        math::max(0, InLeftPads{}[0] - ConvDilationH * (Ytilda - 1)), ConvStrides{}[0]);
-    constexpr index_t WtildaLeft = math::integer_divide_floor(
-        math::max(0, InLeftPads{}[1] - ConvDilationW * (Xtilda - 1)), ConvStrides{}[1]);
+    constexpr index_t HTildaLeft = math::integer_divide_floor(
+        math::max(0, InLeftPads{}[0] - ConvDilationH * (YTilda - 1)), ConvStrides{}[0]);
+    constexpr index_t WTildaLeft = math::integer_divide_floor(
+        math::max(0, InLeftPads{}[1] - ConvDilationW * (XTilda - 1)), ConvStrides{}[1]);

-    constexpr index_t HtildaRight = math::min(
-        Htilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1);
-    constexpr index_t WtildaRight = math::min(
-        Wtilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1);
+    constexpr index_t HTildaRight = math::min(
+        HTilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1);
+    constexpr index_t WTildaRight = math::min(
+        WTilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1);

-    constexpr index_t HtildaTrim = HtildaRight - HtildaLeft;
-    constexpr index_t WtildaTrim = WtildaRight - WtildaLeft;
+    constexpr index_t HTildaSlice = HTildaRight - HTildaLeft;
+    constexpr index_t WTildaSlice = WTildaRight - WTildaLeft;

-    constexpr index_t GemmM = C * Ytilda * Xtilda;
-    constexpr index_t GemmN = N * HtildaTrim * WtildaTrim;
+    constexpr index_t GemmM = C * YTilda * XTilda;
+    constexpr index_t GemmN = N * HTildaSlice * WTildaSlice;

    constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) *
                                 math::integer_divide_ceil(GemmN, GemmNPerBlock);
@@ -200,13 +200,13 @@ void device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw(InDesc i
        GemmMPerBlock,
        GemmNPerBlock,
        GemmKPerBlock,
-        GemmMPerThreadSubC,
-        GemmNPerThreadSubC,
+        GemmMPerThread,
+        GemmNPerThread,
+        GemmKPerThread,
        GemmMLevel0Cluster,
        GemmNLevel0Cluster,
        GemmMLevel1Cluster,
        GemmNLevel1Cluster,
-        GemmKPerThreadLoop,
        GemmThreadGemmDataPerReadM,
        GemmThreadGemmDataPerReadN,
        GemmABlockCopyThreadSliceLengths_GemmK_GemmM,

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
@@ -66,13 +66,13 @@ void device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 8;
-    constexpr index_t GemmMPerThreadSubC         = 4;
-    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
    constexpr index_t GemmMLevel0Cluster         = 4;
    constexpr index_t GemmNLevel0Cluster         = 4;
    constexpr index_t GemmMLevel1Cluster         = 4;
    constexpr index_t GemmNLevel1Cluster         = 4;
-    constexpr index_t GemmKPerThreadLoop         = 1;
    constexpr index_t GemmThreadGemmDataPerReadM = 4;
    constexpr index_t GemmThreadGemmDataPerReadN = 4;

@@ -91,33 +91,33 @@ void device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
 #endif

-    constexpr index_t gcd_stride_dilation_h = math::gcd(ConvStrideH, ConvDilationH);
-    constexpr index_t gcd_stride_dilation_w = math::gcd(ConvStrideW, ConvDilationW);
+    constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);

-    constexpr index_t Ytilda = ConvStrideH / gcd_stride_dilation_h;
-    constexpr index_t Xtilda = ConvStrideW / gcd_stride_dilation_w;
+    constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH;
+    constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW;

-    constexpr index_t Ydot = math::integer_divide_ceil(Y, Ytilda);
-    constexpr index_t Xdot = math::integer_divide_ceil(X, Xtilda);
+    constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda);
+    constexpr index_t XDot = math::integer_divide_ceil(X, XTilda);

-    constexpr index_t Htilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
-    constexpr index_t Wtilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW);
+    constexpr index_t HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
+    constexpr index_t WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW);

-    constexpr index_t HtildaLeft = math::integer_divide_floor(
-        math::max(0, InLeftPads{}[0] - ConvDilationH * (Ytilda - 1)), ConvStrides{}[0]);
-    constexpr index_t WtildaLeft = math::integer_divide_floor(
-        math::max(0, InLeftPads{}[1] - ConvDilationW * (Xtilda - 1)), ConvStrides{}[1]);
+    constexpr index_t HTildaLeft = math::integer_divide_floor(
+        math::max(0, InLeftPads{}[0] - ConvDilationH * (YTilda - 1)), ConvStrides{}[0]);
+    constexpr index_t WTildaLeft = math::integer_divide_floor(
+        math::max(0, InLeftPads{}[1] - ConvDilationW * (XTilda - 1)), ConvStrides{}[1]);

-    constexpr index_t HtildaRight = math::min(
-        Htilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1);
-    constexpr index_t WtildaRight = math::min(
-        Wtilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1);
+    constexpr index_t HTildaRight = math::min(
+        HTilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1);
+    constexpr index_t WTildaRight = math::min(
+        WTilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1);

-    constexpr index_t HtildaTrim = HtildaRight - HtildaLeft;
-    constexpr index_t WtildaTrim = WtildaRight - WtildaLeft;
+    constexpr index_t HTildaSlice = HTildaRight - HTildaLeft;
+    constexpr index_t WTildaSlice = WTildaRight - WTildaLeft;

    constexpr index_t GemmM = C;
-    constexpr index_t GemmN = N * HtildaTrim * WtildaTrim;
+    constexpr index_t GemmN = N * HTildaSlice * WTildaSlice;

    constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) *
                                 math::integer_divide_ceil(GemmN, GemmNPerBlock);
@@ -139,13 +139,13 @@ void device_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw(InDesc i
        GemmMPerBlock,
        GemmNPerBlock,
        GemmKPerBlock,
-        GemmMPerThreadSubC,
-        GemmNPerThreadSubC,
+        GemmMPerThread,
+        GemmNPerThread,
+        GemmKPerThread,
        GemmMLevel0Cluster,
        GemmNLevel0Cluster,
        GemmMLevel1Cluster,
        GemmNLevel1Cluster,
-        GemmKPerThreadLoop,
        GemmThreadGemmDataPerReadM,
        GemmThreadGemmDataPerReadN,
        GemmABlockCopyThreadSliceLengths_GemmK_GemmM,

--- a/driver/include/device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -69,13 +69,13 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 8;
-    constexpr index_t GemmMPerThreadSubC         = 4;
-    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
    constexpr index_t GemmMLevel0Cluster         = 4;
    constexpr index_t GemmNLevel0Cluster         = 4;
    constexpr index_t GemmMLevel1Cluster         = 4;
    constexpr index_t GemmNLevel1Cluster         = 4;
-    constexpr index_t GemmKPerThreadLoop         = 1;
    constexpr index_t GemmThreadGemmDataPerReadM = 4;
    constexpr index_t GemmThreadGemmDataPerReadN = 4;

@@ -99,13 +99,13 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmMPerBlock              = 128;
    constexpr index_t GemmNPerBlock              = 128;
    constexpr index_t GemmKPerBlock              = 16;
-    constexpr index_t GemmMPerThreadSubC         = 4;
-    constexpr index_t GemmNPerThreadSubC         = 4;
+    constexpr index_t GemmMPerThread             = 4;
+    constexpr index_t GemmNPerThread             = 4;
+    constexpr index_t GemmKPerThread             = 1;
    constexpr index_t GemmMLevel0Cluster         = 4;
    constexpr index_t GemmNLevel0Cluster         = 4;
    constexpr index_t GemmMLevel1Cluster         = 4;
    constexpr index_t GemmNLevel1Cluster         = 4;
-    constexpr index_t GemmKPerThreadLoop         = 1;
    constexpr index_t GemmThreadGemmDataPerReadM = 4;
    constexpr index_t GemmThreadGemmDataPerReadN = 4;

@@ -124,33 +124,33 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i
    constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 1;
 #endif

-    constexpr index_t gcd_stride_dilation_h = math::gcd(ConvStrideH, ConvDilationH);
-    constexpr index_t gcd_stride_dilation_w = math::gcd(ConvStrideW, ConvDilationW);
+    constexpr index_t GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    constexpr index_t GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);

-    constexpr index_t Ytilda = ConvStrideH / gcd_stride_dilation_h;
-    constexpr index_t Xtilda = ConvStrideW / gcd_stride_dilation_w;
+    constexpr index_t YTilda = ConvStrideH / GcdStrideDilationH;
+    constexpr index_t XTilda = ConvStrideW / GcdStrideDilationW;

-    constexpr index_t Ydot = math::integer_divide_ceil(Y, Ytilda);
-    constexpr index_t Xdot = math::integer_divide_ceil(X, Xtilda);
+    constexpr index_t YDot = math::integer_divide_ceil(Y, YTilda);
+    constexpr index_t XDot = math::integer_divide_ceil(X, XTilda);

-    constexpr index_t Htilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
-    constexpr index_t Wtilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW);
+    constexpr index_t HTilda = Ho + math::integer_divide_ceil(ConvDilationH * (Y - 1), ConvStrideH);
+    constexpr index_t WTilda = Wo + math::integer_divide_ceil(ConvDilationW * (X - 1), ConvStrideW);

-    constexpr index_t HtildaLeft = math::integer_divide_floor(
-        math::max(0, InLeftPads{}[0] - ConvDilationH * (Ytilda - 1)), ConvStrides{}[0]);
-    constexpr index_t WtildaLeft = math::integer_divide_floor(
-        math::max(0, InLeftPads{}[1] - ConvDilationW * (Xtilda - 1)), ConvStrides{}[1]);
+    constexpr index_t HTildaLeft = math::integer_divide_floor(
+        math::max(0, InLeftPads{}[0] - ConvDilationH * (YTilda - 1)), ConvStrides{}[0]);
+    constexpr index_t WTildaLeft = math::integer_divide_floor(
+        math::max(0, InLeftPads{}[1] - ConvDilationW * (XTilda - 1)), ConvStrides{}[1]);

-    constexpr index_t HtildaRight = math::min(
-        Htilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1);
-    constexpr index_t WtildaRight = math::min(
-        Wtilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1);
+    constexpr index_t HTildaRight = math::min(
+        HTilda, math::integer_divide_ceil(InLeftPads{}[0] + Hi - 1, ConvStrides{}[0]) + 1);
+    constexpr index_t WTildaRight = math::min(
+        WTilda, math::integer_divide_ceil(InLeftPads{}[1] + Wi - 1, ConvStrides{}[1]) + 1);

-    constexpr index_t HtildaTrim = HtildaRight - HtildaLeft;
-    constexpr index_t WtildaTrim = WtildaRight - WtildaLeft;
+    constexpr index_t HTildaSlice = HTildaRight - HTildaLeft;
+    constexpr index_t WTildaSlice = WTildaRight - WTildaLeft;

    constexpr index_t GemmM = C;
-    constexpr index_t GemmN = N * HtildaTrim * WtildaTrim;
+    constexpr index_t GemmN = N * HTildaSlice * WTildaSlice;

    constexpr index_t GridSize = math::integer_divide_ceil(GemmM, GemmMPerBlock) *
                                 math::integer_divide_ceil(GemmN, GemmNPerBlock);
@@ -159,7 +159,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i

    for(index_t i = 0; i < nrepeat; ++i)
    {
-        using GridwiseConv = GridwiseConvolutionBackwardDataImplicitGemm_v4r1_nchw_kcyx_nkhw<
+        using GridwiseConvBwdData = GridwiseConvolutionBackwardDataImplicitGemm_v4r1_nchw_kcyx_nkhw<
            GridSize,
            BlockSize,
            T,
@@ -174,13 +174,13 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i
            GemmMPerBlock,
            GemmNPerBlock,
            GemmKPerBlock,
-            GemmMPerThreadSubC,
-            GemmNPerThreadSubC,
+            GemmMPerThread,
+            GemmNPerThread,
+            GemmKPerThread,
            GemmMLevel0Cluster,
            GemmNLevel0Cluster,
            GemmMLevel1Cluster,
            GemmNLevel1Cluster,
-            GemmKPerThreadLoop,
            GemmThreadGemmDataPerReadM,
            GemmThreadGemmDataPerReadN,
            GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
@@ -196,11 +196,18 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i
        KernelTimer timer;
        timer.Start();

-        static_for<0, GridwiseConv::GetNumberOfGemm(), 1>{}([&](auto gemm_id_) {
+        static_for<0, GridwiseConvBwdData::GetNumberOfGemm(), 1>{}([&](auto gemm_id_) {
            constexpr index_t gemm_id = decltype(gemm_id_){};

-            launch_kernel(run_gridwise_convolution_backward_data_v4r1<GridwiseConv,
-                                                                      gemm_id,
+            constexpr auto gemm_sizes        = GridwiseConvBwdData::GetGemmSize(gemm_id);
+            constexpr index_t gemm_k         = gemm_sizes.At(2);
+            constexpr bool is_gemm_not_empty = gemm_k > 0;
+
+            // only compile and run if GEMM is no empty
+            static_if<is_gemm_not_empty>{}([&](auto fwd) {
+                launch_kernel(
+                    run_gridwise_convolution_backward_data_v4r1<GridwiseConvBwdData,
+                                                                fwd(gemm_id),
                                                                T* const __restrict__,
                                                                const T* const __restrict__,
                                                                const T* const __restrict__>,
@@ -212,6 +219,7 @@ void device_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc i
                    static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()),
                    static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer()));
            });
+        });

        timer.End();
        float time = timer.GetElapsedTime();

--- a/driver/src/conv_bwd_data_driver.cpp
+++ b/driver/src/conv_bwd_data_driver.cpp
@@ -23,17 +23,16 @@ int main(int argc, char* argv[])
 {
    using namespace launcher;

-#if 0
-    // 3x3 filter, 2x2 stride, 35x35 input
-    constexpr index_t N  = 128;
-    constexpr index_t C  = 1024;
-    constexpr index_t HI = 35;
-    constexpr index_t WI = 35;
-    constexpr index_t K  = 1024;
-    constexpr index_t Y  = 3;
-    constexpr index_t X  = 3;
+#if 1
+    constexpr index_t N  = 64;
+    constexpr index_t C  = 256;
+    constexpr index_t HI = 56;
+    constexpr index_t WI = 56;
+    constexpr index_t K  = 256;
+    constexpr index_t Y  = 1;
+    constexpr index_t X  = 1;

-    using ConvStrides   = Sequence<2, 2>;
+    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;

    using LeftPads  = Sequence<0, 0>;
@@ -158,7 +157,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<2, 2>;
    using RightPads = Sequence<2, 2>;
-#elif 1
+#elif 0
    // 1x7 filter, 0x3 pad, 17x17 input
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;
@@ -188,7 +187,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<3, 0>;
    using RightPads = Sequence<3, 0>;
-#elif 0
+#elif 1
    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
    constexpr index_t N  = 128;
    constexpr index_t C  = 1024;