sync with miopen

4cd8f454 · Chao Liu · 6fc49f91 · 4cd8f454 · 4cd8f454 · 4cd8f454
Commit 4cd8f454 authored Jun 17, 2020 by Chao Liu
13 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,10 +56,12 @@ if(DEVICE_BACKEND STREQUAL "AMD")
    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/synchronization.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/synchronization.hpp")
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/in_memory_operation.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/synchronization.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/synchronization.hpp")
 endif()

 add_subdirectory(driver)
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
@@ -36,8 +36,8 @@ template <index_t GridSize,
          index_t ThreadGemmDataPerRead_GemmN,
          typename GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
          typename GemmABlockCopyThreadClusterLengths_GemmK_GemmM,
-          index_t GemmABlockCopySrcDataPerRead_GemmN,
-          index_t GemmABlockCopyDstDataPerWrite_GemmN,
+          index_t GemmABlockCopySrcDataPerRead_GemmM,
+          index_t GemmABlockCopyDstDataPerWrite_GemmM,
          typename GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
          typename GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
          index_t GemmBBlockCopySrcDataPerRead_GemmN,
@@ -82,13 +82,6 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
        constexpr auto wei_gemmk_gemmm_global_desc =
            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3);

-        // output tensor
-        constexpr auto out_gemmk_gemmn_global_desc =
-            transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
-                                        make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
-                                        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
-                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
-
        // input tensor
        constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
            in_n_c_hi_wi_global_desc,
@@ -98,16 +91,15 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));

+        constexpr index_t Hip = in_n_c_hip_wip_global_desc.GetLengths()[2];
+        constexpr index_t Wip = in_n_c_hip_wip_global_desc.GetLengths()[3];
+
        constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
            in_n_c_hip_wip_global_desc,
            make_tuple(PassThrough<N>{},
                       PassThrough<C>{},
-                       Embed<Hi + InLeftPads::At(0) + InRightPads::At(0),
-                             Sequence<Y, Ho>,
-                             Sequence<ConvDilationH, ConvStrideH, 0>>{},
-                       Embed<Wi + InLeftPads::At(1) + InRightPads::At(1),
-                             Sequence<X, Wo>,
-                             Sequence<ConvDilationW, ConvStrideW, 0>>{}),
+                       Embed<Hip, Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
+                       Embed<Wip, Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));

@@ -117,6 +109,13 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}));

+        // output tensor
+        constexpr auto out_gemmk_gemmn_global_desc =
+            transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
+                                        make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
+                                        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
        // GEMM
        // \todo there are more combinations of Y, ConvDilationH and ConvStrideH that don't need
        // atomic, find out all of them
@@ -152,8 +151,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
                                                     Sequence<0, 1>,
                                                     Sequence<0, 1>,
                                                     1,
-                                                     GemmABlockCopySrcDataPerRead_GemmN,
-                                                     GemmABlockCopyDstDataPerWrite_GemmN,
+                                                     GemmABlockCopySrcDataPerRead_GemmM,
+                                                     GemmABlockCopyDstDataPerWrite_GemmM,
                                                     GemmBBlockCopyThreadSliceLengths_GemmK_GemmN,
                                                     GemmBBlockCopyThreadClusterLengths_GemmK_GemmN,
                                                     Sequence<0, 1>,

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
 #endif

        // weight tensor
-        constexpr auto wei_e_k_global_desc = reorder_tensor_descriptor_given_upper2lower(
+        constexpr auto wei_gemmk_gemmm_global_desc = reorder_tensor_descriptor_given_upper2lower(
            unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});

        // input tensor
@@ -110,14 +110,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));

-        constexpr auto in_e_b_global_desc = transform_tensor_descriptor(
+        constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor(
            in_n_c_y_ho_x_wo_global_desc,
            make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
            make_tuple(Sequence<0>{}, Sequence<1>{}));

        // output tensor
-        constexpr auto out_k_b_global_desc =
+        constexpr auto out_gemmk_gemmn_global_desc =
            transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
                                        make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
                                        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
@@ -129,9 +129,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
                                                     BlockSize,
                                                     Float,
                                                     AccFloat,
-                                                     decltype(wei_e_k_global_desc),
-                                                     decltype(in_e_b_global_desc),
-                                                     decltype(out_k_b_global_desc),
+                                                     decltype(wei_gemmk_gemmm_global_desc),
+                                                     decltype(in_gemmm_gemmn_global_desc),
+                                                     decltype(out_gemmk_gemmn_global_desc),
                                                     InMemoryDataOperation::Set,
                                                     GemmMPerBlock,
                                                     GemmNPerBlock,

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -31,7 +31,9 @@ template <index_t BlockSize,
          AddressSpace SrcAddressSpace          = AddressSpace::Generic,
          AddressSpace ThreadBufferAddressSpace = AddressSpace::Generic,
          AddressSpace DstAddressSpace          = AddressSpace::Generic,
-          InMemoryDataOperation DstInMemOp      = InMemoryDataOperation::Set>
+          InMemoryDataOperation DstInMemOp      = InMemoryDataOperation::Set,
+          index_t SrcDataStride                 = 1,
+          index_t DstDataStride                 = 1>
 struct BlockwiseGenericTensorSliceCopy_v4
 {
    static constexpr index_t nDim = BlockSrcDesc::GetNumOfDimension();
@@ -178,7 +180,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
                                                                 1,
                                                                 SrcAddressSpace,
                                                                 ThreadBufferAddressSpace,
-                                                                 InMemoryDataOperation::Set>;
+                                                                 InMemoryDataOperation::Set,
+                                                                 SrcDataStride,
+                                                                 1>;

    using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2<ThreadBufferDesc,
                                                                  BlockDstDesc,
@@ -189,7 +193,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
                                                                  DstDataPerWrite,
                                                                  ThreadBufferAddressSpace,
                                                                  DstAddressSpace,
-                                                                  DstInMemOp>;
+                                                                  DstInMemOp,
+                                                                  1,
+                                                                  DstDataStride>;

    static constexpr auto mThreadClusterDesc =
        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -23,7 +23,9 @@ template <typename SrcDesc,
          index_t DstDataPerWrite,
          AddressSpace SrcAddressSpace     = AddressSpace::Generic,
          AddressSpace DstAddressSpace     = AddressSpace::Generic,
-          InMemoryDataOperation DstInMemOp = InMemoryDataOperation::Set>
+          InMemoryDataOperation DstInMemOp = InMemoryDataOperation::Set,
+          index_t SrcDataStride            = 1,
+          index_t DstDataStride            = 1>
 struct ThreadwiseGenericTensorSliceCopy_v4r2
 {
    static constexpr index_t nDim = SliceLengths::Size();

--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -16,15 +16,12 @@
 #include "functional3.hpp"
 #include "functional4.hpp"
 #include "in_memory_operation.hpp"
+#include "synchronization.hpp"

 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
 #endif

-#if CK_USE_AMD_BUFFER_ADDRESSING
-#include "amd_buffer_addressing.hpp"
-#endif
-
 #if CK_USE_AMD_XDLOPS
 #include "amd_xdlops.hpp"
 #endif

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -25,11 +25,7 @@
 #define CK_USE_AMD_BUFFER_ADDRESSING 1
 #endif

-#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
-#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
-#endif
-
-// only support gfx908
+// only gfx908 support native floating point atomic add
 #ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD 0
 #endif
@@ -47,6 +43,11 @@
 #define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
 #endif

+// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
+#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
+#endif
+
 // experimental implementation
 #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
 #define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
@@ -54,8 +55,24 @@
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+
+#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
+#endif
+
+#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
 #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
+#endif
+
+// workaround: put all workaround here
+// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
+#ifndef CK_WORKAROUND_SWDEV_229564
+#define CK_WORKAROUND_SWDEV_229564 1
+#endif
+// workaround for buffer load/store fp16/bfp16 intrinsic bug
+#ifndef CK_WORKAROUND_SWDEV_231101
+#define CK_WORKAROUND_SWDEV_231101 1
+#endif

 namespace ck {


--- a/composable_kernel/include/utility/float_type.amd.hpp.in
+++ b/composable_kernel/include/utility/float_type.amd.hpp.in
@@ -14,10 +14,12 @@ typedef float float32_t __attribute__((ext_vector_type(32)));
 typedef _Float16 half_t;
 typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
 typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
+typedef _Float16 half8_t __attribute__((ext_vector_type(8)));

 // bfloat16
 typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
 typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
+typedef ushort ushort8_t __attribute__((ext_vector_type(8)));

 template <class T, index_t N>
 struct vector_type
@@ -152,6 +154,25 @@ struct vector_type<half_t, 4>
    }
 };

+template <>
+struct vector_type<half_t, 8>
+{
+    using MemoryType = half8_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        half_t scalar[8];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
+    {
+        static_assert(I < 8, "wrong");
+        *(reinterpret_cast<half_t*>(&v) + I) = s;
+    }
+};
+
 template <>
 struct vector_type<ushort, 1>
 {
@@ -221,6 +242,25 @@ struct vector_type<ushort, 4>
    }
 };

+template <>
+struct vector_type<ushort, 8>
+{
+    using MemoryType = ushort8_t;
+
+    union DataType
+    {
+        MemoryType vector;
+        ushort scalar[8];
+    };
+
+    template <index_t I>
+    __host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
+    {
+        static_assert(I < 8, "wrong");
+        *(reinterpret_cast<ushort*>(&v) + I) = s;
+    }
+};
+
 // data type conversion
 template <typename T>
 struct type_convert
@@ -251,6 +291,34 @@ struct inner_product_with_conversion
 {
    static constexpr auto convert = type_convert<T>();

+    __device__ T operator()(float4_t a, float4_t b) const
+    {
+        const float* p_a_float = reinterpret_cast<const float*>(&a);
+        const float* p_b_float = reinterpret_cast<const float*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 4; ++v)
+        {
+            acc += convert(p_a_float[v]) * convert(p_b_float[v]);
+        }
+
+        return acc;
+    }
+
+    __device__ T operator()(float2_t a, float2_t b) const
+    {
+        const float* p_a_float = reinterpret_cast<const float*>(&a);
+        const float* p_b_float = reinterpret_cast<const float*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 2; ++v)
+        {
+            acc += convert(p_a_float[v]) * convert(p_b_float[v]);
+        }
+
+        return acc;
+    }
+
    __device__ T operator()(float a, float b) const { return convert(a) * convert(b); }

    __device__ T operator()(half2_t a, half2_t b) const
@@ -280,6 +348,19 @@ struct inner_product_with_conversion
        return acc;
    }

+    __device__ T operator()(half8_t a, half8_t b) const
+    {
+        const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
+        const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 8; ++v)
+        {
+            acc += convert(p_a_half[v]) * convert(p_b_half[v]);
+        }
+        return acc;
+    }
+
    __device__ T operator()(ushort2_t a, ushort2_t b) const
    {
        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
@@ -306,6 +387,19 @@ struct inner_product_with_conversion
        }
        return acc;
    }
+
+    __device__ T operator()(ushort8_t a, ushort8_t b) const
+    {
+        const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
+        const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
+
+        T acc = 0;
+        for(index_t v = 0; v < 8; ++v)
+        {
+            acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
+        }
+        return acc;
+    }
 };

 } // namespace ck

--- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
+++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in
@@ -2,91 +2,159 @@
 #define CK_IN_MEMORY_OPERATION_AMD_HPP

 #include "float_type.hpp"
+
+#if CK_USE_AMD_BUFFER_ADDRESSING
 #include "amd_buffer_addressing.hpp"
+#endif

 namespace ck {

-template <typename T,
-          index_t DataPerAccess,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace>
-__device__ void set_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+template <typename T>
+__device__ void atomic_add_impl(T* p_dst, T src)
+{
+    atomicAdd(p_dst, src);
+}
+
+// atomicAdd for float does not support vector type
+template <>
+__device__ void atomic_add_impl<float2_t>(float2_t* p_dst, float2_t src)
+{
+    float* p_dst_float       = reinterpret_cast<float*>(p_dst);
+    const float* p_src_float = reinterpret_cast<const float*>(&src);
+
+    for(index_t i = 0; i < 2; ++i)
+    {
+        atomicAdd(&(p_dst_float[i]), p_src_float[i]);
+    }
+}
+
+template <>
+__device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
+{
+    float* p_dst_float       = reinterpret_cast<float*>(p_dst);
+    const float* p_src_float = reinterpret_cast<const float*>(&src);
+
+    for(index_t i = 0; i < 4; ++i)
+    {
+        atomicAdd(&(p_dst_float[i]), p_src_float[i]);
+    }
+}
+
+template <typename T, index_t DataPerAccess>
+struct SetData
 {
    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;

+    // This version is only for compatibility, don't use this version if possible
+    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
+    __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
+    {
+        *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
+            *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+    }
+
 #if CK_USE_AMD_BUFFER_ADDRESSING
-    // TODO: use static_if::ElseIf, instead of nested static_if
-    static_if<SrcAddressSpace == AddressSpace::Global &&
-              DstAddressSpace == AddressSpace::Vgpr>{}([&](auto) {
-        // buffer_load requires:
-        //   1) p_src must be in global memory space, d_dst must be vgpr
-        //   2) p_src to be a block-invariant pointer.
-        // It is user's responsibility to make sure that is true.
+    // buffer_load requires:
+    //   1) p_src must be in global memory space, d_dst must be vgpr
+    //   2) p_src to be a block-invariant pointer.
+    // It is user's responsibility to make sure that is true.
+    template <>
+    __device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
+                                                                  index_t src_offset,
+                                                                  T* p_dst,
+                                                                  index_t dst_offset) const
+    {
        *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-            amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
-    }).Else([&](auto) {
-        static_if<SrcAddressSpace == AddressSpace::Vgpr &&
-                  DstAddressSpace == AddressSpace::Global>{}([&](auto) {
-            // buffer_store requires:
-            //   1) p_src must be in vgpr space, d_dst must be global memory
-            //   2) p_dst to be a block-invariant pointer.
-            // It is user's responsibility to make sure that is true.
-            amd_intrinsic_buffer_store<T, DataPerAccess>(
-                *reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
-        }).Else([&](auto) {
-            *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-                *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
-        });
-    });
-#else
-    *reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
-        *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
+            amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
+    }
+
+    // buffer_store requires:
+    //   1) p_src must be in vgpr space, d_dst must be global memory
+    //   2) p_dst to be a block-invariant pointer.
+    // It is user's responsibility to make sure that is true.
+    template <>
+    __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
+                                                                  index_t src_offset,
+                                                                  T* p_dst,
+                                                                  index_t dst_offset) const
+    {
+        amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
+    }
 #endif
-}
+};

-template <typename T,
-          index_t DataPerAccess,
-          AddressSpace SrcAddressSpace,
-          AddressSpace DstAddressSpace>
-__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
+template <typename T, index_t DataPerAccess>
+struct AtomicAddData
 {
    using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;

-    static_if<SrcAddressSpace == AddressSpace::Vgpr &&
-              DstAddressSpace == AddressSpace::Global>{}([&](auto) {
-#if CK_USE_AMD_BUFFER_ATOMIC_ADD
-        amd_intrinsic_buffer_atomic_add<T, DataPerAccess>(
-            *reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
-#else
-        atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
-                  *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
+    // This version is only for compatibility, don't use this version if possible
+    template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
+    __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
+    {
+        atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
+                        *reinterpret_cast<const vector_t*>(&p_src[src_offset]));
+    }
+
+#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
+    // buffer_atomic_add requires:
+    //   1) p_src must be in vgpr space, d_dst must be global memory
+    //   2) p_dst to be a block-invariant pointer.
+    // It is user's responsibility to make sure that is true.
+    template <>
+    __device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
+                                                                  index_t src_offset,
+                                                                  T* p_dst,
+                                                                  index_t dst_offset) const
+    {
+        amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
+    }
 #endif
-    }).Else([&](auto fwd) {
-        static_assert(fwd(false), "atomic_add doesn't support this memory space");
-    });
-}
+};

 template <typename T,
          index_t DataPerAccess,
          AddressSpace SrcAddressSpace,
          AddressSpace DstAddressSpace,
-          InMemoryDataOperation DstInMemOp>
+          InMemoryDataOperation DstInMemOp,
+          index_t SrcDataStride = 1,
+          index_t DstDataStride = 1>
 __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
 {
    static_assert(DstInMemOp == InMemoryDataOperation::Set ||
                      DstInMemOp == InMemoryDataOperation::AtomicAdd,
                  "wrong! InMemoryDataOperation not supported!");

-    // TODO: use static_if::ElseIf
-    static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
-        set_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
-            p_src, src_offset, p_dst, dst_offset);
-    });
+    // keep it simple, don't use static_if here, otherwise compiler will do weird things
+    if(SrcDataStride == 1 && DstDataStride == 1)
+    {
+        // TODO: use static_if::ElseIf
+        static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+            SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                p_src, src_offset, p_dst, dst_offset);
+        });
+
+        static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
+            AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                p_src, src_offset, p_dst, dst_offset);
+        });
+    }
+    else
+    {
+        for(index_t i = 0; i < DataPerAccess; i++)
+        {
+            // TODO: use static_if::ElseIf
+            static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
+                SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                    p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
+            });

-    static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
-        atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
-            p_src, src_offset, p_dst, dst_offset);
-    });
+            static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
+                AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
+                    p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
+            });
+        }
+    }
 }

 } // namespace ck

--- a/composable_kernel/include/utility/synchronization.amd.hpp.in
+++ b/composable_kernel/include/utility/synchronization.amd.hpp.in
+#ifndef CK_SYNCHRONIZATION_AMD_HPP
+#define CK_SYNCHRONIZATION_AMD_HPP
+
+#include "config.hpp"
+
+namespace ck {
+
+__device__ void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
+
+__device__ void block_sync_lds()
+{
+#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+    asm volatile("\
+    s_waitcnt lgkmcnt(0) \n \
+    s_barrier \
+    " ::);
+#else
+    __llvm_amdgcn_s_barrier();
+#endif
+}
+
+__device__ void block_sync_lds_vmem() { __llvm_amdgcn_s_barrier(); }
+
+} // namespace ck
+#endif
--- a/driver/src/conv_bwd_data_driver.cpp
+++ b/driver/src/conv_bwd_data_driver.cpp
@@ -160,10 +160,10 @@ int main(int argc, char* argv[])
 #elif 0
    // 1x7 filter, 0x3 pad, 17x17 input
    constexpr index_t N  = 128;
-    constexpr index_t C  = 128;
+    constexpr index_t C  = 1024;
    constexpr index_t HI = 17;
    constexpr index_t WI = 17;
-    constexpr index_t K  = 128;
+    constexpr index_t K  = 1024;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 7;

@@ -247,7 +247,7 @@ int main(int argc, char* argv[])

 #if 0
    device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
-#elif 0
+#elif 1
    device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
 #elif 0
    device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -19,7 +19,7 @@ int main(int argc, char* argv[])
 {
    using namespace ck;

-#if 1
+#if 0
    // 1x1, 17x17
    constexpr index_t N  = 128;
    constexpr index_t C  = 1024;
@@ -97,10 +97,10 @@ int main(int argc, char* argv[])
 #elif 0
    // 7x1, 17x17
    constexpr index_t N  = 128;
-    constexpr index_t C  = 256;
+    constexpr index_t C  = 128;
    constexpr index_t HI = 17;
    constexpr index_t WI = 17;
-    constexpr index_t K  = 320;
+    constexpr index_t K  = 128;
    constexpr index_t Y  = 7;
    constexpr index_t X  = 1;

@@ -109,13 +109,13 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<3, 0>;
    using RightPads = Sequence<3, 0>;
-#elif 0
+#elif 1
    // 1x7, 17x17
    constexpr index_t N  = 128;
-    constexpr index_t C  = 224;
+    constexpr index_t C  = 128;
    constexpr index_t HI = 17;
    constexpr index_t WI = 17;
-    constexpr index_t K  = 224;
+    constexpr index_t K  = 128;
    constexpr index_t Y  = 1;
    constexpr index_t X  = 7;

@@ -124,7 +124,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 3>;
    using RightPads = Sequence<0, 3>;
-#elif 1
+#elif 0
    // 3x3, 299x299 stride=2
    constexpr index_t N  = 128;
    constexpr index_t C  = 3;
@@ -565,7 +565,7 @@ int main(int argc, char* argv[])
 #endif
    }

-#if 0
+#if 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
                                                         in_nchw,
                                                         wei_kcyx_desc,