add vector load and access order to threadwise copy v1r3

68ea43b1 · Chao Liu · 5fdccfce · 68ea43b1 · 68ea43b1 · 68ea43b1
Commit 68ea43b1 authored Jan 27, 2021 by Chao Liu
9 changed files
--- a/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -492,7 +492,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
            0,
            GemmABlockTransferSrcScalarPerVector_GemmK,
            GemmABlockTransferDstScalarPerVector_GemmM,
-            true, // move back src coordinate after threadwise copy
+            false, // don't move back src coordinate after threadwise copy
            GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
            GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
            Sequence<0, 1>,

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -502,11 +502,11 @@ struct DynamicMerge
              typename LowIdx,
              typename UpIdx,
              index_t Hack>
-    __host__ __device__ void UpdateLowerIndex_1(LowIdxDiff& idx_diff_low,
+    __host__ __device__ void UpdateLowerIndex_1a(LowIdxDiff& idx_diff_low,
-                                                const UpIdxDiff& idx_diff_up,
+                                                 const UpIdxDiff& idx_diff_up,
-                                                LowIdx& idx_low,
+                                                 LowIdx& idx_low,
-                                                const UpIdx& /* idx_up_new */,
+                                                 const UpIdx& /* idx_up_new */,
-                                                Number<Hack>) const
+                                                 Number<Hack>) const
    {
        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
@@ -640,6 +640,148 @@ struct DynamicMerge
        }
    }
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_1b(LowIdxDiff& idx_diff_low,
+                                                 const UpIdxDiff& idx_diff_up,
+                                                 LowIdx& idx_low,
+                                                 const UpIdx& /* idx_up_new */,
+                                                 Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#endif
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+            idx_low += idx_diff_low;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t negative_idx_low_tmp = borrow - idx_low[i];
+                bool do_borrow = negative_idx_low_tmp > idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) -= borrow;
+                borrow = do_borrow ? 1 : 0;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+            idx_low += idx_diff_low;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+                idx_diff_low(i) += carry;
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+            idx_low += idx_diff_low;
+        }
+    }
    template <typename LowIdxDiff,
              typename UpIdxDiff,
              typename LowIdx,
@@ -705,11 +847,15 @@ struct DynamicMerge
                do_carry = idx_low_tmp >= low_lengths_[i];
+#if 0
                // TODO: use exec-mask inline asm
                if(do_carry)
                {
                    idx_diff_low(i) -= low_lengths_[i];
                }
+#else
+                idx_diff_low(i) = do_carry ? idx_diff_low[i] - low_lengths_[i] : idx_diff_low[i];
+#endif
                idx_low(i) += idx_diff_low[i];
            });
@@ -733,11 +879,15 @@ struct DynamicMerge
                do_borrow = idx_low_tmp < 0;
+#if 0
                // TODO: use exec-mask inline asm
                if(do_borrow)
                {
                    idx_diff_low(i) += low_lengths_[i];
                }
+#else
+                idx_diff_low(i) = do_borrow ? idx_diff_low[i] + low_lengths_[i] : idx_diff_low[i];
+#endif
                idx_low(i) += idx_diff_low[i];
            });
@@ -765,8 +915,10 @@ struct DynamicMerge
                                              const UpIdx& idx_up_new,
                                              Number<Hack>) const
    {
-#if 1
+#if 0
-        UpdateLowerIndex_1(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+        UpdateLowerIndex_1a(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#elif 0
+        UpdateLowerIndex_1b(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
 #else
        UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
 #endif

--- a/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_dynamic_gemm.hpp
@@ -4,6 +4,7 @@
 #include "common_header.hpp"
 #include "dynamic_tensor_descriptor.hpp"
 #include "dynamic_tensor_descriptor_helper.hpp"
+#include "tensor_descriptor_helper.hpp"
 #include "blockwise_dynamic_tensor_slice_transfer.hpp"
 #include "threadwise_dynamic_tensor_slice_transfer.hpp"
 #include "ConstantMatrixDescriptor.hpp"
@@ -364,9 +365,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
            // define input tensor descriptor for threadwise copy
            //     thread input tensor, src of threadwise copy
+#if 0 // debug
            constexpr auto c_m0_m1_n0_n1_thread_desc =
                make_dynamic_naive_tensor_descriptor_packed<4>(
                    make_multi_index(MRepeat, MPerThread, NRepeat, NPerThread));
+#else
+            constexpr auto c_m0_m1_n0_n1_thread_desc = make_native_tensor_descriptor_packed(
+                Sequence<MRepeat, MPerThread, NRepeat, NPerThread>{});
+#endif
            // calculate origin of thread input tensor on global memory
            //     blockwise GEMM c matrix starting index
@@ -379,6 +385,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
            const index_t n_thread_data_on_global =
                n_block_data_on_global + c_thread_mtx_on_block.col;
+#if 0
            ThreadwiseDynamicTensorSliceTransfer_v1r2<
                AccFloat,
                Float,
@@ -404,6 +411,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
                                       n_thread_data_on_global % N1))
                .Run_hack(
                    c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
+#else
+            ThreadwiseDynamicTensorSliceTransfer_v1r3<
+                AccFloat,
+                Float,
+                decltype(c_m0_m1_n0_n1_thread_desc),
+                decltype(c_m0_m1_n0_n1_global_desc),
+                Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                AddressSpace::Vgpr,
+                AddressSpace::Global,
+                CGlobalMemoryDataOperation,
+                1,
+                true,
+                true>(c_m0_m1_n0_n1_global_desc,
+                      make_multi_index(m_thread_data_on_global / M1,
+                                       m_thread_data_on_global % M1,
+                                       n_thread_data_on_global / N1,
+                                       n_thread_data_on_global % N1))
+                .Run_hack(p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
+#endif
        }
    }

--- a/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -97,8 +97,8 @@ __llvm_amdgcn_buffer_atomic_add_f32(float vdata,
 #endif
 // buffer_load requires:
-//   1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
+//   1) p_src_wave must be in global memory space
-//   2) p_src_thread to be a wavewise pointer.
+//   2) p_src_wave to be a wavewise pointer.
 // It is user's responsibility to make sure that is true.
 template <typename T, index_t VectorSize>
 __device__ typename vector_type<T, VectorSize>::MemoryType
@@ -118,6 +118,18 @@ __device__ void amd_buffer_store(const T* p_src_thread,
                                 bool dst_thread_data_valid,
                                 index_t dst_data_range);
+// buffer_store requires:
+//   1) p_dst_wave must be global memory
+//   2) p_dst_wave to be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t VectorSize>
+__device__ void
+amd_buffer_store_v2(const typename vector_type<T, VectorSize>::MemoryType src_thread_data,
+                    T* p_dst_wave,
+                    const index_t dst_thread_data_offset,
+                    const bool dst_thread_data_valid,
+                    const index_t dst_data_range);
 // buffer_atomic requires:
 //   1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
 //   2) p_dst_thread to be a wavewise pointer.
@@ -926,6 +938,126 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
 #endif
 }
+template <>
+__device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
+                                              float* p_dst_wave,
+                                              const index_t dst_thread_data_offset,
+                                              const bool dst_thread_data_valid,
+                                              const index_t dst_data_range)
+{
+    BufferResourceConstant<float> dst_wave_buffer_resource;
+    // wavewise base address (64 bit)
+    dst_wave_buffer_resource.address[0] = p_dst_wave;
+    // wavewise range (32 bit)
+    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
+    // wavewise setting (32 bit)
+    dst_wave_buffer_resource.config[3] = 0x00027000;
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
+    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
+    __llvm_amdgcn_buffer_store_f32(src_thread_data,
+                                   dst_wave_buffer_resource.data,
+                                   0,
+                                   dst_addr_shift + dst_thread_addr_offset,
+                                   false,
+                                   false);
+#else
+    if(dst_thread_data_valid)
+    {
+        __llvm_amdgcn_buffer_store_f32(src_thread_data,
+                                       dst_wave_buffer_resource.data,
+                                       0,
+                                       dst_thread_addr_offset,
+                                       false,
+                                       false);
+    }
+#endif
+}
+template <>
+__device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
+                                              float* p_dst_wave,
+                                              const index_t dst_thread_data_offset,
+                                              const bool dst_thread_data_valid,
+                                              const index_t dst_data_range)
+{
+    BufferResourceConstant<float> dst_wave_buffer_resource;
+    // wavewise base address (64 bit)
+    dst_wave_buffer_resource.address[0] = p_dst_wave;
+    // wavewise range (32 bit)
+    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
+    // wavewise setting (32 bit)
+    dst_wave_buffer_resource.config[3] = 0x00027000;
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
+    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
+    __llvm_amdgcn_buffer_store_f32x2(src_thread_data,
+                                     dst_wave_buffer_resource.data,
+                                     0,
+                                     dst_addr_shift + dst_thread_addr_offset,
+                                     false,
+                                     false);
+#else
+    if(dst_thread_data_valid)
+    {
+        __llvm_amdgcn_buffer_store_f32x2(src_thread_data,
+                                         dst_wave_buffer_resource.data,
+                                         0,
+                                         dst_thread_addr_offset,
+                                         false,
+                                         false);
+    }
+#endif
+}
+template <>
+__device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
+                                              float* p_dst_wave,
+                                              const index_t dst_thread_data_offset,
+                                              const bool dst_thread_data_valid,
+                                              const index_t dst_data_range)
+{
+    BufferResourceConstant<float> dst_wave_buffer_resource;
+    // wavewise base address (64 bit)
+    dst_wave_buffer_resource.address[0] = p_dst_wave;
+    // wavewise range (32 bit)
+    dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
+    // wavewise setting (32 bit)
+    dst_wave_buffer_resource.config[3] = 0x00027000;
+    index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
+#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
+    uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
+    __llvm_amdgcn_buffer_store_f32x4(src_thread_data,
+                                     dst_wave_buffer_resource.data,
+                                     0,
+                                     dst_addr_shift + dst_thread_addr_offset,
+                                     false,
+                                     false);
+#else
+    if(dst_thread_data_valid)
+    {
+        __llvm_amdgcn_buffer_store_f32x4(src_thread_data,
+                                         dst_wave_buffer_resource.data,
+                                         0,
+                                         dst_thread_addr_offset,
+                                         false,
+                                         false);
+    }
+#endif
+}
 #if CK_USE_AMD_BUFFER_ATOMIC_FADD
 template <>
 __device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,

--- a/composable_kernel/include/utility/config.amd.hpp.in
+++ b/composable_kernel/include/utility/config.amd.hpp.in
@@ -91,7 +91,7 @@
 #endif
 // workaround: put all workaround here
-// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
+// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
 #ifndef CK_WORKAROUND_SWDEV_229564
 #define CK_WORKAROUND_SWDEV_229564 1
 #endif
@@ -123,6 +123,8 @@ using index_t = uint32_t;
 using index_t = int32_t;
 #endif
+typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
 // int32x4_t use by buffer_load and buffer_store llvm intrinsic
 typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));

--- a/composable_kernel/include/utility/functional2.hpp
+++ b/composable_kernel/include/utility/functional2.hpp
@@ -32,7 +32,8 @@ struct static_for
        static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0,
                      "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
        static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd),
-                      "wrongs! should have NBegin <= NEnd");
+                      "wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
+                      "NBegin >= NEnd)");
    }
    template <class F>

--- a/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
    constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x8
    // b thread copy 2x2
    constexpr index_t BlockSize = 256;
@@ -166,7 +166,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    using GemmABlockTransferThreadSliceLengths_GemmK_GemmM   = Sequence<4, 1>;
    using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
-    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
+    constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 2;
    constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
    using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN   = Sequence<2, 2>;
@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
    constexpr auto conv_driver =
-#if 0
+#if 1
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
 #else
        DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -22,7 +22,22 @@ int main(int argc, char* argv[])
 {
    using namespace ck;
-#if 1
+#if 0
+    // 3x3, 36x36, stride 2
+    constexpr index_t N  = 128;
+    constexpr index_t C  = 192;
+    constexpr index_t HI = 37;
+    constexpr index_t WI = 37;
+    constexpr index_t K  = 384;
+    constexpr index_t Y  = 3;
+    constexpr index_t X  = 3;
+    using ConvStrides   = Sequence<2, 2>;
+    using ConvDilations = Sequence<1, 1>;
+    using LeftPads  = Sequence<0, 0>;
+    using RightPads = Sequence<0, 0>;
+#elif 0
    // 3x3, 35x35, stride 2
    constexpr index_t N  = 128;
    constexpr index_t C  = 192;