done: explicitly separate offset component into compile-time, block-invariant...

done: explicitly separate offset component into compile-time, block-invariant and per-thread components. Experimenting

done: explicitly separate offset component into compile-time, block-invariant...
done: explicitly separate offset component into compile-time, block-invariant and per-thread components. Experimenting
6c2c50b0 · Chao Liu · 51884fc2 · 6c2c50b0 · 6c2c50b0 · 6c2c50b0
Commit 6c2c50b0 authored Sep 22, 2019 by Chao Liu
8 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -438,7 +438,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                      0,
                                                      b_thread_data_on_global,
                                                      0})
-                .template Run_amd_experiment<Float, 0, 2>(p_out_thread, p_out_global);
+#if 0
+                .Run_generic
+#elif 1
+                .template Run_generic<Float, address_space_t::generic, address_space_t::global>
+#elif 1
+                .template Run_optimized_dst_address_calculation<Float, address_space_t::global>
+#endif
+                (p_out_thread, p_out_global);
        }
    }
 };

--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -325,14 +325,14 @@ struct TensorCoordinate
    private:
    template <class... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
    {
        return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
    }

    template <class... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
    {
        return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
    }

--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
    private:
    template <typename... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
    {
        return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -196,7 +196,7 @@ struct TensorCoordinate_v2

    template <typename... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
    {
        return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -742,12 +742,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
    __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
    {
 #if 0
-        mThreadwiseLoad.Run(p_src, p_buffer);
+        mThreadwiseLoad.Run_generic(p_src, p_buffer);
 #elif 1
-        mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer);
-#elif 0
-        // hardcoded: global to register
-        mThreadwiseLoad.template Run_amd_experiment<TData, 2, 0>(p_src, p_buffer);
+        // hardcoded: src is global memory
+        mThreadwiseLoad.template Run_generic<TData, address_space_t::global>(p_src, p_buffer);
+#elif 1
+        // hardcoded: src is global memory
+        mThreadwiseLoad
+            .template Run_optimized_src_address_calculation<TData, address_space_t::global>(
+                p_src, p_buffer);
 #endif
    }

@@ -755,10 +758,15 @@ struct BlockwiseGenericTensorSliceCopy_v4
    __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
    {
 #if 0
-        mThreadwiseStore.Run(p_buffer, p_dst);
+        mThreadwiseStore.Run_generic(p_buffer, p_dst);
 #elif 1
-        // hardcoded: register to LDS
-        mThreadwiseStore.template Run_amd_experiment<TData, 0, 1>(p_buffer, p_dst);
+        // hardcoded: dst is lds
+        mThreadwiseStore.template Run_generic<TData, address_space_t::lds>(p_buffer, p_dst);
+#elif 1
+        // hardcoded: dst is lds
+        mThreadwiseStore
+            .template Run_optimized_dst_address_calculation<TData, address_space_t::lds>(p_buffer,
+                                                                                         p_dst);
 #endif
    }


--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -22,7 +22,7 @@
 #include "amd_inline_asm.hpp"
 #endif

-#if CK_USE_AMD_INTRINCIS
+#if CK_USE_AMD_INTRINSIC
 #include "amd_intrinsic.hpp"
 #endif


--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -8,7 +8,7 @@
 #define CK_DEVICE_BACKEND_AMD 1
 #define CK_USE_AMD_INTRINSIC 1
 #define CK_USE_AMD_INLINE_ASM 1
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1
+#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
@@ -16,6 +16,14 @@

 namespace ck {

+enum address_space_t
+{
+    generic = 0,
+    vgpr    = 1,
+    lds     = 2,
+    global  = 3
+};
+
 #if CK_UNSIGNED_INDEX_TYPE
 using index_t = uint32_t;
 #else

--- a/composable_kernel/include/utility/config_nvidia.hpp.in
+++ b/composable_kernel/include/utility/config_nvidia.hpp.in
@@ -10,7 +10,7 @@
 #define CK_DEVICE_BACKEND_NVIDIA 1
 #define CK_USE_AMD_INTRINSIC 0
 #define CK_USE_AMD_INLINE_ASM 0
-#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
+#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
@@ -18,6 +18,11 @@

 namespace ck {

+enum address_space_t
+{
+    generic = 0
+};
+
 #if CK_UNSIGNED_INDEX_TYPE
 using index_t = uint32_t;
 #else