refactoring for miopen

33d1e0e2 · Chao Liu · b1cb48a0 · 33d1e0e2 · 33d1e0e2 · 33d1e0e2
Commit 33d1e0e2 authored Jun 17, 2019 by Chao Liu
20 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
@@ -264,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
 #if 1
                    blockwise_batch_gemm.Run
 #else
-                    blockwise_batch_gemm.Run_asm
+                    blockwise_batch_gemm.Run_amd_asm
 #endif
                        (p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                         p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
@@ -191,7 +191,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
 #if 0
            return blockwise_batch_gemm.Run(Xs...);
 #elif 0
-            return blockwise_batch_gemm.Run_asm(Xs...);
+            return blockwise_batch_gemm.Run_amd_asm(Xs...);
 #else
            return blockwise_batch_gemm.Run_asm_v2(Xs...);
 #endif

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -206,7 +206,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
 #if 1
            return blockwise_batch_gemm.Run(Xs...);
 #elif 0
-            return blockwise_batch_gemm.Run_asm(Xs...);
+            return blockwise_batch_gemm.Run_amd_asm(Xs...);
 #else
            return blockwise_batch_gemm.Run_asm_v2(Xs...);
 #endif

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
@@ -201,7 +201,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
 #if 1
            return blockwise_batch_gemm.Run(Xs...);
 #elif 0
-            return blockwise_batch_gemm.Run_asm(Xs...);
+            return blockwise_batch_gemm.Run_amd_asm(Xs...);
 #else
            return blockwise_batch_gemm.Run_asm_v2(Xs...);
 #endif

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -203,7 +203,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
 #if 1
            return blockwise_batch_gemm.Run(Xs...);
 #elif 0
-            return blockwise_batch_gemm.Run_asm(Xs...);
+            return blockwise_batch_gemm.Run_amd_asm(Xs...);
 #else
            return blockwise_batch_gemm.Run_asm_v2(Xs...);
 #endif

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -238,7 +238,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
 #elif 0
                    blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 1
-                    blockwise_gemm.Run_asm
+                    blockwise_gemm.Run_amd_asm
 #endif
                        (p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
                         p_in_block + y * Wi + x,

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -271,7 +271,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #elif 0
                        blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 0
-                        blockwise_gemm.Run_asm
+                        blockwise_gemm.Run_amd_asm
 #endif
                            (p_wei_block_now +
                                 wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
@@ -313,7 +313,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #elif 0
                    blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 0
-                    blockwise_gemm.Run_asm
+                    blockwise_gemm.Run_amd_asm
 #endif
                        (p_wei_block_double +
                             wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),
@@ -339,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
 #elif 0
                    blockwise_gemm.Run_RegisterDoubleBuffer
 #elif 0
-                    blockwise_gemm.Run_asm
+                    blockwise_gemm.Run_amd_asm
 #endif
                        (p_wei_block_double + wei_block_space +
                             wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0),

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -217,7 +217,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
 #if 1
            return blockwise_gemm.Run(Xs...);
 #else
-            return blockwise_gemm.Run_asm(Xs...);
+            return blockwise_gemm.Run_amd_asm(Xs...);
 #endif
        };


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -214,7 +214,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
 #if 1
            return blockwise_gemm.Run(Xs...);
 #else
-            return blockwise_gemm.Run_asm(Xs...);
+            return blockwise_gemm.Run_amd_asm(Xs...);
 #endif
        };


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp
@@ -247,7 +247,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
 #if 1
            return blockwise_gemm.Run(Xs...);
 #else
-            return blockwise_gemm.Run_asm(Xs...);
+            return blockwise_gemm.Run_amd_asm(Xs...);
 #endif
        };


--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -9,10 +9,6 @@
 #include "blockwise_gemm.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"

-#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
-#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
-#endif
-
 namespace ck {

 // define B = merge(N0, Ho, Wo)
@@ -239,15 +235,6 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
            GemmDataPerReadA,
            GemmDataPerReadB>{};

-        // choose GEMM implementation here
-        const auto run_blockwise_gemm = [&](auto... Xs) {
-#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
-            return blockwise_gemm.Run_asm(Xs...);
-#else
-            return blockwise_gemm.Run(Xs...);
-#endif
-        };
-
        // LDS allocation for input and weight: be careful of alignment
        constexpr index_t max_align = math::lcm(InBlockCopyDstDataPerWrite_N2,
                                                WeiBlockCopyDstDataPerWrite_K,
@@ -255,9 +242,11 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
                                                GemmDataPerReadB);

        constexpr index_t in_block_space =
-            in_e_n1_b_n2_block_desc.GetElementSpace(Number<max_align>{});
+            math::integer_divide_ceil(in_e_n1_b_n2_block_desc.GetElementSpace(), max_align) *
+            max_align;

-        constexpr index_t wei_block_space = wei_e_k_block_desc.GetElementSpace(Number<max_align>{});
+        constexpr index_t wei_block_space =
+            math::integer_divide_ceil(wei_e_k_block_desc.GetElementSpace(), max_align) * max_align;

        __shared__ Float p_in_block_double[2 * in_block_space];
        __shared__ Float p_wei_block_double[2 * wei_block_space];
@@ -309,7 +298,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
                                                            p_wei_register_clipboard);

                // LDS double buffer: GEMM on current data
-                run_blockwise_gemm(p_wei_block_now, p_in_block_now, p_out_thread);
+                blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread);

                // LDS double buffer: store next data to LDS
                blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -336,7 +325,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
                                                        p_wei_register_clipboard);

            // LDS double buffer: GEMM on current data
-            run_blockwise_gemm(p_wei_block_double, p_in_block_double, p_out_thread);
+            blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread);

            // LDS double buffer: store next data to LDS
            blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
@@ -348,7 +337,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
            __syncthreads();

            // LDS double buffer: GEMM on current data
-            run_blockwise_gemm(p_wei_block_double + wei_block_space,
+            blockwise_gemm.Run(p_wei_block_double + wei_block_space,
                               p_in_block_double + in_block_space,
                               p_out_thread);
        }

--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
@@ -107,15 +107,12 @@ struct ConstantTensorDescriptor
        return accumulate_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{});
    }

-    template <class Align = Number<1>>
-    __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{})
+    __host__ __device__ static constexpr index_t GetElementSpace()
    {
-        // This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
-        // dimension
        constexpr index_t element_space_unaligned = accumulate_on_sequence(
            (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});

-        return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
+        return element_space_unaligned;
    }

    // emulate constexpr lambda
@@ -234,7 +231,7 @@ struct ConstantTensorDescriptor
            // do carry check in reversed order, starting from lowest dimension
            // don't check the highest dimension
            static_for<0, nDim, 1>{}([&](auto IDimReverse) {
-                constexpr index_t idim = nDim - 1 - IDimReverse.Get();
+                constexpr index_t idim = nDim - 1 - IDimReverse;
                constexpr auto IDim    = Number<idim>{};

                if(carry)
@@ -260,7 +257,7 @@ struct ConstantTensorDescriptor
            // do borrow check in reversed order, starting from lowest dimension
            // don't check the highest dimension
            static_for<0, nDim, 1>{}([&](auto IDimReverse) {
-                constexpr index_t idim = nDim - 1 - IDimReverse.Get();
+                constexpr index_t idim = nDim - 1 - IDimReverse;
                constexpr auto IDim    = Number<idim>{};

                if(borrow)

--- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
@@ -286,7 +286,7 @@ struct Blockwise2dTensorCopy2

    __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
    {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

        using Float4 = float4;
        using Float2 = float2;
@@ -565,7 +565,7 @@ struct Blockwise2dTensorCopy3

    __device__ constexpr index_t GetRegisterClipboardSize() const
    {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};
@@ -714,7 +714,7 @@ struct Blockwise2dTensorCopy3
                *(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset +
                                                    iloop * src_loop_stride]));
 #else
-            static_assert(is_same<float, Float>::value && DataPerRead == 4,
+            static_assert(is_same<float, Float>{} && DataPerRead == 4,
                          "global_load is only for float4");

            global_load(reinterpret_cast<vector_t&>(p_clipboard[iloop * DataPerRead]),
@@ -773,7 +773,7 @@ struct Blockwise2dTensorCopy3
            *(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) =
                *(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]);
 #else
-            static_assert(is_same<float, Float>::value && DataPerRead == 4,
+            static_assert(is_same<float, Float>{} && DataPerRead == 4,
                          "ds_write_b128 is only for float4");

            ds_write_b128(reinterpret_cast<const vector_t&>(p_clipboard[iloop * DataPerRead]),

--- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
@@ -239,7 +239,7 @@ struct Blockwise3dTensorCopy3

    __device__ static constexpr index_t GetRegisterClipboardSize()
    {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

--- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
@@ -598,7 +598,7 @@ struct Blockwise4dTensorCopy3

    __device__ constexpr index_t GetRegisterClipboardSize() const
    {
-        static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float!\n");

        constexpr auto I0 = Number<0>{};
        constexpr auto I1 = Number<1>{};

--- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
@@ -295,7 +295,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2

 #if CK_USE_AMD_INLINE_ASM
    template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run_asm(const FloatA* __restrict__ p_a_block,
+    __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
                                const FloatB* __restrict__ p_b_block,
                                FloatC* __restrict__ p_c_thread) const
    {
@@ -330,19 +330,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
        constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;

        // assertion for inline asm
-        static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
-                          is_same<FloatC, float>::value,
-                      "Run_asm only deal with float\n");
+        static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
+                          is_same<FloatC, float>{},
+                      "Run_amd_asm only deal with float\n");

        static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
                          MPerThread == 8 && NPerThread == 8,
-                      "Run_asm cannot deal with this GEMM shape yet\n");
+                      "Run_amd_asm cannot deal with this GEMM shape yet\n");

-        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
+        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");

-        static_assert(
-            BlockMatrixStrideA == 0 && BatchPerThread == 1,
-            "Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n");
+        static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
+                      "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
+                      "1 for now\n");

        using Float4 = vector_type<float, 4>::MemoryType;

@@ -421,19 +421,19 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
        constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;

        // assertion for inline asm
-        static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
-                          is_same<FloatC, float>::value,
-                      "Run_asm only deal with float\n");
+        static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
+                          is_same<FloatC, float>{},
+                      "Run_amd_asm only deal with float\n");

        static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
                          MPerThread == 8 && NPerThread == 8,
-                      "Run_asm cannot deal with this GEMM shape yet\n");
+                      "Run_amd_asm cannot deal with this GEMM shape yet\n");

-        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
+        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read\n");

-        static_assert(
-            BlockMatrixStrideA == 0 && BatchPerThread == 1,
-            "Run_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == 1 for now\n");
+        static_assert(BlockMatrixStrideA == 0 && BatchPerThread == 1,
+                      "Run_amd_asm can only deal with BlockMatrixStrideA == 0 && BatchPerThread == "
+                      "1 for now\n");

        using Float4 = vector_type<float, 4>::MemoryType;


--- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp
@@ -5,6 +5,10 @@
 #include "ConstantMatrixDescriptor.hpp"
 #include "threadwise_gemm.hpp"

+#ifndef CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
+#define CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM 1
+#endif
+
 namespace ck {

 // if following number are power of 2, index calculation shall be greatly reduced:
@@ -51,7 +55,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                          N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
                      "wrong! Cannot evenly divide work among\n");

-        static_assert(is_same_type(ThreadMatrixC::GetLengths(), GetThreadMatrixCLengths()),
+        static_assert(std::is_same<decltype(ThreadMatrixC::GetLengths()),
+                                   decltype(GetThreadMatrixCLengths())>{},
                      "wrong! ThreadMatrixC lengths is wrong");

        auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
@@ -115,9 +120,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
    }

 #if CK_USE_AMD_INLINE_ASM
-    // TODO: this is not working correctly
    template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run_asm(const FloatA* __restrict__ p_a_block,
+    __device__ void Run_amd_asm(const FloatA* __restrict__ p_a_block,
                                const FloatB* __restrict__ p_b_block,
                                FloatC* __restrict__ p_c_thread) const
    {
@@ -156,15 +160,15 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
        constexpr index_t NPerLevel1Cluster = NPerThreadSubC * NLevel0Cluster * NLevel1Cluster;

        // assertion for inline asm
-        static_assert(is_same<FloatA, float>::value && is_same<FloatB, float>::value &&
-                          is_same<FloatC, float>::value,
-                      "Run_asm only deal with float\n");
+        static_assert(is_same<FloatA, float>{} && is_same<FloatB, float>{} &&
+                          is_same<FloatC, float>{},
+                      "Run_amd_asm only deal with float");

        static_assert(MPerThreadSubC == 4 && NPerThreadSubC == 4 && KPerThreadLoop == 1 &&
                          MPerThread == 8 && NPerThread == 8,
-                      "Run_asm cannot deal with this GEMM shape yet\n");
+                      "Run_amd_asm cannot deal with this GEMM shape yet");

-        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_asm only do float4 read\n");
+        static_assert(DataPerReadA == 4 && DataPerReadB == 4, "Run_amd_asm only do float4 read");

        using Float4 = vector_type<float, 4>::MemoryType;

@@ -200,7 +204,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
 #endif

    template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run(const FloatA* const __restrict__ p_a_block,
+    __device__ void Run_source(const FloatA* const __restrict__ p_a_block,
                               const FloatB* const __restrict__ p_b_block,
                               FloatC* const __restrict__ p_c_thread) const
    {
@@ -291,7 +295,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
    }

    template <class FloatA, class FloatB, class FloatC>
-    __device__ void Run_RegisterDoubleBuffer(FloatA* const p_a_block,
+    __device__ void RunRegisterDoubleBuffer_source(FloatA* const p_a_block,
                                                   FloatB* const p_b_block,
                                                   FloatC* p_c_thread) const
    {
@@ -427,6 +431,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
                            p_c_thread);
        }
    }
+    template <class FloatA, class FloatB, class FloatC>
+    __device__ void Run(const FloatA* __restrict__ p_a_block,
+                        const FloatB* __restrict__ p_b_block,
+                        FloatC* __restrict__ p_c_thread) const
+
+    {
+#if CK_USE_AMD_INLINE_ASM && CK_BLOCKWISE_GEMM_USE_AMD_INLINE_ASM
+        Run_amd_asm(p_a_block, p_b_block, p_c_thread);
+#else
+        Run_source(p_a_block, p_b_block, p_c_thread);
+#endif
+    }
 };

 } // namespace ck

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -6,6 +6,10 @@
 #include "ConstantMergedTensorDescriptor.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"

+#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
+#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
+#endif
+
 namespace ck {

 // slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
@@ -91,7 +95,7 @@ struct BlockwiseGenericTensorSliceCopy_v1

        constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;

-        // for now, only support SubLengths.Get() == 1 on a merged dimension that constains
+        // for now, only support SubLengths == 1 on a merged dimension that constains
        // multiple original dimensions
        static_for<0, nDim, 1>{}([&](auto IDim_) {
            constexpr auto IDim = decltype(IDim_){};
@@ -121,7 +125,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
        // partial offset on each dimension
        static_for<0, nDim, 1>{}([&](auto IDim_) {
            constexpr auto IDim    = decltype(IDim_){};
-            constexpr index_t idim = IDim.Get();
+            constexpr index_t idim = IDim;

            constexpr auto src_partial_original_dims =
                SrcDesc::GetContainedOriginalDimensions(IDim);
@@ -135,7 +139,7 @@ struct BlockwiseGenericTensorSliceCopy_v1

        static_for<0, nDim, 1>{}([&](auto IDim_) {
            constexpr auto IDim    = decltype(IDim_){};
-            constexpr index_t idim = IDim.Get();
+            constexpr index_t idim = IDim;

            constexpr auto dst_partial_original_dims =
                DstDesc::GetContainedOriginalDimensions(IDim);
@@ -153,38 +157,6 @@ struct BlockwiseGenericTensorSliceCopy_v1

        mThreadDstOffset = accumulate_on_array(
            mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
-
-#if 0
-        if(get_block_1d_id() == 0)
-        {
-            printf("id %5u %5u: "
-                   "src_block_data_multi_id_begin: %u %u %u %u, "
-                   "thread_cluster_multi_id: %u %u %u %u, "
-                   "data_cluster_multi_id: %u %u %u %u, "
-                   "thread_data_multi_id_begin: %u %u %u %u, "
-                   "mThreadSrcOffset %u, mThreadDstOffset %u \n",
-                   get_block_1d_id(),
-                   get_thread_local_1d_id(),
-                   src_block_data_multi_id_begin[0],
-                   src_block_data_multi_id_begin[1],
-                   src_block_data_multi_id_begin[2],
-                   src_block_data_multi_id_begin[3],
-                   thread_cluster_multi_id[0],
-                   thread_cluster_multi_id[1],
-                   thread_cluster_multi_id[2],
-                   thread_cluster_multi_id[3],
-                   data_cluster_multi_id[0],
-                   data_cluster_multi_id[1],
-                   data_cluster_multi_id[2],
-                   data_cluster_multi_id[3],
-                   thread_data_multi_id_begin[0],
-                   thread_data_multi_id_begin[1],
-                   thread_data_multi_id_begin[2],
-                   thread_data_multi_id_begin[3],
-                   mThreadSrcOffset,
-                   mThreadDstOffset);
-        }
-#endif
    }

    __device__ static constexpr index_t GetRegisterClipboardSize()
@@ -210,19 +182,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
            make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
-#if 0
-            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
-
-            const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
-
-            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
-
-            const index_t src_offset =
-                SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
-
-            const index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
-#else // HIP compiler performs better with these codes
+#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};

            constexpr auto src_thread_data_multi_id_begin =
@@ -236,6 +196,18 @@ struct BlockwiseGenericTensorSliceCopy_v1

            constexpr index_t clipboard_offset =
                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
+#else
+            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
+
+            const auto src_thread_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
+
+            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
+
+            const index_t src_offset =
+                SrcDesc{}.GetOffsetFromMultiIndex(src_thread_data_multi_id_begin);
+
+            const index_t clipboard_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
 #endif

            threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
@@ -263,18 +235,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
            make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
-#if 0
-            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
-
-            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
-
-            const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
-
-            const index_t clipboard_offset =
-                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
-
-            const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
-#else // HIP compiler performs better with these codes
+#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};

            constexpr auto clipboard_data_multi_id_begin =
@@ -287,6 +248,17 @@ struct BlockwiseGenericTensorSliceCopy_v1

            constexpr index_t dst_offset =
                DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
+#else
+            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
+
+            const auto clipboard_data_multi_id_begin = repeat_multi_id * thread_sub_tensor_lengths;
+
+            const auto dst_data_multi_id_begin = repeat_multi_id * data_per_cluster_per_dims;
+
+            const index_t clipboard_offset =
+                thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin);
+
+            const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin);
 #endif

            threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc,
@@ -310,7 +282,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
    }

    // When moving the slicing windows along a merged dimension, if the strides of the
-    // contained (by the merged dimension) original dimensions are in descending order,
+    // contained (by the merged dimension) original dimensions are not in descending order,
    // then there is no guarantee that the new offset will be larger than the old offset
    // for movement in positive direction (vice versue for movement in negative direction).
    // As a result, there is the possiblity that the offset calculation may result in
@@ -323,7 +295,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
        Number<IDim_>, Number<StepSize>, integral_constant<bool, PositiveDirection> direction)
    {
        constexpr auto IDim    = Number<IDim_>{};
-        constexpr index_t idim = IDim.Get();
+        constexpr index_t idim = IDim;

        static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim)>{}([&](auto fwd) {
            // logic for a merged dimension, also works for non-merged dimension, but its logic may
@@ -350,8 +322,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
                constexpr auto I                = decltype(I_){};
                constexpr index_t idim_original = src_partial_original_dims.Get(I);

-                mThreadSrcOriginalMultiId(idim_original) =
-                    new_src_partial_original_multi_id[I.Get()];
+                mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I];
            });

            // calculate new partial offset on this merged dimension

--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
@@ -49,7 +49,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
            make_ConstantTensorDescriptor_packed(thread_cluster_lengths);

        // sanity check: data type
-        static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
+        static_assert(is_same<Float, float>{}, "wrong! only support float for now!\n");

        // sanity check: nDim
        static_assert(SrcDesc::GetNumOfDimension() == nDim &&
@@ -121,12 +121,11 @@ struct BlockwiseTensorSliceReorderCopy_v3
            reorder_array_given_old2new(thread_multi_id, map_thread_cluster_2_src_cluster);

        static_for<0, nDim, 1>{}([&](auto IDim) {
-            constexpr auto I    = decltype(IDim){};
-            constexpr index_t i = I.Get();
+            constexpr index_t idim = IDim;
            // compiler: will it really compute index here, or be merged with
            // GetOffsetFromMultiIndex and
            // optimized away???
-            src_data_multi_id(i) *= src_sub_lengths.Get(I);
+            src_data_multi_id(idim) *= src_sub_lengths.Get(IDim);
        });

        // compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex

--- a/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_4d_tensor_op.hpp
@@ -26,16 +26,16 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
    constexpr index_t nshift = NShift::mValue;

    constexpr index_t did0_end =
-        is_same<decltype(I0), IDim>::value ? desc.GetLength(I0) - nshift : desc.GetLength(I0);
+        is_same<decltype(I0), IDim>{} ? desc.GetLength(I0) - nshift : desc.GetLength(I0);

    constexpr index_t did1_end =
-        is_same<decltype(I1), IDim>::value ? desc.GetLength(I1) - nshift : desc.GetLength(I1);
+        is_same<decltype(I1), IDim>{} ? desc.GetLength(I1) - nshift : desc.GetLength(I1);

    constexpr index_t did2_end =
-        is_same<decltype(I2), IDim>::value ? desc.GetLength(I2) - nshift : desc.GetLength(I2);
+        is_same<decltype(I2), IDim>{} ? desc.GetLength(I2) - nshift : desc.GetLength(I2);

    constexpr index_t did3_end =
-        is_same<decltype(I3), IDim>::value ? desc.GetLength(I3) - nshift : desc.GetLength(I3);
+        is_same<decltype(I3), IDim>{} ? desc.GetLength(I3) - nshift : desc.GetLength(I3);

    for(index_t did0 = 0; did0 < did0_end; ++did0)
    {