Refactor for MIOpen integration (#4)

Refactor, so can bring multi-index transformation and padding support into MIOpen

Refactor for MIOpen integration (#4)
Refactor, so can bring multi-index transformation and padding support into MIOpen
52c3fe05 · Chao Liu · GitHub · 9aaeacc8 · 52c3fe05 · 52c3fe05
Unverified Commit 52c3fe05 authored Oct 11, 2019 by Chao Liu Committed by GitHub Oct 11, 2019
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,14 +47,17 @@ include_directories(BEFORE
    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
    ${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
    ${PROJECT_SOURCE_DIR}/composable_kernel/include/kernel_algorithm
+    ${PROJECT_SOURCE_DIR}/external/include
    ${PROJECT_SOURCE_DIR}/driver/include
    ${PROJECT_BINARY_DIR}/composable_kernel/include/utility
 )
 if(DEVICE_BACKEND STREQUAL "AMD")
-    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.amd.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
 elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
-    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/config.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/config.hpp")
+    configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/utility/float_type.nvidia.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/utility/float_type.hpp")
 endif()
 add_subdirectory(driver)
--- a/composable_kernel/include/kernel_algorithm/convolution_common.hpp
+++ b/composable_kernel/include/kernel_algorithm/convolution_common.hpp
+#ifndef CK_CONVOLUTION_COMMON_HPP
+#define CK_CONVOLUTION_COMMON_HPP
+namespace ck {
+enum ConvolutionDirection
+{
+    Forward,
+    BackwardData,
+    BackwardWeight
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_DIRECT_V2_NCHW_KCYX_NKHW
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_4d_tensor_op.hpp"
 #include "threadwise_tensor_slice_copy.hpp"

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R1_CHWN_CYXK_KHWN
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_4d_tensor_op.hpp"
 #include "blockwise_2d_tensor_op.hpp"

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R2_CHWN_CYXK_KHWN
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_3d_tensor_op.hpp"

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
@@ -125,38 +125,38 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
        // blockwise copy
        // input: format is [C, Hi, Wi, N]
-        auto blockwise_in_copy =
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
+            BlockSize,
-                                               decltype(in_c_h_w_n_global_desc),
+            decltype(in_c_h_w_n_global_desc),
-                                               decltype(in_c_h_w_n_block_desc),
+            decltype(in_c_h_w_n_block_desc),
-                                               decltype(in_c_h_w_n_block_desc.GetLengths()),
+            decltype(in_c_h_w_n_block_desc.GetLengths()),
-                                               InBlockCopySubLengths_CHWN,
+            InBlockCopySubLengths_CHWN,
-                                               InBlockCopyClusterLengths_CHWN,
+            InBlockCopyClusterLengths_CHWN,
-                                               Sequence<0, 1, 2, 3>,
+            Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
+            Sequence<0, 1, 2, 3>,
-                                               Sequence<0, 1, 2, 3>,
+            Sequence<0, 1, 2, 3>,
-                                               3,
+            3,
-                                               3,
+            3,
-                                               InBlockCopyDataPerAccess_N,
+            InBlockCopyDataPerAccess_N,
-                                               InBlockCopyDataPerAccess_N>({0, 0, 0, 0},
+            InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0});
-                                                                           {0, 0, 0, 0});
        // blockwise wei copy
        //   format is [CPerBlock, X * KPerBlock]
        const auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
+            BlockwiseGenericTensorSliceCopy_v1_deprecated<BlockSize,
-                                               decltype(wei_c_k_global_desc),
+                                                          decltype(wei_c_k_global_desc),
-                                               decltype(wei_c_k_block_desc),
+                                                          decltype(wei_c_k_block_desc),
-                                               decltype(wei_c_k_block_desc.GetLengths()),
+                                                          decltype(wei_c_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_CK,
+                                                          WeiBlockCopySubLengths_CK,
-                                               WeiBlockCopyClusterLengths_CK,
+                                                          WeiBlockCopyClusterLengths_CK,
-                                               Sequence<0, 1>,
+                                                          Sequence<0, 1>,
-                                               Sequence<0, 1>,
+                                                          Sequence<0, 1>,
-                                               Sequence<0, 1>,
+                                                          Sequence<0, 1>,
-                                               1,
+                                                          1,
-                                               1,
+                                                          1,
-                                               WeiBlockCopyDataPerAccess_K,
+                                                          WeiBlockCopyDataPerAccess_K,
-                                               WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0});
+                                                          WeiBlockCopyDataPerAccess_K>({0, 0},
+                                                                                       {0, 0});
        // a series of blockwise batched GEMM
        // C_matrix += transpose(A_matrix) * B_matrix
@@ -318,14 +318,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                                                n_block_data_begin + n_thread_data_begin);
 #if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
-                                                  decltype(out_10d_global_desc),
+                decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
+                decltype(out_10d_global_desc),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
+                decltype(out_10d_thread_desc.GetLengths()),
-                                                  9,
+                arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  OutThreadCopyDataPerAccess_N,
+                9,
-                                                  OutThreadCopyDataPerAccess_N>(
+                OutThreadCopyDataPerAccess_N,
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
+                OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
+                                              make_zero_array<index_t, 10>())
                .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
            ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
@@ -388,14 +389,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
                                                n_block_data_begin + n_thread_data_begin);
 #if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
-                                                  decltype(out_10d_global_desc),
+                decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
+                decltype(out_10d_global_desc),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
+                decltype(out_10d_thread_desc.GetLengths()),
-                                                  9,
+                arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  OutThreadCopyDataPerAccess_N,
+                9,
-                                                  OutThreadCopyDataPerAccess_N>(
+                OutThreadCopyDataPerAccess_N,
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
+                OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
+                                              make_zero_array<index_t, 10>())
                .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
            ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "threadwise_generic_tensor_slice_copy.hpp"
@@ -127,9 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
        // input: format is [C, Hi, Wi, N]
        auto blockwise_in_copy =
 #if 0
-            BlockwiseGenericTensorSliceCopy_v1
+            BlockwiseGenericTensorSliceCopy_v1_deprecated
 #else
-            BlockwiseGenericTensorSliceCopy_v2
+            BlockwiseGenericTensorSliceCopy_v2_deprecated
 #endif
            <BlockSize,
             decltype(in_c_h_w_n_global_desc),
@@ -149,9 +149,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
        //   format is [CPerBlock, X * KPerBlock]
        const auto blockwise_wei_copy =
 #if 0
-            BlockwiseGenericTensorSliceCopy_v1
+            BlockwiseGenericTensorSliceCopy_v1_deprecated
 #else
-            BlockwiseGenericTensorSliceCopy_v2
+            BlockwiseGenericTensorSliceCopy_v2_deprecated
 #endif
            <BlockSize,
             decltype(wei_c_k_global_desc),
@@ -406,14 +406,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                n_block_data_begin + n_thread_data_begin);
 #if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
-                                                  decltype(out_10d_global_desc),
+                decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
+                decltype(out_10d_global_desc),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
+                decltype(out_10d_thread_desc.GetLengths()),
-                                                  9,
+                arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  OutThreadCopyDataPerAccess_N,
+                9,
-                                                  OutThreadCopyDataPerAccess_N>(
+                OutThreadCopyDataPerAccess_N,
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
+                OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
+                                              make_zero_array<index_t, 10>())
                .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
            ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),
@@ -476,14 +477,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
                                                n_block_data_begin + n_thread_data_begin);
 #if 1
-            ThreadwiseGenericTensorSliceCopy_v1r2<decltype(out_10d_thread_desc),
+            ThreadwiseGenericTensorSliceCopy_v1r2_deprecated<
-                                                  decltype(out_10d_global_desc),
+                decltype(out_10d_thread_desc),
-                                                  decltype(out_10d_thread_desc.GetLengths()),
+                decltype(out_10d_global_desc),
-                                                  arithmetic_sequence_gen<0, 10, 1>::type,
+                decltype(out_10d_thread_desc.GetLengths()),
-                                                  9,
+                arithmetic_sequence_gen<0, 10, 1>::type,
-                                                  OutThreadCopyDataPerAccess_N,
+                9,
-                                                  OutThreadCopyDataPerAccess_N>(
+                OutThreadCopyDataPerAccess_N,
-                make_zero_array<index_t, 10>(), make_zero_array<index_t, 10>())
+                OutThreadCopyDataPerAccess_N>(make_zero_array<index_t, 10>(),
+                                              make_zero_array<index_t, 10>())
                .Run(p_out_thread, p_out_thread_on_global);
 #elif 0
            ThreadwiseGenericTensorSliceCopy_v1r1<decltype(out_10d_thread_desc),

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_CHWN_CYXK_KHWN_PADDED_HPP
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_tensor_slice_copy.hpp"

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V1R3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_2d_tensor_op.hpp"
 #include "blockwise_tensor_slice_copy.hpp"

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_4d_tensor_op.hpp"
 #include "blockwise_2d_tensor_op.hpp"

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp
@@ -2,7 +2,7 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V2_CHWN_CYXK_KHWN_LDS_DOUBLE_BUFFER
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_4d_tensor_op.hpp"
 #include "blockwise_2d_tensor_op.hpp"

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -128,7 +128,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        // input blockwise copy
        //     slice a merged tensor, reorder and copy to a normal tensor
        //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
            BlockSize,
            Float,
            decltype(in_c_n1_b_n2_global_merged_desc),
@@ -155,20 +155,19 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        // operator for blockwise copy of weight into LDS
        //     slice a tensor, and copy it into another tensor
        //     this copy operator already have blockwise offset built-in
-        auto blockwise_wei_copy =
+        auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
+            BlockSize,
-                                               Float,
+            Float,
-                                               decltype(wei_c_k_global_desc),
+            decltype(wei_c_k_global_desc),
-                                               decltype(wei_c_k_block_desc),
+            decltype(wei_c_k_block_desc),
-                                               decltype(wei_c_k_block_desc.GetLengths()),
+            decltype(wei_c_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_C_K,
+            WeiBlockCopySubLengths_C_K,
-                                               WeiBlockCopyClusterLengths_C_K,
+            WeiBlockCopyClusterLengths_C_K,
-                                               Sequence<0, 1>, // thread_arrange_order [C, K]
+            Sequence<0, 1>, // thread_arrange_order [C, K]
-                                               Sequence<0, 1>, // src_access_order [C, K]
+            Sequence<0, 1>, // src_access_order [C, K]
-                                               Sequence<0, 1>, // dst_access_order [C, K]
+            Sequence<0, 1>, // dst_access_order [C, K]
-                                               WeiBlockCopyDataPerAccess_K,
+            WeiBlockCopyDataPerAccess_K,
-                                               WeiBlockCopyDataPerAccess_K>(
+            WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global}, {0, 0});
-                {0, k_block_data_on_global}, {0, 0});
        // GEMM definition
        // c_mtx += transpose(a_mtx) * b_mtx

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V3_NCHW_CYXK_NKHW_LDS_DOUBLE_BUFFER
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -125,7 +125,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
        // input blockwise copy
        //     slice a merged tensor, reorder and copy to a normal tensor
        //     this copy operator already has blockwise offset built-in
-        const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        const auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
            BlockSize,
            Float,
            decltype(in_c_n1_b_n2_global_merged_desc),
@@ -152,20 +152,19 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
        // operator for blockwise copy of weight into LDS
        //     slice a tensor, and copy it into another tensor
        //     this copy operator already have blockwise offset built-in
-        const auto blockwise_wei_copy =
+        const auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
+            BlockSize,
-                                               Float,
+            Float,
-                                               decltype(wei_c_k_global_desc),
+            decltype(wei_c_k_global_desc),
-                                               decltype(wei_c_k_block_desc),
+            decltype(wei_c_k_block_desc),
-                                               decltype(wei_c_k_block_desc.GetLengths()),
+            decltype(wei_c_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_C_K,
+            WeiBlockCopySubLengths_C_K,
-                                               WeiBlockCopyClusterLengths_C_K,
+            WeiBlockCopyClusterLengths_C_K,
-                                               Sequence<0, 1>, // thread_arrange_order [C, K]
+            Sequence<0, 1>, // thread_arrange_order [C, K]
-                                               Sequence<0, 1>, // src_access_order [C, K]
+            Sequence<0, 1>, // src_access_order [C, K]
-                                               Sequence<0, 1>, // dst_access_order [C, K]
+            Sequence<0, 1>, // dst_access_order [C, K]
-                                               WeiBlockCopyDataPerAccess_K,
+            WeiBlockCopyDataPerAccess_K,
-                                               WeiBlockCopyDataPerAccess_K>(
+            WeiBlockCopyDataPerAccess_K>({0, k_block_data_on_global}, {0, 0});
-                {0, k_block_data_on_global}, {0, 0});
        // GEMM definition
        // c_mtx += transpose(a_mtx) * b_mtx

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R2_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -166,7 +166,7 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
        // input blockwise copy
        //     slice a merged tensor, reorder and copy to a normal tensor
        //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
            BlockSize,
            Float,
            decltype(in_e_n0_ho0_wo0_b_n2_ho2_wo2_global_merged_desc),
@@ -196,18 +196,18 @@ struct GridwiseConvolutionImplicitGemm_v4r2_nchw_kcyx_nkhw_lds_double_buffer
        //     slice a tensor, and copy it into another tensor
        //     this copy operator already have blockwise offset built-in
        auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
+            BlockwiseGenericTensorSliceCopy_v1_deprecated<BlockSize,
-                                               Float,
+                                                          Float,
-                                               decltype(wei_e_k_global_desc),
+                                                          decltype(wei_e_k_global_desc),
-                                               decltype(wei_e_k_block_desc),
+                                                          decltype(wei_e_k_block_desc),
-                                               decltype(wei_e_k_block_desc.GetLengths()),
+                                                          decltype(wei_e_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_E_K,
+                                                          WeiBlockCopySubLengths_E_K,
-                                               WeiBlockCopyClusterLengths_E_K,
+                                                          WeiBlockCopyClusterLengths_E_K,
-                                               WeiBlockCopyThreadClusterArrangeOrder,
+                                                          WeiBlockCopyThreadClusterArrangeOrder,
-                                               WeiBlockCopySrcAccessOrder,
+                                                          WeiBlockCopySrcAccessOrder,
-                                               WeiBlockCopyDstAccessOrder,
+                                                          WeiBlockCopyDstAccessOrder,
-                                               WeiBlockCopySrcDataPerRead_E,
+                                                          WeiBlockCopySrcDataPerRead_E,
-                                               WeiBlockCopyDstDataPerWrite_K>(
+                                                          WeiBlockCopyDstDataPerWrite_K>(
                {0, k_block_data_on_global}, {0, 0});
        // GEMM definition

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp
@@ -2,8 +2,8 @@
 #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R3_NCHW_KCYX_NKHW_LDS_DOUBLE_BUFFER
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "ConstantTensorDescriptor_deprecated.hpp"
-#include "ConstantMergedTensorDescriptor.hpp"
+#include "ConstantMergedTensorDescriptor_deprecated.hpp"
 #include "ConstantMatrixDescriptor.hpp"
 #include "blockwise_generic_tensor_slice_copy.hpp"
 #include "blockwise_gemm.hpp"
@@ -165,7 +165,7 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer
        // input blockwise copy
        //     slice a merged tensor, reorder and copy to a normal tensor
        //     this copy operator already has blockwise offset built-in
-        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1<
+        auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1_deprecated<
            BlockSize,
            Float,
            decltype(in_e_n1_ho1_wo1_b_n2_ho2_wo2_global_merged_desc),
@@ -195,18 +195,18 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer
        //     slice a tensor, and copy it into another tensor
        //     this copy operator already have blockwise offset built-in
        auto blockwise_wei_copy =
-            BlockwiseGenericTensorSliceCopy_v1<BlockSize,
+            BlockwiseGenericTensorSliceCopy_v1_deprecated<BlockSize,
-                                               Float,
+                                                          Float,
-                                               decltype(wei_e_k_global_desc),
+                                                          decltype(wei_e_k_global_desc),
-                                               decltype(wei_e_k_block_desc),
+                                                          decltype(wei_e_k_block_desc),
-                                               decltype(wei_e_k_block_desc.GetLengths()),
+                                                          decltype(wei_e_k_block_desc.GetLengths()),
-                                               WeiBlockCopySubLengths_E_K,
+                                                          WeiBlockCopySubLengths_E_K,
-                                               WeiBlockCopyClusterLengths_E_K,
+                                                          WeiBlockCopyClusterLengths_E_K,
-                                               WeiBlockCopyThreadClusterArrangeOrder,
+                                                          WeiBlockCopyThreadClusterArrangeOrder,
-                                               WeiBlockCopySrcAccessOrder,
+                                                          WeiBlockCopySrcAccessOrder,
-                                               WeiBlockCopyDstAccessOrder,
+                                                          WeiBlockCopyDstAccessOrder,
-                                               WeiBlockCopySrcDataPerRead_E,
+                                                          WeiBlockCopySrcDataPerRead_E,
-                                               WeiBlockCopyDstDataPerWrite_K>(
+                                                          WeiBlockCopyDstDataPerWrite_K>(
                {0, k_block_data_on_global}, {0, 0});
 #if 0

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp