refactor

97ba755f · Chao Liu · 8d460740 · 97ba755f · 97ba755f · 97ba755f
Commit 97ba755f authored May 31, 2019 by Chao Liu
20 changed files
--- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);

    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());

    // reorder input
-    auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Hi, Wi, N>{});
+    auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Hi, Wi, N>{});
    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");

    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
@@ -64,8 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());

    // output
-    auto out_khwn_desc =
-        make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
+    auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");

    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp
@@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);

    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
@@ -50,8 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
        std::thread::hardware_concurrency());

    // output
-    auto out_khwn_desc =
-        make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, Ho, Wo, N>{});
+    auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");

    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

--- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);

    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));

--- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
    constexpr index_t X = wei_kcyx_desc.GetLength(I3);

    // reorder weight
-    auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<C, Y, X, K>{});
+    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));

--- a/driver/driver.hip.cpp
+++ b/driver/driver.hip.cpp
@@ -443,7 +443,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 0
+#elif 1
    // 3x3 filter, 28x28 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 256;
@@ -455,7 +455,7 @@ int main(int argc, char* argv[])

    constexpr index_t HPad = 0;
    constexpr index_t WPad = 0;
-#elif 1
+#elif 0
    // 1x1 filter, 28x28 image
    constexpr index_t N  = 128;
    constexpr index_t C  = 512;
@@ -568,8 +568,8 @@ int main(int argc, char* argv[])
    auto lower_pads = Sequence<HPad, WPad>{};
    auto upper_pads = Sequence<HPad, WPad>{};

-    auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, C, HI, WI>{});
-    auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence<K, C, Y, X>{});
+    auto in_nchw_desc  = make_ConstantTensorDescriptor_packed(Sequence<N, C, HI, WI>{});
+    auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence<K, C, Y, X>{});
    auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor(
        in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads);


--- a/src/include/ConstantMergedTensorDescriptor.hip.hpp
+++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp
@@ -114,7 +114,7 @@ struct ConstantMergedTensorDescriptor

    __host__ __device__ static Array<index_t, nDim> GetMultiIndexFrom1dIndex(index_t id)
    {
-        constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths());
+        constexpr auto dummy_desc = make_ConstantTensorDescriptor_packed(GetLengths());

        return dummy_desc.GetMultiIndexFrom1dIndex(id);
    }
@@ -128,7 +128,7 @@ __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalT
 }

 template <class TDesc>
-__host__ __device__ void print_ConstantMergedTensorDescriptor(TDesc, const char* s)
+__host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDesc)
 {
-    print_ConstantTensorDescriptor(TDesc::GetOriginalTensorDescriptor(), s);
+    print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor());
 }
--- a/src/include/ConstantTensorDescriptor.hip.hpp
+++ b/src/include/ConstantTensorDescriptor.hip.hpp
--- a/src/include/blockwise_4d_tensor_op.hip.hpp
+++ b/src/include/blockwise_4d_tensor_op.hip.hpp
@@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst

    constexpr auto dst_desc = DstDesc{};

-    constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths());
+    constexpr auto desc = make_ConstantTensorDescriptor_packed(dst_desc.GetLengths());

 #if 0
    if(get_thread_local_1d_id() == 0)
@@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds

    constexpr auto src_desc = SrcDesc{};
    constexpr auto dst_desc = DstDesc{};
-    constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{});
+    constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(SrcOpLengths{});

    constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;

@@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1
        constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead);

        constexpr auto ref_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(Sequence<L0, L1, L2, read_per_d3>{});
+            make_ConstantTensorDescriptor_packed(Sequence<L0, L1, L2, read_per_d3>{});

        constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize;

@@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded

        constexpr auto src_desc = SrcDesc{};
        constexpr auto dst_desc = DstDesc{};
-        constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{});
+        constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(DstOpLengths{});

        constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0);
        constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1);
@@ -510,8 +510,7 @@ struct Blockwise4dTensorCopy3
            }
        }

-        constexpr auto thread_cluster_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{});
+        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(ThreadPerDims{});
        const auto thread_multi_id =
            thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());

@@ -653,7 +652,7 @@ struct Blockwise4dTensorCopy3
        constexpr index_t nloop_d2 = L2 / thread_per_d2;
        constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);

-        constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed(
            Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});

 #pragma unroll
@@ -720,7 +719,7 @@ struct Blockwise4dTensorCopy3
        constexpr index_t nloop_d2 = L2 / thread_per_d2;
        constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead);

-        constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed(
            Sequence<nloop_d0, nloop_d1, nloop_d2, nloop_d3 * DataPerRead>{});

 #pragma unroll

--- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp
@@ -63,7 +63,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
                      "wrong!");

        // thread cluster
-        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
            DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));

        // BlockSize
@@ -185,7 +185,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});

        constexpr auto thread_tensor_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths);
+            make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);

        return thread_tensor_desc.GetElementSpace();
    }
@@ -199,8 +199,8 @@ struct BlockwiseGenericTensorSliceCopy_v1

        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});

-        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
-            thread_sub_tensor_lengths * repeat_lengths);
+        constexpr auto thread_tensor_desc =
+            make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});
@@ -237,8 +237,8 @@ struct BlockwiseGenericTensorSliceCopy_v1

        constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{});

-        constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed(
-            thread_sub_tensor_lengths * repeat_lengths);
+        constexpr auto thread_tensor_desc =
+            make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){});

--- a/src/include/blockwise_tensor_slice_op.hip.hpp
+++ b/src/include/blockwise_tensor_slice_op.hip.hpp
@@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
            src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster);

        constexpr auto thread_cluster_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths);
+            make_ConstantTensorDescriptor_packed(thread_cluster_lengths);

        // sanity check: data type
        static_assert(is_same<Float, float>::value, "wrong! only support float for now!\n");
@@ -175,7 +175,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;

        constexpr auto thread_tensor_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
+            make_ConstantTensorDescriptor_packed(thread_tensor_lengths);

        return thread_tensor_desc.GetElementSpace();
    }
@@ -196,7 +196,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;

        constexpr auto thread_tensor_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
+            make_ConstantTensorDescriptor_packed(thread_tensor_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};
@@ -234,7 +234,7 @@ struct BlockwiseTensorSliceReorderCopy_v3
        constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths;

        constexpr auto thread_tensor_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths);
+            make_ConstantTensorDescriptor_packed(thread_tensor_lengths);

        static_ford<decltype(repeat_lengths)>{}([&](auto repeat_multi_id_) {
            constexpr auto repeat_multi_id = decltype(repeat_multi_id_){};

--- a/src/include/conv_common.hip.hpp
+++ b/src/include/conv_common.hip.hpp
@@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc
    constexpr auto HO = HI + 1 - Y;
    constexpr auto WO = WI + 1 - X;

-    return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
+    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
 }

 template <class InDesc, class WeiDesc, class LowerPads, class UpperPads>
@@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4
    constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y;
    constexpr auto WO = WI + WPadLow + WPadUp + 1 - X;

-    return make_ConstantTensorDescriptor_default_rank_packed(Sequence<N, K, HO, WO>{});
+    return make_ConstantTensorDescriptor_packed(Sequence<N, K, HO, WO>{});
 }

 template <class InDesc, class WeiDesc, class OutDesc>

--- a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp
@@ -45,23 +45,23 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
        constexpr index_t Y = wei_kcyx_global_desc.GetLength(I2);
        constexpr index_t X = wei_kcyx_global_desc.GetLength(I3);

-        constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor_packed(
            Sequence<K, C * Y * X>{}); // 2d view of wei for blockwise copy

        constexpr index_t HiPerBlock = HoPerBlock + Y - 1;
        constexpr index_t WiPerBlock = WoPerBlock + X - 1;

-        constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<NPerBlock, CPerBlock, HiPerBlock, WiPerBlock>{},
            Number<InBlockCopyDataPerRead>{});

-        constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<KPerBlock, CPerBlock * Y * X>{},
            Number<WeiBlockCopyDataPerRead>{}); // 2d view of wei for blockwise copy

-        constexpr auto wei_kcyx_block_desc = make_ConstantTensorDescriptor_default_rank(
-            Sequence<KPerBlock, CPerBlock, Y, X>{},
-            Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});
+        constexpr auto wei_kcyx_block_desc =
+            make_ConstantTensorDescriptor(Sequence<KPerBlock, CPerBlock, Y, X>{},
+                                          Sequence<wei_ke_block_desc.GetStride(I0), Y * X, X, 1>{});

        // shared mem
        constexpr index_t in_block_element_size =
@@ -82,11 +82,11 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw
        constexpr index_t HiPerThread = HoPerThread + Y - 1;
        constexpr index_t WiPerThread = WoPerThread + X - 1;

-        constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor_default_rank(
+        constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor(
            Sequence<NPerThread, CPerThread, HiPerThread, WiPerThread>{},
            in_nchw_block_desc.GetStrides());

-        constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor_default_rank(
+        constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor(
            Sequence<KPerThread, CPerThread, Y, X>{}, wei_kcyx_block_desc.GetStrides());

        constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor(

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
@@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);
        constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock);

-        constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
            Sequence<KBlockWork, HBlockWork, WBlockWork, NBlockWork>{});

        const auto block_work_multi_id =
@@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);

-        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockCopyDataPerRead_N>{});

@@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");

-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});

        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});

 // blockwise copy

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);

-        constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});

        const auto block_work_multi_id =
@@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn

        // global tensor view
        constexpr auto wei_c_k_global_desc =
-            make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
+            make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});

        // LDS tensor view
        //   be careful of alignment
@@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);

-        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockReorderDataPerWrite_N>{});

@@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");

-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});

        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});

        // blockwise copy
@@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
            constexpr index_t K1 = KPerBlock / KPerThread;

 #if 0
-            constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed(
+            constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_packed(
                Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});

-            constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
+            constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_packed(
                Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
 #else
            constexpr auto out_10d_global_desc =

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
        constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);

-        constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});

        const auto block_work_multi_id =
@@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);

-        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockReorderDataPerWrite_N>{});

@@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");

-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});

        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});

        // blockwise copy

--- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
@@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
        constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock);
        constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock);

-        constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed(
            Sequence<NBlockWork, KBlockWork, HBlockWork, WBlockWork>{});

        const auto block_work_multi_id =
@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw

        // global tensor view
        constexpr auto wei_c_k_global_desc =
-            make_ConstantTensorDescriptor_default_rank(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});
+            make_ConstantTensorDescriptor(Sequence<C, K>{}, Sequence<Y * X * K, 1>{});

        // LDS tensor view
        //   be careful of alignment
@@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
                                                    GemmDataPerReadA,
                                                    GemmDataPerReadB);

-        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, HoPerBlock, WoPerBlock, NPerBlock>{},
            Number<InBlockReorderDataPerWrite_N>{});

@@ -117,12 +117,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
        static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
                      "GemmDataPerReadB alignment requirement is not meet");

-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerRead_K, GemmDataPerReadA)>{});

        // tensor view of threadwise output in register
-        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed(
+        constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed(
            Sequence<KPerThread, HoPerThread, WoPerThread, NPerThread>{});

        // blockwise copy

--- a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
@@ -88,7 +88,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
        constexpr index_t BBlockWork = B / BPerBlock;

        constexpr auto block_work_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{});
+            make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});

        const auto block_work_multi_id =
            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
@@ -111,9 +111,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw

        //     memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto in_c_n1_b_n2_block_mem_desc =
-            make_ConstantTensorDescriptor_default_rank_aligned(
-                Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
+        constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned(
+            Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});

        //     this check is ad-hoc
        //     TODO: need to properly implement tensor descriptor with alignment
@@ -143,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw

        //     tensor descriptor in LDS, dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});

@@ -367,7 +366,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
            // define tensor descriptor for threadwise copy
            //     output memory layout descriptor in register
            constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc =
-                make_ConstantTensorDescriptor_default_rank_packed(
+                make_ConstantTensorDescriptor_packed(
                    Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});

            //     output tensor descriptor in register, src of threadwise copy

--- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
@@ -91,7 +91,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
        constexpr index_t BBlockWork = B / BPerBlock;

        constexpr auto block_work_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{});
+            make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});

        const auto block_work_multi_id =
            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
@@ -114,9 +114,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw

        //     memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto in_c_n1_b_n2_block_mem_desc =
-            make_ConstantTensorDescriptor_default_rank_aligned(
-                Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});
+        constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned(
+            Sequence<CPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});

        //     this check is ad-hoc
        //     TODO: need to properly implement tensor descriptor with alignment
@@ -146,7 +145,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw

        //     tensor descriptor in LDS, dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<CPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDataPerAccess_K, GemmDataPerReadA)>{});

@@ -320,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
            // define tensor descriptor for threadwise copy
            //     output memory layout descriptor in register
            constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc =
-                make_ConstantTensorDescriptor_default_rank_packed(
+                make_ConstantTensorDescriptor_packed(
                    Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});

            //     output tensor descriptor in register, src of threadwise copy

--- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp
@@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
        constexpr index_t BBlockWork = B / BPerBlock;

        constexpr auto block_work_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{});
+            make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});

        const auto block_work_multi_id =
            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
@@ -127,20 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
            Sequence<3, 6, 7>{},
            Sequence<5>{});

-#if 0
-        if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
-        {
-            print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc,
-                                           "in_n0_n1_n2_h_w_global_desc: ");
-            print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: ");
-            print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc,
-                                                 "in_e_n1_b_n2_global_merged_desc: ");
-        }
-#endif
-
        //     memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});

        //     this check is ad-hoc
@@ -174,7 +163,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw

        //     tensor descriptor in LDS, dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<EPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});

@@ -406,7 +395,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
            // define tensor descriptor for threadwise copy
            //     output memory layout descriptor in register
            constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc =
-                make_ConstantTensorDescriptor_default_rank_packed(
+                make_ConstantTensorDescriptor_packed(
                    Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});

            //     output tensor descriptor in register, src of threadwise copy

--- a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
+++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp
@@ -93,7 +93,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
        constexpr index_t BBlockWork = B / BPerBlock;

        constexpr auto block_work_desc =
-            make_ConstantTensorDescriptor_default_rank_packed(Sequence<KBlockWork, BBlockWork>{});
+            make_ConstantTensorDescriptor_packed(Sequence<KBlockWork, BBlockWork>{});

        const auto block_work_multi_id =
            block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id());
@@ -134,7 +134,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw

        //     memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<EPerBlock, N1, BPerBlock, N2>{}, Number<InBlockCopyDstDataPerWrite_N2>{});

        //     this check is ad-hoc
@@ -167,7 +167,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw

        //     tensor descriptor in LDS, dst of blockwise copy
        //     be careful of LDS alignment
-        constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned(
+        constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned(
            Sequence<EPerBlock, KPerBlock>{},
            Number<mod_conv::max(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});

@@ -288,7 +288,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
            // define tensor descriptor for threadwise copy
            //     output memory layout descriptor in register
            constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc =
-                make_ConstantTensorDescriptor_default_rank_packed(
+                make_ConstantTensorDescriptor_packed(
                    Sequence<KPerBlock / (K1 * K2), 1, K2, N1, 1, 1, 1, N2>{});

            //     output tensor descriptor in register, src of threadwise copy