prototype dynamic tensor descriptor

5514dc0b · Chao Liu · 0b7fcca6 · 5514dc0b · 5514dc0b · 5514dc0b
Commit 5514dc0b authored Aug 18, 2020 by Chao Liu
15 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
@@ -8,6 +8,76 @@
 #include "dynamic_tensor_descriptor_helper.hpp"

 namespace ck {
+__host__ __device__ constexpr auto
+map_convolution_into_gemm(const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
+                          const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
+                          const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
+                          const Array<index_t, 2> conv_strides,
+                          const Array<index_t, 2> conv_dilations,
+                          const Array<index_t, 2> in_left_pads,
+                          const Array<index_t, 2> in_right_pads)
+{
+    const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
+    const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
+    const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
+
+    const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
+    const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
+
+    const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
+    const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
+
+    const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
+    const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
+
+    const index_t ConvStrideH = conv_strides[0];
+    const index_t ConvStrideW = conv_strides[1];
+
+    const index_t ConvDilationH = conv_dilations[0];
+    const index_t ConvDilationW = conv_dilations[1];
+
+    const index_t InLeftPadH  = in_left_pads[0];
+    const index_t InLeftPadW  = in_left_pads[1];
+    const index_t InRightPadH = in_right_pads[0];
+    const index_t InRightPadW = in_right_pads[1];
+
+    // input tensor
+    const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
+        transform_dynamic_tensor_descriptor(
+            in_n_c_hi_wi_global_desc,
+            make_tuple(DynamicPassThrough{N},
+                       DynamicPassThrough{C},
+                       DynamicLeftPad{Hi, InLeftPadH},
+                       DynamicLeftPad{Wi, InLeftPadW}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
+        make_tuple(DynamicPassThrough{N},
+                   DynamicPassThrough{C},
+                   DynamicRightPad{Hi + InLeftPadH, InRightPadH},
+                   DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
+    const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
+
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_hip_wip_global_desc,
+        make_tuple(DynamicPassThrough{N},
+                   DynamicPassThrough{C},
+                   DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
+                   DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
+        in_n_c_y_ho_x_wo_global_desc,
+        make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
+        make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(in_gemmk_gemmn_global_desc);
+}

 template <index_t BlockSize>
 struct DummyDynamicTransform
@@ -462,64 +532,15 @@ struct DummyDynamicTransform
                           const Array<index_t, 2> in_left_pads,
                           const Array<index_t, 2> in_right_pads) const
    {
-        const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
-        const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
-        const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
-
-        const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
-        const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
-
-        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
-        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
-
-        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
-        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
-
-        const index_t ConvStrideH = conv_strides[0];
-        const index_t ConvStrideW = conv_strides[1];
-
-        const index_t ConvDilationH = conv_dilations[0];
-        const index_t ConvDilationW = conv_dilations[1];
-
-        const index_t InLeftPadH  = in_left_pads[0];
-        const index_t InLeftPadW  = in_left_pads[1];
-        const index_t InRightPadH = in_right_pads[0];
-        const index_t InRightPadW = in_right_pads[1];
-
-        // input tensor
-        const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
-            transform_dynamic_tensor_descriptor(
+        const auto transformed_tensor_descs = map_convolution_into_gemm(wei_k_c_y_x_global_desc,
                                                                        in_n_c_hi_wi_global_desc,
-                make_tuple(DynamicPassThrough{N},
-                           DynamicPassThrough{C},
-                           DynamicLeftPad{Hi, InLeftPadH},
-                           DynamicLeftPad{Wi, InLeftPadW}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicRightPad{Hi + InLeftPadH, InRightPadH},
-                       DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
-
-        const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(2);
-        const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(3);
-
-        const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
-            in_n_c_hip_wip_global_desc,
-            make_tuple(DynamicPassThrough{N},
-                       DynamicPassThrough{C},
-                       DynamicEmbed<2>{{Y, Ho}, {ConvDilationH, ConvStrideH, 0}},
-                       DynamicEmbed<2>{{X, Wo}, {ConvDilationW, ConvStrideW, 0}}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+                                                                        out_n_k_ho_wo_global_desc,
+                                                                        conv_strides,
+                                                                        conv_dilations,
+                                                                        in_left_pads,
+                                                                        in_right_pads);

-        const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
-            in_n_c_y_ho_x_wo_global_desc,
-            make_tuple(DynamicMerge<3>{{C, Y, X}}, DynamicMerge<3>{{N, Ho, Wo}}),
-            make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
-            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});

 #pragma unroll 1
        for(index_t iter = 0; iter < 100; ++iter)

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -186,7 +186,8 @@ struct DynamicRightPad
    }
 };

-// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
+// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] +
+// coefficients[nDimUp]
 template <index_t NDimUp>
 struct DynamicEmbed
 {
@@ -277,7 +278,7 @@ struct DynamicMerge

    __host__ __device__ explicit constexpr DynamicMerge(const LowerIndex& low_lengths)
        : low_lengths_{low_lengths},
-          low_lengths_scan_{reverse_inclusive_scan_on_array(
+          low_lengths_scan_{reverse_exclusive_scan_on_array(
              low_lengths, math::multiplies<index_t>{}, index_t{1})},
          up_length_{reduce_on_array(low_lengths, math::multiplies<index_t>(), 1)}
    {
@@ -303,16 +304,16 @@ struct DynamicMerge
        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");

-        index_t itmp = idx_up[0];
+        index_t tmp = idx_up[0];

 #pragma unroll
-        for(index_t i; i < NDimLow - 1; ++i)
+        for(index_t i = 0; i < NDimLow - 1; ++i)
        {
-            idx_low(i) = itmp / low_lengths_scan_[i];
-            itmp -= idx_low[i] * low_lengths_scan_[i];
+            idx_low(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_low[i] * low_lengths_scan_[i];
        }

-        idx_low(NDimLow - 1) = itmp;
+        idx_low(NDimLow - 1) = tmp;
    }

    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
@@ -334,10 +335,14 @@ struct DynamicMerge
        //   1) If idx_up_diff is known at compile-time, then idx_low_diff_const
        //   can be calculated at compile-time.
        //   2) If idx_up_diff is not known at compile-time, but its value
-        //   doesn't change during the whole kernel execution, then idx_low_diff_const also
-        //   doesn't change during the whole kernel execution. Compiler generated ISA should
-        //   only caclculate idx_low_diff_const once and save it durinng the whole kernel execution
-        // If neither 1) nor 2) is satisfied, then the calculation will also be computed at
+        //   doesn't change during the whole kernel execution, then
+        //   idx_low_diff_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_low_diff_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
        //   run-time each time this function is called, and can be very expensive.
        LowerIndex idx_low_diff_const = CalculateLowerIndex(idx_up_diff);

@@ -346,7 +351,7 @@ struct DynamicMerge
        index_t carry = 0;

 #pragma unroll
-        for(index_t i = NDimLow - 1; i > 1; --i)
+        for(index_t i = NDimLow - 1; i > 0; --i)
        {
            // this should be saved as well
            index_t idx_low_length_minus_idx_low_diff_const =

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
@@ -168,10 +168,13 @@ struct DynamicTransformedTensorDescriptor
                          is_valid_sequence_map<sorted_low_dimension_ids>{},
                      "wrong! LowDimensionIds is not configured correctly");

-        // TODO: sanity check: while a up-dimension could be associated with multille
-        //   transformation, a low-dimension should be associated with only one transformation
+        // TODO: sanity check: while a up-dimension could be associated with
+        // multille
+        //   transformation, a low-dimension should be associated with only one
+        //   transformation

-        // TODO: sanity-check: GetLowerLengths of each transform should be consistent with lengths
+        // TODO: sanity-check: GetLowerLengths of each transform should be
+        // consistent with lengths
        //   of lower-tensor-descriptor
    }

@@ -222,13 +225,15 @@ struct DynamicTransformedTensorDescriptor
        return low_tensor_desc_.GetElementSpace();
    }

+    __host__ __device__ constexpr auto GetLowerTensorDescriptor() const { return low_tensor_desc_; }
+
    template <typename LowIdx, typename UpIdx>
    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
    {
        static_for<0, NTransform, 1>{}([&](auto itran) constexpr {
-            auto tran = transforms_.At(itran);
+            const auto tran = transforms_.At(itran);

-            auto idx_up_part  = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
+            const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
            auto idx_low_part      = pick_array_element(idx_low, LowDimensionIds{}.At(itran));

            tran.CalculateLowerIndex(idx_low_part, idx_up_part);

--- a/composable_kernel/include/utility/array.hpp
+++ b/composable_kernel/include/utility/array.hpp
 #ifndef CK_ARRAY_HPP
 #define CK_ARRAY_HPP

-#include "sequence.hpp"
 #include "functional2.hpp"
+#include "sequence.hpp"

 namespace ck {


--- a/composable_kernel/include/utility/array_helper.hpp
+++ b/composable_kernel/include/utility/array_helper.hpp
@@ -288,7 +288,7 @@ reverse_exclusive_scan_on_array(const Array<TData, NSize>& x, Reduce f, TData in
        r    = f(r, x[i]);
    }

-    y(NSize - 1) = r;
+    y(0) = r;

    return y;
 }

--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
 #ifndef CK_COMMON_HEADER_HPP
 #define CK_COMMON_HEADER_HPP

-#include "config.hpp"
-#include "utility.hpp"
-#include "integral_constant.hpp"
-#include "number.hpp"
-#include "float_type.hpp"
-#include "type.hpp"
-#include "tuple.hpp"
-#include "tuple_helper.hpp"
-#include "math.hpp"
-#include "sequence.hpp"
 #include "array.hpp"
 #include "array_helper.hpp"
+#include "config.hpp"
+#include "float_type.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "functional3.hpp"
 #include "functional4.hpp"
 #include "in_memory_operation.hpp"
+#include "integral_constant.hpp"
+#include "math.hpp"
+#include "number.hpp"
+#include "sequence.hpp"
 #include "synchronization.hpp"
+#include "tuple.hpp"
+#include "tuple_helper.hpp"
+#include "type.hpp"
+#include "utility.hpp"

 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"

--- a/composable_kernel/include/utility/functional.hpp
+++ b/composable_kernel/include/utility/functional.hpp
@@ -2,7 +2,6 @@
 #define CK_FUNCTIONAL_HPP

 #include "integral_constant.hpp"
-#include "sequence.hpp"
 #include "type.hpp"

 namespace ck {
@@ -56,8 +55,10 @@ struct static_if<true>
    __host__ __device__ constexpr auto operator()(F f) const
    {
        // This is a trick for compiler:
-        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
-        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
+        //   use it,
+        //   this will make "f" a generic lambda, so that "f" won't be compiled
+        //   until being
        //   instantiated here
        f(forwarder{});
        return Type{};
@@ -84,8 +85,10 @@ struct static_if<false>
    __host__ __device__ static void Else(F f)
    {
        // This is a trick for compiler:
-        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will use it,
-        //   this will make "f" a generic lambda, so that "f" won't be compiled until being
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
+        //   use it,
+        //   this will make "f" a generic lambda, so that "f" won't be compiled
+        //   until being
        //   instantiated here
        f(forwarder{});
    }

--- a/composable_kernel/include/utility/functional3.hpp
+++ b/composable_kernel/include/utility/functional3.hpp
 #ifndef CK_FUNCTIONAL3_HPP
 #define CK_FUNCTIONAL3_HPP

+#include "array.hpp"
 #include "functional.hpp"
 #include "functional2.hpp"
 #include "sequence.hpp"
-#include "array.hpp"

 namespace ck {

@@ -83,8 +83,10 @@ struct ford_impl<Sequence<>, Orders>

 } // namespace detail

-// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
-// Orders is Sequence<...>, it is the order of dimension in which static_ford will loop over each
+// Lengths is Sequence<...>, it is the length of each dimension for
+// N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which static_ford
+// will loop over each
 // dimension
 template <class Lengths,
          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
@@ -106,8 +108,10 @@ struct static_ford
    }
 };

-// Lengths is Sequence<...>, it is the length of each dimension for N-dimensional loop
-// Orders is Sequence<...>, it is the order of dimension in which ford will loop over each
+// Lengths is Sequence<...>, it is the length of each dimension for
+// N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which ford will loop
+// over each
 // dimension
 template <class Lengths,
          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>

--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -165,6 +165,6 @@ struct less
 };

 } // namespace math
-} // namspace ck
+} // namespace ck

 #endif
--- a/composable_kernel/include/utility/print_array.hpp
+++ b/composable_kernel/include/utility/print_array.hpp
-#ifndef CK_PRINT_ARRAY_HPP
-#define CK_PRINT_ARRAY_HPP
-
-#include "array.hpp"
-
-namespace ck {
-
-template <index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<uint32_t, NSize> a)
-{
-    constexpr index_t nsize = a.GetSize();
-
-    static_assert(nsize > 0 && nsize <= 10, "wrong!");
-
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, a[0]); });
-
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, a[0], a[1]); });
-
-    static_if<nsize == 3>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, a[0], a[1], a[2]); });
-
-    static_if<nsize == 4>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3]); });
-
-    static_if<nsize == 5>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
-    });
-
-    static_if<nsize == 6>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
-    });
-
-    static_if<nsize == 7>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6]);
-    });
-
-    static_if<nsize == 8>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7]);
-    });
-
-    static_if<nsize == 9>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8]);
-    });
-
-    static_if<nsize == 10>{}([&](auto) {
-        printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8],
-               a[9]);
-    });
-}
-
-template <index_t NSize>
-__host__ __device__ void print_array(const char* s, Array<int32_t, NSize> a)
-{
-    constexpr index_t nsize = a.GetSize();
-
-    static_assert(nsize > 0 && nsize <= 10, "wrong!");
-
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); });
-
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); });
-
-    static_if<nsize == 3>{}(
-        [&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); });
-
-    static_if<nsize == 4>{}(
-        [&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); });
-
-    static_if<nsize == 5>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]);
-    });
-
-    static_if<nsize == 6>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]);
-    });
-
-    static_if<nsize == 7>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6]);
-    });
-
-    static_if<nsize == 8>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7]);
-    });
-
-    static_if<nsize == 9>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8]);
-    });
-
-    static_if<nsize == 10>{}([&](auto) {
-        printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n",
-               s,
-               nsize,
-               a[0],
-               a[1],
-               a[2],
-               a[3],
-               a[4],
-               a[5],
-               a[6],
-               a[7],
-               a[8],
-               a[9]);
-    });
-}
-
-} // namespace ck
-#endif
--- a/composable_kernel/include/utility/print_sequence.hpp
+++ b/composable_kernel/include/utility/print_sequence.hpp
-#ifndef CK_PRINT_SEQUENCE_HPP
-#define CK_PRINT_SEQUENCE_HPP
-
-#include "sequence.hpp"
-
-namespace ck {
-
-template <index_t... Xs>
-__host__ __device__ void print_sequence(const char* s, Sequence<Xs...>)
-{
-    constexpr index_t nsize = Sequence<Xs...>::Size();
-
-    static_assert(nsize <= 10, "wrong!");
-
-    static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 1>{}([&](auto) { printf("%s size %u, {%u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 2>{}([&](auto) { printf("%s size %u, {%u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 3>{}([&](auto) { printf("%s size %u, {%u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 4>{}([&](auto) { printf("%s size %u, {%u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 5>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 6>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 7>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 8>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 9>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-
-    static_if<nsize == 10>{}(
-        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
-}
-
-} // namespace ck
-
-#endif
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
@@ -2,8 +2,8 @@
 #define CK_TUPLE_HPP

 #include "integral_constant.hpp"
-#include "type.hpp"
 #include "sequence.hpp"
+#include "type.hpp"

 namespace ck {


--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
@@ -9,6 +9,6 @@ __device__ index_t get_thread_local_1d_id() { return threadIdx.x; }

 __device__ index_t get_block_1d_id() { return blockIdx.x; }

-} // namspace ck
+} // namespace ck

 #endif
--- a/driver/include/device_dummy_dynamic_transform.hpp
+++ b/driver/include/device_dummy_dynamic_transform.hpp
@@ -40,6 +40,31 @@ void device_dummy_dynamic_transform(InDesc,
    const auto in_left_pads   = to_array(InLeftPads{});
    const auto in_right_pads  = to_array(InRightPads{});

+    {
+        const auto tensor_descs = map_convolution_into_gemm(wei_kcyx_desc,
+                                                            in_nchw_desc,
+                                                            out_nkhw_desc,
+                                                            conv_strides,
+                                                            conv_dilations,
+                                                            in_left_pads,
+                                                            in_right_pads);
+
+        const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
+        print_array("cpu: in_gemmk_gemmn_global_desc:", in_gemmk_gemmn_global_desc.GetLengths());
+
+        const auto idx0 = MultiIndex<2>({2591, 36991});
+        const auto idx1 = in_gemmk_gemmn_global_desc.CalculateLowerIndex(idx0);
+        const auto idx2 =
+            in_gemmk_gemmn_global_desc.GetLowerTensorDescriptor().CalculateLowerIndex(idx1);
+
+        const index_t offset = in_gemmk_gemmn_global_desc.CalculateOffset(idx0);
+
+        print_array("idx0:", idx0);
+        print_array("idx1:", idx1);
+        print_array("idx2:", idx2);
+        printf("offset %d\n", offset);
+    }
+
    std::size_t data_sz = sizeof(T);
    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
@@ -54,6 +79,7 @@ void device_dummy_dynamic_transform(InDesc,

    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);

+#if 0
    using dummy_transform = DummyDynamicTransform<BlockSize>;

    for(index_t i = 0; i < 5; ++i)
@@ -92,6 +118,7 @@ void device_dummy_dynamic_transform(InDesc,
                          in_right_pads);
        }
    }
+#endif

    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
 }
--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -5,8 +5,7 @@
 #include <stdlib.h>
 #include <half.hpp>
 #include "config.hpp"
-#include "print_array.hpp"
-#include "print_sequence.hpp"
+#include "print.hpp"
 #include "device.hpp"
 #include "host_tensor_generator.hpp"
 #include "conv_common.hpp"
@@ -496,10 +495,10 @@ int main(int argc, char* argv[])
    ostream_tensor_descriptor(in_nchw_desc, std::cout << "in_nchw_desc: ");
    ostream_tensor_descriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
    ostream_tensor_descriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
-    print_sequence("LeftPads", LeftPads{});
-    print_sequence("RightPads", RightPads{});
-    print_sequence("ConvStrides", ConvStrides{});
-    print_sequence("ConvDilations", ConvDilations{});
+    print_array("LeftPads", LeftPads{});
+    print_array("RightPads", RightPads{});
+    print_array("ConvStrides", ConvStrides{});
+    print_array("ConvDilations", ConvDilations{});

 #if 1
    using in_data_t  = float;