prototype dynamic descriptor

d3405258 · Chao Liu · 834eb24c · d3405258 · d3405258 · d3405258
Commit d3405258 authored Jul 29, 2020 by Chao Liu
12 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_HPP
+#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+#include "dynamic_tensor_descriptor_helper.hpp"
+namespace ck {
+template <index_t BlockSize>
+struct DummyDynamicTransform
+{
+    __device__ void Run_(index_t* const __restrict__ p_wei_global,
+                         index_t* const __restrict__ p_in_global,
+                         float* const __restrict__ p_out_global,
+                         const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
+                         const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
+                         const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
+                         const Array<index_t, 2> conv_strides,
+                         const Array<index_t, 2> conv_dilations,
+                         const Array<index_t, 2> in_left_pads,
+                         const Array<index_t, 2> in_right_pads,
+                         index_t k_block_num,
+                         index_t c_block_num,
+                         index_t y_block_num,
+                         index_t x_block_num) const
+    {
+        const index_t N  = in_n_c_hi_wi_global_desc.GetLength(0);
+        const index_t C  = in_n_c_hi_wi_global_desc.GetLength(1);
+        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
+        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
+        const index_t K  = out_n_k_ho_wo_global_desc.GetLength(1);
+        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
+        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
+        const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
+        const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
+        const index_t ConvStrideH = conv_strides[0];
+        const index_t ConvStrideW = conv_strides[1];
+        const index_t ConvDilationH = conv_dilations[0];
+        const index_t ConvDilationW = conv_dilations[1];
+        p_wei_global[0] = wei_k_c_y_x_global_desc.GetElementSize();
+        p_wei_global[1] = wei_k_c_y_x_global_desc.GetElementSpace();
+        const index_t k_block_num_stride = c_block_num * y_block_num * x_block_num;
+        const index_t c_block_num_stride = y_block_num * x_block_num;
+        const index_t y_block_num_stride = x_block_num;
+        index_t tmp = get_block_1d_id();
+#if 0
+        const index_t k_block = tmp / k_block_num_stride;
+        tmp -= k_block * k_block_num_stride;
+        const index_t c_block = tmp / c_block_num_stride;
+        tmp -= c_block * c_block_num_stride;
+        const index_t y_block = tmp / y_block_num_stride;
+        tmp -= y_block * y_block_num_stride;
+        const index_t x_block = tmp;
+#else
+        const index_t k_block = __llvm_amdgcn_readfirstlane_i32(tmp / k_block_num_stride);
+        tmp -= k_block * k_block_num_stride;
+        const index_t c_block = __llvm_amdgcn_readfirstlane_i32(tmp / c_block_num_stride);
+        tmp -= c_block * c_block_num_stride;
+        const index_t y_block = __llvm_amdgcn_readfirstlane_i32(tmp / y_block_num_stride);
+        tmp -= y_block * y_block_num_stride;
+        const index_t x_block = __llvm_amdgcn_readfirstlane_i32(tmp);
+#endif
+        const index_t k_thread = p_in_global[get_thread_local_1d_id()];
+        const index_t c_thread = p_in_global[get_thread_local_1d_id() + 1];
+        const index_t y_thread = p_in_global[get_thread_local_1d_id() + 2];
+        const index_t x_thread = p_in_global[get_thread_local_1d_id() + 3];
+        p_wei_global[3] = wei_k_c_y_x_global_desc.CalculateOffset(
+            {k_block + k_thread, c_block + c_thread, y_block + y_thread, x_block + x_thread});
+    }
+    __device__ void Run(index_t* const __restrict__ p_wei_global,
+                        index_t* const __restrict__ p_in_global,
+                        float* const __restrict__ p_out_global,
+                        const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
+                        const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
+                        const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
+                        const Array<index_t, 2> conv_strides,
+                        const Array<index_t, 2> conv_dilations,
+                        const Array<index_t, 2> in_left_pads,
+                        const Array<index_t, 2> in_right_pads,
+                        index_t,
+                        index_t,
+                        index_t,
+                        index_t) const
+    {
+#if 1
+        const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
+        const index_t C = in_n_c_hi_wi_global_desc.GetLength(1);
+        const index_t K = out_n_k_ho_wo_global_desc.GetLength(1);
+        const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
+        const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
+        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
+        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
+        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
+        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
+        const index_t ConvStrideH = conv_strides[0];
+        const index_t ConvStrideW = conv_strides[1];
+        const index_t ConvDilationH = conv_dilations[0];
+        const index_t ConvDilationW = conv_dilations[1];
+        const index_t InLeftPadH  = in_left_pads[0];
+        const index_t InLeftPadW  = in_left_pads[1];
+        const index_t InRightPadH = in_right_pads[0];
+        const index_t InRightPadW = in_right_pads[1];
+#else
+        const index_t N       = in_n_c_hi_wi_global_desc.GetLength(0);
+        const index_t C       = in_n_c_hi_wi_global_desc.GetLength(1);
+        const index_t Y = 3;
+        const index_t X = 3;
+        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
+        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
+        const index_t ConvStrideH = conv_strides[0];
+        const index_t ConvStrideW = conv_strides[1];
+        const index_t ConvDilationH = conv_dilations[0];
+        const index_t ConvDilationW = conv_dilations[1];
+        const index_t InLeftPadH  = in_left_pads[0];
+        const index_t InLeftPadW  = in_left_pads[1];
+        const index_t InRightPadH = in_right_pads[0];
+        const index_t InRightPadW = in_right_pads[1];
+#endif
+        // define transform
+        // pass through
+        auto f_lower_idx_diff_passthrough = [](index_t& idx_low_diff, const index_t& idx_up_diff) {
+            idx_low_diff = idx_up_diff;
+        };
+        // pad
+        auto f_lower_idx_diff_pad = [](index_t& idx_low_diff, const index_t& idx_up_diff) {
+            idx_low_diff = idx_up_diff;
+        };
+        // embed
+        auto f_lower_idx_diff_embed = [](index_t& idx_low_diff,
+                                         const index_t& idx_up_diff_0,
+                                         const index_t& idx_up_diff_1,
+                                         const index_t coeff0,
+                                         const index_t coeff1) {
+            idx_low_diff = coeff0 * idx_up_diff_0 + coeff1 * idx_up_diff_1;
+        };
+        // unmerge
+        auto f_lower_idx_diff_unmerge = [](index_t& idx_low_diff,
+                                           const index_t& idx_up_diff_0,
+                                           const index_t& idx_up_diff_1,
+                                           const index_t up_length_1) {
+            idx_low_diff = up_length_1 * idx_up_diff_0 + idx_up_diff_1;
+        };
+        // merge
+        auto f_lower_idx_diff_merge_v1 = [](index_t& idx_low_diff_0,
+                                            index_t& idx_low_diff_1,
+                                            index_t& idx_low_diff_2,
+                                            const index_t& idx_up_diff,
+                                            const index_t& idx_low_old_0,
+                                            const index_t& idx_low_old_1,
+                                            const index_t& idx_low_old_2,
+                                            const index_t& idx_low_diff_const_0,
+                                            const index_t& idx_low_diff_const_1,
+                                            const index_t& idx_low_diff_const_2,
+                                            const index_t& idx_low_bound_0,
+                                            const index_t& idx_low_bound_1,
+                                            const index_t& idx_low_bound_2) {
+            auto f_carry_arithmetic = [](index_t& idx_low_diff,
+                                         index_t& carry,
+                                         const index_t& idx_low_old,
+                                         const index_t& idx_low_diff_const,
+                                         const index_t& idx_low_bound) {
+                index_t idx_low_tmp = idx_low_old + carry + idx_low_diff_const;
+#if 0 // positive
+                bool do_carry = idx_low_tmp >= idx_low_bound;
+                index_t idx_low_new = do_carry ? idx_low_tmp - idx_low_bound : idx_low_tmp;
+                carry = do_carry ? 1 : 0;
+#else // negative
+                bool do_borrow    = idx_low_tmp < 0;
+                index_t idx_low_new = do_borrow ? idx_low_tmp + idx_low_bound : idx_low_tmp;
+                carry          = do_borrow ? -1 : 0;
+#endif
+                idx_low_diff = idx_low_new - idx_low_old;
+            };
+            index_t carry = 0;
+            f_carry_arithmetic(
+                idx_low_diff_2, carry, idx_low_old_2, idx_low_diff_const_2, idx_low_bound_2);
+            f_carry_arithmetic(
+                idx_low_diff_1, carry, idx_low_old_1, idx_low_diff_const_1, idx_low_bound_1);
+            idx_low_diff_0 = idx_low_diff_const_0 + carry;
+        };
+        auto f_lower_idx_diff_merge_v2 = [](index_t& idx_low_diff_0,
+                                            index_t& idx_low_diff_1,
+                                            index_t& idx_low_diff_2,
+                                            const index_t& idx_up_diff,
+                                            const index_t& idx_low_old_0,
+                                            const index_t& idx_low_old_1,
+                                            const index_t& idx_low_old_2,
+                                            const index_t& idx_low_diff_const_0,
+                                            const index_t& idx_low_diff_const_1,
+                                            const index_t& idx_low_diff_const_2,
+                                            const index_t& idx_low_bound_0,
+                                            const index_t& idx_low_bound_1,
+                                            const index_t& idx_low_bound_2) {
+            auto f_carry_arithmetic = [](index_t& idx_low_diff,
+                                         index_t& carry,
+                                         const index_t& idx_low_old,
+                                         const index_t& idx_low_diff_const,
+                                         const index_t& idx_low_bound) {
+                index_t idx_low_tmp                            = idx_low_old + carry;
+                index_t idx_low_bound_minus_idx_low_diff_const = idx_low_bound - idx_low_diff_const;
+#if 1 // positive
+                bool do_carry = idx_low_tmp >= idx_low_bound_minus_idx_low_diff_const;
+                idx_low_diff =
+                    do_carry ? -idx_low_bound_minus_idx_low_diff_const : idx_low_diff_const;
+                idx_low_diff += carry;
+                carry = do_carry ? 1 : 0;
+#else // negative
+                bool do_borrow = idx_low_tmp < -idx_low_diff_const;
+                idx_low_diff = do_borrow ? idx_low_diff_const + idx_low_bound : idx_low_diff_const;
+                idx_low_diff -= carry;
+                carry          = do_borrow ? 1 : carry;
+#endif
+            };
+            index_t carry = 0;
+            f_carry_arithmetic(
+                idx_low_diff_2, carry, idx_low_old_2, idx_low_diff_const_2, idx_low_bound_2);
+            f_carry_arithmetic(
+                idx_low_diff_1, carry, idx_low_old_1, idx_low_diff_const_1, idx_low_bound_1);
+            idx_low_diff_0 = idx_low_diff_const_0 + carry;
+        };
+        auto f_lower_idx_diff_merge_v3 = [](index_t& idx_low_diff_0,
+                                            index_t& idx_low_diff_1,
+                                            index_t& idx_low_diff_2,
+                                            const index_t& idx_up_diff,
+                                            const index_t& idx_low_old_0,
+                                            const index_t& idx_low_old_1,
+                                            const index_t& idx_low_old_2,
+                                            const index_t& idx_low_diff_const_0,
+                                            const index_t& idx_low_diff_const_1,
+                                            const index_t& idx_low_diff_const_2,
+                                            const index_t& idx_low_bound_0,
+                                            const index_t& idx_low_bound_1,
+                                            const index_t& idx_low_bound_2) {
+            auto f_carry_arithmetic = [](index_t& idx_low_diff,
+                                         index_t& negative_carry,
+                                         const index_t& idx_low_old,
+                                         const index_t& idx_low_diff_const,
+                                         const index_t& idx_low_bound) {
+                index_t neg_idx_low_tmp = negative_carry - idx_low_old;
+                index_t idx_low_diff_const_minus_idx_low_bound = idx_low_diff_const - idx_low_bound;
+#if 1
+                bool do_carry = neg_idx_low_tmp <= idx_low_diff_const_minus_idx_low_bound;
+                idx_low_diff =
+                    do_carry ? idx_low_diff_const_minus_idx_low_bound : idx_low_diff_const;
+                idx_low_diff -= negative_carry;
+                negative_carry = do_carry ? -1 : 0;
+#else
+                bool do_borrow = neg_idx_low_tmp > idx_low_diff_const;
+                idx_low_diff = do_borrow ? idx_low_diff_const + idx_low_bound : idx_low_diff_const;
+                idx_low_diff -= negative_carry;
+                negative_carry = do_borrow ? 1 : negative_carry;
+#endif
+            };
+            index_t negative_carry = 0;
+            f_carry_arithmetic(idx_low_diff_2,
+                               negative_carry,
+                               idx_low_old_2,
+                               idx_low_diff_const_2,
+                               idx_low_bound_2);
+            f_carry_arithmetic(idx_low_diff_1,
+                               negative_carry,
+                               idx_low_old_1,
+                               idx_low_diff_const_1,
+                               idx_low_bound_1);
+            idx_low_diff_0 = idx_low_diff_const_0 - negative_carry;
+        };
+        index_t idx[20];
+        index_t idx_diff[20];
+        index_t const_tmp[6];
+        // populate const
+        const index_t GemmKPack = p_wei_global[0];
+#if 1
+        for(index_t i = 0; i < 6; ++i)
+        {
+            const_tmp[i] = p_wei_global[i + 1];
+        }
+#else
+        const_tmp[0]           = 0;
+        const_tmp[1]           = 2;
+        const_tmp[2]           = 2;
+#endif
+        // initialize idx
+        for(index_t i = 0; i < 20; ++i)
+        {
+            idx[i] = p_wei_global[get_thread_local_1d_id() + i];
+        }
+        // offset
+        idx[0] = idx[1] * in_n_c_hi_wi_global_desc.GetStride(0) +
+                 idx[2] * in_n_c_hi_wi_global_desc.GetStride(1) +
+                 idx[3] * in_n_c_hi_wi_global_desc.GetStride(2) +
+                 idx[4] * in_n_c_hi_wi_global_desc.GetStride(3);
+// start lowering diff
+#pragma unroll 1
+        for(index_t i = 0; i < 100; ++i)
+        {
+            for(index_t i = 0; i < 20; ++i)
+            {
+                idx_diff[i] = 0;
+            }
+            idx_diff[17] = 8;
+            // stage 4
+            // Unmerge(GemmKTotal) => GemmK, GemmKPack
+            f_lower_idx_diff_unmerge(idx_diff[15], idx_diff[17], idx_diff[18], GemmKPack);
+            // PassThrough GemmN => GemmN
+            f_lower_idx_diff_passthrough(idx_diff[16], idx_diff[19]);
+            // stage 3
+            // Merge(C, Y, X) => GemmKTotal
+            f_lower_idx_diff_merge_v2(idx_diff[10],
+                                      idx_diff[11],
+                                      idx_diff[13],
+                                      idx_diff[15],
+                                      idx[10],
+                                      idx[11],
+                                      idx[13],
+                                      const_tmp[0],
+                                      const_tmp[1],
+                                      const_tmp[2],
+                                      C,
+                                      Y,
+                                      X);
+#if 0
+            // Merge(N, Ho, Wo) => GemmN
+            f_lower_idx_diff_merge(idx_diff[9],
+                                   idx_diff[12],
+                                   idx_diff[14],
+                                   idx_diff[16],
+                                   idx[9],
+                                   idx[12],
+                                   idx[14],
+                                   const_tmp[3],
+                                   const_tmp[4],
+                                   const_tmp[5],
+                                   N,
+                                   Ho,
+                                   Wo);
+#endif
+            // stage 2
+            // PassThrough(N) => N
+            f_lower_idx_diff_passthrough(idx_diff[5], idx_diff[9]);
+            // PassThrough(C) => C
+            f_lower_idx_diff_passthrough(idx_diff[6], idx_diff[10]);
+            // Embed(Hip) => Y, Ho
+            f_lower_idx_diff_embed(
+                idx_diff[7], idx_diff[11], idx_diff[12], ConvDilationH, ConvStrideH);
+            // Embed(Wip) => X, Wo
+            f_lower_idx_diff_embed(
+                idx_diff[8], idx_diff[13], idx_diff[14], ConvDilationW, ConvStrideW);
+            // stage 1
+            // PassThrough(N) => N
+            f_lower_idx_diff_passthrough(idx_diff[1], idx_diff[5]);
+            // PassThrough(C) => C
+            f_lower_idx_diff_passthrough(idx_diff[2], idx_diff[6]);
+            // Pad(Hi) => Hip
+            f_lower_idx_diff_pad(idx_diff[3], idx_diff[7]);
+            // Pad(Wi) => Wip
+            f_lower_idx_diff_pad(idx_diff[4], idx_diff[8]);
+            // stage 0
+            // offset_diff
+            idx_diff[0] = idx_diff[1] * in_n_c_hi_wi_global_desc.GetStride(0) +
+                          idx_diff[2] * in_n_c_hi_wi_global_desc.GetStride(1) +
+                          idx_diff[3] * in_n_c_hi_wi_global_desc.GetStride(2) +
+                          idx_diff[4] * in_n_c_hi_wi_global_desc.GetStride(3);
+#if 0
+            // update idx
+            for(index_t i = 0; i < 20; ++ i)
+            {
+                idx[i] += idx_diff[i];
+            }
+            // padding check
+            bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
+#elif 0 // no pad
+            // offset
+            idx[0] += idx_diff[0];
+            // C, Y, X
+            idx[10] += idx_diff[10];
+            idx[11] += idx_diff[11];
+            idx[13] += idx_diff[13];
+            // padding check
+            bool is_in_bound = true;
+#else   // pad
+            // offset
+            idx[0] += idx_diff[0];
+            // C, Y, X
+            idx[10] += idx_diff[10];
+            idx[11] += idx_diff[11];
+            idx[13] += idx_diff[13];
+            // Hi, Wi
+            idx[3] += idx_diff[3];
+            idx[4] += idx_diff[4];
+            // padding check
+            bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
+#endif
+            float value = 1;
+            transfer_data<float,
+                          1,
+                          AddressSpace::Vgpr,
+                          AddressSpace::Global,
+                          InMemoryDataOperation::Set,
+                          1,
+                          1>(&value,
+                             0,
+                             true,
+                             1,
+                             p_out_global,
+                             idx[0],
+                             is_in_bound,
+                             out_n_k_ho_wo_global_desc.GetElementSpace());
+        }
+    }
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
@@ -96,13 +96,10 @@ struct DummyStaticTransform
        auto coord = typename TensorCoordinate<decltype(in_gemmk_gemmn_global_desc)>::type(k0, n0);
-        if(get_block_1d_id() < coord.GetOffset())
+#pragma unroll 1
+        for(index_t k = 0; k < 100; ++k)
        {
-            for(index_t k = 0; k < 1; ++k)
+            coord += Array<index_t, 2>{8, 0};
-            {
-                for(index_t n = 0; n < 4; ++n)
-                {
-                    auto tmp = coord + Array<index_t, 2>{k, n};
            Float value = 1;
            transfer_data<Float,
@@ -116,13 +113,11 @@ struct DummyStaticTransform
                             true,
                             1,
                             p_in_global,
-                                     tmp.GetOffset(),
+                             coord.GetOffset(),
-                                     tmp.IsOffsetValidAssumingUpperIndexIsValid(),
+                             coord.IsOffsetValidAssumingUpperIndexIsValid(),
                             in_gemmk_gemmn_global_desc.GetElementSpace());
        }
    }
-        }
-    }
 };
 } // namespace ck

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
+#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
+#include "common_header.hpp"
+namespace ck {
+struct DynamicPassThrough
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    index_t low_length_;
+    __host__ __device__ constexpr DynamicPassThrough(index_t low_length) : low_length_(low_length)
+    {
+    }
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
+    __host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return idx_up;
+    }
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return idx_up_diff;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+};
+template <index_t NDimLow>
+struct DynamicMerge
+{
+    static constexpr index_t ndim_low_ = NDimLow static constexpr index_t ndim_up_ = 1;
+    using LowerIndex = MultiIndex<ndim_low_>;
+    using UpperIndex = MultiIndex<ndum_up_>;
+    Array<index_t, NDimLow> low_lengths_;
+    index_t up_length_;
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension()
+    {
+        return Number<ndim_low_>{};
+    }
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension()
+    {
+        return Number<ndim_up_>{};
+    }
+    __host__ __device__ static constexpr auto GetUpperLengths()
+    {
+        return Array<index_t, 1> up_length_;
+    }
+    // emulate constexpr lambda
+    template <typename PseudoLowStrides>
+    struct lambda_CalculateLowerIndex
+    {
+        index_t& itmp;
+        LowerIndex& idx_low;
+        __host__ __device__ explicit constexpr lambda_CalculateLowerIndex(index_t& itmp_,
+                                                                          LowerIndex& idx_low_)
+            : itmp(itmp_), idx_low(idx_low_)
+        {
+        }
+        template <typename IDim>
+        __host__ __device__ constexpr void operator()(IDim idim) const
+        {
+            constexpr index_t stride = PseudoLowStrides::At(idim);
+            idx_low(idim)            = itmp / stride;
+            itmp -= idx_low[idim] * stride;
+        }
+    };
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        LowerIndex idx_low;
+        index_t itmp = idx_up[0];
+        constexpr auto pseudo_low_strides =
+            reverse_inclusive_scan_sequence(
+                LowerLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
+                .PushBack(Number<1>{});
+        static_for<0, nDimLow - 1, 1>{}(
+            lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
+        idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1];
+        return idx_low;
+    }
+    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
+    // If idx_up_diff is known at compile-time, many calculations can be optimized
+    // away by compiler
+    // This function assume idx_low_old is not out-of-bound
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& idx_low_old)
+    {
+        if(idx_up_diff[0] == 0)
+        {
+            return make_zero_array<index_t, nDimLow>();
+        }
+        else
+        {
+            // CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
+            //   If idx_up_diff is known at compile-time, the calculation can
+            //   be done at compile-time. However, if idx_up_diff is only known
+            //   at run-time, then the calculation will also be computed at
+            //   run-time, and can be very expensive.
+            LowerIndex idx_low_diff_tmp = CalculateLowerIndex(idx_up_diff);
+            // find out the last low dimension that changed
+            index_t last_changed_low_dim = 0;
+            static_for<0, nDimLow, 1>{}([&](auto i) {
+                if(idx_low_diff_tmp[i] != 0)
+                {
+                    last_changed_low_dim = i;
+                }
+            });
+            LowerIndex idx_low_new = idx_low_old + idx_low_diff_tmp;
+            if(idx_up_diff[0] > 0)
+            {
+                // do carry check on each low dimension in reversed order
+                // starting from the first digit that changed
+                // don't check the highest dimension
+                bool carry = false;
+                static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
+                    if(i <= last_changed_low_dim)
+                    {
+                        if(carry)
+                        {
+                            ++idx_low_new(i);
+                        }
+                        carry = false;
+                        if(idx_low_new[i] >= LowerLengths::At(i))
+                        {
+                            idx_low_new(i) -= LowerLengths::At(i);
+                            carry = true;
+                        }
+                    }
+                });
+                // highest dimension, no out-of-bound check
+                if(carry)
+                {
+                    ++idx_low_new(0);
+                }
+            }
+            else
+            {
+                // do borrow check on each low dimension in reversed order
+                // starting from the first digit that changed
+                // don't check the highest dimension
+                bool borrow = false;
+                static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
+                    if(i <= last_changed_low_dim)
+                    {
+                        if(borrow)
+                        {
+                            --idx_low_new(i);
+                        }
+                        borrow = false;
+                        if(idx_low_new[i] < 0)
+                        {
+                            idx_low_new(i) += LowerLengths::At(i);
+                            borrow = true;
+                        }
+                    }
+                });
+                // highest dimension, no out-of-bound check
+                if(borrow)
+                {
+                    --idx_low_new(0);
+                }
+            }
+            return idx_low_new - idx_low_old;
+        }
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
+#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
+#include "common_header.hpp"
+namespace ck {
+template <index_t NDim>
+struct DynamicNativeTensorDescriptor
+{
+    using Index = MultiIndex<NDim>;
+    Array<index_t, NDim> lengths_;
+    Array<index_t, NDim> strides_;
+    index_t element_size_;
+    index_t element_space_;
+    template <typename Lengths, typename Strides>
+    __host__ __device__ constexpr DynamicNativeTensorDescriptor(const Lengths& lengths,
+                                                                const Strides& strides)
+        : lengths_(lengths), strides_(strides)
+    {
+        element_size_ = 1;
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            element_size_ *= lengths_[i];
+        }
+        element_space_ = 1;
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            element_space_ += (lengths_[i] - 1) * strides_[i];
+        }
+    }
+    __host__ __device__ static constexpr auto GetNumOfDimension() { return NDim; }
+    __host__ __device__ constexpr auto GetLength(const index_t& i) const { return lengths_[i]; }
+    __host__ __device__ constexpr auto GetStride(const index_t& i) const { return strides_[i]; }
+    __host__ __device__ constexpr auto GetLengths() const { return lengths_; }
+    __host__ __device__ constexpr auto GetStrides() const { return strides_; }
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+    __host__ __device__ constexpr auto GetElementSpace() const { return element_space_; }
+    __host__ __device__ constexpr auto CalculateOffset(const Index& idx) const
+    {
+        index_t offset = 0;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            offset += idx[i] * strides_[i];
+        }
+        return offset;
+    }
+    __host__ __device__ constexpr auto CalculateOffsetDiff(const Index& idx_diff) const
+    {
+        index_t offset_diff = 0;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            offset_diff += idx_diff[i] * strides_[i];
+        }
+        return offset_diff;
+    }
+    __host__ __device__ constexpr bool IsUpperIndexValid(const Index& idx) const
+    {
+        bool flag = true;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            flag = flag && idx[i] >= 0 && idx[i] < lengths_[i];
+        }
+        return flag;
+    }
+};
+#if 0
+// Tensor descriptor for "transformed tensor"
+template <typename LowTensorDescriptor,
+          typename Transforms,          // Tuple<DynamicMultIndexTransforms,...>
+          typename LowDimensions,       // Tuple<Sequence<...>,...>
+          typename UpDimensions>        // Tuple<Sequence<...>,...>
+struct DynamicTransformedTensorDescriptor
+{
+    using Type = DynamicTransformedTensorDescriptor;
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension()
+    {
+        // Here, we assume all lower-dimensions are active
+        // TODO: sanity-check all lower-dimension are indeed active
+        using duplicated_low_active_dims =
+            decltype(unpack(lambda_merge_sequences{}, LowDimensions{}));
+        using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
+                                                              math::less<index_t>,
+                                                              math::equal<index_t>>::type;
+        return low_active_dims::Size();
+    }
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension()
+    {
+        using duplicated_up_active_dims =
+            decltype(unpack(lambda_merge_sequences{}, UpDimensions{}));
+        using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
+                                                             math::less<index_t>,
+                                                             math::equal<index_t>>::type;
+        return up_active_dims::Size();
+    }
+    static constexpr index_t ndim_up_  = GetNumOfUpperDimension();
+    static constexpr index_t ndim_low_ = GetNumOfLowerDimension();
+    static constexpr index_t num_transform_ = Transforms::Size();
+    using UpperIndex = MultiIndex<ndim_up_>;
+    using LowerIndex = MultiIndex<ndim_low_>;
+    const LowTensorDescriptor low_tensor_desc_;
+    const Transforms transforms_;
+    const LowDimensions low_dims_;
+    const UpDimensions up_dims_;
+    __host__ __device__ constexpr TransformedTensorDescriptor(const LowTensorDescriptor& low_tensor_desc,
+                                                              const Transforms& transforms)
+        : low_tensor_desc_(low_tensor_desc),
+          transforms_(transforms)
+    {
+    }
+    __host__ __device__ static constexpr auto GetNumOfDimension()
+    {
+        return GetNumOfUpperDimension();
+    }
+    __host__ __device__ constexpr auto GetLowerTensorDescriptor() const
+    {
+        return low_dims_;
+    }
+    __host__ __device__ constexpr auto GetUpperLengths() cons
+    {
+    }
+    __host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
+    __host__ __device__ constexpr auto GetLength(index_t i) const
+    {
+        return GetLengths()[i];
+    }
+    __host__ __device__ constexpr auto GetElementSize() const
+    {
+        index_t element_size = 1;
+        for(index_t i = 0; i < ndim_up_; ++i)
+        {
+            element_size *= GetLength(i);
+        }
+        return element_size;
+    }
+    __host__ __device__ constexpr auto GetElementSpace() const
+    {
+        return lower_tensor_desc_.GetElementSpace();
+    }
+    // TODO: right now return value is not constexpr because use of non-constexpr lambda
+    __host__ __device__ constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) const
+    {
+        LowerIndex idx_low;
+        static_for<0, num_transform_, 1>{}([&](auto itran) {
+            constexpr auto tran = Transforms{}.At(itran);
+            const auto idx_up_part = pick_array_element(idx_up, UpDimensions{}.At(itran));
+            auto idx_low_part      = pick_array_element(idx_low, LowDimensions{}.At(itran));
+            // this assume each lower (single) index is only assocaited with one transformation,
+            //   which is required for index transformation, and has been checked during constructor
+            //   of TransformedTensorDescriptor
+            idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
+        });
+        return idx_low;
+    }
+    // TODO: right now return value is not constexpr because use of non-constepxr lambda
+    __host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
+        const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
+    {
+        LowerIndex idx_low_diff;
+        static_for<0, nTransform, 1>{}([&](auto itran) {
+            constexpr auto tran = Transforms{}.At(itran);
+            const auto idx_up_diff_part =
+                pick_array_element(idx_up_diff, UpDimensions{}.At(itran));
+            const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensions{}.At(itran));
+            const auto idx_low_old_part =
+                pick_array_element(idx_low_old, LowDimensions{}.At(itran));
+            auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensions{}.At(itran));
+            // this assume each lower (single) index is associated with only one transformation,
+            //   which is required for index transformation, and has been checked during constructor
+            //   of TransformedTensorDescriptor
+            idx_low_diff_part = tran.CalculateLowerIndexDiff(
+                to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
+        });
+        return idx_low_diff;
+    }
+    __host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
+    {
+        return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
+    }
+};
+#endif
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
+#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+namespace ck {
+template <typename Lengths, typename Strides>
+__host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const Lengths& lengths,
+                                                                         const Strides& strides)
+{
+    static_assert(Lengths::GetSize() == Strides::GetSize(), "wrong! Size not the same");
+    return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
+}
+template <typename LowTensorDescriptor,
+          typename Transforms,
+          typename LowDimensions,
+          typename UpDimensions>
+__host__ __device__ constexpr auto
+transform_dynamic_tensor_descriptor(const LowTensorDescriptor& low_tensor_desc,
+                                    const Transforms& transforms,
+                                    LowDimensions,
+                                    UpDimensions)
+{
+    return DynamicTransformedTensorDescriptor<LowTensorDescriptor,
+                                              Transforms,
+                                              LowDimensions,
+                                              UpDimensions>(low_tensor_desc, transforms);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+#ifndef CK_AMD_LLVM_INTRINSIC_HPP
+#define CK_AMD_LLVM_INTRINSIC_HPP
+#include "float_type.hpp"
+namespace ck {
+__device__ int32_t __llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -20,6 +20,7 @@
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
+#include "amd_llvm_intrinsic.hpp"
 #endif
 #if CK_USE_AMD_XDLOPS

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -172,7 +172,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x16
    constexpr index_t BlockSize = 256;
@@ -290,7 +290,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 128, 64x128x8
    constexpr index_t BlockSize = 128;

--- a/driver/include/device_dummy_dynamic_transform.hpp
+++ b/driver/include/device_dummy_dynamic_transform.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "dummy_dynamic_transform.hpp"
+template <class T,
+          class InDesc,
+          class WeiDesc,
+          class OutDesc,
+          class ConvStrides,
+          class ConvDilations,
+          class InLeftPads,
+          class InRightPads>
+void device_dummy_dynamic_transform(InDesc,
+                                    const Tensor<T>& in_nchw,
+                                    WeiDesc,
+                                    const Tensor<T>& wei_kcyx,
+                                    OutDesc,
+                                    Tensor<T>& out_nkhw,
+                                    ConvStrides,
+                                    ConvDilations,
+                                    InLeftPads,
+                                    InRightPads,
+                                    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
+    const auto in_nchw_desc = make_dynamic_native_tensor_descriptor(to_array(InDesc::GetLengths()),
+                                                                    to_array(InDesc::GetStrides()));
+    const auto wei_kcyx_desc = make_dynamic_native_tensor_descriptor(
+        to_array(WeiDesc::GetLengths()), to_array(WeiDesc::GetStrides()));
+    const auto out_nkhw_desc = make_dynamic_native_tensor_descriptor(
+        to_array(OutDesc::GetLengths()), to_array(OutDesc::GetStrides()));
+    const auto conv_strides   = to_array(ConvStrides{});
+    const auto conv_dilations = to_array(ConvDilations{});
+    const auto in_left_pads   = to_array(InLeftPads{});
+    const auto in_right_pads  = to_array(InRightPads{});
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GridSize  = 1;
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    using dummy_transform = DummyDynamicTransform<BlockSize>;
+    for(index_t i = 0; i < 5; ++i)
+    {
+        std::cout << "Start running " << nrepeat << " times..." << std::endl;
+        KernelTimer timer;
+        timer.Start();
+        for(index_t j = 0; j < nrepeat; ++j)
+        {
+            launch_kernel(run_gridwise_operation<dummy_transform,
+                                                 index_t* const,
+                                                 index_t* const,
+                                                 float* const,
+                                                 const DynamicNativeTensorDescriptor<4>,
+                                                 const DynamicNativeTensorDescriptor<4>,
+                                                 const DynamicNativeTensorDescriptor<4>,
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>,
+                                                 index_t,
+                                                 index_t,
+                                                 index_t,
+                                                 index_t>,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          static_cast<index_t*>(in_nchw_device_buf.GetDeviceBuffer()),
+                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
+                          wei_kcyx_desc,
+                          in_nchw_desc,
+                          out_nkhw_desc,
+                          conv_strides,
+                          conv_dilations,
+                          in_left_pads,
+                          in_right_pads,
+                          10,
+                          10,
+                          10,
+                          10);
+        }
+    }
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/include/device_dummy_transform.hpp
+++ b/driver/include/device_dummy_transform.hpp
@@ -12,7 +12,7 @@ template <class T,
          class ConvDilations,
          class InLeftPads,
          class InRightPads>
-void device_dummy_transform(InDesc,
+void device_dummy_static_transform(InDesc,
                                   const Tensor<T>& in_nchw,
                                   WeiDesc,
                                   const Tensor<T>& wei_kcyx,

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -14,7 +14,8 @@
 #include "device_tensor.hpp"
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "device_dummy_transform.hpp"
+#include "device_dummy_static_transform.hpp"
+#include "device_dummy_dynamic_transform.hpp"
 int main(int argc, char* argv[])
 {
@@ -200,7 +201,7 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    // 3x3, 35x35, stride 2
    constexpr index_t N  = 128;
    constexpr index_t C  = 288;
@@ -572,8 +573,20 @@ int main(int argc, char* argv[])
                                                         LeftPads{},
                                                         RightPads{},
                                                         nrepeat);
+#elif 0
+    device_dummy_static_transform(in_nchw_desc,
+                                  in_nchw,
+                                  wei_kcyx_desc,
+                                  wei_kcyx,
+                                  out_nkhw_desc,
+                                  out_nkhw_device,
+                                  ConvStrides{},
+                                  ConvDilations{},
+                                  LeftPads{},
+                                  RightPads{},
+                                  nrepeat);
 #elif 1
-    device_dummy_transform(in_nchw_desc,
+    device_dummy_dynamic_transform(in_nchw_desc,
                                   in_nchw,
                                   wei_kcyx_desc,
                                   wei_kcyx,

--- a/driver/src/conv_driver.cu
+++ b/driver/src/conv_driver.cu
-conv_driver.cpp
\ No newline at end of file