adding dynamic tensor descriptor

2ffa2708 · Chao Liu · c1eaba27 · 2ffa2708 · 2ffa2708 · 2ffa2708
Commit 2ffa2708 authored Sep 06, 2020 by Chao Liu
6 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
@@ -553,7 +553,7 @@ struct DummyDynamicTransform

        auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);

-        for(index_t iter = 0; iter < 100; ++iter)
+        for(index_t iter = 0; iter < niter; ++iter)
        {
            constexpr auto gemmk1_gemmn0 = MultiIndex<2>{1, 0};

@@ -574,7 +574,11 @@ struct DummyDynamicTransform
                             1,
                             p_out_global,
                             in_gemmk_gemmn_coord.GetOffset(),
+#if 0
                             in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
+#else
+                             true,
+#endif
                             in_gemmk_gemmn_global_desc.GetElementSpace());
        }
    }

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -35,8 +35,8 @@ struct DynamicPassThrough
    }

    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                                                            const UpIdxDiff& idx_up_diff,
+    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                                                            const UpIdxDiff& idx_diff_up,
                                                            const LowIdx& /* idx_low_old */,
                                                            const UpIdx& /* idx_up_old */)
    {
@@ -44,7 +44,7 @@ struct DynamicPassThrough
                          UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");

-        idx_low_diff(0) = idx_up_diff[0];
+        idx_diff_low(0) = idx_diff_up[0];
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -86,7 +86,8 @@ struct DynamicLeftPad
    __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }

    template <typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
    {
        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");
@@ -95,8 +96,8 @@ struct DynamicLeftPad
    }

    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                                                            const UpIdxDiff& idx_up_diff,
+    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                                                            const UpIdxDiff& idx_diff_up,
                                                            const LowIdx& /* idx_low_old */,
                                                            const UpIdx& /* idx_up_old */)
    {
@@ -104,7 +105,7 @@ struct DynamicLeftPad
                          UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");

-        idx_low_diff(0) = idx_up_diff[0];
+        idx_diff_low(0) = idx_diff_up[0];
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -159,8 +160,8 @@ struct DynamicRightPad
    }

    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                                                            const UpIdxDiff& idx_up_diff,
+    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                                                            const UpIdxDiff& idx_diff_up,
                                                            const LowIdx& /* idx_low_old */,
                                                            const UpIdx& /* idx_up_old */)
    {
@@ -168,7 +169,7 @@ struct DynamicRightPad
                          UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");

-        idx_low_diff(0) = idx_up_diff[0];
+        idx_diff_low(0) = idx_diff_up[0];
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -218,7 +219,8 @@ struct DynamicEmbed
    __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }

    template <typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
    {
        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
                      "wrong! inconsistent # of dimension");
@@ -233,8 +235,8 @@ struct DynamicEmbed
    }

    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                                                     const UpIdxDiff& idx_up_diff,
+    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                                                               const UpIdxDiff& idx_diff_up,
                                                               const LowIdx& /* idx_low_old */,
                                                               const UpIdx& /* idx_up_old */) const
    {
@@ -242,12 +244,12 @@ struct DynamicEmbed
                          LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
                      "wrong! inconsistent # of dimension");

-        idx_low_diff(0) = 0;
+        idx_diff_low(0) = 0;

 #pragma unroll
        for(index_t i = 0; i < NDimUp; ++i)
        {
-            idx_low_diff(0) += idx_up_diff[i] * coefficients_[i];
+            idx_diff_low(0) += idx_diff_up[i] * coefficients_[i];
        }
    }

@@ -299,7 +301,8 @@ struct DynamicMerge
    __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{up_length_}; }

    template <typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
    {
        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");
@@ -316,13 +319,13 @@ struct DynamicMerge
        idx_low(NDimLow - 1) = tmp;
    }

-    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
-    // If idx_up_diff is known at compile-time, many calculations can be optimized
+    // idx_diff_low depends on idx_low_old, so idx_low need to be up-to-date
+    // If idx_diff_up is known at compile-time, many calculations can be optimized
    // away by compiler
    // This function assume idx_low_old is not out-of-bound
    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                                                     const UpIdxDiff& idx_up_diff,
+    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                                                               const UpIdxDiff& idx_diff_up,
                                                               const LowIdx& idx_low_old,
                                                               const UpIdx& /* idx_up_old */) const
    {
@@ -330,22 +333,35 @@ struct DynamicMerge
                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");

-        // CalculateLowerIndex(idx_low_diff_const) has multiple integer divisions.
+#if 1
+        // I only want to do this check, if idx_diff_up is know at compile-time
+        if(idx_diff_up[0] == 0)
+        {
+#pragma unroll
+            for(index_t i = 0; i < NDimLow; ++i)
+            {
+                idx_diff_low(i) = 0;
+            }
+
+            return;
+        }
+#endif
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
        // However,
-        //   1) If idx_up_diff is known at compile-time, then idx_low_diff_const
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
        //   can be calculated at compile-time.
-        //   2) If idx_up_diff is not known at compile-time, but its value
+        //   2) If idx_diff_up is not known at compile-time, but its value
        //   doesn't change during the whole kernel execution, then
-        //   idx_low_diff_const also
+        //   idx_diff_low_const also
        //   doesn't change during the whole kernel execution. Compiler generated
        //   ISA should
-        //   only caclculate idx_low_diff_const once and save it durinng the whole
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
        //   kernel execution
        // If neither 1) nor 2) is satisfied, then the calculation will also be
        // computed at
        //   run-time each time this function is called, and can be very expensive.
-        LowerIndex idx_low_diff_const;
-        CalculateLowerIndex(idx_low_diff_const, idx_up_diff);
+        LowerIndex idx_diff_low_const;
+        CalculateLowerIndex(idx_diff_low_const, idx_diff_up);

        // do carry check on each low dimension in reversed order
        // do not need to check the first dimension
@@ -354,29 +370,29 @@ struct DynamicMerge
 #pragma unroll
        for(index_t i = NDimLow - 1; i > 0; --i)
        {
-            // this should be saved as well
-            index_t idx_low_length_minus_idx_low_diff_const =
-                low_lengths_[i] - idx_low_diff_const[i];
+            // this should be saved in SGPR as well
+            index_t idx_low_length_minus_idx_diff_low_const =
+                low_lengths_[i] - idx_diff_low_const[i];
 #if 0
-            index_t idx_low_length_plus_idx_low_diff_const =
-                low_lengths_[i] + idx_low_diff_const[i];
+            index_t idx_low_length_plus_idx_diff_low_const =
+                low_lengths_[i] + idx_diff_low_const[i];
 #endif

            index_t idx_low_tmp = idx_low_old[i] + carry;

-            bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_low_diff_const;
+            bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const;
 #if 0
-            bool do_borrow = idx_low_tmp < -idx_low_diff_const[i];
+            bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
 #endif

-            idx_low_diff(i) =
-                do_carry ? -idx_low_length_minus_idx_low_diff_const : idx_low_diff_const[i];
+            idx_diff_low(i) =
+                do_carry ? -idx_low_length_minus_idx_diff_low_const : idx_diff_low_const[i];
 #if 0
-            idx_low_diff(i) =
-                do_borrow ? idx_low_length_plus_idx_low_diff_const : idx_low_diff[i];
+            idx_diff_low(i) =
+                do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
 #endif

-            idx_low_diff(i) += carry;
+            idx_diff_low(i) += carry;

            carry = do_carry ? 1 : 0;
 #if 0
@@ -384,7 +400,7 @@ struct DynamicMerge
 #endif
        }

-        idx_low_diff(0) = idx_low_diff_const[0] + carry;
+        idx_diff_low(0) = idx_diff_low_const[0] + carry;
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
@@ -431,7 +447,8 @@ struct DynamicUnMerge
    __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }

    template <typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
    {
        idx_low(0) = idx_up[NDimUp];

@@ -443,12 +460,12 @@ struct DynamicUnMerge
    }

    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                                                     const UpIdxDiff& idx_up_diff,
+    __host__ __device__ constexpr void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                                                               const UpIdxDiff& idx_diff_up,
                                                               const LowIdx& /* idx_low_old */,
                                                               const UpIdx& /* idx_up_old */) const
    {
-        CalculateLowerIndex(idx_low_diff, idx_up_diff);
+        CalculateLowerIndex(idx_diff_low, idx_diff_up);
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
@@ -486,7 +503,8 @@ struct DynamicFreeze
    __host__ __device__ constexpr auto GetUpperLengths() const { return UpperIndex{}; }

    template <typename LowIdx, typename UpIdx>
-    __host__ __device__ void CalculateLowerIndex(LowIdx& idx_low, const UpIdx& idx_up) const
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
    {
        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
                      "wrong! inconsistent # of dimension");
@@ -495,12 +513,12 @@ struct DynamicFreeze
    }

    template <typename LowIdxDiff, typename UpIdxDiff, typename LowIdx, typename UpIdx>
-    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_low_diff,
-                                                            const UpIdxDiff& idx_up_diff,
+    __host__ __device__ static void CalculateLowerIndexDiff(LowIdxDiff& idx_diff_low,
+                                                            const UpIdxDiff& idx_diff_up,
                                                            const LowIdx& /* idx_low_old */,
                                                            const UpIdx& /* idx_up_old */)
    {
-        idx_low_diff(0) = index_t{0};
+        idx_diff_low(0) = index_t{0};
    }

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper_v2.hpp
+#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V2_HPP
+#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_V2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor_v2.hpp"
+
+namespace ck {
+
+template <typename LowerTensorDescriptor,
+          typename Transforms,
+          typename LowerVisibleDimensionLowerVisibleIdss,
+          typename UpperVisibleDimensionUpperVisibleIdss>
+__host__ __device__ constexpr auto
+transform_dynamic_tensor_descriptor_v2(const LowerTensorDescriptor& low_tensor_desc,
+                                       const Transforms& transforms,
+                                       LowerVisibleDimensionLowerVisibleIdss,
+                                       UpperVisibleDimensionUpperVisibleIdss)
+{
+    // convert lower visible dimension idss (tuple of sequences) to hidden dimension idss (tuple of sequences)
+    constexpr auto low_visible_dimension_hidden_idss = transform_tuples(
+        // convert lower visible dimension ids (a sequence) to hidden dimension ids (a sequence)
+        [](auto low_visible_dim_ids) {
+            return transform_sequences(
+                // convert lower visible dimension id to hidden dimension id
+                [](auto low_visible_dim_id) {
+                    return low_tensor_desc.GetVisibleDimensionIds()[low_visible_dim_id];
+                },
+                low_visible_dim_ids);
+        },
+        LowerVisibleDimensionLowerVisibleIdss{});
+
+    constexpr auto up_visible_dims_
+
+    const auto all_transforms = merge_tuples(old_tensor_desc.GetTransforms(), new_transforms);
+    constexpr auto all_low_dim_idss =
+        merge_tuples(old_tensor_desc.GetLowerDimensionIdss(), new_low_dim_idss);
+    constexpr auto all_up_dim_idss =
+        merge_tuples(old_tensor_desc.GetUpperDimensionIdss(), new_up_dim_idss);
+
+    constexpr auto new_visible_dim_ids = new_up_dim_idss
+
+        return DynamicTensorDescriptor_v2<decltype(all_transforms),
+                                          decltype(all_low_dim_idss),
+                                          decltype(all_up_dim_idss),
+                                          decltype(new_visible_dim_ids)>{
+            all_transforms, old_tensor_desc.GetElementSpaceSize()};
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_v2.hpp
+#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_V2_HPP
+#define CK_DYNAMIC_TENSOR_DESCRIPTOR_V2_HPP
+
+#include "common_header.hpp"
+#include "dynamic_multi_index_transform.hpp"
+
+namespace ck {
+
+template <index_t NDimHidden, typename VisibleDimensionIds>
+struct DynamicTensorCoordinate_v2;
+
+template <index_t NTransform, index_t NDimVisible>
+struct DynamicTensorCoordinateStep_v2;
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionIdss : Tuple<Sequence<...>, ...>
+// UpperDimensionIdss : Tuple<Sequence<...>, ...>
+// VisibleDimensionIds> : Sequence<...>
+template <typename Transforms,
+          typename LowerDimensionIdss,
+          typename UpperDimensionIdss,
+          typename VisibleDimensionIds>
+struct DynamicTensorDescriptor_v2
+{
+    constexpr static index_t ntransform_   = GetNumOfTransform();
+    constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension();
+    constexpr static index_t ndim_hidden_  = GetNumOfHiddenDimension();
+
+    using VisibleIndex = MultiIndex<ndim_visible_>;
+    using HiddenIndex  = MultiIndex<ndim_hidden_>;
+
+    __host__ __device__ explicit constexpr DynamicTensorDescriptor_v2(const Transforms& transforms,
+                                                                      index_t element_space_size)
+        : transforms_{transforms},
+          hidden_lengths_{InitializeHiddenLengths(transforms_, element_space_size)},
+          visble_lengths_{hidden_lengths_}
+    {
+        static_assert(Transforms::Size() == ntransforms_ &&
+                          LowerDimensionIdss::Size() == ntransforms_ &&
+                          UpperDimensionIdss::Size() == ntransforms_,
+                      "wrong! inconsistent # of transformations");
+
+        // TODO check dependency of dimensions is valid
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfDimension() const
+    {
+        return GetNumOfVisibleDimension();
+    }
+
+    __host__ __device__ constexpr index_t GetLength(index_t idim) const
+    {
+        return visible_lengths_[idim];
+    }
+
+    __host__ __device__ constexpr const auto& GetLengths() const { return visible_lengths_; }
+
+    // maybe this result should be saved as a member variable
+    __host__ __device__ constexpr index_t GetElementSize() const
+    {
+        return reduce_on_array(GetLengths(), math::multiplies<index_t>{}, index_t{1});
+    }
+
+    __host__ __device__ constexpr index_t GetElementSpaceSize() const { return hidden_lengths_[0]; }
+
+    template <typename Idx>
+    __host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
+    {
+        static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
+
+        return make_tensor_coordinate_v2(*this, idx).GetOffset();
+    }
+
+    private:
+    __host__ __device__ static constexpr index_t GetNumOfVisibleDimension()
+    {
+        return VisibleDimensionIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
+    {
+        constexpr auto all_low_dim_ids =
+            unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
+                   LowerDimsionIdss{});
+
+        constexpr auto all_up_dim_ids =
+            unpack([](auto&&... xs) constexpr { return merge_sequences(xs...); },
+                   UpperDimsionIdss{});
+
+        constexpr auto all_dim_ids = merge_sequenses(all_low_dim_ids, all_up_dim_ids);
+
+        using unique_sort_all_dim_ids = sequence_unique_sort<decltype(all_dim_ids),
+                                                             math::less<index_t>,
+                                                             math::equal<index_t>>::type;
+
+        return uniqie_sort_all_dim_ids::type::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
+
+    __host__ __device__ constexpr const auto& GetTransforms() const { return transforms_; }
+
+    __host__ __device__ static constexpr auto GetLowerDimensionIdss()
+    {
+        return LowerDimensionIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetUpperDimensionIdss()
+    {
+        return UpperDimensionIdss{};
+    }
+
+    __host__ __device__ static constexpr index_t GetVisibleDimensionIds()
+    {
+        return VisibleDimensionIds{};
+    }
+
+    __host__ __device__ static constexpr auto InitializeHiddenLengths(const Transforms& transforms,
+                                                                      index_t element_space_size)
+    {
+        HiddenIndex lengths_hidden = make_zero_multi_index<ndim_hidden_>();
+
+        // this is the orignal tensor element space size
+        lengths_hidden(0) = element_space_size;
+
+        // lengths for all other hidden dimensions
+        static_for<0, ntransform_, 1>{}([&](auto itran) {
+            const auto& tran = transforms.At(itran);
+
+            constexpr auto up_dim_ids = UpperDimensionIdss::At(itran);
+
+            const auto lengths_up_pick = pick_array_element(lengths_hidden, up_dim_ids);
+
+#pragma unroll
+            for(index_t i = 0; i < lengths_low.Size(); ++i)
+            {
+                lengths_low_pick(i) = tran.GetUpperLengths()[i];
+            }
+        });
+
+        return lengths_hidden;
+    }
+
+    // private member variables
+    const Transforms transforms_;
+    // TODO maybe hidden_lengths_ should use reference_wrapper to save space on stack?
+    const HiddenIndex hidden_lengths_;
+    // visible_lenths_ contains a reference to hidden_lengths_
+    const ArrayElementPicker<HiddenIndex, VisibleDimensionIds> visible_lengths_;
+
+    // friend functions for making and updating tensor coordinate
+    __host__
+        __device__ friend constexpr DynamicTensorCoordinate_v2<ndim_hidden_, VisibleDimensionIds>
+        make_tensor_coordinate_v2(const DynamicTensorDescriptor_v2& /* tensor_desc */,
+                                  const VisibleIndex& /* idx_visible */);
+
+    __host__ __device__ friend constexpr DynamicTensorCoordinateStep_v2<ntransform_, ndim_visible_>
+    make_tensor_coordinate_step_v2(const DynamicTensorDescriptor_v2& /* tensor_desc */,
+                                   const VisibleIndex& /* idx_diff_visible */);
+
+    __host__ __device__ friend void move_tensor_coordinate_v2(
+        const DynamicTensorDescriptor_v2& /* tensor_desc */,
+        DynamicTensorCoordinate_v2<ndim_hidden_, VisibleDimensionIds>& /* coord */,
+        const DynamicTensorCoordinateStep_v2<ntransform_, ndim_visible_>& /* coord_step */);
+};
+
+template <index_t NDimHidden, typename VisibleDimensionIds>
+struct DynamicTensorCoordinate_v2
+{
+    constexpr index_t ndim_visible_ = VisbleDimension::Size();
+
+    using HiddenIndex  = MultiIndex<NDimHidden>;
+    using VisibleIndex = MultiIndex<ndim_visible_>;
+
+    __host__ __device__ explicit constexpr DynamicTensorCoordinate_v2(const HiddenIndex& idx_hidden)
+        : idx_hidden_{idx_hidden}, idx_visible_{idx_hidden_}
+    {
+    }
+
+    __host__ __device__ constexpr const auto& GetIndex() const { GetVisibleIndex(); }
+
+    __host__ __device__ constexpr index_t GetOffset() const { return idx_hidden_[0]; }
+
+    private:
+    __host__ __device__ constexpr const auto& GetHiddenIndex() const { return idx_hidden_; }
+
+    __host__ __device__ auto& GetHiddenIndex() { return idx_hidden_; }
+
+    __host__ __device__ constexpr const auto& GetVisibleIndex() const { return idx_visible_; }
+
+    __host__ __device__ auto& GetVisibleIndex() { return idx_visible_; }
+
+    // private member variables
+    HiddenIndex idx_hidden_;
+    // idx_visible_ contains a reference to idx_hidden_
+    ArrayElementPicker<HiddenIndex, VisibleDimensionIds> idx_visible_;
+
+    // friend functions for making and updating tensor coordinate
+    template <typename TensorDesc>
+    __host__ __device__ friend constexpr DynamicTensorCoordinate_v2
+    make_tensor_coordinate_v2(const TensorDesc& /* tensor_desc */,
+                              const VisibleIndex& /* idx_visible */);
+
+    template <typename TensorDesc>
+    __host__ __device__ friend void move_tensor_coordinate_v2(
+        const TensorDesc& /* tensor_desc */,
+        DynamicTensorCoordinate_v2& /* coord */,
+        const DynamicTensorCoordinateStep_v2<TensorDesc::GetNumOfTransform(),
+                                             ndim_visible_>& /* coord_step */);
+};
+
+template <index_t NTransform, index_t NDimVisible>
+struct DynamicTensorCoordinateStep_v2
+{
+    using VisibleIndex = MultiIndex<NDimVisible>;
+
+    __host__ __device__ explicit constexpr DynamicTensorCoordinateStep_v2(
+        const VisibleIndex& idx_diff_visible, const Array<bool, NTransform>& do_transforms)
+        : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
+    {
+    }
+
+    private:
+    const VisibleIndex idx_diff_visible_;
+    const Array<bool, NTransform> do_transforms_;
+
+    // friend functions for updating tensor coordinate
+    template <typename TensorDesc>
+    __host__ __device__ friend constexpr DynamicTensorCoordinateStep_v2
+    make_tensor_coordinate_step_v2(const TensorDesc& /* tensor_desc */,
+                                   const VisibleIndex& /* idx_visible */);
+
+    template <typename TensorDesc, index_t NDimHidden, typename VisibleDimensionIds>
+    __host__ __device__ friend void move_tensor_coordinate_v2(
+        const TensorDesc& /* tensor_desc */,
+        DynamicTensorCoordinate_v2<NDimHidden, VisibleDimensionIds>& /* coord */,
+        const DynamicTensorCoordinateStep_v2& /* coord_step */);
+};
+
+template <typename TensorDesc, typename VisibleIndex>
+__host__ __device__ constexpr auto make_tensor_coordinate_v2(const TensorDesc& tensor_desc,
+                                                             const VisibleIndex& idx_visible)
+{
+    static_assert(tensor_desc.GetNumOfDimension() == idx_visible.Size(),
+                  "wrong! # of dimension inconsistent");
+
+    constexpr index_t ntransform   = tensor_desc.GetNumOfTransformation();
+    constexpr index_t ndim_hidden  = tensor_desc.GetNumOfHiddenDimension();
+    constexpr index_t ndim_visible = tensor_desc.GetNumOfVisibleDimension();
+
+    MultiIndex<ndim_hidden> idx_hidden;
+
+    auto idx_visible_pick = pick_array_element(idx_hidden, tensor_desc.GetVisibleDimensionIds());
+
+    // initialize visible index
+#pragma unroll
+    for(index_t i < ndim_visible; i < ndim_visible, ++i)
+    {
+        idx_visible_pick(i) = idx_visible[i];
+    }
+
+    // calculate hidden index
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        const auto& tran        = transforms_.At(itran);
+        constexpr auto dims_low = LowerDimensionIdss::At(itran);
+        constexpr auto dims_up  = UpperDimensionIdss::At(itran);
+
+        const auto idx_up = pick_array_element(idx_hidden_, dim_up);
+        auto idx_low      = pick_array_element(idx_hidden_, dim_low);
+
+        tran.CalculateLowerIndex(idx_up, idx_low);
+    });
+
+    // better to use std::move?
+    return DynamicTensorCoordinate_v2{idx_hidden};
+}
+
+template <typename TensorDesc, typename VisibleIndex>
+__host__ __device__ constexpr auto
+make_tensor_coordinate_step_v2(const TensorDesc& tensor_desc, const VisibleIndex& idx_diff_visible)
+{
+    static_assert(tensor_desc.GetNumOfDimension() == idx_visible.Size(),
+                  "wrong! # of dimension inconsistent");
+
+    constexpr index_t ntransform   = tensor_desc.GetNumOfTransformation();
+    constexpr index_t ndim_hidden  = tensor_desc.GetNumOfHiddenDimension();
+    constexpr index_t ndim_visible = tensor_desc.GetNumOfVisibleDimension();
+
+    Array<bool, ntransform> do_transforms = {false};
+
+    Array<bool, ndim_hidden> non_zero_diff = {false};
+
+    auto non_zero_diff_pick_visible =
+        pick_array_element(non_zero_diff, tensor_desc.GetVisibleDimensionIds());
+
+#pragma unroll
+    for(index_t i < ndim_visible; i < ndim_visible, ++i)
+    {
+        non_zero_diff_pick_visible(i) = (idx_diff_visible[i] != 0);
+    }
+
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        const auto& tran        = tensor_desec.GetTransforms().At(itran);
+        constexpr auto dims_low = tensor_desc.GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = tensor_Desc.GetUpperDimensionIdss().At(itran);
+
+        const auto non_zero_diff_pick_up = pick_array_element(non_zero_diff, dims_up);
+        auto non_zero_diff_pick_low      = pick_array_element(non_zero_diff, dims_low);
+
+        // if any of upper index diff components is non-zero, then
+        //   1) Need to do this transform
+        //   2) all components of lower index diff will assume to be non-zero and need to be
+        //   computed
+        const bool idx_diff_up_has_non_zero =
+            reduce_on_array(non_zero_diff_pick_up, [](auto a, auto b) { return a or b; }, false);
+
+        do_transforms(itran) = idx_diff_up_has_non_zero;
+
+#pragma unroll
+        for(index_t i = 0; i < dims_low.Size(); ++i)
+        {
+            non_zero_diff_pick_low(i) = idx_diff_up_has_non_zero;
+        }
+    });
+
+    return do_transforms;
+}
+
+template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
+__host__ __device__ void move_tensor_coordinate_v2(const TensorDesc& tensor_desc,
+                                                   TensorCoord& coord,
+                                                   const TensorCoordStep& coord_step)
+{
+    constexpr index_t ndim_hidden  = tensor_desc.GetNumOfHiddenDimension();
+    constexpr index_t ndim_visible = tensor_desc.GetNumOfVisibleDimension();
+    constexpr index_t ntransform   = tensor_desc.GetNumOfTransform();
+
+    // this is what needs to be calculated
+    auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>();
+    const auto idx_diff_visible_pick =
+        pick_array_element(idx_diff_hidden, tensor_desc.GetVisibleDimensionIds());
+
+    // initialize visible index diff
+#pragma unroll
+    for(index_t i = 0; i < ndim_visible_; ++i)
+    {
+        idx_diff_visible_pick(i) = coord_step.GetVisibleIndexDiff()[i];
+    }
+
+    // this is what needs to be updated
+    auto& idx_hidden = coord.GetHiddenIndex();
+
+    // update hidden index
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        const auto& tran        = tensor_desc.GetTransformations().At(itran);
+        constexpr auto dims_low = tensor_desc.GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = tensor_desc.GetUpperDimensionIdss().At(itran);
+
+        // this const is for ArrayElementPicker, Array itself may not be const
+        const auto idx_up  = pick_array_element(idx_hidden, dim_up);
+        const auto idx_low = pick_array_element(idx_hidden, dim_low);
+
+        const auto idx_diff_up  = pick_array_element(idx_diff_hidden, dim_up);
+        const auto idx_diff_low = pick_array_element(idx_diff_hidden, dim_low);
+
+        tran.CalculateLowerIndexDiff(idx_diff_low, idx_diff_up, idx_low, idx_up);
+
+        // update idx_low
+        idx_low += idx_diff_low;
+    });
+}
+
+template <typename TensorDesc, typename TensorCoord>
+__host__ __device__ bool constexpr coordinate_has_valid_offset_assuming_visible_index_is_valid(
+    const TensorDesc& tensor_desc, const TensorCoord& coord)
+{
+    bool valid = true;
+
+    constexpr index_t ntransform = tensor_desc.GetNumOfTransform();
+
+    const auto& idx_hidden = coord.GetHiddenIndex();
+
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        const auto tran = tensor_desc.GetTransforms().At(itran);
+
+        // check validity, only if current transformation does not always has a valid mapping
+        if constexpr(!decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex())
+        {
+            const auto idx_up =
+                pick_array_element(idx_hidden, tensor_desc.GetUpperDimensionIdss().At(itran));
+
+            valid = valid && tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up);
+        }
+    });
+
+    return valid;
+}
+
+template <typename TensorDesc, typename TensorCoord>
+__host__ __device__ bool constexpr coordinate_has_valid_offset(const TensorDesc& tensor_desc,
+                                                               const TensorCoord& coord)
+{
+    // check visible index
+    const auto& idx_visible = coord.GetVisibleIndex();
+
+    bool is_visible_index_valid = true;
+
+#pragma unroll
+    for(index_t i = 0; i < tensor_desc.GetNumOfDimension(); ++i)
+    {
+        is_visible_index_valid = is_visible_index_valid &&
+                                 (idx_visible[i] >= 0 && idx_visible[i] < tensor_desc.GetLength(i));
+    }
+
+    // check other hidden index
+    return is_visible_index_valid &&
+           coordinate_has_valid_offset_assuming_visible_index_is_valid(tensor_desc, coord);
+}
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -14,6 +14,12 @@ __host__ __device__ constexpr auto make_multi_index(Xs... xs)
    return MultiIndex<sizeof...(Xs)>(xs...);
 }

+template <index_t NSize>
+__host__ __device__ constexpr auto make_zero_multi_index()
+{
+    make_zero_array<index_t, NSize>();
+}
+
 template <index_t Length>
 struct PassThrough
 {

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -133,7 +133,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,

    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x8
    constexpr index_t BlockSize = 256;