Merge branch 'master' into jd_redux

52423948 · Jehandad Khan · b97af4ec · 98a2cfcc · 52423948 · 52423948
Commit 52423948 authored Sep 27, 2019 by Jehandad Khan
20 changed files
--- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
+++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
@@ -6,7 +6,7 @@
 namespace ck {

 template <class Lengths>
-__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
+__host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths)
 {
    return reverse_inclusive_scan_sequence(
               Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
@@ -14,12 +14,12 @@ __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
 }

 template <class Lengths, index_t Align>
-__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
+__host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths, Number<Align>)
 {
    constexpr index_t L_back_align =
        Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);

-    return calculate_tensor_strides_packed(
+    return calculate_tensor_strides_packed_old(
        Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
 }

@@ -96,13 +96,12 @@ struct ConstantTensorDescriptor

    __host__ __device__ static constexpr auto GetElementSize()
    {
-        return Number<accumulate_on_sequence(
-            Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
+        return Number<reduce_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
    }

    __host__ __device__ static constexpr auto GetElementSpace()
    {
-        constexpr index_t element_space_unaligned = accumulate_on_sequence(
+        constexpr index_t element_space_unaligned = reduce_on_sequence(
            (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});

        return Number<element_space_unaligned>{};
@@ -155,7 +154,7 @@ struct ConstantTensorDescriptor

        constexpr auto multi_id = Sequence<Is...>{};

-        return Number<accumulate_on_sequence(
+        return Number<reduce_on_sequence(
            multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{})>{};
    }

@@ -178,7 +177,7 @@ struct ConstantTensorDescriptor
        {
            constexpr auto IDim      = IDim_{};
            constexpr index_t stride = PackedStrides::Get(IDim);
-            multi_id.Set(IDim, id / stride);
+            multi_id(IDim)           = id / stride;
            id -= multi_id[IDim] * stride;
        }
    };
@@ -187,12 +186,12 @@ struct ConstantTensorDescriptor
    {
        Array<index_t, nDim> multi_id;

-        using PackedStrides = decltype(calculate_tensor_strides_packed(GetLengths()));
+        using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths()));

        // calculate index in each of the dimensions in the order of their dimension
        static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex<PackedStrides>(id, multi_id));

-        multi_id.Set(Number<nDim - 1>{}, id / PackedStrides::Get(Number<nDim - 1>{}));
+        multi_id(Number<nDim - 1>{}) = id / PackedStrides::Get(Number<nDim - 1>{});

        return multi_id;
    }
@@ -204,7 +203,7 @@ struct ConstantTensorDescriptor
    }

    // This function doesn't do carry check on the highest dimension for positive stepping (or
-    // borrow check on the lowest dimension for negative stepping) , for performance reason. It is
+    // borrow check on the highest dimension for negative stepping) , for performance reason. It is
    // the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the
    // highest dimension for positive stepping (or on the lowest dimension for negative stepping)
    template <bool PositiveDirection>
@@ -304,14 +303,73 @@ struct ConstantTensorDescriptor
                                            GetStrides().PushBack(leaf_tensor::GetStrides()))>{};
    }

+    template <index_t IDimVector, index_t DataPerVector>
+    struct lambda_IsVectorizationAllowed
+    {
+        bool& is_allowed;
+
+        __host__ __device__ constexpr lambda_IsVectorizationAllowed(bool& is_allowed_)
+            : is_allowed(is_allowed_)
+        {
+        }
+
+        template <index_t IDim_>
+        __host__ __device__ constexpr void operator()(Number<IDim_>) const
+        {
+            constexpr auto IDim = Number<IDim_>{};
+
+            if(IDimVector != IDim && Strides::Get(IDim) % DataPerVector != 0)
+            {
+                is_allowed = false;
+            }
+        }
+    };
+
+    template <index_t IDimVector, index_t DataPerVector>
+    __host__ __device__ static constexpr bool IsVectorizationAllowed(Number<IDimVector>,
+                                                                     Number<DataPerVector>)
+    {
+        bool is_allowed = (Strides{}[IDimVector] == 1 || DataPerVector == 1) &&
+                          Lengths{}[IDimVector] % DataPerVector == 0;
+
+        static_for<0, nDim, 1>{}(
+            lambda_IsVectorizationAllowed<IDimVector, DataPerVector>{is_allowed});
+
+        return is_allowed;
+    }
+
+    template <index_t IDim, index_t DataPerVector>
+    __host__ __device__ static constexpr auto Vectorize(Number<IDim>, Number<DataPerVector>)
+    {
+        constexpr auto idim            = Number<IDim>{};
+        constexpr auto data_per_vector = Number<DataPerVector>{};
+
+        static_assert(IsVectorizationAllowed(idim, data_per_vector), "wrong!");
+
+        using vectorized_lengths =
+            decltype(Lengths::Modify(Number<IDim>{}, Number<Lengths{}[IDim] / DataPerVector>{}));
+        using vectorized_strides =
+            decltype((Strides{} / Number<DataPerVector>{}).Modify(Number<IDim>{}, Number<1>{}));
+
+        return ConstantTensorDescriptor<vectorized_lengths, vectorized_strides>{};
+    }
+
    template <index_t IDim, index_t SliceLen>
    __host__ __device__ static constexpr auto Slice(Number<IDim>, Number<SliceLen>)
    {
-        using slice_lengths = decltype(Lengths{}.Modify(Number<IDim>{}, Number<SliceLen>{}));
+        using slice_lengths = decltype(Lengths::Modify(Number<IDim>{}, Number<SliceLen>{}));

        return ConstantTensorDescriptor<slice_lengths, Strides>{};
    }

+    template <index_t... Is>
+    __host__ __device__ static constexpr auto Slice(Sequence<Is...> slice_lengths)
+    {
+        static_assert(slice_lengths.GetSize() == nDim, "wrong!");
+
+        return ConstantTensorDescriptor<decltype(slice_lengths), Strides>{};
+    }
+
    template <index_t IDim, index_t SliceLength, index_t SliceStride>
    __host__ __device__ static constexpr auto
        StridedSlice(Number<IDim>, Number<SliceLength>, Number<SliceStride>)
@@ -330,7 +388,7 @@ struct ConstantTensorDescriptor
        constexpr auto fold_intervals = Sequence<FoldIntervals...>{};

        constexpr index_t fold_intervals_product =
-            accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
+            reduce_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});

        constexpr auto unfold_length = GetLength(Number<IDim>{});
        constexpr auto unfold_stride = GetStride(Number<IDim>{});
@@ -388,7 +446,7 @@ struct ConstantTensorDescriptor
        static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable");

        // unfolded length, stride
-        constexpr index_t unfold_length = accumulate_on_sequence(
+        constexpr index_t unfold_length = reduce_on_sequence(
            GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});

        constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
@@ -409,7 +467,7 @@ struct ConstantTensorDescriptor

    __host__ __device__ static constexpr auto Pack()
    {
-        using packed_strides = decltype(calculate_tensor_strides_packed(Lengths{}));
+        using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
        return ConstantTensorDescriptor<Lengths, packed_strides>{};
    }

@@ -431,7 +489,7 @@ struct ConstantTensorDescriptor
 template <class Lengths>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths)
 {
-    using Strides = decltype(calculate_tensor_strides_packed(Lengths{}));
+    using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{}));
    return ConstantTensorDescriptor<Lengths, Strides>{};
 }

@@ -444,7 +502,7 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
 template <class Lengths, index_t Align>
 __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number<Align>)
 {
-    using Strides = decltype(calculate_tensor_strides_aligned(Lengths{}, Number<Align>{}));
+    using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number<Align>{}));
    return ConstantTensorDescriptor<Lengths, Strides>{};
 }


--- a/composable_kernel/include/tensor_description/dimension.hpp
+++ b/composable_kernel/include/tensor_description/dimension.hpp
+#ifndef CK_DIMENSION_HPP
+#define CK_DIMENSION_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t Length>
+struct Dimension
+{
+    __host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
+};
+
+template <index_t Length, index_t Stride>
+struct NativeDimension
+{
+    __host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
+
+    __host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; }
+
+    __host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; }
+
+    __host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff)
+    {
+        return i_diff * Stride;
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
+#ifndef CK_MULTI_INDEX_TRANSFORM_HPP
+#define CK_MULTI_INDEX_TRANSFORM_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t N>
+using MultiIndex = Array<index_t, N>;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_multi_index(Xs... xs)
+{
+    return MultiIndex<sizeof...(Xs)>(xs...);
+}
+
+template <index_t Length>
+struct PassThrough
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
+
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
+
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return idx_up;
+    }
+
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return idx_up_diff;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
+};
+
+// LowerLengths: Sequence<...>
+template <typename LowerLengths, typename LeftPads, typename RightPads>
+struct Pad
+{
+    static constexpr index_t nDim = LowerLengths::Size();
+
+    using LowerIndex = MultiIndex<nDim>;
+    using UpperIndex = MultiIndex<nDim>;
+
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDim>{}; }
+
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDim>{}; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths()
+    {
+        return LowerLengths{} + LeftPads{} + RightPads{};
+    }
+
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return idx_up - LeftPads{};
+    }
+
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return idx_up_diff;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ constexpr bool
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
+    {
+#if 0
+        struct lambda_no_pad
+        {
+            __host__ __device__ constexpr bool operator()(index_t x) const { return x == 0; }
+        };
+
+        if(sequence_all_of(LeftPads{}, lambda_no_pad{}) &&
+           sequence_all_of(RightPads{}, lambda_no_pad{}))
+        {
+            return true;
+        }
+        else
+#endif
+        {
+            bool flag = true;
+
+            static_for<0, nDim, 1>{}([&](auto idim) {
+                // only check if there is left-padding
+                static_if<(LeftPads::At(idim) != 0)>{}(
+                    [&](auto) { flag = flag && idx_up[idim] >= LeftPads::At(idim); });
+
+                // only check if there is right-padding
+                static_if<(RightPads::At(idim) != 0)>{}([&](auto) {
+                    flag = flag && (idx_up[idim] < LeftPads::At(idim) + LowerLengths::At(idim));
+                });
+            });
+
+            return flag;
+        }
+    }
+};
+
+// LowerLengths: Sequence<...>
+template <typename LowerLengths>
+struct Merge
+{
+    static constexpr index_t nDimLow = LowerLengths::Size();
+    static constexpr index_t nDimUp  = 1;
+
+    using LowerIndex = MultiIndex<nDimLow>;
+    using UpperIndex = MultiIndex<nDimUp>;
+
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDimLow>{}; }
+
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDimUp>{}; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths()
+    {
+        return Sequence<reduce_on_sequence(
+            LowerLengths{}, math::multiplies<index_t>{}, Number<1>{})>{};
+    }
+
+    // emulate constexpr lambda
+    template <typename PseudoLowStrides>
+    struct lambda_CalculateLowerIndex
+    {
+        index_t& itmp;
+        LowerIndex& idx_low;
+
+        __host__ __device__ explicit constexpr lambda_CalculateLowerIndex(index_t& itmp_,
+                                                                          LowerIndex& idx_low_)
+            : itmp(itmp_), idx_low(idx_low_)
+        {
+        }
+
+        template <typename IDim>
+        __host__ __device__ constexpr void operator()(IDim idim) const
+        {
+            constexpr index_t stride = PseudoLowStrides::At(idim);
+            idx_low(idim)            = itmp / stride;
+            itmp -= idx_low[idim] * stride;
+        }
+    };
+
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        LowerIndex idx_low;
+
+        index_t itmp = idx_up[0];
+
+        constexpr auto pseudo_low_strides =
+            reverse_inclusive_scan_sequence(
+                LowerLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
+                .PushBack(Number<1>{});
+
+        static_for<0, nDimLow - 1, 1>{}(
+            lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
+
+        idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1];
+
+        return idx_low;
+    }
+
+    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
+    // If idx_up_diff is known at compile-time, many calculations can be optimized
+    // away by compiler
+    // This function assume idx_low_old is not out-of-bound
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& idx_low_old)
+    {
+        // do nothing if idx_up_diff == 0
+        if(idx_up_diff[0] == 0)
+        {
+            return make_zero_array<index_t, nDimLow>();
+        }
+
+        // CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
+        //   If idx_up_diff is known at compile-time, the calculation can
+        //   be done at compile-time. However, if idx_up_diff is only known
+        //   at run-time, then the calculation will also be computed at
+        //   run-time, and can be very expensive.
+        LowerIndex idx_low_new = idx_low_old + CalculateLowerIndex(idx_up_diff);
+
+        if(idx_up_diff[0] > 0)
+        {
+            bool carry = false;
+
+            // do carry check in reversed order, starting from lowest dimension
+            // don't check the highest dimension
+            static_for<0, nDimLow - 1, 1>{}([&](auto ireverse) {
+                constexpr index_t i = nDimLow - 1 - ireverse;
+
+                if(carry)
+                {
+                    ++idx_low_new(i);
+                }
+
+                carry = false;
+
+                if(idx_low_new[i] >= LowerLengths::At(i))
+                {
+                    idx_low_new(i) -= LowerLengths::At(i);
+                    carry = true;
+                }
+            });
+
+            // highest dimension, no out-of-bound check
+            if(carry)
+            {
+                ++idx_low_new(0);
+            }
+        }
+        else if(idx_up_diff[0] < 0)
+        {
+            bool borrow = false;
+
+            // do borrow check in reversed order, starting from lowest dimension
+            // don't check the highest dimension
+            static_for<0, nDimLow - 1, 1>{}([&](auto ireverse) {
+                constexpr index_t i = nDimLow - 1 - ireverse;
+
+                if(borrow)
+                {
+                    --idx_low_new(i);
+                }
+
+                borrow = false;
+
+                if(idx_low_new[i] < 0)
+                {
+                    idx_low_new(i) += LowerLengths::At(i);
+                    borrow = true;
+                }
+            });
+
+            // highest dimension, no out-of-bound check
+            if(borrow)
+            {
+                --idx_low_new(0);
+            }
+        }
+
+        return idx_low_new - idx_low_old;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
+};
+
+// UpperLengths: Sequence<...>
+template <typename UpperLengths>
+struct UnMerge
+{
+    static constexpr index_t nDimLow = 1;
+    static constexpr index_t nDimUp  = UpperLengths::Size();
+
+    using LowerIndex = MultiIndex<nDimLow>;
+    using UpperIndex = MultiIndex<nDimUp>;
+
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDimLow>{}; }
+
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDimUp>{}; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths() { return UpperLengths{}; }
+
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        LowerIndex idx_low{0};
+
+        constexpr auto pseudo_up_strides =
+            reverse_inclusive_scan_sequence(
+                UpperLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
+                .PushBack(Number<1>{});
+
+        static_for<0, nDimUp, 1>{}(
+            [&](auto idim) { idx_low(0) += idx_up[idim] * pseudo_up_strides[idim]; });
+
+        return idx_low;
+    }
+
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return CalculateLowerIndex(idx_up_diff);
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
+};
+
+// UpperLengths: Sequence<...>
+// Coefficients: Sequence<...>
+// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
+template <typename UpperLengths, typename Coefficients>
+struct Embed
+{
+    static constexpr index_t nDimLow = 1;
+    static constexpr index_t nDimUp  = UpperLengths::Size();
+
+    using LowerIndex = MultiIndex<nDimLow>;
+    using UpperIndex = MultiIndex<nDimUp>;
+
+    __host__ __device__ explicit constexpr Embed()
+    {
+        static_assert(UpperLengths::GetSize() == nDimUp && Coefficients::GetSize() == nDimUp + 1,
+                      "wrong! # of dimensions not consistent");
+    }
+
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDimUp>{}; }
+
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDimLow>{}; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths() { return UpperLengths{}; }
+
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        LowerIndex idx_low(Coefficients{}[nDimUp]);
+
+        static_for<0, nDimUp, 1>{}(
+            [&](auto idim) { idx_low(0) += idx_up[idim] * Coefficients{}[idim]; });
+
+        return idx_low;
+    }
+
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        LowerIndex idx_low_diff{0};
+
+        static_for<0, nDimUp, 1>{}(
+            [&](auto idim) { idx_low_diff(0) += idx_up_diff[idim] * Coefficients{}[idim]; });
+
+        return idx_low_diff;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
+};
+
+template <index_t LowerLength, index_t VectorSize>
+struct Vectorize
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    __host__ __device__ constexpr Vectorize()
+    {
+        static_assert(VectorSize > 0 && LowerLength % VectorSize == 0,
+                      "wrong! cannot evenly divide");
+    }
+
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
+
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths()
+    {
+        return Sequence<LowerLength / VectorSize>{};
+    }
+
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return VectorSize * idx_up;
+    }
+
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return VectorSize * idx_up_diff;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool
+    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    {
+        return true;
+    }
+};
+
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
--- a/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
--- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
--- a/composable_kernel/include/tensor_description/tensor_view.hpp
+++ b/composable_kernel/include/tensor_description/tensor_view.hpp
--- a/composable_kernel/include/tensor_description/tensor_visit.hpp
+++ b/composable_kernel/include/tensor_description/tensor_visit.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_2d_tensor_op.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_3d_tensor_op.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_4d_tensor_op.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_batched_gemm.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
--- a/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_tensor_slice_copy.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
--- a/composable_kernel/include/utility/amd_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_intrinsic.hpp