tweaking

f0eec07c · Chao Liu · 89140d16 · f0eec07c · f0eec07c · f0eec07c
Commit f0eec07c authored Dec 19, 2019 by Chao Liu
7 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
@@ -123,7 +123,7 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw
            make_tuple(Sequence<0>{}, Sequence<1>{}));

 // output tensor
-#if 0 // debug
+#if 1 // debug
        constexpr auto out_n_k_hop_wop_global_desc = transform_tensor_descriptor(
            out_n_k_ho_wo_global_desc,
            make_tuple(

--- a/composable_kernel/include/tensor_description/multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp
@@ -41,18 +41,10 @@ struct PassThrough

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }

-#if 0
-    __host__ __device__ static constexpr bool
-    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
-    {
-        return true;
-    }
-#else
    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
    {
        return true;
    }
-#endif
 };

 // LowerLengths: Sequence<...>
@@ -64,6 +56,13 @@ struct Pad
    using LowerIndex = MultiIndex<nDim>;
    using UpperIndex = MultiIndex<nDim>;

+    __host__ __device__ explicit constexpr Pad()
+    {
+        static_assert(LowerLengths::GetSize() == nDim && LeftPads::GetSize() == nDim &&
+                          RightPads::GetSize() == nDim,
+                      "wrong! # of dimensions not consistent");
+    }
+
    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<nDim>{}; }

    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<nDim>{}; }
@@ -88,15 +87,14 @@ struct Pad

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }

-    __host__ __device__ constexpr bool
-    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up) const
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
    {
        bool flag = true;

-        static_for<0, nDim, 1>{}([&](auto idim) {
-            flag = flag && (idx_up[idim] >= LeftPads::At(idim)) &&
-                   (idx_up[idim] < LeftPads::At(idim) + LowerLengths::At(idim));
-        });
+        for(index_t i = 0; i < nDim; ++i)
+        {
+            flag = flag && LeftPads::At(i) == 0 && RightPads::At(i) == 0;
+        }

        return flag;
    }
@@ -163,91 +161,6 @@ struct Merge
        return idx_low;
    }

-#if 0
-    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
-    // If idx_up_diff is known at compile-time, many calculations can be optimized
-    // away by compiler
-    // This function assume idx_low_old is not out-of-bound
-    __host__ __device__ static constexpr auto
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
-                            const UpperIndex& /* idx_up_old */,
-                            const LowerIndex& idx_low_old)
-    {
-        // do nothing if idx_up_diff == 0
-        if(idx_up_diff[0] == 0)
-        {
-            return make_zero_array<index_t, nDimLow>();
-        }
-
-        // CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
-        //   If idx_up_diff is known at compile-time, the calculation can
-        //   be done at compile-time. However, if idx_up_diff is only known
-        //   at run-time, then the calculation will also be computed at
-        //   run-time, and can be very expensive.
-        LowerIndex idx_low_new = idx_low_old + CalculateLowerIndex(idx_up_diff);
-
-        if(idx_up_diff[0] > 0)
-        {
-            bool carry = false;
-
-            // do carry check in reversed order, starting from lowest dimension
-            // don't check the highest dimension
-            static_for<0, nDimLow - 1, 1>{}([&](auto ireverse) {
-                constexpr index_t i = nDimLow - 1 - ireverse;
-
-                if(carry)
-                {
-                    ++idx_low_new(i);
-                }
-
-                carry = false;
-
-                if(idx_low_new[i] >= LowerLengths::At(i))
-                {
-                    idx_low_new(i) -= LowerLengths::At(i);
-                    carry = true;
-                }
-            });
-
-            // highest dimension, no out-of-bound check
-            if(carry)
-            {
-                ++idx_low_new(0);
-            }
-        }
-        else if(idx_up_diff[0] < 0)
-        {
-            bool borrow = false;
-
-            // do borrow check in reversed order, starting from lowest dimension
-            // don't check the highest dimension
-            static_for<0, nDimLow - 1, 1>{}([&](auto ireverse) {
-                constexpr index_t i = nDimLow - 1 - ireverse;
-
-                if(borrow)
-                {
-                    --idx_low_new(i);
-                }
-
-                borrow = false;
-
-                if(idx_low_new[i] < 0)
-                {
-                    idx_low_new(i) += LowerLengths::At(i);
-                    borrow = true;
-                }
-            });
-
-            // highest dimension, no out-of-bound check
-            if(borrow)
-            {
-                --idx_low_new(0);
-            }
-        }
-
-        return idx_low_new - idx_low_old;
-    }
-#else
    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
    // If idx_up_diff is known at compile-time, many calculations can be optimized
    // away by compiler
@@ -348,12 +261,10 @@ struct Merge
            return idx_low_new - idx_low_old;
        }
    }
-#endif

    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }

-    __host__ __device__ static constexpr bool
-    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
    {
        return true;
    }
@@ -400,8 +311,7 @@ struct UnMerge

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }

-    __host__ __device__ static constexpr bool
-    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
    {
        return true;
    }
@@ -410,7 +320,7 @@ struct UnMerge
 // UpperLengths: Sequence<...>
 // Coefficients: Sequence<...>
 // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp]
-template <typename UpperLengths, typename Coefficients>
+template <typename UpperLengths, typename Coefficients, bool IsAlwaysValidMapping = true>
 struct Embed
 {
    static constexpr index_t nDimLow = 1;
@@ -456,10 +366,9 @@ struct Embed

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }

-    __host__ __device__ static constexpr bool
-    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
    {
-        return true;
+        return IsAlwaysValidMapping;
    }
 };

@@ -499,8 +408,7 @@ struct Vectorize

    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }

-    __host__ __device__ static constexpr bool
-    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& /* idx_up */)
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
    {
        return true;
    }

--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -53,6 +53,8 @@ struct NativeTensorCoordinate

    __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; }

+    __host__ __device__ constexpr const Index& GetUpperIndex() const { return mIndex; }
+
    __host__ __device__ constexpr const Index& GetIndex() const { return mIndex; }

    __host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; }
@@ -98,9 +100,6 @@ struct NativeTensorCoordinate
        return tensor_desc_type::CalculateOffsetDiff(idx_diff);
    }

-#if 0 // debug
-    __host__ __device__ static constexpr bool HasValidOffset() { return true; }
-#else
    // evaluated at run-time
    __host__ __device__ constexpr bool IsUpperIndexValid() const
    {
@@ -117,10 +116,8 @@ struct NativeTensorCoordinate
    // evaluated at compile-time
    __host__ __device__ static constexpr bool IsOffsetValidAssumingUpperIndexIsValid()
    {
-        // For native tensor, offset is valid if upper-index is valid
        return true;
    }
-#endif

    private:
    // mIndex may be saved and updated, however, the value of some (or all) of its entries may
@@ -165,8 +162,6 @@ struct TransformedTensorCoordinate

    __host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; }

-    __host__ __device__ constexpr const LowerIndex& GetLowerIndex() const { return mIndexLow.GetIndex(); }
-
    __host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); }

    __host__ __device__ constexpr const index_t& GetOffset() const
@@ -230,13 +225,6 @@ struct TransformedTensorCoordinate
        return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff);
    }

-#if 0 // debug
-    __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
-    {
-        return tensor_desc_type::IsUpperIndexMappedToValidLowerIndex(GetIndex()) &&
-               mCoordLow.IsUpperIndexMappedToValidOffset();
-    }
-#else
    // evaluated at run-time
    __host__ __device__ constexpr bool IsUpperIndexValid() const
    {
@@ -252,7 +240,8 @@ struct TransformedTensorCoordinate
    // most evaluatation is done at comile-time
    __host__ __device__ constexpr bool IsLowerIndexValidAssumingUpperIndexIsValid() const
    {
-        return tensor_desc_type::IsLowerIndexValidAssumingUpperIndexIsValid(GetLowerIndex());
+        return tensor_desc_type::IsLowerIndexValidAssumingUpperIndexIsValid(
+            GetLowerCoordinate().GetIndex());
    }

    // most evaluatation is done at comile-time
@@ -261,7 +250,6 @@ struct TransformedTensorCoordinate
        return IsLowerIndexValidAssumingUpperIndexIsValid() &&
               GetLowerCoordinate().IsOffsetValidAssumingUpperIndexIsValid();
    }
-#endif

    private:
    // mIndexUp may be calculated and updated, however, the value of some (or all) of its entries

--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -120,30 +120,18 @@ struct NativeTensorDescriptor
        return Tuple<>{};
    }

-#if 0 // debug
-    __host__ __device__ static constexpr bool
-    IsUpperIndexMappedToValidOffset(const Index& /* idx */)
-    {
-        return true;
-    }
-#else
+    // a multi-index is valid if there is a corresponding point for it in the tensor
    __host__ __device__ static constexpr bool IsUpperIndexValid(const Index& idx)
    {
        bool flag = true;

        for(index_t i = 0; i < nDim; ++i)
+        {
            flag = flag && idx[i] >= 0 && idx[i] < GetLengths()[i];
-    });
+        }

        return flag;
-}
-
-__host__ __device__ static constexpr bool
-IsUpperIndexMappedToValidOffset(const Index& idx)
-{
-    return IsUpperIndexValid(idx) && IsValidUpperIndexAlwaysMappedToValidOffset();
-}
-#endif
+    }
 };

 // Tensor descriptor for "transformed tensor"
@@ -486,41 +474,12 @@ struct TransformedTensorDescriptor
    }
 #endif

-#if 0
-    __host__ __device__ static constexpr bool
-    IsUpperIndexMappedToValidLowerIndex(const UpperIndex& idx_up)
-    {
-        bool flag = true;
-
-        static_for<0, nTransform, 1>{}([&](auto itran) {
-            constexpr auto tran = Transforms{}.At(itran);
-
-            const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran));
-
-            flag = flag && tran.IsUpperIndexMappedToValidLowerIndex(to_array(idx_up_part));
-        });
-
-        return flag;
-    }
-
-    // Whenever this function is called, it will call CalculateLowerIndex() recursively.
-    // If you have created a tensor coordinate already, instead of calling this function,
-    //   you should call TensorCoordinate::IsUpperIndexMappedToValidOffset() which would
-    //   be less expensive.
-    __host__ __device__ static constexpr bool
-    IsUpperIndexMappedToValidOffset(const UpperIndex& idx_up)
-    {
-        return IsUpperIndexMappedToValidLowerIndex(idx_up) &&
-               GetLowerTensorDescriptor().IsUpperIndexMappedToValidOffset(
-                   CalculateLowerIndex(idx_up));
-    }
-#else
-    //
+    // a multi-index is valid if there is a corresponding point for it in the tensor
    __host__ __device__ constexpr bool IsUpperIndexValid(const UpperIndex& idx_up) const
    {
        bool flag = true;

-        for(index_t i = 0; i < nDim; ++i)
+        for(index_t i = 0; i < nDimUp; ++i)
        {
            flag = flag && idx_up[i] >= 0 && idx_up[i] < GetLengths()[i];
        }
@@ -528,9 +487,10 @@ struct TransformedTensorDescriptor
        return flag;
    }

-    // this function tells you: Is lower-index valid, assuming upper index is valid?
-    __host__ __device__ constexpr bool
-    IsLowerIndexValidAssumingUpperIndexIsValid(const LowerIndex& idx_low) const
+    // this function is for optimization purpose, it's called by tensor coordinate
+    // this function tells you: If a lower-index is valid or not, assuming upper index is valid
+    __host__ __device__ static constexpr bool
+    IsLowerIndexValidAssumingUpperIndexIsValid(const LowerIndex& idx_low)
    {
        bool flag = true;

@@ -540,22 +500,20 @@ struct TransformedTensorDescriptor
            // check a indtransformation if it does not always has a valid mapping
            if(!tran.IsValidUpperIndexAlwaysMappedToValidLowerIndex())
            {
-                const auto idx_low_part =
-                    to_array(pick_array_element(idx_low, LowerDimensionIds{}.At(itran)));
+                constexpr auto low_dims_part = LowDimensionIds{}.At(itran);
+                constexpr auto low_lengths_part =
+                    GetLowerTensorDescriptor().GetLengths(low_dims_part);
+                const auto idx_low_part = to_array(pick_array_element(idx_low, low_dims_part));

-                constexpr auto lengths_low_part =
-                    GetLowerTenosrDescriptor().GetLengths()(LowerDimensionIds{});
-
-                for(index_t i = 0; i < LowerDimensionIds::Size(); ++i)
+                for(index_t i = 0; i < low_dims_part.Size(); ++i)
                {
-                    flag = flag && idx_low_part[i] >= 0 && idx_low_part[i] < lengths_low_part[i];
+                    flag = flag && idx_low_part[i] >= 0 && idx_low_part[i] < low_lengths_part[i];
                }
            }
        });

        return flag;
    }
-#endif
 };

 } // namespace ck

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -110,7 +110,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                // Check src data's valid mapping situation, only check the first data in this src
                //   vector. It's user's responsiblity to make sure all data in the src vector
                //   has the valid/invalid mapping situation
-                if(src_coord.HasValidOffset())
+                if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
                {
                    move_data<SrcData,
                              SrcDataPerRead,
@@ -142,7 +142,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                // Check dst data's valid mapping situation, only check the first data in this dst
                //   vector. It's user's responsiblity to make sure all data in the dst vector
                //   has the valid/invalid mapping situation
-                if(dst_coord.HasValidOffset())
+                if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
                {
                    move_data<DstData,
                              DstDataPerWrite,
@@ -260,7 +260,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    // src
                    //   vector. It's user's responsiblity to make sure all data in the src vector
                    //   has the valid/invalid mapping situation
-                    if(src_coord.HasValidOffset())
+                    if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
                    {
                        move_data<SrcData,
                                  SrcDataPerRead,
@@ -299,7 +299,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    // dst
                    //   vector. It's user's responsiblity to make sure all data in the dst vector
                    //   has the valid/invalid mapping situation
-                    if(dst_coord.HasValidOffset())
+                    if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
                    {
                        move_data<DstData,
                                  DstDataPerWrite,
@@ -399,7 +399,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    // src
                    //   vector. It's user's responsiblity to make sure all data in the src vector
                    //   has the valid/invalid mapping situation
-                    if(src_coord.HasValidOffset())
+                    if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
                    {
                        move_data<SrcData,
                                  SrcDataPerRead,
@@ -444,7 +444,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    // dst
                    //   vector. It's user's responsiblity to make sure all data in the dst vector
                    //   has the valid/invalid mapping situation
-                    if(dst_coord.HasValidOffset())
+                    if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid())
                    {
                        move_data<DstData,
                                  DstDataPerWrite,

--- a/driver/src/conv_bwd_data_driver.cpp
+++ b/driver/src/conv_bwd_data_driver.cpp
@@ -21,7 +21,7 @@ int main(int argc, char* argv[])
 {
    using namespace ck;

-#if 1
+#if 0
    // 1x1
    constexpr index_t N  = 256;
    constexpr index_t C  = 1024;
@@ -36,7 +36,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 1
+#elif 0
    // 1x7
    constexpr index_t N  = 128;
    constexpr index_t C  = 1024;
@@ -291,7 +291,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<3, 0>;
    using RightPads = Sequence<3, 0>;
-#elif 0
+#elif 1
    // 1x7 filter, 0x3 pad, 17x17 input
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -59,7 +59,7 @@ int main(int argc, char* argv[])

    using LeftPads  = Sequence<0, 3>;
    using RightPads = Sequence<0, 3>;
-#elif 1
+#elif 0
    // 3x3, 34x34
    constexpr index_t N  = 64;
    constexpr index_t C  = 256;