prototype dynamic descriptor

b808713a · Chao Liu · d3405258 · b808713a · b808713a · b808713a
Commit b808713a authored Aug 06, 2020 by Chao Liu
4 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
@@ -12,73 +12,6 @@ namespace ck {
 template <index_t BlockSize>
 struct DummyDynamicTransform
 {
-    __device__ void Run_(index_t* const __restrict__ p_wei_global,
-                         index_t* const __restrict__ p_in_global,
-                         float* const __restrict__ p_out_global,
-                         const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
-                         const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
-                         const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
-                         const Array<index_t, 2> conv_strides,
-                         const Array<index_t, 2> conv_dilations,
-                         const Array<index_t, 2> in_left_pads,
-                         const Array<index_t, 2> in_right_pads,
-                         index_t k_block_num,
-                         index_t c_block_num,
-                         index_t y_block_num,
-                         index_t x_block_num) const
-    {
-        const index_t N  = in_n_c_hi_wi_global_desc.GetLength(0);
-        const index_t C  = in_n_c_hi_wi_global_desc.GetLength(1);
-        const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(2);
-        const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(3);
-        const index_t K  = out_n_k_ho_wo_global_desc.GetLength(1);
-        const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(2);
-        const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(3);
-        const index_t Y = wei_k_c_y_x_global_desc.GetLength(2);
-        const index_t X = wei_k_c_y_x_global_desc.GetLength(3);
-        const index_t ConvStrideH = conv_strides[0];
-        const index_t ConvStrideW = conv_strides[1];
-        const index_t ConvDilationH = conv_dilations[0];
-        const index_t ConvDilationW = conv_dilations[1];
-        p_wei_global[0] = wei_k_c_y_x_global_desc.GetElementSize();
-        p_wei_global[1] = wei_k_c_y_x_global_desc.GetElementSpace();
-        const index_t k_block_num_stride = c_block_num * y_block_num * x_block_num;
-        const index_t c_block_num_stride = y_block_num * x_block_num;
-        const index_t y_block_num_stride = x_block_num;
-        index_t tmp = get_block_1d_id();
-#if 0
-        const index_t k_block = tmp / k_block_num_stride;
-        tmp -= k_block * k_block_num_stride;
-        const index_t c_block = tmp / c_block_num_stride;
-        tmp -= c_block * c_block_num_stride;
-        const index_t y_block = tmp / y_block_num_stride;
-        tmp -= y_block * y_block_num_stride;
-        const index_t x_block = tmp;
-#else
-        const index_t k_block = __llvm_amdgcn_readfirstlane_i32(tmp / k_block_num_stride);
-        tmp -= k_block * k_block_num_stride;
-        const index_t c_block = __llvm_amdgcn_readfirstlane_i32(tmp / c_block_num_stride);
-        tmp -= c_block * c_block_num_stride;
-        const index_t y_block = __llvm_amdgcn_readfirstlane_i32(tmp / y_block_num_stride);
-        tmp -= y_block * y_block_num_stride;
-        const index_t x_block = __llvm_amdgcn_readfirstlane_i32(tmp);
-#endif
-        const index_t k_thread = p_in_global[get_thread_local_1d_id()];
-        const index_t c_thread = p_in_global[get_thread_local_1d_id() + 1];
-        const index_t y_thread = p_in_global[get_thread_local_1d_id() + 2];
-        const index_t x_thread = p_in_global[get_thread_local_1d_id() + 3];
-        p_wei_global[3] = wei_k_c_y_x_global_desc.CalculateOffset(
-            {k_block + k_thread, c_block + c_thread, y_block + y_thread, x_block + x_thread});
-    }
    __device__ void Run(index_t* const __restrict__ p_wei_global,
                        index_t* const __restrict__ p_in_global,
                        float* const __restrict__ p_out_global,
@@ -88,11 +21,7 @@ struct DummyDynamicTransform
                        const Array<index_t, 2> conv_strides,
                        const Array<index_t, 2> conv_dilations,
                        const Array<index_t, 2> in_left_pads,
-                        const Array<index_t, 2> in_right_pads,
+                        const Array<index_t, 2> in_right_pads) const
-                        index_t,
-                        index_t,
-                        index_t,
-                        index_t) const
    {
 #if 1
        const index_t N = in_n_c_hi_wi_global_desc.GetLength(0);
@@ -189,7 +118,7 @@ struct DummyDynamicTransform
                                         const index_t& idx_low_bound) {
                index_t idx_low_tmp = idx_low_old + carry + idx_low_diff_const;
-#if 0 // positive
+#if 1 // positive
                bool do_carry = idx_low_tmp >= idx_low_bound;
                index_t idx_low_new = do_carry ? idx_low_tmp - idx_low_bound : idx_low_tmp;
@@ -389,23 +318,6 @@ struct DummyDynamicTransform
                                      Y,
                                      X);
-#if 0
-            // Merge(N, Ho, Wo) => GemmN
-            f_lower_idx_diff_merge(idx_diff[9],
-                                   idx_diff[12],
-                                   idx_diff[14],
-                                   idx_diff[16],
-                                   idx[9],
-                                   idx[12],
-                                   idx[14],
-                                   const_tmp[3],
-                                   const_tmp[4],
-                                   const_tmp[5],
-                                   N,
-                                   Ho,
-                                   Wo);
-#endif
            // stage 2
            // PassThrough(N) => N
            f_lower_idx_diff_passthrough(idx_diff[5], idx_diff[9]);
@@ -496,6 +408,19 @@ struct DummyDynamicTransform
                             out_n_k_ho_wo_global_desc.GetElementSpace());
        }
    }
+    __device__ void Run_(index_t* const __restrict__ p_wei_global,
+                        index_t* const __restrict__ p_in_global,
+                        float* const __restrict__ p_out_global,
+                        const DynamicNativeTensorDescriptor<4> wei_k_c_y_x_global_desc,
+                        const DynamicNativeTensorDescriptor<4> in_n_c_hi_wi_global_desc,
+                        const DynamicNativeTensorDescriptor<4> out_n_k_ho_wo_global_desc,
+                        const Array<index_t, 2> conv_strides,
+                        const Array<index_t, 2> conv_dilations,
+                        const Array<index_t, 2> in_left_pads,
+                        const Array<index_t, 2> in_right_pads) const
+    {
+    }
 };
 } // namespace ck

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
@@ -5,217 +5,18 @@
 namespace ck {
-struct DynamicPassThrough
+class DynamicTransformation
 {
-    using LowerIndex = MultiIndex<1>;
-    using UpperIndex = MultiIndex<1>;
-    index_t low_length_;
-    __host__ __device__ constexpr DynamicPassThrough(index_t low_length) : low_length_(low_length)
-    {
-    }
-    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
-    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
-    __host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
-    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
-    {
-        return idx_up;
-    }
-    __host__ __device__ static constexpr auto
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
-                            const UpperIndex& /* idx_up_old */,
-                            const LowerIndex& /* idx_low_old */)
-    {
-        return idx_up_diff;
-    }
-    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
-    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
-    {
-        return true;
-    }
 };
-template <index_t NDimLow>
+template<index_t N>
-struct DynamicMerge
+class DynamicEmbed : public DynamicTransformation
 {
-    static constexpr index_t ndim_low_ = NDimLow static constexpr index_t ndim_up_ = 1;
+    const array<idnex_t, N+1> coefficients_;
-    using LowerIndex = MultiIndex<ndim_low_>;
-    using UpperIndex = MultiIndex<ndum_up_>;
-    Array<index_t, NDimLow> low_lengths_;
-    index_t up_length_;
-    __host__ __device__ static constexpr auto GetNumOfLowerDimension()
-    {
-        return Number<ndim_low_>{};
-    }
-    __host__ __device__ static constexpr auto GetNumOfUpperDimension()
-    {
-        return Number<ndim_up_>{};
-    }
-    __host__ __device__ static constexpr auto GetUpperLengths()
-    {
-        return Array<index_t, 1> up_length_;
-    }
-    // emulate constexpr lambda
-    template <typename PseudoLowStrides>
-    struct lambda_CalculateLowerIndex
-    {
-        index_t& itmp;
-        LowerIndex& idx_low;
-        __host__ __device__ explicit constexpr lambda_CalculateLowerIndex(index_t& itmp_,
-                                                                          LowerIndex& idx_low_)
-            : itmp(itmp_), idx_low(idx_low_)
-        {
-        }
-        template <typename IDim>
-        __host__ __device__ constexpr void operator()(IDim idim) const
-        {
-            constexpr index_t stride = PseudoLowStrides::At(idim);
-            idx_low(idim)            = itmp / stride;
-            itmp -= idx_low[idim] * stride;
-        }
-    };
-    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
-    {
-        LowerIndex idx_low;
-        index_t itmp = idx_up[0];
-        constexpr auto pseudo_low_strides =
-            reverse_inclusive_scan_sequence(
-                LowerLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
-                .PushBack(Number<1>{});
-        static_for<0, nDimLow - 1, 1>{}(
-            lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
-        idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1];
-        return idx_low;
-    }
-    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
-    // If idx_up_diff is known at compile-time, many calculations can be optimized
-    // away by compiler
-    // This function assume idx_low_old is not out-of-bound
-    __host__ __device__ static constexpr auto
-    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
-                            const UpperIndex& /* idx_up_old */,
-                            const LowerIndex& idx_low_old)
-    {
-        if(idx_up_diff[0] == 0)
-        {
-            return make_zero_array<index_t, nDimLow>();
-        }
-        else
-        {
-            // CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
-            //   If idx_up_diff is known at compile-time, the calculation can
-            //   be done at compile-time. However, if idx_up_diff is only known
-            //   at run-time, then the calculation will also be computed at
-            //   run-time, and can be very expensive.
-            LowerIndex idx_low_diff_tmp = CalculateLowerIndex(idx_up_diff);
-            // find out the last low dimension that changed
-            index_t last_changed_low_dim = 0;
-            static_for<0, nDimLow, 1>{}([&](auto i) {
-                if(idx_low_diff_tmp[i] != 0)
-                {
-                    last_changed_low_dim = i;
-                }
-            });
-            LowerIndex idx_low_new = idx_low_old + idx_low_diff_tmp;
-            if(idx_up_diff[0] > 0)
-            {
-                // do carry check on each low dimension in reversed order
-                // starting from the first digit that changed
-                // don't check the highest dimension
-                bool carry = false;
-                static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
-                    if(i <= last_changed_low_dim)
-                    {
-                        if(carry)
-                        {
-                            ++idx_low_new(i);
-                        }
-                        carry = false;
-                        if(idx_low_new[i] >= LowerLengths::At(i))
-                        {
-                            idx_low_new(i) -= LowerLengths::At(i);
-                            carry = true;
-                        }
-                    }
-                });
-                // highest dimension, no out-of-bound check
-                if(carry)
-                {
-                    ++idx_low_new(0);
-                }
-            }
-            else
-            {
-                // do borrow check on each low dimension in reversed order
-                // starting from the first digit that changed
-                // don't check the highest dimension
-                bool borrow = false;
-                static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
-                    if(i <= last_changed_low_dim)
-                    {
-                        if(borrow)
-                        {
-                            --idx_low_new(i);
-                        }
-                        borrow = false;
-                        if(idx_low_new[i] < 0)
-                        {
-                            idx_low_new(i) += LowerLengths::At(i);
-                            borrow = true;
-                        }
-                    }
-                });
-                // highest dimension, no out-of-bound check
-                if(borrow)
-                {
-                    --idx_low_new(0);
-                }
-            }
-            return idx_low_new - idx_low_old;
-        }
-    }
-    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
-    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    __host__ __device__ constexpr DynamicEmbed(coefficients)
+        : coefficients_(coefficients)
    {
-        return true;
    }
 };

--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
@@ -15,21 +15,6 @@ __host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const L
    return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
 }
-template <typename LowTensorDescriptor,
-          typename Transforms,
-          typename LowDimensions,
-          typename UpDimensions>
-__host__ __device__ constexpr auto
-transform_dynamic_tensor_descriptor(const LowTensorDescriptor& low_tensor_desc,
-                                    const Transforms& transforms,
-                                    LowDimensions,
-                                    UpDimensions)
-{
-    return DynamicTransformedTensorDescriptor<LowTensorDescriptor,
-                                              Transforms,
-                                              LowDimensions,
-                                              UpDimensions>(low_tensor_desc, transforms);
-}
 } // namespace ck
 #endif
--- a/driver/include/device_dummy_dynamic_transform.hpp
+++ b/driver/include/device_dummy_dynamic_transform.hpp
@@ -75,11 +75,7 @@ void device_dummy_dynamic_transform(InDesc,
                                                 const Array<index_t, 2>,
                                                 const Array<index_t, 2>,
                                                 const Array<index_t, 2>,
-                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>>,
-                                                 index_t,
-                                                 index_t,
-                                                 index_t,
-                                                 index_t>,
                          dim3(GridSize),
                          dim3(BlockSize),
                          0,
@@ -93,11 +89,7 @@ void device_dummy_dynamic_transform(InDesc,
                          conv_strides,
                          conv_dilations,
                          in_left_pads,
-                          in_right_pads,
+                          in_right_pads);
-                          10,
-                          10,
-                          10,
-                          10);
        }
    }