adding im2col

dfe6dedb · Chao Liu · fe2ea9b6 · dfe6dedb · dfe6dedb · dfe6dedb
Commit dfe6dedb authored Feb 21, 2023 by Chao Liu
8 changed files
--- a/example/91_tile_program/hello_world.cpp
+++ b/example/91_tile_program/hello_world.cpp
@@ -14,10 +14,10 @@
 // program
 struct HelloWorld
 {
-    __host__ __device__ void operator()(TileProgram& tp, int x, int y, int* res)
+    __host__ __device__ void operator()(ProgramServer& ps, int x, int y, int* res)
    {
-        auto desc0 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(x)));
+        auto desc0 = ps(make_naive_tensor_descriptor_packed(ck::make_tuple(x)));
-        auto desc1 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(y)));
+        auto desc1 = ps(make_naive_tensor_descriptor_packed(ck::make_tuple(y)));
        // only for testing purpose
        // cpu should not do work here
@@ -33,7 +33,13 @@ int main()
    DeviceMem res_dev_buf(2 * sizeof(int));
-    launch(HelloWorld{}, 1, 1, x, y, static_cast<int*>(res_dev_buf.GetDeviceBuffer()));
+    launch(ProgramServer{},
+           HelloWorld{},
+           1,
+           1,
+           x,
+           y,
+           static_cast<int*>(res_dev_buf.GetDeviceBuffer()));
    int res_host[2];

--- a/example/91_tile_program/im2col.cpp
+++ b/example/91_tile_program/im2col.cpp
 #include "tile_program.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/thread_group.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_description/cluster_descriptor.hpp"
 #include "ck/tensor/tensor.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -12,6 +18,182 @@
 #include "ck/library/utility/device_memory.hpp"
+namespace ck {
+template <typename ThreadGroup,
+          typename SrcElementwiseOperation,
+          typename DstElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcTensor,
+          typename DstTensor,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct Copier
+{
+    using SrcDesc = typename SrcTensor::TensorDescriptor;
+    using DstDesc = typename DstTensor::TensorDescriptor;
+    static constexpr ck::index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+    using Index = MultiIndex<nDim>;
+    __host__ __device__ constexpr Copier() : block_copy_{}, src_tensor_{}, dst_tensor_{} {}
+    __device__ constexpr Copier(const SrcTensor& src_tensor,
+                                const Index& src_block_slice_origin,
+                                const SrcElementwiseOperation& src_element_op,
+                                DstTensor& dst_tensor,
+                                const Index& dst_block_slice_origin,
+                                const DstElementwiseOperation& dst_element_op)
+        : block_copy_{src_tensor.desc_,
+                      src_block_slice_origin,
+                      src_element_op,
+                      dst_tensor.desc_,
+                      dst_block_slice_origin,
+                      dst_element_op},
+          src_tensor_{src_tensor.buf_.p_data_, src_tensor.desc_},
+          dst_tensor_{dst_tensor.buf_.p_data_, dst_tensor.desc_}
+    {
+    }
+    __host__ void operator()() {}
+    __device__ void operator()()
+    {
+        block_copy_.Run(
+            src_tensor_.desc_, src_tensor_.buf_, dst_tensor_.desc_, dst_tensor_.buf_, Number<0>{});
+    }
+    __host__ void move_src_window(const Index&) {}
+    __device__ void move_src_window(const Index& step)
+    {
+        block_copy_.MoveSrcSliceWindow(src_tensor_.desc_, step);
+    }
+    __host__ void move_dst_window(const Index&) {}
+    __device__ void move_dst_window(const Index& step)
+    {
+        block_copy_.MoveDstSliceWindow(dst_tensor_.desc_, step);
+    }
+    // member
+    ThreadGroupTensorSliceTransfer_v4r1<ThreadGroup,
+                                        SrcElementwiseOperation,
+                                        DstElementwiseOperation,
+                                        DstInMemOp,
+                                        BlockSliceLengths,
+                                        ThreadClusterLengths,
+                                        ThreadClusterArrangeOrder,
+                                        typename SrcTensor::DataType,
+                                        typename SrcTensor::DataType,
+                                        SrcDesc,
+                                        DstDesc,
+                                        SrcDimAccessOrder,
+                                        DstDimAccessOrder,
+                                        SrcVectorDim,
+                                        DstVectorDim,
+                                        SrcScalarPerVector,
+                                        DstScalarPerVector,
+                                        SrcScalarStrideInVector,
+                                        DstScalarStrideInVector,
+                                        ThreadTransferSrcResetCoordinateAfterRun,
+                                        ThreadTransferDstResetCoordinateAfterRun>
+        block_copy_;
+    SrcTensor src_tensor_;
+    DstTensor dst_tensor_;
+};
+} // namespace ck
+struct CopierStrategy
+{
+};
+template <ck::index_t BlockSize>
+struct MyProgramServer : public ProgramServer
+{
+    template <typename SrcTensor, typename DstTensor, typename Index, typename Strategy>
+    __host__ auto make_copier(const SrcTensor& src_tensor,
+                              const Index& src_window_origin,
+                              DstTensor& dst_tensor,
+                              const Index& dst_window_origin,
+                              const Index& window_lengths,
+                              const Strategy& strategy)
+    {
+        using namespace ck;
+        return Copier<ThisThreadBlock<BlockSize>,
+                      tensor_operation::element_wise::PassThrough,
+                      tensor_operation::element_wise::PassThrough,
+                      InMemoryDataOperationEnum::Set,
+                      Sequence<128, 16>, // BlockSliceLengths,
+                      Sequence<16, 16>,
+                      Sequence<0, 1>,
+                      SrcTensor,
+                      DstTensor,
+                      Sequence<0, 1>,
+                      Sequence<0, 1>,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      true,
+                      true>{};
+    }
+    template <typename SrcTensor, typename DstTensor, typename Index, typename Strategy>
+    __device__ auto make_copier(const SrcTensor& src_tensor,
+                                const Index& src_window_origin,
+                                DstTensor& dst_tensor,
+                                const Index& dst_window_origin,
+                                const Index& window_lengths,
+                                const Strategy& strategy)
+    {
+        using namespace ck;
+        return Copier<ThisThreadBlock<BlockSize>,
+                      tensor_operation::element_wise::PassThrough,
+                      tensor_operation::element_wise::PassThrough,
+                      InMemoryDataOperationEnum::Set,
+                      Sequence<128, 16>, // BlockSliceLengths,
+                      Sequence<16, 16>,
+                      Sequence<0, 1>,
+                      SrcTensor,
+                      DstTensor,
+                      Sequence<0, 1>,
+                      Sequence<0, 1>,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      1,
+                      true,
+                      true>{src_tensor,
+                            src_window_origin,
+                            tensor_operation::element_wise::PassThrough{},
+                            dst_tensor,
+                            dst_window_origin,
+                            tensor_operation::element_wise::PassThrough{}};
+    }
+};
 // program
 template <ck::index_t NDimSpatial,
          typename ALayout,
@@ -21,8 +203,9 @@ template <ck::index_t NDimSpatial,
          ck::index_t kKPerTile>
 struct Im2Col
 {
+    template <typename Server, typename CopierStrategy>
    __host__ __device__ void
-    operator()(TileProgram& tp,
+    operator()(Server& ps,
               const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
               const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
               const std::array<ck::index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
@@ -34,17 +217,19 @@ struct Im2Col
               const std::array<ck::index_t, NDimSpatial>& input_left_pads,
               const std::array<ck::index_t, NDimSpatial>& input_right_pads,
               //
-               const std::array<ck::index_t, 3> a_gemmg_gemmm_gemmk_lengths,
+               const std::array<ck::index_t, 2> a_gemmm_gemmk_lengths,
-               const std::array<ck::index_t, 3> a_gemmg_gemmm_gemmk_strides,
+               const std::array<ck::index_t, 2> a_gemmm_gemmk_strides,
               //
               const T* p_a_img,
-               T* p_a_mtx)
+               T* p_a_mtx,
+               // strategy
+               const CopierStrategy& copier_strategy)
    {
        using namespace ck;
        constexpr auto I0 = Number<0>{};
-        constexpr auto I1 = Number<1>{};
+#if 0 // debug
        const auto a_src_desc = tensor_operation::TransformConvFwdToGemm<
            NDimSpatial,
            tensor_operation::device::ConvolutionForwardSpecialization::Default>::
@@ -58,36 +243,55 @@ struct Im2Col
                                                  conv_filter_dilations,
                                                  input_left_pads,
                                                  input_right_pads);
+#else
+        const auto a_src_desc =
+            ps(tensor_operation::TransformConvFwdToGemm<
+                NDimSpatial,
+                tensor_operation::device::ConvolutionForwardSpecialization::Default>::
+                   template MakeADescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
+                                                         a_g_n_c_wis_strides,
+                                                         b_g_k_c_xs_lengths,
+                                                         b_g_k_c_xs_strides,
+                                                         c_g_n_k_wos_lengths,
+                                                         c_g_n_k_wos_strides,
+                                                         conv_filter_strides,
+                                                         conv_filter_dilations,
+                                                         input_left_pads,
+                                                         input_right_pads));
+#endif
-        const auto a_dst_desc =
+#if 1 // debug
-            make_naive_tensor_descriptor(make_tuple(a_gemmg_gemmm_gemmk_lengths[0],
+        const auto a_dst_desc = ps(make_naive_tensor_descriptor(
-                                                    a_gemmg_gemmm_gemmk_lengths[1],
+            make_tuple(a_gemmm_gemmk_lengths[0], a_gemmm_gemmk_lengths[1]),
-                                                    a_gemmg_gemmm_gemmk_lengths[2]),
+            make_tuple(a_gemmm_gemmk_strides[0], a_gemmm_gemmk_strides[1])));
-                                         make_tuple(a_gemmg_gemmm_gemmk_strides[0],
+#else
-                                                    a_gemmg_gemmm_gemmk_strides[1],
+        const auto a_dst_desc = make_naive_tensor_descriptor(
-                                                    a_gemmg_gemmm_gemmk_strides[2]));
+            make_tuple(a_gemmm_gemmk_lengths[0], a_gemmm_gemmk_lengths[1]),
+            make_tuple(a_gemmm_gemmk_strides[0], a_gemmm_gemmk_strides[1]));
+#endif
-        const auto a_src = tp(make_tensor<AddressSpaceEnum::Global, true>(a_src_desc, p_a_img));
+        const auto a_src = make_tensor<AddressSpaceEnum::Global, true>(a_src_desc, p_a_img);
-        auto a_dst = tp(make_tensor<AddressSpaceEnum::Global, true>(a_dst_desc, p_a_mtx));
+        auto a_dst = make_tensor<AddressSpaceEnum::Global, true>(a_dst_desc, p_a_mtx);
-        const auto num_gemmg = a_gemmg_gemmm_gemmk_lengths[0];
+        const auto num_gemmm = a_gemmm_gemmk_lengths[0];
-        const auto num_gemmm = a_gemmg_gemmm_gemmk_lengths[1];
+        const auto num_gemmk = a_gemmm_gemmk_lengths[1];
-        const auto num_gemmk = a_gemmg_gemmm_gemmk_lengths[2];
-        const auto id_block = tp.get_block_1d_id();
+        const auto id_block = ps.get_block_1d_id();
-        const auto num_tile_m = num_gemmm / kMPerTile;
+        const auto num_tile_m = ps.read_first_lane(num_gemmm / kMPerTile);
-        const auto num_tile_k = num_gemmk / kKPerTile;
-        const auto block2tile = tp(make_cluster_descriptor(make_tuple(num_tile_m, num_tile_k)));
+#if 1 // debug
+        const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m)));
+#else
+        const auto block2tile = make_cluster_descriptor(make_tuple(num_tile_m));
+#endif
        const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block));
-        const auto id_tile_m = id_tile[I0];
+        const auto id_tile_m = ps.read_first_lane(id_tile[I0]);
-        const auto id_tile_k = id_tile[I1];
-#if 1
+#if 0
        // data-based syntax: per-data solution strategy
        auto window_a_src = make_window(a_src,
                                        make_tuple(1, MPerTile, KPerTile),
@@ -107,6 +311,25 @@ struct Im2Col
            window_a_dst += make_tuple(1, 0, 0);
        }
 #else
+        // operator-based syntax
+        auto copier = ps.make_copier(a_src,
+                                     make_tuple(id_tile_m * kMPerTile, 0),
+                                     a_dst,
+                                     make_tuple(id_tile_m * kMPerTile, 0),
+                                     make_tuple(kMPerTile, kKPerTile),
+                                     copier_strategy);
+        ck::index_t id_gemmk = 0;
+        do
+        {
+            copier();
+            copier.move_src_window(make_tuple(0, kKPerTile));
+            copier.move_dst_window(make_tuple(0, kKPerTile));
+            id_gemmk += kKPerTile;
+        } while(id_gemmk < num_gemmk - kKPerTile);
 #endif
    }
 };
@@ -117,7 +340,7 @@ int main()
    constexpr ck::index_t NumDimSpatial = 2;
-    ck::index_t G  = 32;
+    ck::index_t G  = 1;
    ck::index_t N  = 256;
    ck::index_t K  = 192;
    ck::index_t C  = 192;
@@ -170,8 +393,8 @@ int main()
    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
    // matrix
-    std::array<ck::index_t, 3> in_mtx_lengths{G, G * Ho * Wo, C * Y * X};
+    std::array<ck::index_t, 2> in_mtx_lengths{N * Ho * Wo, C * Y * X};
-    std::array<ck::index_t, 3> in_mtx_strides{0, 0, 1};
+    std::array<ck::index_t, 2> in_mtx_strides{0, 1};
    std::partial_sum(rbegin(in_mtx_lengths),
                     std::prev(rend(in_mtx_lengths)),
@@ -181,9 +404,10 @@ int main()
    DeviceMem in(sizeof(DataType) * G * N * Hi * Wi * C);
    DeviceMem in_mtx(sizeof(DataType) * G * N * Ho * Wo * C * Y * X);
-    launch(Im2Col<2, ck::tensor_layout::convolution::GNHWC, float, 128, 128>{},
+    launch(MyProgramServer<256>{},
+           Im2Col<2, ck::tensor_layout::convolution::GNHWC, float, 128, 16>{},
+           1,
           1,
-           256,
           in_lengths,
           in_strides,
           wei_lengths,
@@ -199,7 +423,8 @@ int main()
           in_mtx_strides,
           //
           static_cast<DataType*>(in.GetDeviceBuffer()),
-           static_cast<DataType*>(in_mtx.GetDeviceBuffer()));
+           static_cast<DataType*>(in_mtx.GetDeviceBuffer()),
+           CopierStrategy{});
    return 0;
 }
--- a/example/91_tile_program/tile_program.hpp
+++ b/example/91_tile_program/tile_program.hpp
@@ -6,10 +6,13 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-// hidden intermediate argument
+// Meta data for GPU
-struct Arg
+// TODO: do we need to take care of data alignment in code or it's done by compiler?
+template <ck::index_t kSize>
+struct MetaData
 {
-    char data_[128];
+    char p_data_[kSize];
    ck::index_t size_ = 0;
    ck::index_t pos_  = 0;
@@ -21,49 +24,59 @@ struct Arg
    __device__ void reset_pos() { pos_ = 0; }
-    // push arg on host
+    // push meta data on host
+    // TODO: correct forwarding?
    template <typename T>
-    __host__ T push(const T& a)
+    __host__ auto push(T&& a)
    {
-        *reinterpret_cast<T*>(data_ + size_) = a;
+        assert(size_ + sizeof(Type) <= kSize);
-        size_ += sizeof(T);
+        using Type = ck::remove_cvref_t<T>;
-        return a;
+        *reinterpret_cast<Type*>(p_data_ + size_) = a;
+        size_ += sizeof(Type);
+        return ck::forwarder{}(a);
    }
-    // pull arg on device
+    // pull meta data on device
+    // TODO: correct forwarding?
    template <typename T>
-    __device__ T pull()
+    __device__ auto pull()
    {
-        T a = *reinterpret_cast<T*>(data_ + pos_);
+        using Type = ck::remove_cvref_t<T>;
-        pos_ += sizeof(T);
+        Type a = *reinterpret_cast<Type*>(p_data_ + pos_);
+        pos_ += sizeof(Type);
        return a;
    }
 };
 // namespace tp (for tile programming)
-struct TileProgram
+struct ProgramServer
 {
-    // arg on device
+    // meta data on device
-    Arg arg_;
+    MetaData<1024> meta_data_;
+    __host__ void cpu_init() { meta_data_.reset(); }
-    __device__ void gpu_init() { arg_.reset_pos(); }
+    __device__ void gpu_init() { meta_data_.reset_pos(); }
-    // push arg on host
+    // push meta data on host
    template <typename T>
-    __host__ T operator()(const T& a)
+    __host__ auto operator()(T&& a)
    {
-        return arg_.push(a);
+        return ck::forwarder{}(meta_data_.push(a));
    }
-    // push arg on host
+    // push meta data on host
    template <typename T>
-    __device__ T operator()(const T&)
+    __device__ auto operator()(T&&)
    {
-        return arg_.pull<T>();
+        return ck::forwarder{}(meta_data_.pull<T>());
    }
    __host__ static ck::index_t get_block_1d_id() { return -1; }
@@ -73,23 +86,36 @@ struct TileProgram
    __device__ static ck::index_t get_block_1d_id() { return ck::get_block_1d_id(); }
    __device__ static ck::index_t get_grid_size() { return ck::get_grid_size(); }
+    // TODO: correct forwarding?
+    template <typename T>
+    __host__ static constexpr auto read_first_lane(T&& a)
+    {
+        return ck::forwarder{}(a);
+    }
+    template <typename T>
+    __device__ static constexpr auto read_first_lane(T&& a)
+    {
+        return __builtin_amdgcn_readfirstlane(a);
+    }
 };
-template <typename Program, typename... Xs>
+template <typename Server, typename Program, typename... Xs>
-__global__ void gpu_program_wrapper(Program f, TileProgram tp, Xs... xs)
+__global__ void gpu_program_wrapper(Server server, Program f, Xs... xs)
 {
-    tp.gpu_init();
+    server.gpu_init();
-    f(tp, xs...);
+    f(server, xs...);
 }
-template <typename Program, typename... Xs>
+template <typename Server, typename Program, typename... Xs>
-void launch(Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
+void launch(Server server, Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
 {
-    TileProgram tp;
+    server.cpu_init();
-    f(tp, xs...);
+    f(server, xs...);
-    printf("cpu arg size %d\n", tp.arg_.size_);
+    printf("meta data size %d\n", server.meta_data_.size_);
-    gpu_program_wrapper<Program><<<grid_dim, block_dim, 0, nullptr>>>(f, tp, xs...);
+    gpu_program_wrapper<Server, Program><<<grid_dim, block_dim, 0, nullptr>>>(server, f, xs...);
 }
--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
@@ -78,7 +78,9 @@ struct StaticTensor
    StaticBuffer<AddressSpace, T, element_space_size_, true> data_;
    static constexpr T zero_scalar_value_ = T{0};
+    // for read access of invalid element
    const T invalid_element_scalar_value_;
+    // for write access of invalid element
    T ignored_element_scalar_;
 };
@@ -101,12 +103,12 @@ struct StaticTensorTupleOfVectorBuffer
    using V = vector_type<S, ScalarPerVector>;
    __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer()
-        : invalid_element_scalar_value_{0}
+        : invalid_element_scalar_value_{0}, ignored_element_scalar_{0}
    {
    }
    __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value)
-        : invalid_element_scalar_value_{invalid_element_value}
+        : invalid_element_scalar_value_{invalid_element_value}, ignored_element_scalar_{0}
    {
    }
@@ -244,7 +246,9 @@ struct StaticTensorTupleOfVectorBuffer
    StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_;
    static constexpr S zero_scalar_value_ = S{0};
+    // for read access of invalid element
    const S invalid_element_scalar_value_ = S{0};
+    // for write access of invalid element
    S ignored_element_scalar_;
 };

--- a/include/ck/tensor/tensor.hpp
+++ b/include/ck/tensor/tensor.hpp
@@ -8,32 +8,38 @@ namespace ck {
 template <AddressSpaceEnum AddressSpace,
          bool InvalidElementUseNumericalZeroValue,
          typename T,
-          typename TensorDesc>
+          typename TensorDescTmp>
 struct Tensor
 {
+    using TensorDescriptor = remove_cvref_t<TensorDescTmp>;
+    using DataType         = remove_reference_t<T>;
    static constexpr AddressSpaceEnum kAdressSpace_ = AddressSpace;
    static constexpr bool kInvalidElementUseNumericalZeroValue_ =
        InvalidElementUseNumericalZeroValue;
    __host__ __device__ constexpr Tensor() : buf_{nullptr, 0}, desc_{} {}
-    __host__ __device__ constexpr Tensor(T* p_data, TensorDesc desc)
+    __host__ __device__ constexpr Tensor(DataType* p_data, TensorDescriptor desc)
        : buf_{p_data, desc.GetElementSpaceSize()}, desc_{desc}
    {
    }
-    __host__ __device__ constexpr Tensor(T* p_data, TensorDesc desc, T invalid_element_value)
+    __host__ __device__ constexpr Tensor(DataType* p_data,
+                                         TensorDescriptor desc,
+                                         DataType invalid_element_value)
        : buf_{p_data, desc.GetElementSpaceSize(), invalid_element_value}, desc_{desc}
    {
    }
+    // member
    DynamicBuffer<AddressSpace,
-                  T,
+                  DataType,
-                  typename TensorDesc::ElementSpaceSizeType,
+                  typename TensorDescriptor::ElementSpaceSizeType,
                  InvalidElementUseNumericalZeroValue>
        buf_;
-    TensorDesc desc_;
+    TensorDescriptor desc_;
 };
 template <AddressSpaceEnum AddressSpace,

--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -45,6 +45,10 @@ struct ThreadGroupTensorSliceTransfer_v4r1
    using Index = MultiIndex<nDim>;
+#if 1 // debug
+    __host__ __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1() : threadwise_transfer_{} {}
+#endif
    __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1(
        const SrcDesc& src_desc,
        const Index& src_block_slice_origin,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -52,8 +52,8 @@ template <typename SliceLengths,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,
          InMemoryDataOperationEnum DstInMemOp,
-          typename SrcData,
+          typename SrcDataTmp,
-          typename DstData,
+          typename DstDataTmp,
          typename SrcDesc,
          typename DstDesc,
          typename SrcDimAccessOrder,
@@ -76,6 +76,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
    static constexpr index_t nDim = SliceLengths::Size();
    using Index                   = MultiIndex<nDim>;
+    using SrcData = remove_cvref_t<SrcDataTmp>;
+    using DstData = remove_cvref_t<DstDataTmp>;
    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
@@ -84,6 +87,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
    static constexpr auto I0 = Number<0>{};
+    __host__ __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1()
+        : src_coord_{}, dst_coord_{}, src_element_op_{}, dst_element_op_{}
+    {
+    }
    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(
        const SrcDesc& src_desc,
        const Index& src_slice_origin,

--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -24,12 +24,17 @@ struct DynamicBuffer
 {
    using type = T;
-    T* p_data_;
+    T* p_data_ = nullptr;
    ElementSpaceSize element_space_size_;
    remove_cvref_t<T> invalid_element_value_ = T{0};
+    __host__ __device__ constexpr DynamicBuffer()
+        : p_data_{}, element_space_size_{}, invalid_element_value_{}
+    {
+    }
    __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
-        : p_data_{p_data}, element_space_size_{element_space_size}
+        : p_data_{p_data}, element_space_size_{element_space_size}, invalid_element_value_{0}
    {
    }
@@ -42,20 +47,17 @@ struct DynamicBuffer
    {
    }
-    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace()
+    __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return BufferAddressSpace; }
-    {
-        return BufferAddressSpace;
-    }
-    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+    __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
-    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+    __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
    template <typename X,
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
                                 bool>::type = false>
-    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
+    __device__ constexpr auto Get(index_t i, bool is_valid_element) const
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
@@ -120,7 +122,7 @@ struct DynamicBuffer
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
                                 bool>::type = false>
-    __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    __device__ void Update(index_t i, bool is_valid_element, const X& x)
    {
        if constexpr(Op == InMemoryDataOperationEnum::Set)
        {
@@ -147,7 +149,7 @@ struct DynamicBuffer
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
                                 bool>::type = false>
-    __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
+    __device__ void Set(index_t i, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
@@ -290,7 +292,7 @@ struct DynamicBuffer
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
                                 bool>::type = false>
-    __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
+    __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
    {
        using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
@@ -339,7 +341,7 @@ struct DynamicBuffer
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
                                 bool>::type = false>
-    __host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
+    __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
    {
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
@@ -371,9 +373,9 @@ struct DynamicBuffer
        }
    }
-    __host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
+    __device__ static constexpr bool IsStaticBuffer() { return false; }
-    __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
+    __device__ static constexpr bool IsDynamicBuffer() { return true; }
 };
 template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>