Commit dfe6dedb authored by Chao Liu's avatar Chao Liu
Browse files

adding im2col

parent fe2ea9b6
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
// program // program
struct HelloWorld struct HelloWorld
{ {
__host__ __device__ void operator()(TileProgram& tp, int x, int y, int* res) __host__ __device__ void operator()(ProgramServer& ps, int x, int y, int* res)
{ {
auto desc0 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(x))); auto desc0 = ps(make_naive_tensor_descriptor_packed(ck::make_tuple(x)));
auto desc1 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(y))); auto desc1 = ps(make_naive_tensor_descriptor_packed(ck::make_tuple(y)));
// only for testing purpose // only for testing purpose
// cpu should not do work here // cpu should not do work here
...@@ -33,7 +33,13 @@ int main() ...@@ -33,7 +33,13 @@ int main()
DeviceMem res_dev_buf(2 * sizeof(int)); DeviceMem res_dev_buf(2 * sizeof(int));
launch(HelloWorld{}, 1, 1, x, y, static_cast<int*>(res_dev_buf.GetDeviceBuffer())); launch(ProgramServer{},
HelloWorld{},
1,
1,
x,
y,
static_cast<int*>(res_dev_buf.GetDeviceBuffer()));
int res_host[2]; int res_host[2];
......
#include "tile_program.hpp" #include "tile_program.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/thread_group.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp" #include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor.hpp" #include "ck/tensor/tensor.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp" #include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
...@@ -12,6 +18,182 @@ ...@@ -12,6 +18,182 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
namespace ck {
template <typename ThreadGroup,
typename SrcElementwiseOperation,
typename DstElementwiseOperation,
InMemoryDataOperationEnum DstInMemOp,
typename BlockSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcTensor,
typename DstTensor,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorDim,
index_t DstVectorDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector,
bool ThreadTransferSrcResetCoordinateAfterRun,
bool ThreadTransferDstResetCoordinateAfterRun>
struct Copier
{
using SrcDesc = typename SrcTensor::TensorDescriptor;
using DstDesc = typename DstTensor::TensorDescriptor;
static constexpr ck::index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__host__ __device__ constexpr Copier() : block_copy_{}, src_tensor_{}, dst_tensor_{} {}
__device__ constexpr Copier(const SrcTensor& src_tensor,
const Index& src_block_slice_origin,
const SrcElementwiseOperation& src_element_op,
DstTensor& dst_tensor,
const Index& dst_block_slice_origin,
const DstElementwiseOperation& dst_element_op)
: block_copy_{src_tensor.desc_,
src_block_slice_origin,
src_element_op,
dst_tensor.desc_,
dst_block_slice_origin,
dst_element_op},
src_tensor_{src_tensor.buf_.p_data_, src_tensor.desc_},
dst_tensor_{dst_tensor.buf_.p_data_, dst_tensor.desc_}
{
}
__host__ void operator()() {}
__device__ void operator()()
{
block_copy_.Run(
src_tensor_.desc_, src_tensor_.buf_, dst_tensor_.desc_, dst_tensor_.buf_, Number<0>{});
}
__host__ void move_src_window(const Index&) {}
__device__ void move_src_window(const Index& step)
{
block_copy_.MoveSrcSliceWindow(src_tensor_.desc_, step);
}
__host__ void move_dst_window(const Index&) {}
__device__ void move_dst_window(const Index& step)
{
block_copy_.MoveDstSliceWindow(dst_tensor_.desc_, step);
}
// member
ThreadGroupTensorSliceTransfer_v4r1<ThreadGroup,
SrcElementwiseOperation,
DstElementwiseOperation,
DstInMemOp,
BlockSliceLengths,
ThreadClusterLengths,
ThreadClusterArrangeOrder,
typename SrcTensor::DataType,
typename SrcTensor::DataType,
SrcDesc,
DstDesc,
SrcDimAccessOrder,
DstDimAccessOrder,
SrcVectorDim,
DstVectorDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcScalarStrideInVector,
DstScalarStrideInVector,
ThreadTransferSrcResetCoordinateAfterRun,
ThreadTransferDstResetCoordinateAfterRun>
block_copy_;
SrcTensor src_tensor_;
DstTensor dst_tensor_;
};
} // namespace ck
struct CopierStrategy
{
};
template <ck::index_t BlockSize>
struct MyProgramServer : public ProgramServer
{
template <typename SrcTensor, typename DstTensor, typename Index, typename Strategy>
__host__ auto make_copier(const SrcTensor& src_tensor,
const Index& src_window_origin,
DstTensor& dst_tensor,
const Index& dst_window_origin,
const Index& window_lengths,
const Strategy& strategy)
{
using namespace ck;
return Copier<ThisThreadBlock<BlockSize>,
tensor_operation::element_wise::PassThrough,
tensor_operation::element_wise::PassThrough,
InMemoryDataOperationEnum::Set,
Sequence<128, 16>, // BlockSliceLengths,
Sequence<16, 16>,
Sequence<0, 1>,
SrcTensor,
DstTensor,
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
true,
true>{};
}
template <typename SrcTensor, typename DstTensor, typename Index, typename Strategy>
__device__ auto make_copier(const SrcTensor& src_tensor,
const Index& src_window_origin,
DstTensor& dst_tensor,
const Index& dst_window_origin,
const Index& window_lengths,
const Strategy& strategy)
{
using namespace ck;
return Copier<ThisThreadBlock<BlockSize>,
tensor_operation::element_wise::PassThrough,
tensor_operation::element_wise::PassThrough,
InMemoryDataOperationEnum::Set,
Sequence<128, 16>, // BlockSliceLengths,
Sequence<16, 16>,
Sequence<0, 1>,
SrcTensor,
DstTensor,
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
true,
true>{src_tensor,
src_window_origin,
tensor_operation::element_wise::PassThrough{},
dst_tensor,
dst_window_origin,
tensor_operation::element_wise::PassThrough{}};
}
};
// program // program
template <ck::index_t NDimSpatial, template <ck::index_t NDimSpatial,
typename ALayout, typename ALayout,
...@@ -21,8 +203,9 @@ template <ck::index_t NDimSpatial, ...@@ -21,8 +203,9 @@ template <ck::index_t NDimSpatial,
ck::index_t kKPerTile> ck::index_t kKPerTile>
struct Im2Col struct Im2Col
{ {
template <typename Server, typename CopierStrategy>
__host__ __device__ void __host__ __device__ void
operator()(TileProgram& tp, operator()(Server& ps,
const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths, const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_strides, const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<ck::index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths, const std::array<ck::index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
...@@ -34,17 +217,19 @@ struct Im2Col ...@@ -34,17 +217,19 @@ struct Im2Col
const std::array<ck::index_t, NDimSpatial>& input_left_pads, const std::array<ck::index_t, NDimSpatial>& input_left_pads,
const std::array<ck::index_t, NDimSpatial>& input_right_pads, const std::array<ck::index_t, NDimSpatial>& input_right_pads,
// //
const std::array<ck::index_t, 3> a_gemmg_gemmm_gemmk_lengths, const std::array<ck::index_t, 2> a_gemmm_gemmk_lengths,
const std::array<ck::index_t, 3> a_gemmg_gemmm_gemmk_strides, const std::array<ck::index_t, 2> a_gemmm_gemmk_strides,
// //
const T* p_a_img, const T* p_a_img,
T* p_a_mtx) T* p_a_mtx,
// strategy
const CopierStrategy& copier_strategy)
{ {
using namespace ck; using namespace ck;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
#if 0 // debug
const auto a_src_desc = tensor_operation::TransformConvFwdToGemm< const auto a_src_desc = tensor_operation::TransformConvFwdToGemm<
NDimSpatial, NDimSpatial,
tensor_operation::device::ConvolutionForwardSpecialization::Default>:: tensor_operation::device::ConvolutionForwardSpecialization::Default>::
...@@ -58,36 +243,55 @@ struct Im2Col ...@@ -58,36 +243,55 @@ struct Im2Col
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
input_right_pads); input_right_pads);
#else
const auto a_src_desc =
ps(tensor_operation::TransformConvFwdToGemm<
NDimSpatial,
tensor_operation::device::ConvolutionForwardSpecialization::Default>::
template MakeADescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
c_g_n_k_wos_lengths,
c_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads));
#endif
const auto a_dst_desc = #if 1 // debug
make_naive_tensor_descriptor(make_tuple(a_gemmg_gemmm_gemmk_lengths[0], const auto a_dst_desc = ps(make_naive_tensor_descriptor(
a_gemmg_gemmm_gemmk_lengths[1], make_tuple(a_gemmm_gemmk_lengths[0], a_gemmm_gemmk_lengths[1]),
a_gemmg_gemmm_gemmk_lengths[2]), make_tuple(a_gemmm_gemmk_strides[0], a_gemmm_gemmk_strides[1])));
make_tuple(a_gemmg_gemmm_gemmk_strides[0], #else
a_gemmg_gemmm_gemmk_strides[1], const auto a_dst_desc = make_naive_tensor_descriptor(
a_gemmg_gemmm_gemmk_strides[2])); make_tuple(a_gemmm_gemmk_lengths[0], a_gemmm_gemmk_lengths[1]),
make_tuple(a_gemmm_gemmk_strides[0], a_gemmm_gemmk_strides[1]));
#endif
const auto a_src = tp(make_tensor<AddressSpaceEnum::Global, true>(a_src_desc, p_a_img)); const auto a_src = make_tensor<AddressSpaceEnum::Global, true>(a_src_desc, p_a_img);
auto a_dst = tp(make_tensor<AddressSpaceEnum::Global, true>(a_dst_desc, p_a_mtx)); auto a_dst = make_tensor<AddressSpaceEnum::Global, true>(a_dst_desc, p_a_mtx);
const auto num_gemmg = a_gemmg_gemmm_gemmk_lengths[0]; const auto num_gemmm = a_gemmm_gemmk_lengths[0];
const auto num_gemmm = a_gemmg_gemmm_gemmk_lengths[1]; const auto num_gemmk = a_gemmm_gemmk_lengths[1];
const auto num_gemmk = a_gemmg_gemmm_gemmk_lengths[2];
const auto id_block = tp.get_block_1d_id(); const auto id_block = ps.get_block_1d_id();
const auto num_tile_m = num_gemmm / kMPerTile; const auto num_tile_m = ps.read_first_lane(num_gemmm / kMPerTile);
const auto num_tile_k = num_gemmk / kKPerTile;
const auto block2tile = tp(make_cluster_descriptor(make_tuple(num_tile_m, num_tile_k))); #if 1 // debug
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m)));
#else
const auto block2tile = make_cluster_descriptor(make_tuple(num_tile_m));
#endif
const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block)); const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block));
const auto id_tile_m = id_tile[I0]; const auto id_tile_m = ps.read_first_lane(id_tile[I0]);
const auto id_tile_k = id_tile[I1];
#if 1 #if 0
// data-based syntax: per-data solution strategy // data-based syntax: per-data solution strategy
auto window_a_src = make_window(a_src, auto window_a_src = make_window(a_src,
make_tuple(1, MPerTile, KPerTile), make_tuple(1, MPerTile, KPerTile),
...@@ -107,6 +311,25 @@ struct Im2Col ...@@ -107,6 +311,25 @@ struct Im2Col
window_a_dst += make_tuple(1, 0, 0); window_a_dst += make_tuple(1, 0, 0);
} }
#else #else
// operator-based syntax
auto copier = ps.make_copier(a_src,
make_tuple(id_tile_m * kMPerTile, 0),
a_dst,
make_tuple(id_tile_m * kMPerTile, 0),
make_tuple(kMPerTile, kKPerTile),
copier_strategy);
ck::index_t id_gemmk = 0;
do
{
copier();
copier.move_src_window(make_tuple(0, kKPerTile));
copier.move_dst_window(make_tuple(0, kKPerTile));
id_gemmk += kKPerTile;
} while(id_gemmk < num_gemmk - kKPerTile);
#endif #endif
} }
}; };
...@@ -117,7 +340,7 @@ int main() ...@@ -117,7 +340,7 @@ int main()
constexpr ck::index_t NumDimSpatial = 2; constexpr ck::index_t NumDimSpatial = 2;
ck::index_t G = 32; ck::index_t G = 1;
ck::index_t N = 256; ck::index_t N = 256;
ck::index_t K = 192; ck::index_t K = 192;
ck::index_t C = 192; ck::index_t C = 192;
...@@ -170,8 +393,8 @@ int main() ...@@ -170,8 +393,8 @@ int main()
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1}; std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
// matrix // matrix
std::array<ck::index_t, 3> in_mtx_lengths{G, G * Ho * Wo, C * Y * X}; std::array<ck::index_t, 2> in_mtx_lengths{N * Ho * Wo, C * Y * X};
std::array<ck::index_t, 3> in_mtx_strides{0, 0, 1}; std::array<ck::index_t, 2> in_mtx_strides{0, 1};
std::partial_sum(rbegin(in_mtx_lengths), std::partial_sum(rbegin(in_mtx_lengths),
std::prev(rend(in_mtx_lengths)), std::prev(rend(in_mtx_lengths)),
...@@ -181,9 +404,10 @@ int main() ...@@ -181,9 +404,10 @@ int main()
DeviceMem in(sizeof(DataType) * G * N * Hi * Wi * C); DeviceMem in(sizeof(DataType) * G * N * Hi * Wi * C);
DeviceMem in_mtx(sizeof(DataType) * G * N * Ho * Wo * C * Y * X); DeviceMem in_mtx(sizeof(DataType) * G * N * Ho * Wo * C * Y * X);
launch(Im2Col<2, ck::tensor_layout::convolution::GNHWC, float, 128, 128>{}, launch(MyProgramServer<256>{},
Im2Col<2, ck::tensor_layout::convolution::GNHWC, float, 128, 16>{},
1,
1, 1,
256,
in_lengths, in_lengths,
in_strides, in_strides,
wei_lengths, wei_lengths,
...@@ -199,7 +423,8 @@ int main() ...@@ -199,7 +423,8 @@ int main()
in_mtx_strides, in_mtx_strides,
// //
static_cast<DataType*>(in.GetDeviceBuffer()), static_cast<DataType*>(in.GetDeviceBuffer()),
static_cast<DataType*>(in_mtx.GetDeviceBuffer())); static_cast<DataType*>(in_mtx.GetDeviceBuffer()),
CopierStrategy{});
return 0; return 0;
} }
...@@ -6,10 +6,13 @@ ...@@ -6,10 +6,13 @@
#include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
// hidden intermediate argument // Meta data for GPU
struct Arg // TODO: do we need to take care of data alignment in code or it's done by compiler?
template <ck::index_t kSize>
struct MetaData
{ {
char data_[128]; char p_data_[kSize];
ck::index_t size_ = 0; ck::index_t size_ = 0;
ck::index_t pos_ = 0; ck::index_t pos_ = 0;
...@@ -21,49 +24,59 @@ struct Arg ...@@ -21,49 +24,59 @@ struct Arg
__device__ void reset_pos() { pos_ = 0; } __device__ void reset_pos() { pos_ = 0; }
// push arg on host // push meta data on host
// TODO: correct forwarding?
template <typename T> template <typename T>
__host__ T push(const T& a) __host__ auto push(T&& a)
{ {
*reinterpret_cast<T*>(data_ + size_) = a; assert(size_ + sizeof(Type) <= kSize);
size_ += sizeof(T); using Type = ck::remove_cvref_t<T>;
return a; *reinterpret_cast<Type*>(p_data_ + size_) = a;
size_ += sizeof(Type);
return ck::forwarder{}(a);
} }
// pull arg on device // pull meta data on device
// TODO: correct forwarding?
template <typename T> template <typename T>
__device__ T pull() __device__ auto pull()
{ {
T a = *reinterpret_cast<T*>(data_ + pos_); using Type = ck::remove_cvref_t<T>;
pos_ += sizeof(T); Type a = *reinterpret_cast<Type*>(p_data_ + pos_);
pos_ += sizeof(Type);
return a; return a;
} }
}; };
// namespace tp (for tile programming) // namespace tp (for tile programming)
struct TileProgram struct ProgramServer
{ {
// arg on device // meta data on device
Arg arg_; MetaData<1024> meta_data_;
__host__ void cpu_init() { meta_data_.reset(); }
__device__ void gpu_init() { arg_.reset_pos(); } __device__ void gpu_init() { meta_data_.reset_pos(); }
// push arg on host // push meta data on host
template <typename T> template <typename T>
__host__ T operator()(const T& a) __host__ auto operator()(T&& a)
{ {
return arg_.push(a); return ck::forwarder{}(meta_data_.push(a));
} }
// push arg on host // push meta data on host
template <typename T> template <typename T>
__device__ T operator()(const T&) __device__ auto operator()(T&&)
{ {
return arg_.pull<T>(); return ck::forwarder{}(meta_data_.pull<T>());
} }
__host__ static ck::index_t get_block_1d_id() { return -1; } __host__ static ck::index_t get_block_1d_id() { return -1; }
...@@ -73,23 +86,36 @@ struct TileProgram ...@@ -73,23 +86,36 @@ struct TileProgram
__device__ static ck::index_t get_block_1d_id() { return ck::get_block_1d_id(); } __device__ static ck::index_t get_block_1d_id() { return ck::get_block_1d_id(); }
__device__ static ck::index_t get_grid_size() { return ck::get_grid_size(); } __device__ static ck::index_t get_grid_size() { return ck::get_grid_size(); }
// TODO: correct forwarding?
template <typename T>
__host__ static constexpr auto read_first_lane(T&& a)
{
return ck::forwarder{}(a);
}
template <typename T>
__device__ static constexpr auto read_first_lane(T&& a)
{
return __builtin_amdgcn_readfirstlane(a);
}
}; };
template <typename Program, typename... Xs> template <typename Server, typename Program, typename... Xs>
__global__ void gpu_program_wrapper(Program f, TileProgram tp, Xs... xs) __global__ void gpu_program_wrapper(Server server, Program f, Xs... xs)
{ {
tp.gpu_init(); server.gpu_init();
f(tp, xs...); f(server, xs...);
} }
template <typename Program, typename... Xs> template <typename Server, typename Program, typename... Xs>
void launch(Program f, dim3 grid_dim, dim3 block_dim, Xs... xs) void launch(Server server, Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
{ {
TileProgram tp; server.cpu_init();
f(tp, xs...); f(server, xs...);
printf("cpu arg size %d\n", tp.arg_.size_); printf("meta data size %d\n", server.meta_data_.size_);
gpu_program_wrapper<Program><<<grid_dim, block_dim, 0, nullptr>>>(f, tp, xs...); gpu_program_wrapper<Server, Program><<<grid_dim, block_dim, 0, nullptr>>>(server, f, xs...);
} }
...@@ -78,7 +78,9 @@ struct StaticTensor ...@@ -78,7 +78,9 @@ struct StaticTensor
StaticBuffer<AddressSpace, T, element_space_size_, true> data_; StaticBuffer<AddressSpace, T, element_space_size_, true> data_;
static constexpr T zero_scalar_value_ = T{0}; static constexpr T zero_scalar_value_ = T{0};
// for read access of invalid element
const T invalid_element_scalar_value_; const T invalid_element_scalar_value_;
// for write access of invalid element
T ignored_element_scalar_; T ignored_element_scalar_;
}; };
...@@ -101,12 +103,12 @@ struct StaticTensorTupleOfVectorBuffer ...@@ -101,12 +103,12 @@ struct StaticTensorTupleOfVectorBuffer
using V = vector_type<S, ScalarPerVector>; using V = vector_type<S, ScalarPerVector>;
__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer() __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer()
: invalid_element_scalar_value_{0} : invalid_element_scalar_value_{0}, ignored_element_scalar_{0}
{ {
} }
__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value) __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value)
: invalid_element_scalar_value_{invalid_element_value} : invalid_element_scalar_value_{invalid_element_value}, ignored_element_scalar_{0}
{ {
} }
...@@ -244,7 +246,9 @@ struct StaticTensorTupleOfVectorBuffer ...@@ -244,7 +246,9 @@ struct StaticTensorTupleOfVectorBuffer
StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_; StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_;
static constexpr S zero_scalar_value_ = S{0}; static constexpr S zero_scalar_value_ = S{0};
// for read access of invalid element
const S invalid_element_scalar_value_ = S{0}; const S invalid_element_scalar_value_ = S{0};
// for write access of invalid element
S ignored_element_scalar_; S ignored_element_scalar_;
}; };
......
...@@ -8,32 +8,38 @@ namespace ck { ...@@ -8,32 +8,38 @@ namespace ck {
template <AddressSpaceEnum AddressSpace, template <AddressSpaceEnum AddressSpace,
bool InvalidElementUseNumericalZeroValue, bool InvalidElementUseNumericalZeroValue,
typename T, typename T,
typename TensorDesc> typename TensorDescTmp>
struct Tensor struct Tensor
{ {
using TensorDescriptor = remove_cvref_t<TensorDescTmp>;
using DataType = remove_reference_t<T>;
static constexpr AddressSpaceEnum kAdressSpace_ = AddressSpace; static constexpr AddressSpaceEnum kAdressSpace_ = AddressSpace;
static constexpr bool kInvalidElementUseNumericalZeroValue_ = static constexpr bool kInvalidElementUseNumericalZeroValue_ =
InvalidElementUseNumericalZeroValue; InvalidElementUseNumericalZeroValue;
__host__ __device__ constexpr Tensor() : buf_{nullptr, 0}, desc_{} {} __host__ __device__ constexpr Tensor() : buf_{nullptr, 0}, desc_{} {}
__host__ __device__ constexpr Tensor(T* p_data, TensorDesc desc) __host__ __device__ constexpr Tensor(DataType* p_data, TensorDescriptor desc)
: buf_{p_data, desc.GetElementSpaceSize()}, desc_{desc} : buf_{p_data, desc.GetElementSpaceSize()}, desc_{desc}
{ {
} }
__host__ __device__ constexpr Tensor(T* p_data, TensorDesc desc, T invalid_element_value) __host__ __device__ constexpr Tensor(DataType* p_data,
TensorDescriptor desc,
DataType invalid_element_value)
: buf_{p_data, desc.GetElementSpaceSize(), invalid_element_value}, desc_{desc} : buf_{p_data, desc.GetElementSpaceSize(), invalid_element_value}, desc_{desc}
{ {
} }
// member
DynamicBuffer<AddressSpace, DynamicBuffer<AddressSpace,
T, DataType,
typename TensorDesc::ElementSpaceSizeType, typename TensorDescriptor::ElementSpaceSizeType,
InvalidElementUseNumericalZeroValue> InvalidElementUseNumericalZeroValue>
buf_; buf_;
TensorDesc desc_; TensorDescriptor desc_;
}; };
template <AddressSpaceEnum AddressSpace, template <AddressSpaceEnum AddressSpace,
......
...@@ -45,6 +45,10 @@ struct ThreadGroupTensorSliceTransfer_v4r1 ...@@ -45,6 +45,10 @@ struct ThreadGroupTensorSliceTransfer_v4r1
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
#if 1 // debug
__host__ __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1() : threadwise_transfer_{} {}
#endif
__device__ constexpr ThreadGroupTensorSliceTransfer_v4r1( __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1(
const SrcDesc& src_desc, const SrcDesc& src_desc,
const Index& src_block_slice_origin, const Index& src_block_slice_origin,
......
...@@ -52,8 +52,8 @@ template <typename SliceLengths, ...@@ -52,8 +52,8 @@ template <typename SliceLengths,
typename SrcElementwiseOperation, typename SrcElementwiseOperation,
typename DstElementwiseOperation, typename DstElementwiseOperation,
InMemoryDataOperationEnum DstInMemOp, InMemoryDataOperationEnum DstInMemOp,
typename SrcData, typename SrcDataTmp,
typename DstData, typename DstDataTmp,
typename SrcDesc, typename SrcDesc,
typename DstDesc, typename DstDesc,
typename SrcDimAccessOrder, typename SrcDimAccessOrder,
...@@ -76,6 +76,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -76,6 +76,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static constexpr index_t nDim = SliceLengths::Size(); static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>; using Index = MultiIndex<nDim>;
using SrcData = remove_cvref_t<SrcDataTmp>;
using DstData = remove_cvref_t<DstDataTmp>;
using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
...@@ -84,6 +87,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1 ...@@ -84,6 +87,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
__host__ __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1()
: src_coord_{}, dst_coord_{}, src_element_op_{}, dst_element_op_{}
{
}
__device__ constexpr ThreadwiseTensorSliceTransfer_v3r1( __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(
const SrcDesc& src_desc, const SrcDesc& src_desc,
const Index& src_slice_origin, const Index& src_slice_origin,
......
...@@ -24,12 +24,17 @@ struct DynamicBuffer ...@@ -24,12 +24,17 @@ struct DynamicBuffer
{ {
using type = T; using type = T;
T* p_data_; T* p_data_ = nullptr;
ElementSpaceSize element_space_size_; ElementSpaceSize element_space_size_;
remove_cvref_t<T> invalid_element_value_ = T{0}; remove_cvref_t<T> invalid_element_value_ = T{0};
__host__ __device__ constexpr DynamicBuffer()
: p_data_{}, element_space_size_{}, invalid_element_value_{}
{
}
__host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size) __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
: p_data_{p_data}, element_space_size_{element_space_size} : p_data_{p_data}, element_space_size_{element_space_size}, invalid_element_value_{0}
{ {
} }
...@@ -42,20 +47,17 @@ struct DynamicBuffer ...@@ -42,20 +47,17 @@ struct DynamicBuffer
{ {
} }
__host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return BufferAddressSpace; }
{
return BufferAddressSpace;
}
__host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; } __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
__host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; } __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
template <typename X, template <typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type, typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value, typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false> bool>::type = false>
__host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const __device__ constexpr auto Get(index_t i, bool is_valid_element) const
{ {
// X contains multiple T // X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size; constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
...@@ -120,7 +122,7 @@ struct DynamicBuffer ...@@ -120,7 +122,7 @@ struct DynamicBuffer
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type, typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value, typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false> bool>::type = false>
__host__ __device__ void Update(index_t i, bool is_valid_element, const X& x) __device__ void Update(index_t i, bool is_valid_element, const X& x)
{ {
if constexpr(Op == InMemoryDataOperationEnum::Set) if constexpr(Op == InMemoryDataOperationEnum::Set)
{ {
...@@ -147,7 +149,7 @@ struct DynamicBuffer ...@@ -147,7 +149,7 @@ struct DynamicBuffer
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type, typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value, typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false> bool>::type = false>
__host__ __device__ void Set(index_t i, bool is_valid_element, const X& x) __device__ void Set(index_t i, bool is_valid_element, const X& x)
{ {
// X contains multiple T // X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size; constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
...@@ -290,7 +292,7 @@ struct DynamicBuffer ...@@ -290,7 +292,7 @@ struct DynamicBuffer
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type, typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value, typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false> bool>::type = false>
__host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x) __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
{ {
using scalar_t = typename scalar_type<remove_cvref_t<T>>::type; using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
...@@ -339,7 +341,7 @@ struct DynamicBuffer ...@@ -339,7 +341,7 @@ struct DynamicBuffer
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type, typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value, typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false> bool>::type = false>
__host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x) __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
{ {
// X contains multiple T // X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size; constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
...@@ -371,9 +373,9 @@ struct DynamicBuffer ...@@ -371,9 +373,9 @@ struct DynamicBuffer
} }
} }
__host__ __device__ static constexpr bool IsStaticBuffer() { return false; } __device__ static constexpr bool IsStaticBuffer() { return false; }
__host__ __device__ static constexpr bool IsDynamicBuffer() { return true; } __device__ static constexpr bool IsDynamicBuffer() { return true; }
}; };
template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize> template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment