Commit dfe6dedb authored by Chao Liu's avatar Chao Liu
Browse files

adding im2col

parent fe2ea9b6
......@@ -14,10 +14,10 @@
// program
struct HelloWorld
{
__host__ __device__ void operator()(TileProgram& tp, int x, int y, int* res)
__host__ __device__ void operator()(ProgramServer& ps, int x, int y, int* res)
{
auto desc0 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(x)));
auto desc1 = tp(make_naive_tensor_descriptor_packed(ck::make_tuple(y)));
auto desc0 = ps(make_naive_tensor_descriptor_packed(ck::make_tuple(x)));
auto desc1 = ps(make_naive_tensor_descriptor_packed(ck::make_tuple(y)));
// only for testing purpose
// cpu should not do work here
......@@ -33,7 +33,13 @@ int main()
DeviceMem res_dev_buf(2 * sizeof(int));
launch(HelloWorld{}, 1, 1, x, y, static_cast<int*>(res_dev_buf.GetDeviceBuffer()));
launch(ProgramServer{},
HelloWorld{},
1,
1,
x,
y,
static_cast<int*>(res_dev_buf.GetDeviceBuffer()));
int res_host[2];
......
#include "tile_program.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/thread_group.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
......@@ -12,6 +18,182 @@
#include "ck/library/utility/device_memory.hpp"
namespace ck {
template <typename ThreadGroup,
typename SrcElementwiseOperation,
typename DstElementwiseOperation,
InMemoryDataOperationEnum DstInMemOp,
typename BlockSliceLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcTensor,
typename DstTensor,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorDim,
index_t DstVectorDim,
index_t SrcScalarPerVector,
index_t DstScalarPerVector,
index_t SrcScalarStrideInVector,
index_t DstScalarStrideInVector,
bool ThreadTransferSrcResetCoordinateAfterRun,
bool ThreadTransferDstResetCoordinateAfterRun>
struct Copier
{
using SrcDesc = typename SrcTensor::TensorDescriptor;
using DstDesc = typename DstTensor::TensorDescriptor;
static constexpr ck::index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
using Index = MultiIndex<nDim>;
__host__ __device__ constexpr Copier() : block_copy_{}, src_tensor_{}, dst_tensor_{} {}
__device__ constexpr Copier(const SrcTensor& src_tensor,
const Index& src_block_slice_origin,
const SrcElementwiseOperation& src_element_op,
DstTensor& dst_tensor,
const Index& dst_block_slice_origin,
const DstElementwiseOperation& dst_element_op)
: block_copy_{src_tensor.desc_,
src_block_slice_origin,
src_element_op,
dst_tensor.desc_,
dst_block_slice_origin,
dst_element_op},
src_tensor_{src_tensor.buf_.p_data_, src_tensor.desc_},
dst_tensor_{dst_tensor.buf_.p_data_, dst_tensor.desc_}
{
}
__host__ void operator()() {}
__device__ void operator()()
{
block_copy_.Run(
src_tensor_.desc_, src_tensor_.buf_, dst_tensor_.desc_, dst_tensor_.buf_, Number<0>{});
}
__host__ void move_src_window(const Index&) {}
__device__ void move_src_window(const Index& step)
{
block_copy_.MoveSrcSliceWindow(src_tensor_.desc_, step);
}
__host__ void move_dst_window(const Index&) {}
__device__ void move_dst_window(const Index& step)
{
block_copy_.MoveDstSliceWindow(dst_tensor_.desc_, step);
}
// member
ThreadGroupTensorSliceTransfer_v4r1<ThreadGroup,
SrcElementwiseOperation,
DstElementwiseOperation,
DstInMemOp,
BlockSliceLengths,
ThreadClusterLengths,
ThreadClusterArrangeOrder,
typename SrcTensor::DataType,
typename SrcTensor::DataType,
SrcDesc,
DstDesc,
SrcDimAccessOrder,
DstDimAccessOrder,
SrcVectorDim,
DstVectorDim,
SrcScalarPerVector,
DstScalarPerVector,
SrcScalarStrideInVector,
DstScalarStrideInVector,
ThreadTransferSrcResetCoordinateAfterRun,
ThreadTransferDstResetCoordinateAfterRun>
block_copy_;
SrcTensor src_tensor_;
DstTensor dst_tensor_;
};
} // namespace ck
struct CopierStrategy
{
};
template <ck::index_t BlockSize>
struct MyProgramServer : public ProgramServer
{
template <typename SrcTensor, typename DstTensor, typename Index, typename Strategy>
__host__ auto make_copier(const SrcTensor& src_tensor,
const Index& src_window_origin,
DstTensor& dst_tensor,
const Index& dst_window_origin,
const Index& window_lengths,
const Strategy& strategy)
{
using namespace ck;
return Copier<ThisThreadBlock<BlockSize>,
tensor_operation::element_wise::PassThrough,
tensor_operation::element_wise::PassThrough,
InMemoryDataOperationEnum::Set,
Sequence<128, 16>, // BlockSliceLengths,
Sequence<16, 16>,
Sequence<0, 1>,
SrcTensor,
DstTensor,
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
true,
true>{};
}
template <typename SrcTensor, typename DstTensor, typename Index, typename Strategy>
__device__ auto make_copier(const SrcTensor& src_tensor,
const Index& src_window_origin,
DstTensor& dst_tensor,
const Index& dst_window_origin,
const Index& window_lengths,
const Strategy& strategy)
{
using namespace ck;
return Copier<ThisThreadBlock<BlockSize>,
tensor_operation::element_wise::PassThrough,
tensor_operation::element_wise::PassThrough,
InMemoryDataOperationEnum::Set,
Sequence<128, 16>, // BlockSliceLengths,
Sequence<16, 16>,
Sequence<0, 1>,
SrcTensor,
DstTensor,
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
true,
true>{src_tensor,
src_window_origin,
tensor_operation::element_wise::PassThrough{},
dst_tensor,
dst_window_origin,
tensor_operation::element_wise::PassThrough{}};
}
};
// program
template <ck::index_t NDimSpatial,
typename ALayout,
......@@ -21,8 +203,9 @@ template <ck::index_t NDimSpatial,
ck::index_t kKPerTile>
struct Im2Col
{
template <typename Server, typename CopierStrategy>
__host__ __device__ void
operator()(TileProgram& tp,
operator()(Server& ps,
const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
const std::array<ck::index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
const std::array<ck::index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
......@@ -34,17 +217,19 @@ struct Im2Col
const std::array<ck::index_t, NDimSpatial>& input_left_pads,
const std::array<ck::index_t, NDimSpatial>& input_right_pads,
//
const std::array<ck::index_t, 3> a_gemmg_gemmm_gemmk_lengths,
const std::array<ck::index_t, 3> a_gemmg_gemmm_gemmk_strides,
const std::array<ck::index_t, 2> a_gemmm_gemmk_lengths,
const std::array<ck::index_t, 2> a_gemmm_gemmk_strides,
//
const T* p_a_img,
T* p_a_mtx)
T* p_a_mtx,
// strategy
const CopierStrategy& copier_strategy)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
#if 0 // debug
const auto a_src_desc = tensor_operation::TransformConvFwdToGemm<
NDimSpatial,
tensor_operation::device::ConvolutionForwardSpecialization::Default>::
......@@ -58,36 +243,55 @@ struct Im2Col
conv_filter_dilations,
input_left_pads,
input_right_pads);
#else
const auto a_src_desc =
ps(tensor_operation::TransformConvFwdToGemm<
NDimSpatial,
tensor_operation::device::ConvolutionForwardSpecialization::Default>::
template MakeADescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
c_g_n_k_wos_lengths,
c_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads));
#endif
const auto a_dst_desc =
make_naive_tensor_descriptor(make_tuple(a_gemmg_gemmm_gemmk_lengths[0],
a_gemmg_gemmm_gemmk_lengths[1],
a_gemmg_gemmm_gemmk_lengths[2]),
make_tuple(a_gemmg_gemmm_gemmk_strides[0],
a_gemmg_gemmm_gemmk_strides[1],
a_gemmg_gemmm_gemmk_strides[2]));
#if 1 // debug
const auto a_dst_desc = ps(make_naive_tensor_descriptor(
make_tuple(a_gemmm_gemmk_lengths[0], a_gemmm_gemmk_lengths[1]),
make_tuple(a_gemmm_gemmk_strides[0], a_gemmm_gemmk_strides[1])));
#else
const auto a_dst_desc = make_naive_tensor_descriptor(
make_tuple(a_gemmm_gemmk_lengths[0], a_gemmm_gemmk_lengths[1]),
make_tuple(a_gemmm_gemmk_strides[0], a_gemmm_gemmk_strides[1]));
#endif
const auto a_src = tp(make_tensor<AddressSpaceEnum::Global, true>(a_src_desc, p_a_img));
const auto a_src = make_tensor<AddressSpaceEnum::Global, true>(a_src_desc, p_a_img);
auto a_dst = tp(make_tensor<AddressSpaceEnum::Global, true>(a_dst_desc, p_a_mtx));
auto a_dst = make_tensor<AddressSpaceEnum::Global, true>(a_dst_desc, p_a_mtx);
const auto num_gemmg = a_gemmg_gemmm_gemmk_lengths[0];
const auto num_gemmm = a_gemmg_gemmm_gemmk_lengths[1];
const auto num_gemmk = a_gemmg_gemmm_gemmk_lengths[2];
const auto num_gemmm = a_gemmm_gemmk_lengths[0];
const auto num_gemmk = a_gemmm_gemmk_lengths[1];
const auto id_block = tp.get_block_1d_id();
const auto id_block = ps.get_block_1d_id();
const auto num_tile_m = num_gemmm / kMPerTile;
const auto num_tile_k = num_gemmk / kKPerTile;
const auto num_tile_m = ps.read_first_lane(num_gemmm / kMPerTile);
const auto block2tile = tp(make_cluster_descriptor(make_tuple(num_tile_m, num_tile_k)));
#if 1 // debug
const auto block2tile = ps(make_cluster_descriptor(make_tuple(num_tile_m)));
#else
const auto block2tile = make_cluster_descriptor(make_tuple(num_tile_m));
#endif
const auto id_tile = block2tile.CalculateBottomIndex(make_tuple(id_block));
const auto id_tile_m = id_tile[I0];
const auto id_tile_k = id_tile[I1];
const auto id_tile_m = ps.read_first_lane(id_tile[I0]);
#if 1
#if 0
// data-based syntax: per-data solution strategy
auto window_a_src = make_window(a_src,
make_tuple(1, MPerTile, KPerTile),
......@@ -107,6 +311,25 @@ struct Im2Col
window_a_dst += make_tuple(1, 0, 0);
}
#else
// operator-based syntax
auto copier = ps.make_copier(a_src,
make_tuple(id_tile_m * kMPerTile, 0),
a_dst,
make_tuple(id_tile_m * kMPerTile, 0),
make_tuple(kMPerTile, kKPerTile),
copier_strategy);
ck::index_t id_gemmk = 0;
do
{
copier();
copier.move_src_window(make_tuple(0, kKPerTile));
copier.move_dst_window(make_tuple(0, kKPerTile));
id_gemmk += kKPerTile;
} while(id_gemmk < num_gemmk - kKPerTile);
#endif
}
};
......@@ -117,7 +340,7 @@ int main()
constexpr ck::index_t NumDimSpatial = 2;
ck::index_t G = 32;
ck::index_t G = 1;
ck::index_t N = 256;
ck::index_t K = 192;
ck::index_t C = 192;
......@@ -170,8 +393,8 @@ int main()
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
// matrix
std::array<ck::index_t, 3> in_mtx_lengths{G, G * Ho * Wo, C * Y * X};
std::array<ck::index_t, 3> in_mtx_strides{0, 0, 1};
std::array<ck::index_t, 2> in_mtx_lengths{N * Ho * Wo, C * Y * X};
std::array<ck::index_t, 2> in_mtx_strides{0, 1};
std::partial_sum(rbegin(in_mtx_lengths),
std::prev(rend(in_mtx_lengths)),
......@@ -181,9 +404,10 @@ int main()
DeviceMem in(sizeof(DataType) * G * N * Hi * Wi * C);
DeviceMem in_mtx(sizeof(DataType) * G * N * Ho * Wo * C * Y * X);
launch(Im2Col<2, ck::tensor_layout::convolution::GNHWC, float, 128, 128>{},
launch(MyProgramServer<256>{},
Im2Col<2, ck::tensor_layout::convolution::GNHWC, float, 128, 16>{},
1,
1,
256,
in_lengths,
in_strides,
wei_lengths,
......@@ -199,7 +423,8 @@ int main()
in_mtx_strides,
//
static_cast<DataType*>(in.GetDeviceBuffer()),
static_cast<DataType*>(in_mtx.GetDeviceBuffer()));
static_cast<DataType*>(in_mtx.GetDeviceBuffer()),
CopierStrategy{});
return 0;
}
......@@ -6,10 +6,13 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
// hidden intermediate argument
struct Arg
// Meta data for GPU
// TODO: do we need to take care of data alignment in code or it's done by compiler?
template <ck::index_t kSize>
struct MetaData
{
char data_[128];
char p_data_[kSize];
ck::index_t size_ = 0;
ck::index_t pos_ = 0;
......@@ -21,49 +24,59 @@ struct Arg
__device__ void reset_pos() { pos_ = 0; }
// push arg on host
// push meta data on host
// TODO: correct forwarding?
template <typename T>
__host__ T push(const T& a)
__host__ auto push(T&& a)
{
*reinterpret_cast<T*>(data_ + size_) = a;
assert(size_ + sizeof(Type) <= kSize);
size_ += sizeof(T);
using Type = ck::remove_cvref_t<T>;
return a;
*reinterpret_cast<Type*>(p_data_ + size_) = a;
size_ += sizeof(Type);
return ck::forwarder{}(a);
}
// pull arg on device
// pull meta data on device
// TODO: correct forwarding?
template <typename T>
__device__ T pull()
__device__ auto pull()
{
T a = *reinterpret_cast<T*>(data_ + pos_);
using Type = ck::remove_cvref_t<T>;
pos_ += sizeof(T);
Type a = *reinterpret_cast<Type*>(p_data_ + pos_);
pos_ += sizeof(Type);
return a;
}
};
// namespace tp (for tile programming)
struct TileProgram
struct ProgramServer
{
// arg on device
Arg arg_;
// meta data on device
MetaData<1024> meta_data_;
__host__ void cpu_init() { meta_data_.reset(); }
__device__ void gpu_init() { arg_.reset_pos(); }
__device__ void gpu_init() { meta_data_.reset_pos(); }
// push arg on host
// push meta data on host
template <typename T>
__host__ T operator()(const T& a)
__host__ auto operator()(T&& a)
{
return arg_.push(a);
return ck::forwarder{}(meta_data_.push(a));
}
// push arg on host
// push meta data on host
template <typename T>
__device__ T operator()(const T&)
__device__ auto operator()(T&&)
{
return arg_.pull<T>();
return ck::forwarder{}(meta_data_.pull<T>());
}
__host__ static ck::index_t get_block_1d_id() { return -1; }
......@@ -73,23 +86,36 @@ struct TileProgram
__device__ static ck::index_t get_block_1d_id() { return ck::get_block_1d_id(); }
__device__ static ck::index_t get_grid_size() { return ck::get_grid_size(); }
// TODO: correct forwarding?
template <typename T>
__host__ static constexpr auto read_first_lane(T&& a)
{
return ck::forwarder{}(a);
}
template <typename T>
__device__ static constexpr auto read_first_lane(T&& a)
{
return __builtin_amdgcn_readfirstlane(a);
}
};
template <typename Program, typename... Xs>
__global__ void gpu_program_wrapper(Program f, TileProgram tp, Xs... xs)
template <typename Server, typename Program, typename... Xs>
__global__ void gpu_program_wrapper(Server server, Program f, Xs... xs)
{
tp.gpu_init();
f(tp, xs...);
server.gpu_init();
f(server, xs...);
}
template <typename Program, typename... Xs>
void launch(Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
template <typename Server, typename Program, typename... Xs>
void launch(Server server, Program f, dim3 grid_dim, dim3 block_dim, Xs... xs)
{
TileProgram tp;
server.cpu_init();
f(tp, xs...);
f(server, xs...);
printf("cpu arg size %d\n", tp.arg_.size_);
printf("meta data size %d\n", server.meta_data_.size_);
gpu_program_wrapper<Program><<<grid_dim, block_dim, 0, nullptr>>>(f, tp, xs...);
gpu_program_wrapper<Server, Program><<<grid_dim, block_dim, 0, nullptr>>>(server, f, xs...);
}
......@@ -78,7 +78,9 @@ struct StaticTensor
StaticBuffer<AddressSpace, T, element_space_size_, true> data_;
static constexpr T zero_scalar_value_ = T{0};
// for read access of invalid element
const T invalid_element_scalar_value_;
// for write access of invalid element
T ignored_element_scalar_;
};
......@@ -101,12 +103,12 @@ struct StaticTensorTupleOfVectorBuffer
using V = vector_type<S, ScalarPerVector>;
__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer()
: invalid_element_scalar_value_{0}
: invalid_element_scalar_value_{0}, ignored_element_scalar_{0}
{
}
__host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value)
: invalid_element_scalar_value_{invalid_element_value}
: invalid_element_scalar_value_{invalid_element_value}, ignored_element_scalar_{0}
{
}
......@@ -244,7 +246,9 @@ struct StaticTensorTupleOfVectorBuffer
StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_;
static constexpr S zero_scalar_value_ = S{0};
// for read access of invalid element
const S invalid_element_scalar_value_ = S{0};
// for write access of invalid element
S ignored_element_scalar_;
};
......
......@@ -8,32 +8,38 @@ namespace ck {
template <AddressSpaceEnum AddressSpace,
bool InvalidElementUseNumericalZeroValue,
typename T,
typename TensorDesc>
typename TensorDescTmp>
struct Tensor
{
using TensorDescriptor = remove_cvref_t<TensorDescTmp>;
using DataType = remove_reference_t<T>;
static constexpr AddressSpaceEnum kAdressSpace_ = AddressSpace;
static constexpr bool kInvalidElementUseNumericalZeroValue_ =
InvalidElementUseNumericalZeroValue;
__host__ __device__ constexpr Tensor() : buf_{nullptr, 0}, desc_{} {}
__host__ __device__ constexpr Tensor(T* p_data, TensorDesc desc)
__host__ __device__ constexpr Tensor(DataType* p_data, TensorDescriptor desc)
: buf_{p_data, desc.GetElementSpaceSize()}, desc_{desc}
{
}
__host__ __device__ constexpr Tensor(T* p_data, TensorDesc desc, T invalid_element_value)
__host__ __device__ constexpr Tensor(DataType* p_data,
TensorDescriptor desc,
DataType invalid_element_value)
: buf_{p_data, desc.GetElementSpaceSize(), invalid_element_value}, desc_{desc}
{
}
// member
DynamicBuffer<AddressSpace,
T,
typename TensorDesc::ElementSpaceSizeType,
DataType,
typename TensorDescriptor::ElementSpaceSizeType,
InvalidElementUseNumericalZeroValue>
buf_;
TensorDesc desc_;
TensorDescriptor desc_;
};
template <AddressSpaceEnum AddressSpace,
......
......@@ -45,6 +45,10 @@ struct ThreadGroupTensorSliceTransfer_v4r1
using Index = MultiIndex<nDim>;
#if 1 // debug
__host__ __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1() : threadwise_transfer_{} {}
#endif
__device__ constexpr ThreadGroupTensorSliceTransfer_v4r1(
const SrcDesc& src_desc,
const Index& src_block_slice_origin,
......
......@@ -52,8 +52,8 @@ template <typename SliceLengths,
typename SrcElementwiseOperation,
typename DstElementwiseOperation,
InMemoryDataOperationEnum DstInMemOp,
typename SrcData,
typename DstData,
typename SrcDataTmp,
typename DstDataTmp,
typename SrcDesc,
typename DstDesc,
typename SrcDimAccessOrder,
......@@ -76,6 +76,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcData = remove_cvref_t<SrcDataTmp>;
using DstData = remove_cvref_t<DstDataTmp>;
using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
......@@ -84,6 +87,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static constexpr auto I0 = Number<0>{};
__host__ __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1()
: src_coord_{}, dst_coord_{}, src_element_op_{}, dst_element_op_{}
{
}
__device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(
const SrcDesc& src_desc,
const Index& src_slice_origin,
......
......@@ -24,12 +24,17 @@ struct DynamicBuffer
{
using type = T;
T* p_data_;
T* p_data_ = nullptr;
ElementSpaceSize element_space_size_;
remove_cvref_t<T> invalid_element_value_ = T{0};
__host__ __device__ constexpr DynamicBuffer()
: p_data_{}, element_space_size_{}, invalid_element_value_{}
{
}
__host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
: p_data_{p_data}, element_space_size_{element_space_size}
: p_data_{p_data}, element_space_size_{element_space_size}, invalid_element_value_{0}
{
}
......@@ -42,20 +47,17 @@ struct DynamicBuffer
{
}
__host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace()
{
return BufferAddressSpace;
}
__device__ static constexpr AddressSpaceEnum GetAddressSpace() { return BufferAddressSpace; }
__host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
__device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
__host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
__device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
template <typename X,
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
__host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
__device__ constexpr auto Get(index_t i, bool is_valid_element) const
{
// X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
......@@ -120,7 +122,7 @@ struct DynamicBuffer
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
__host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
__device__ void Update(index_t i, bool is_valid_element, const X& x)
{
if constexpr(Op == InMemoryDataOperationEnum::Set)
{
......@@ -147,7 +149,7 @@ struct DynamicBuffer
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
__host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
__device__ void Set(index_t i, bool is_valid_element, const X& x)
{
// X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
......@@ -290,7 +292,7 @@ struct DynamicBuffer
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
__host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
__device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
{
using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
......@@ -339,7 +341,7 @@ struct DynamicBuffer
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<T>>::type>::value,
bool>::type = false>
__host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
__device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
{
// X contains multiple T
constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
......@@ -371,9 +373,9 @@ struct DynamicBuffer
}
}
__host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
__device__ static constexpr bool IsStaticBuffer() { return false; }
__host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
__device__ static constexpr bool IsDynamicBuffer() { return true; }
};
template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment