Commit 7e44fd84 authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed

parent 15965dfc
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float;
using ADataType = F16; using ADataType = F16;
using BDataType = F16; using BDataType = F16;
...@@ -21,13 +20,12 @@ using DeviceElementwisePermuteInstance = ...@@ -21,13 +20,12 @@ using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>, ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
ck::Tuple<BDataType>, ck::Tuple<BDataType>,
PassThrough, PassThrough,
4, 3, // NumDim_M
3, 1, // NumDim_N
1,
8, 8,
8, 8,
ck::Sequence<1>, ck::Sequence<8>,
ck::Sequence<1>>; ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename Functor> template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nhwc, void host_elementwise4D(HostTensorB& B_nhwc,
...@@ -48,10 +46,16 @@ void host_elementwise4D(HostTensorB& B_nhwc, ...@@ -48,10 +46,16 @@ void host_elementwise4D(HostTensorB& B_nhwc,
int main() int main()
{ {
bool do_verification = true; bool do_verification = true;
bool time_kernel = true; bool time_kernel = false;
std::vector<std::size_t> nchw = {4, 8, 4, 8}; const int N = 16;
std::vector<std::size_t> nhwc = {4, 4, 8, 8}; const int H = 32;
const int W = 64;
const int C = 128;
std::vector<std::size_t> nchw = {N, C, H, W};
std::vector<std::size_t> nhwc = {N, H, W, C};
Tensor<ADataType> a(nchw); Tensor<ADataType> a(nchw);
Tensor<BDataType> b(nhwc); Tensor<BDataType> b(nhwc);
...@@ -62,28 +66,16 @@ int main() ...@@ -62,28 +66,16 @@ int main()
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data()); a_device_buf.ToDevice(a.mData.data());
LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl;
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()}; std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()}; std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
std::array<ck::index_t, 4> ab_lengths; std::array<ck::index_t, 4> ab_lengths{N, H, W, C};
std::array<ck::index_t, 4> a_strides = {static_cast<int>(nhwc[1] * nhwc[2] * nhwc[3]), // std::copy(nhwc.begin(), nhwc.end(), ab_lengths.begin());
static_cast<int>(nhwc[2]),
1,
static_cast<int>(nhwc[1] * nhwc[2])};
std::array<ck::index_t, 4> b_strides = {static_cast<int>(nhwc[1] * nhwc[2] * nhwc[3]),
static_cast<int>(nhwc[2]*nhwc[3]),
static_cast<int>(nhwc[3]),
1};
// std::cout << "Length: " << ab_lengths << std::endl;
// std::cout << "A stride: " << a_strides << std::endl;
// std::cout << "B stride: " << b_strides << std::endl;
std::copy(nhwc.begin(), nhwc.end(), ab_lengths.begin()); std::array<ck::index_t, 4> a_strides = {C * H * W, W, 1, H * W};
// std::copy(a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end(), a_strides.begin()); std::array<ck::index_t, 4> b_strides = {H * W * C, W * C, C, 1};
// std::copy(b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end(), b_strides.begin());
auto broadcastPermute = DeviceElementwisePermuteInstance{}; auto broadcastPermute = DeviceElementwisePermuteInstance{};
auto argument = broadcastPermute.MakeArgumentPointer( auto argument = broadcastPermute.MakeArgumentPointer(
...@@ -94,6 +86,7 @@ int main() ...@@ -94,6 +86,7 @@ int main()
throw std::runtime_error( throw std::runtime_error(
"The runtime parameters seems not supported by the device instance, exiting!"); "The runtime parameters seems not supported by the device instance, exiting!");
}; };
auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer(); auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
float ave_time = float ave_time =
broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel}); broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
...@@ -105,12 +98,13 @@ int main() ...@@ -105,12 +98,13 @@ int main()
if(do_verification) if(do_verification)
{ {
b_device_buf.FromDevice(b.mData.data()); b_device_buf.FromDevice(b.mData.data());
LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nhwc); Tensor<BDataType> host_b(nhwc);
host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>( host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
host_b, a, nchw, PassThrough{}); host_b, a, nchw, PassThrough{});
LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl;
pass &= pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
} }
......
...@@ -20,20 +20,24 @@ namespace device { ...@@ -20,20 +20,24 @@ namespace device {
template <typename InDataTypeTuple, template <typename InDataTypeTuple,
typename OutDataTypeTuple, typename OutDataTypeTuple,
typename ElementwiseOperation, typename ElementwiseOperation,
index_t NumDim,
index_t NumDim_m, index_t NumDim_m,
index_t NumDim_n, index_t NumDim_n,
index_t MPerThread, index_t MPerThread,
index_t NPerThread, index_t NPerThread,
typename InScalarPerVectorSeq, typename InScalarPerVectorSeq,
typename OutScalarPerVectorSeq> typename OutScalarPerVectorSeq>
struct DeviceElementwise struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
: public DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim> OutDataTypeTuple,
ElementwiseOperation,
NumDim_m + NumDim_n>
{ {
static constexpr index_t NumDim = NumDim_m + NumDim_n;
static constexpr int NumInput = InDataTypeTuple::Size(); static constexpr int NumInput = InDataTypeTuple::Size();
static constexpr int NumOutput = OutDataTypeTuple::Size(); static constexpr int NumOutput = OutDataTypeTuple::Size();
// const index_t NumDim = NumDim_m + NumDim_n; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static_assert(NumInput == InScalarPerVectorSeq::Size() && static_assert(NumInput == InScalarPerVectorSeq::Size() &&
NumOutput == OutScalarPerVectorSeq::Size(), NumOutput == OutScalarPerVectorSeq::Size(),
...@@ -67,16 +71,17 @@ struct DeviceElementwise ...@@ -67,16 +71,17 @@ struct DeviceElementwise
template <typename Desc_MN> template <typename Desc_MN>
static auto PadDescriptor_MN_2d(Desc_MN desc_mn, index_t gridSize, index_t blockSize) static auto PadDescriptor_MN_2d(Desc_MN desc_mn, index_t gridSize, index_t blockSize)
{ {
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
const auto m = desc_mn.GetLength(I0); const auto m = desc_mn.GetLength(I0);
const auto n = desc_mn.GetLength(I1); const auto n = desc_mn.GetLength(I1);
const index_t loop_step_m = gridSize * blockSize * MPerThread; const index_t loop_step_m = MPerThread;
const index_t loop_step_n = gridSize * blockSize * NPerThread; const index_t loop_step_n = gridSize * blockSize * NPerThread;
const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m; const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n; const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
const auto desc_mn_pad = transform_tensor_descriptor( std::cout << NumDim_m << " m: " << m << " loop_step_m: " << loop_step_m
<< " pad_m: " << pad_m << std::endl;
std::cout << NumDim_n << " n: " << n << " loop_step_n: " << loop_step_n
<< " pad_n: " << pad_n << std::endl;
const auto desc_mn_pad = transform_tensor_descriptor(
desc_mn, desc_mn,
make_tuple(make_right_pad_transform(m, pad_m), make_right_pad_transform(n, pad_n)), make_tuple(make_right_pad_transform(m, pad_m), make_right_pad_transform(n, pad_n)),
make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{}),
...@@ -137,7 +142,6 @@ struct DeviceElementwise ...@@ -137,7 +142,6 @@ struct DeviceElementwise
using OutGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumOutput>{})); using OutGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumOutput>{}));
using InGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumInput>{})); using InGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumInput>{}));
// using OutGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumOutput>{}));
using GridwiseElementwise = GridwiseElementwise_2D<InGrid2dDescTuple, using GridwiseElementwise = GridwiseElementwise_2D<InGrid2dDescTuple,
OutGrid2dDescTuple, OutGrid2dDescTuple,
...@@ -165,6 +169,9 @@ struct DeviceElementwise ...@@ -165,6 +169,9 @@ struct DeviceElementwise
blockSize_(256), blockSize_(256),
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
{ {
static_assert(NumDim_m > 0, "");
static_assert(NumDim_n > 0, "");
in_dev_buffers_ = generate_tuple( in_dev_buffers_ = generate_tuple(
[&](auto I) { [&](auto I) {
using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>; using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
...@@ -256,30 +263,32 @@ struct DeviceElementwise ...@@ -256,30 +263,32 @@ struct DeviceElementwise
auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths, auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
const std::array<index_t, NumDim>& strides, const std::array<index_t, NumDim>& strides,
index_t scalarPerVector, index_t scalarPerVector,
index_t vectorDim) { index_t vectorDim) {
std::cout << "scalarPerVector: " << scalarPerVector << std::endl; // std::cout << "scalarPerVector: " << scalarPerVector << std::endl;
std::cout << "stride back: " << strides.back() << std::endl; // std::cout << "stride back: " << strides.back() << std::endl;
std::cout << "len back: " << lengths.back() << std::endl; // std::cout << "len back: " << lengths.back() << std::endl;
std::cout << "NumDim-1: " << NumDim - 1 << std::endl; // std::cout << "NumDim-1: " << NumDim - 1 << std::endl;
std::cout << "stride[nd-1]: " << strides[NumDim - 1] << std::endl; // std::cout << "stride[nd-1]: " << strides[NumDim - 1] << std::endl;
std::cout << "NumDim_m-1: " << NumDim_m - 1 << std::endl; // std::cout << "NumDim_m-1: " << NumDim_m - 1 << std::endl;
std::cout << std::endl; // std::cout << std::endl;
std::cout << "ISPVV Check 1 starting" << std::endl; // std::cout << "ISPVV Check 1 starting" << std::endl;
if(strides[vectorDim] == 1 && (lengths[vectorDim] % scalarPerVector == 0 || lengths[vectorDim]%scalarPerVector == lengths[vectorDim])) if(strides[vectorDim] == 1 &&
(lengths[vectorDim] % scalarPerVector == 0 ||
lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
{ {
std::cout << "Check 1 passed" << std::endl; // std::cout << "Check 1 passed" << std::endl;
return true; return true;
} }
std::cout << "Check 1 failed " << std::endl; // std::cout << "Check 1 failed " << std::endl;
std::cout << "ISPVV Check 2 starting" << std::endl; // std::cout << "ISPVV Check 2 starting" << std::endl;
std::cout << "strides[vectorDim]: " << strides[vectorDim] << std::endl; // std::cout << "strides[vectorDim]: " << strides[vectorDim] << std::endl;
if(strides[vectorDim] != 1 && scalarPerVector == strides[vectorDim]) if(strides[vectorDim] != 1 && scalarPerVector == strides[vectorDim])
{ {
std::cout << "Check 2 passed " << std::endl; // std::cout << "Check 2 passed " << std::endl;
return true; return true;
} }
std::cout << "Check 2 failed" << std::endl; // std::cout << "Check 2 failed" << std::endl;
return false; return false;
}; };
...@@ -300,19 +309,23 @@ struct DeviceElementwise ...@@ -300,19 +309,23 @@ struct DeviceElementwise
bool valid = true; bool valid = true;
static_for<0, NumInput, 1>{}([&](auto I) { static_for<0, NumInput, 1>{}([&](auto I) {
std::cout << "running: " << I << std::endl; std::cout << "running: " << I << std::endl;
if(!IsScalarPerVectorValid( if(!IsScalarPerVectorValid(pArg->lengths_,
pArg->lengths_, pArg->inStridesArray_[I.value], InScalarPerVectorSeq::At(I), NumDim_m - 1)) pArg->inStridesArray_[I.value],
InScalarPerVectorSeq::At(I),
NumDim_m - 1))
valid = false; valid = false;
}); });
std::cout << "valid after loop through input: " << valid << std::endl; std::cout << "valid after loop through input: " << valid << std::endl;
static_for<0, NumOutput, 1>{}([&](auto I) { static_for<0, NumOutput, 1>{}([&](auto I) {
std::cout << "running 2: " << I << std::endl; std::cout << "running 2: " << I << std::endl;
if(!IsScalarPerVectorValid( if(!IsScalarPerVectorValid(pArg->lengths_,
pArg->lengths_, pArg->outStridesArray_[I.value], OutScalarPerVectorSeq::At(I), NumDim - 1)) pArg->outStridesArray_[I.value],
OutScalarPerVectorSeq::At(I),
NumDim - 1))
valid = false; valid = false;
}); });
std::cout << "valid after loop through output: " << valid << std::endl; std::cout << "valid after loop through output: " << valid << std::endl;
return valid; return valid;
}; };
......
...@@ -102,26 +102,20 @@ struct GridwiseElementwise_2D ...@@ -102,26 +102,20 @@ struct GridwiseElementwise_2D
Number<NumOutput>{}); Number<NumOutput>{});
const index_t blockSize = get_block_size(); const index_t blockSize = get_block_size();
const index_t blockPerGrid_m = get_grid_size(); const index_t blockPerGrid = get_grid_size();
//const index_t blockPerGrid_n = gridDim.y; const index_t totalNumThread = blockSize * blockPerGrid;
// const index_t block_1d = get_block_1d_id();
const auto M = in_grid_2d_desc_tuple[I0].GetLength(I0); const auto M = in_grid_2d_desc_tuple[I0].GetLength(I0);
const auto N = in_grid_2d_desc_tuple[I0].GetLength(I1); const auto N = in_grid_2d_desc_tuple[I0].GetLength(I1);
const index_t loop_step_m = blockPerGrid_m * blockSize * MPerThread; const index_t loop_step_m = MPerThread;
const index_t loop_step_n = blockPerGrid_m * blockSize * NPerThread; const index_t loop_step_n = totalNumThread * NPerThread;
const auto loop_step_index_m = make_multi_index(loop_step_m, 0);
const auto loop_step_index_n = make_multi_index(0, loop_step_n);
const index_t thread_1d_id = get_thread_global_1d_id(); const index_t thread_1d_id = get_thread_global_1d_id();
index_t tid_m = thread_1d_id/(N/NPerThread); // index_t tid_m = thread_1d_id / (N / NPerThread);
index_t tid_n = thread_1d_id%(N/NPerThread); // index_t tid_n = thread_1d_id % (N / NPerThread);
//index_t tid_m = thread_1d_id;
//index_t tid_n = blockDim.y * blockIdx.y + threadIdx.y; const auto thread_global_offset = make_multi_index(0, thread_1d_id * NPerThread);
const auto thread_global_offset = make_multi_index(tid_m* MPerThread, tid_n* NPerThread);
// make_multi_index(thread_global_id_2d[I0] * MPerThread, thread_global_id_2d[I1] *
// NPerThread);
auto in_global_load_tuple = generate_tuple( auto in_global_load_tuple = generate_tuple(
[&](auto I) { [&](auto I) {
...@@ -135,10 +129,10 @@ struct GridwiseElementwise_2D ...@@ -135,10 +129,10 @@ struct GridwiseElementwise_2D
decltype(thread_buffer_desc_mn), decltype(thread_buffer_desc_mn),
Sequence<MPerThread, NPerThread>, // SliceLengths Sequence<MPerThread, NPerThread>, // SliceLengths
Sequence<0, 1>, // DimAccessOrder Sequence<0, 1>, // DimAccessOrder
1, // SrcVectorDim 0, // SrcVectorDim
InScalarPerVectorSeq::At(I), // ScalarPerVector InScalarPerVectorSeq::At(I), // ScalarPerVector
1, // SrcScalarStrideInVector 1, // SrcScalarStrideInVector
false>{in_grid_2d_desc_tuple[I], thread_global_offset}; true>{in_grid_2d_desc_tuple[I], thread_global_offset};
}, },
Number<NumInput>{}); Number<NumInput>{});
...@@ -154,21 +148,21 @@ struct GridwiseElementwise_2D ...@@ -154,21 +148,21 @@ struct GridwiseElementwise_2D
decltype(out_grid_2d_desc_tuple[I]), decltype(out_grid_2d_desc_tuple[I]),
PassThroughOp, PassThroughOp,
Sequence<MPerThread, NPerThread>, // SliceLengths Sequence<MPerThread, NPerThread>, // SliceLengths
Sequence<1, 0>, // DimAccessOrder Sequence<0, 1>, // DimAccessOrder
0, // SrcVectorDim 1, // SrcVectorDim
OutScalarPerVectorSeq::At(I), 1, // OutScalarPerVectorSeq::At(I),
InMemoryDataOperationEnum::Set, InMemoryDataOperationEnum::Set,
1, 1,
false>(out_grid_2d_desc_tuple[I], thread_global_offset, PassThroughOp{}); true>(out_grid_2d_desc_tuple[I], thread_global_offset, PassThroughOp{});
}, },
Number<NumOutput>{}); Number<NumOutput>{});
index_t num_iter_m = M / (loop_step_m); index_t num_iter_m = M / (loop_step_m);
index_t num_iter_n = N / (loop_step_n);
do do
{ {
index_t num_iter_n = N / (loop_step_n);
do do
{ {
static_for<0, NumInput, 1>{}([&](auto I) { static_for<0, NumInput, 1>{}([&](auto I) {
in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I], in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I],
in_global_buf_tuple[I], in_global_buf_tuple[I],
...@@ -177,7 +171,7 @@ struct GridwiseElementwise_2D ...@@ -177,7 +171,7 @@ struct GridwiseElementwise_2D
in_thread_buf_tuple(I)); in_thread_buf_tuple(I));
in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I], in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I],
loop_step_index_n); make_multi_index(0, loop_step_n));
}); });
static_for<0, MPerThread, 1>{}([&](auto iM) { static_for<0, MPerThread, 1>{}([&](auto iM) {
...@@ -203,6 +197,9 @@ struct GridwiseElementwise_2D ...@@ -203,6 +197,9 @@ struct GridwiseElementwise_2D
}); });
}); });
// static_for<0, MPerThread * NPerThread, 1>{}(
//[&](auto i) { out_thread_buf_tuple(I0)(i) = 1; });
static_for<0, NumOutput, 1>{}([&](auto I) { static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).Run(thread_buffer_desc_mn, out_global_store_tuple(I).Run(thread_buffer_desc_mn,
make_tuple(I0, I0), make_tuple(I0, I0),
...@@ -211,15 +208,22 @@ struct GridwiseElementwise_2D ...@@ -211,15 +208,22 @@ struct GridwiseElementwise_2D
out_global_buf_tuple(I)); out_global_buf_tuple(I));
out_global_store_tuple(I).MoveDstSliceWindow(out_grid_2d_desc_tuple[I], out_global_store_tuple(I).MoveDstSliceWindow(out_grid_2d_desc_tuple[I],
loop_step_index_n); make_multi_index(0, loop_step_n));
}); });
} while(--num_iter_n); } while(--num_iter_n);
static_for<0, NumInput, 1>{}([&](auto I) {
in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I], loop_step_index_m); static_for<0, NumInput, 1>{}([&](auto I) {
}); in_global_load_tuple(I).MoveSrcSliceWindow(
static_for<0, NumOutput, 1>{}([&](auto I){ in_grid_2d_desc_tuple[I],
out_global_store_tuple(I).MoveDstSliceWindow(out_grid_2d_desc_tuple[I], loop_step_index_m); make_multi_index(loop_step_m, -(N / loop_step_n) * loop_step_n));
}); });
static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).MoveDstSliceWindow(
out_grid_2d_desc_tuple[I],
make_multi_index(loop_step_m, -(N / loop_step_n) * loop_step_n));
});
} while(--num_iter_m); } while(--num_iter_m);
} }
}; };
......
...@@ -149,6 +149,12 @@ struct ThreadwiseTensorSliceTransfer_v1r3 ...@@ -149,6 +149,12 @@ struct ThreadwiseTensorSliceTransfer_v1r3
const bool is_dst_valid = const bool is_dst_valid =
coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
// if(get_thread_global_1d_id() == 0)
//{
// const index_t dst_off = dst_coord_.GetOffset();
// printf("dst_off: %d\n", dst_off);
//}
// copy data from dst_vector into dst_buf // copy data from dst_vector into dst_buf
dst_buf.template Update<DstInMemOp, dst_vector_t>( dst_buf.template Update<DstInMemOp, dst_vector_t>(
dst_coord_.GetOffset(), dst_coord_.GetOffset(),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment