Commit bccc6d8b authored by wangshaojie6's avatar wangshaojie6
Browse files

merge develop and resolve conflict

parents c6b52884 91d8b7d6
# Instructions for ```example_pool2d_fwd``` Example
# Instructions for ```example_pool2d_fwd``` Examples
## Run ```example_pool2d_fwd```
## Run ```example_pool2d_fwd_fp16```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: run kernel # of times (>1)
#arg3: time kernel (0=no, 1=yes)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd 1 1 10
./bin/example_pool2d_fwd_fp16 1 1 1
```
Result
......@@ -14,9 +14,28 @@ Result
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up
Warm up 1 time
Start running 10 times...
Perf: 0.415453 ms, 1.37996 TFlops, 749.726 GB/s
error: 0
max_diff: 0, 1, 1
Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
```
## Run ```example_pool2d_fwd_fp32```
```bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
#arg3: time kernel (0=no, 1=yes)
#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
./bin/example_pool2d_fwd_fp32 1 1 1
```
Result
```
./bin/example_pool2d_fwd_fp32 1 1 1
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1}
Warm up 1 time
Start running 10 times...
Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
```
#pragma once
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "check_err.hpp"
#include "config.hpp"
......@@ -13,48 +11,19 @@
#include "host_reduce_util.hpp"
#include "device_tensor.hpp"
#include "tensor_layout.hpp"
#include "reduction_operator.hpp"
#include "reduction_enums.hpp"
#include "device_pool2d_fwd_nhwc_nhwc.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC;
#if 1
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif
static constexpr bool NeedIndices = false;
static constexpr bool PropagateNan = false;
using DevicePoolFwdInstance =
ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
InDataType, // InDataType
OutDataType, // OutDataType
AccDataType, // AccDataType
ReduceOpId,
NeedIndices,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
4, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
4>; // InSrcOutDstVectorSize
template <typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
ck::ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool NeedIndices>
bool OutputIndex>
static void pool_host_verify(const Tensor<InDataType>& in,
Tensor<OutDataType>& out,
Tensor<int>& out_indices,
Tensor<IndexDataType>& out_indices,
const std::array<ck::index_t, 2>& window_spatial_lengths,
const std::array<ck::index_t, 2>& window_strides,
const std::array<ck::index_t, 2>& in_left_pads,
......@@ -62,26 +31,26 @@ static void pool_host_verify(const Tensor<InDataType>& in,
{
using namespace ck::host_reduce;
const int divider = window_spatial_lengths[0] * window_spatial_lengths[1];
const int32_t divider = window_spatial_lengths[0] * window_spatial_lengths[1];
const auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
const auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if constexpr(!NeedIndices)
if constexpr(!OutputIndex)
{
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
for(int y = 0; y < window_spatial_lengths[0]; ++y)
for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
{
int hi = ho * window_strides[0] + y - in_left_pads[0];
for(int x = 0; x < window_spatial_lengths[1]; ++x)
ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
{
int wi = wo * window_strides[1] + x - in_left_pads[1];
if(hi >= 0 && hi < ck::type_convert<int>(in.mDesc.GetLengths()[2]) && wi >= 0 &&
wi < ck::type_convert<int>(in.mDesc.GetLengths()[3]))
ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
if(hi >= 0 && hi < static_cast<ck::index_t>(in.mDesc.GetLengths()[2]) &&
wi >= 0 && wi < static_cast<ck::index_t>(in.mDesc.GetLengths()[3]))
{
AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
......@@ -108,24 +77,24 @@ static void pool_host_verify(const Tensor<InDataType>& in,
auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
IndexDataType accuIndex = 0;
for(int y = 0; y < window_spatial_lengths[0]; ++y)
for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
{
int hi = ho * window_strides[0] + y - in_left_pads[0];
for(int x = 0; x < window_spatial_lengths[1]; ++x)
ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
{
int wi = wo * window_strides[1] + x - in_left_pads[1];
ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
{
AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
int currIndex = y * window_spatial_lengths[1] + x;
AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
IndexDataType currIndex = y * window_spatial_lengths[1] + x;
PreUnaryOp(currVal);
binop_with_nan_check2<AccDataType, PropagateNan>(
binop_with_index_and_nan_check<AccDataType, IndexDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
}
}
......@@ -145,62 +114,46 @@ static void pool_host_verify(const Tensor<InDataType>& in,
};
}
int main(int argc, char* argv[])
template <typename InDataType,
typename OutDataType,
typename AccDataType,
typename IndexDataType,
typename InLayout,
typename OutLayout,
ck::ReduceTensorOp ReduceOpId,
bool PropagateNan,
bool OutputIndex>
bool pool_test(bool do_verification,
int init_method,
bool time_kernel,
ck::index_t N,
ck::index_t C,
ck::index_t Y,
ck::index_t X,
ck::index_t Hi,
ck::index_t Wi,
ck::index_t window_stride_h,
ck::index_t window_stride_w,
ck::index_t in_left_pad_h,
ck::index_t in_left_pad_w,
ck::index_t in_right_pad_h,
ck::index_t in_right_pad_w)
{
using namespace ck::host_reduce;
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
// Pool shape
ck::index_t N = 128;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 16)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
N = std::stoi(argv[4]);
C = std::stoi(argv[5]);
Y = std::stoi(argv[6]);
X = std::stoi(argv[7]);
Hi = std::stoi(argv[8]);
Wi = std::stoi(argv[9]);
window_stride_h = std::stoi(argv[10]);
window_stride_w = std::stoi(argv[11]);
in_left_pad_h = std::stoi(argv[12]);
in_left_pad_w = std::stoi(argv[13]);
in_right_pad_h = std::stoi(argv[14]);
in_right_pad_w = std::stoi(argv[15]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
}
using DevicePoolFwdInstance =
ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
InDataType, // InDataType
OutDataType, // OutDataType
AccDataType, // AccDataType
ReduceOpId,
OutputIndex,
64, // BlockSize
64, // ReduceMThreadClusterSize
1, // ReduceKThreadClusterSize
4, // ReduceMThreadSliceSize
1, // ReduceKThreadSliceSize
4>; // InSrcOutDstVectorSize
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
......@@ -228,9 +181,11 @@ int main(int argc, char* argv[])
Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
Tensor<int> out_indices_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
Tensor<IndexDataType> out_indices_n_c_ho_wo_host(
f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
Tensor<int> out_indices_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
Tensor<IndexDataType> out_indices_n_c_ho_wo_device(
f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
......@@ -245,25 +200,25 @@ int main(int argc, char* argv[])
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * out_n_c_ho_wo_device.mDesc.GetElementSpace());
DeviceMem out_indices_device_buf(sizeof(int) *
DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
out_indices_n_c_ho_wo_device.mDesc.GetElementSpace());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
auto pool = DevicePoolFwdInstance{};
auto invoker_ptr = pool.MakeInvokerPointer();
auto argument_ptr =
pool.MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<int*>(out_indices_device_buf.GetDeviceBuffer()),
N,
C,
std::array<ck::index_t, 2>{{Hi, Wi}},
std::array<ck::index_t, 2>{{Y, X}},
std::array<ck::index_t, 2>{{Ho, Wo}},
window_strides,
input_left_pads,
input_right_pads);
auto pool = DevicePoolFwdInstance{};
auto invoker_ptr = pool.MakeInvokerPointer();
auto argument_ptr = pool.MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
N,
C,
std::array<ck::index_t, 2>{{Hi, Wi}},
std::array<ck::index_t, 2>{{Y, X}},
std::array<ck::index_t, 2>{{Ho, Wo}},
window_strides,
input_left_pads,
input_right_pads);
if(!pool.IsSupportedArgument(argument_ptr.get()))
{
......@@ -286,14 +241,16 @@ int main(int argc, char* argv[])
<< std::endl;
bool pass = true;
if(do_verification)
{
pool_host_verify<InDataType,
OutDataType,
AccDataType,
IndexDataType,
ReduceOpId,
PropagateNan,
NeedIndices>(in_n_c_hi_wi,
OutputIndex>(in_n_c_hi_wi,
out_n_c_ho_wo_host,
out_indices_n_c_ho_wo_host,
window_spatial_lengths,
......@@ -303,15 +260,16 @@ int main(int argc, char* argv[])
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
pass &= ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
pass = pass && ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
if constexpr(NeedIndices)
if constexpr(OutputIndex)
{
out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
pass &= ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
out_indices_n_c_ho_wo_host.mData);
pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
out_indices_n_c_ho_wo_host.mData);
};
}
return pass ? 0 : 1;
}
return (pass);
};
#include <iostream>
#include <cstdlib>
#include "config.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "pool2d_fwd_common.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using AccDataType = float;
using IndexDataType = int32_t;
using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC;
#if 1
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif
static constexpr bool OutputIndex = false;
static constexpr bool PropagateNan = false;
int main(int argc, char* argv[])
{
using namespace ck::host_reduce;
bool do_verification;
int init_method;
bool time_kernel;
// Pool shape
ck::index_t N = 128;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
if(argc == 1)
{
do_verification = true;
init_method = 1;
time_kernel = true;
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = static_cast<bool>(std::stoi(argv[3]));
}
else if(argc == 16)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = static_cast<bool>(std::stoi(argv[3]));
N = std::stoi(argv[4]);
C = std::stoi(argv[5]);
Y = std::stoi(argv[6]);
X = std::stoi(argv[7]);
Hi = std::stoi(argv[8]);
Wi = std::stoi(argv[9]);
window_stride_h = std::stoi(argv[10]);
window_stride_w = std::stoi(argv[11]);
in_left_pad_h = std::stoi(argv[12]);
in_left_pad_w = std::stoi(argv[13]);
in_right_pad_h = std::stoi(argv[14]);
in_right_pad_w = std::stoi(argv[15]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
}
bool pass = pool_test<InDataType,
OutDataType,
AccDataType,
IndexDataType,
InLayout,
OutLayout,
ReduceOpId,
PropagateNan,
OutputIndex>(do_verification,
init_method,
time_kernel,
N,
C,
Y,
X,
Hi,
Wi,
window_stride_h,
window_stride_w,
in_left_pad_h,
in_left_pad_w,
in_right_pad_h,
in_right_pad_w);
return (pass ? 0 : 1);
}
#include <iostream>
#include <cstdlib>
#include "config.hpp"
#include "tensor_layout.hpp"
#include "reduction_enums.hpp"
#include "pool2d_fwd_common.hpp"
using InDataType = float;
using OutDataType = float;
using AccDataType = float;
using IndexDataType = int32_t;
using InLayout = ck::tensor_layout::convolution::NHWC;
using OutLayout = ck::tensor_layout::convolution::NHWC;
#if 1
static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
#else
static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
#endif
static constexpr bool OutputIndex = false;
static constexpr bool PropagateNan = false;
int main(int argc, char* argv[])
{
using namespace ck::host_reduce;
bool do_verification;
int init_method;
bool time_kernel;
// Pool shape
ck::index_t N = 128;
ck::index_t C = 192;
ck::index_t Y = 3;
ck::index_t X = 3;
ck::index_t Hi = 71;
ck::index_t Wi = 71;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
if(argc == 1)
{
do_verification = true;
init_method = 1;
time_kernel = true;
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = static_cast<bool>(std::stoi(argv[3]));
}
else if(argc == 16)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = static_cast<bool>(std::stoi(argv[3]));
N = std::stoi(argv[4]);
C = std::stoi(argv[5]);
Y = std::stoi(argv[6]);
X = std::stoi(argv[7]);
Hi = std::stoi(argv[8]);
Wi = std::stoi(argv[9]);
window_stride_h = std::stoi(argv[10]);
window_stride_w = std::stoi(argv[11]);
in_left_pad_h = std::stoi(argv[12]);
in_left_pad_w = std::stoi(argv[13]);
in_right_pad_h = std::stoi(argv[14]);
in_right_pad_w = std::stoi(argv[15]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
exit(0);
}
bool pass = pool_test<InDataType,
OutDataType,
AccDataType,
IndexDataType,
InLayout,
OutLayout,
ReduceOpId,
PropagateNan,
OutputIndex>(do_verification,
init_method,
time_kernel,
N,
C,
Y,
X,
Hi,
Wi,
window_stride_h,
window_stride_w,
in_left_pad_h,
in_left_pad_w,
in_right_pad_h,
in_right_pad_w);
return (pass ? 0 : 1);
}
......@@ -100,8 +100,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
16>; // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, RequantReluRequant>;
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
float,
PassThrough,
PassThrough,
RequantReluRequant>;
int main(int argc, char* argv[])
{
......
......@@ -56,7 +56,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
int main(int argc, char* argv[])
{
......
......@@ -32,6 +32,7 @@ using CDataType = F16;
using ReduceAccDataType = F32;
using DDataType = F64;
using DPtrsGlobal = ck::Tuple<DDataType*>;
using AccDataType = F32;
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
......@@ -59,7 +60,7 @@ using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
int main(int argc, char* argv[])
{
......
......@@ -32,6 +32,7 @@ using CDataType = F16;
using ReduceAccDataType = F32;
using DDataType = F32;
using DPtrsGlobal = ck::Tuple<DDataType*, DDataType*>;
using AccDataType = F32;
using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
......@@ -68,7 +69,7 @@ using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
int main(int argc, char* argv[])
{
......
......@@ -332,18 +332,15 @@ int main(int argc, char* argv[])
{
case 3: {
auto ref_conv = ReferenceConvBwdDataInstance<3>();
verify_f(ref_conv);
break;
return verify_f(ref_conv);
}
case 2: {
auto ref_conv = ReferenceConvBwdDataInstance<2>();
verify_f(ref_conv);
break;
return verify_f(ref_conv);
}
case 1: {
auto ref_conv = ReferenceConvBwdDataInstance<1>();
verify_f(ref_conv);
break;
return verify_f(ref_conv);
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
......
add_example_executable(example_broadcast_add_2d broadcast_add_2d.cpp)
add_example_executable(example_broadcast_add_2d_amn_bn broadcast_add_2d_amn_bn.cpp)
add_example_executable(example_broadcast_add_3d_am_bmnk broadcast_add_3d_am_bmnk.cpp)
add_example_executable(example_elementwise_add_1d elementwise_add_1d.cpp)
add_example_executable(example_elementwise_add_4d elementwise_add_4d.cpp)
\ No newline at end of file
......@@ -19,8 +19,17 @@ using EltwiseComputeDataType = F32;
using Add = ck::tensor_operation::binary_element_wise::Add;
using DeviceElementwiseAddInstance = ck::tensor_operation::device::
DeviceBinaryElementwise<ABDataType, ABDataType, CDataType, EltwiseComputeDataType, Add, 2, 8>;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
ABDataType,
CDataType,
EltwiseComputeDataType,
Add,
2,
8,
8,
8,
8>;
template <typename HostTensorA,
typename HostTensorB,
......@@ -100,7 +109,7 @@ int main()
if(!broadcastAdd.IsSupportedArgument(argument.get()))
{
throw std::runtime_error("The runtime parameters seems not supported by the "
"DeviceBinaryElementwise_2D instance, exiting!");
"DeviceBinaryElementwise instance, exiting!");
};
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
......@@ -123,7 +132,7 @@ int main()
0>(host_c_m_n, a_m_n, b_n, M, N, Add{});
pass &= ck::utils::check_err(
c_m_n.mData, host_c_m_n.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
c_m_n.mData, host_c_m_n.mData, "Error: Incorrect results c", 1e-3, 1e-3);
}
return pass ? 0 : 1;
......
#include <iostream>
#include <cstdlib>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "binary_element_wise_operation.hpp"
#include "device_binary_elementwise.hpp"
using F16 = ck::half_t;
using F32 = float;
using ABDataType = F16;
using CDataType = F16;
using EltwiseComputeDataType = F32;
using Add = ck::tensor_operation::binary_element_wise::Add;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
ABDataType,
CDataType,
EltwiseComputeDataType,
Add,
3,
8,
1,
8,
8>;
template <typename HostTensorA,
typename HostTensorB,
typename HostTensorC,
typename ComputeDataType,
typename Functor>
void host_broadcast3D_am_bmnk(HostTensorC& C,
const HostTensorA& A,
const HostTensorB& B,
const std::vector<std::size_t>& shape,
Functor functor)
{
using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
for(std::size_t m = 0; m < shape[0]; ++m)
for(std::size_t n = 0; n < shape[1]; ++n)
for(std::size_t k = 0; k < shape[2]; ++k)
{
ComputeDataType a_val = static_cast<ComputeDataType>(A(m));
ComputeDataType b_val = static_cast<ComputeDataType>(B(m, n, k));
ComputeDataType c_val = 0;
functor(c_val, a_val, b_val);
C(m, n, k) = static_cast<ctype>(c_val);
}
}
int main()
{
bool do_verification = true;
bool time_kernel = false;
std::vector<std::size_t> mnk = {4, 16, 32};
ck::index_t M = mnk[0];
Tensor<ABDataType> a_m({M});
Tensor<ABDataType> b_m_n_k(mnk);
Tensor<CDataType> c_m_n_k(mnk);
a_m.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
b_m_n_k.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
DeviceMem a_m_device_buf(sizeof(ABDataType) * a_m.mDesc.GetElementSpace());
DeviceMem b_m_n_k_device_buf(sizeof(ABDataType) * b_m_n_k.mDesc.GetElementSpace());
DeviceMem c_m_n_k_device_buf(sizeof(CDataType) * c_m_n_k.mDesc.GetElementSpace());
a_m_device_buf.ToDevice(a_m.mData.data());
b_m_n_k_device_buf.ToDevice(b_m_n_k.mData.data());
auto broadcastAdd = DeviceElementwiseAddInstance{};
auto argument = broadcastAdd.MakeArgumentPointer(
a_m_device_buf.GetDeviceBuffer(),
b_m_n_k_device_buf.GetDeviceBuffer(),
c_m_n_k_device_buf.GetDeviceBuffer(),
std::vector<ck::index_t>{mnk.begin(), mnk.end()},
{1, 0, 0}, // broadcast A on second and third dimension
std::vector<ck::index_t>{b_m_n_k.mDesc.GetStrides().begin(),
b_m_n_k.mDesc.GetStrides().end()},
std::vector<ck::index_t>{c_m_n_k.mDesc.GetStrides().begin(),
c_m_n_k.mDesc.GetStrides().end()},
Add{});
if(!broadcastAdd.IsSupportedArgument(argument.get()))
{
throw std::runtime_error("The runtime parameters seems not supported by the "
"DeviceBinaryElementwise instance, exiting!");
};
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
float ave_time =
broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::cout << "Perf: " << ave_time << " ms" << std::endl;
bool pass = true;
if(do_verification)
{
c_m_n_k_device_buf.FromDevice(c_m_n_k.mData.data());
Tensor<CDataType> host_c_m_n_k(mnk);
host_broadcast3D_am_bmnk<Tensor<ABDataType>,
Tensor<ABDataType>,
Tensor<CDataType>,
EltwiseComputeDataType,
Add>(host_c_m_n_k, a_m, b_m_n_k, mnk, Add{});
pass &= ck::utils::check_err(
c_m_n_k.mData, host_c_m_n_k.mData, "Error: Incorrect results c", 1e-3, 1e-3);
}
return pass ? 0 : 1;
}
......@@ -19,8 +19,17 @@ using EltwiseComputeDataType = F32;
using Add = ck::tensor_operation::binary_element_wise::Add;
using DeviceElementwiseAddInstance = ck::tensor_operation::device::
DeviceBinaryElementwise<ABDataType, ABDataType, CDataType, EltwiseComputeDataType, Add, 1, 8>;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
ABDataType,
CDataType,
EltwiseComputeDataType,
Add,
1,
8,
8,
8,
8>;
template <typename HostTensorA,
typename HostTensorB,
......@@ -81,7 +90,7 @@ int main()
if(!broadcastAdd.IsSupportedArgument(argument.get()))
{
throw std::runtime_error("The runtime parameters seems not supported by the "
"DeviceBinaryElementwise_2D instance, exiting!");
"DeviceBinaryElementwise instance, exiting!");
};
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
......@@ -103,7 +112,7 @@ int main()
Add>(host_c_m, a_m, b_m, M, Add{});
pass &= ck::utils::check_err(
c_m.mData, host_c_m.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
c_m.mData, host_c_m.mData, "Error: Incorrect results c", 1e-3, 1e-3);
}
return pass ? 0 : 1;
......
......@@ -19,8 +19,17 @@ using EltwiseComputeDataType = F32;
using Add = ck::tensor_operation::binary_element_wise::Add;
using DeviceElementwiseAddInstance = ck::tensor_operation::device::
DeviceBinaryElementwise<ABDataType, ABDataType, CDataType, EltwiseComputeDataType, Add, 4, 8>;
using DeviceElementwiseAddInstance =
ck::tensor_operation::device::DeviceBinaryElementwise<ABDataType,
ABDataType,
CDataType,
EltwiseComputeDataType,
Add,
4,
8,
8,
8,
8>;
template <typename HostTensorA,
typename HostTensorB,
......@@ -83,7 +92,7 @@ int main()
if(!broadcastAdd.IsSupportedArgument(argument.get()))
{
throw std::runtime_error("The runtime parameters seems not supported by the "
"DeviceBinaryElementwise_2D instance, exiting!");
"DeviceBinaryElementwise instance, exiting!");
};
auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
......@@ -105,7 +114,7 @@ int main()
Add>(host_c, a, b, nchw, Add{});
pass &=
ck::utils::check_err(c.mData, host_c.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
ck::utils::check_err(c.mData, host_c.mData, "Error: Incorrect results c", 1e-3, 1e-3);
}
return pass ? 0 : 1;
......
......@@ -366,18 +366,15 @@ int main(int argc, char* argv[])
{
case 3: {
auto ref_conv = ReferenceConvBwdWeightInstance<3>();
verify_f(ref_conv);
break;
return verify_f(ref_conv);
}
case 2: {
auto ref_conv = ReferenceConvBwdWeightInstance<2>();
verify_f(ref_conv);
break;
return verify_f(ref_conv);
}
case 1: {
auto ref_conv = ReferenceConvBwdWeightInstance<1>();
verify_f(ref_conv);
break;
return verify_f(ref_conv);
}
default: {
throw std::runtime_error("Unsupported number of spatial dimensions provided!");
......
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include/ck
${PROJECT_SOURCE_DIR}/include/ck/utility
${PROJECT_SOURCE_DIR}/include/ck/host_utility
${PROJECT_SOURCE_DIR}/include/ck/tensor_description
${PROJECT_SOURCE_DIR}/include/ck/tensor
${PROJECT_SOURCE_DIR}/include/ck/problem_transform
......
#pragma once
#include <string>
#include <map>
namespace ck {
inline std::string get_device_name()
{
hipDeviceProp_t props{};
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
{
return std::string();
}
status = hipGetDeviceProperties(&props, device);
if(status != hipSuccess)
{
return std::string();
}
const std::string raw_name(props.gcnArchName);
// https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
static std::map<std::string, std::string> device_name_map = {
{"Ellesmere", "gfx803"},
{"Baffin", "gfx803"},
{"RacerX", "gfx803"},
{"Polaris10", "gfx803"},
{"Polaris11", "gfx803"},
{"Tonga", "gfx803"},
{"Fiji", "gfx803"},
{"gfx800", "gfx803"},
{"gfx802", "gfx803"},
{"gfx804", "gfx803"},
{"Vega10", "gfx900"},
{"gfx901", "gfx900"},
{"10.3.0 Sienna_Cichlid 18", "gfx1030"},
};
const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
auto match = device_name_map.find(name);
if(match != device_name_map.end())
return match->second;
return name;
}
} // namespace ck
#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
#define CK_BLOCKWISE_GEMM_DLOPS_V2R3_HPP
#pragma once
#include "common_header.hpp"
#include "tensor_adaptor.hpp"
#include "threadwise_tensor_slice_transfer_v2.hpp"
#include "threadwise_contraction_dlops.hpp"
#include "threadwise_tensor_slice_transfer_v4r1.hpp"
#include "threadwise_contraction_dl.hpp"
namespace ck {
......@@ -41,7 +39,7 @@ template <index_t BlockSize,
typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
bool>::type = false>
struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
struct BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
{
using AIndex = MultiIndex<3>;
using BIndex = MultiIndex<3>;
......@@ -148,7 +146,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
public:
__device__ BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
__device__ BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
: c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
get_thread_local_1d_id())},
a_thread_copy_{
......@@ -175,6 +173,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
"wrong!");
// TODO: remove this restriction
static_assert(BM0 == 2, "wrong");
static_assert(BM0 == 2 && BN0 == 2, "wrong");
}
......@@ -226,7 +225,7 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
constexpr auto threadwise_contraction =
ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
FloatA,
FloatB,
FloatC,
......@@ -407,4 +406,3 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
};
} // namespace ck
#endif
......@@ -75,14 +75,13 @@ struct BlockwiseTensorSliceTransfer_v5r1
}
}
template <typename SrcBuffer, typename SrcStepHacks>
__device__ void
RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
template <typename SrcBuffer>
__device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
{
if(BlockSize == thread_cluster_desc_.GetElementSize() or
get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
{
threadwise_transfer_.RunRead(src_desc, src_buf, src_step_hacks);
threadwise_transfer_.RunRead(src_desc, src_buf);
}
}
......
......@@ -15,91 +15,107 @@ template <typename ADataType,
typename CDataType,
typename ComputeDataType,
typename ElementwiseFunctor,
index_t Dim,
index_t ScalarPerVector>
index_t NDim,
index_t MPerThread,
index_t AScalarPerVector,
index_t BScalarPerVector,
index_t CScalarPerVector>
struct DeviceBinaryElementwise : public BaseOperator
{
static constexpr auto I0 = Number<0>{};
template <typename Desc_M0>
static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
template <typename Desc_M>
static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
{
const auto m0 = desc_m0.GetLength(I0);
const index_t loop_step = gridSize * blockSize * ScalarPerVector;
const auto pad = math::integer_least_multiple(m0, loop_step) - m0;
const auto desc_m0_pad =
transform_tensor_descriptor(desc_m0,
make_tuple(make_right_pad_transform(m0, pad)),
const auto M = desc_m.GetLength(I0);
const index_t loop_step = gridSize * blockSize * MPerThread;
const auto pad = math::integer_least_multiple(M, loop_step) - M;
const auto desc_m_pad =
transform_tensor_descriptor(desc_m,
make_tuple(make_right_pad_transform(M, pad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
return desc_m0_pad;
return desc_m_pad;
}
static auto MakeDescriptor_M0(const std::vector<index_t>& shape,
const std::vector<index_t>& stride,
index_t gridSize,
index_t blockSize)
static auto MakeDescriptor_M(const std::vector<index_t>& lengths,
const std::vector<index_t>& strides,
index_t gridSize,
index_t blockSize)
{
auto tupleOfShape = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
auto tupleOfShape = generate_tuple([&](auto I) { return lengths[I]; }, Number<NDim>{});
auto tupleOfStride = generate_tuple([&](auto I) { return strides[I]; }, Number<NDim>{});
// nd desc - [s0, s1, s2, ...]
const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
// merge nd to 1d desc - [s0 * s1 * ...]
if constexpr(Dim > 1)
if constexpr(NDim > 1)
{
const auto desc_m0 = transform_tensor_descriptor(
const auto desc_m = transform_tensor_descriptor(
desc,
make_tuple(make_merge_transform(tupleOfShape)),
make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NDim>{})),
make_tuple(Sequence<0>{}));
return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
}
else
return PadDescriptor_M0_1d(desc, gridSize, blockSize);
return PadDescriptor_M_1d(desc, gridSize, blockSize);
}
using GridDesc_M0 = decltype(MakeDescriptor_M0({1, 1}, {1, 1}, 1, 1));
using AGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
using BGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
using CGridDesc_M = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
using GridwiseBinEltwise = GridwiseBinaryElementwise_1D<ADataType,
BDataType,
CDataType,
ComputeDataType,
GridDesc_M0,
AGridDesc_M,
BGridDesc_M,
CGridDesc_M,
ElementwiseFunctor,
ScalarPerVector>;
MPerThread,
AScalarPerVector,
BScalarPerVector,
CScalarPerVector>;
struct Argument : public BaseArgument
{
Argument(const ADataType* p_a,
const BDataType* p_b,
CDataType* p_c,
const std::vector<index_t>& shape,
const std::vector<index_t>& stride_a,
const std::vector<index_t>& stride_b,
const std::vector<index_t>& stride_c,
const std::vector<index_t>& lengths,
const std::vector<index_t>& a_strides,
const std::vector<index_t>& b_strides,
const std::vector<index_t>& c_strides,
ElementwiseFunctor functor)
: p_a_(p_a),
p_b_(p_b),
p_c_(p_c),
shape_(shape),
lengths_(lengths),
a_strides_(a_strides),
b_strides_(b_strides),
c_strides_(c_strides),
functor_(functor),
blockSize_(256),
gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
{
a_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_a, gridSize_, blockSize_);
b_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_b, gridSize_, blockSize_);
c_grid_desc_m0_ = MakeDescriptor_M0(shape, stride_c, gridSize_, blockSize_);
a_grid_desc_m_ = MakeDescriptor_M(lengths, a_strides, gridSize_, blockSize_);
b_grid_desc_m_ = MakeDescriptor_M(lengths, b_strides, gridSize_, blockSize_);
c_grid_desc_m_ = MakeDescriptor_M(lengths, c_strides, gridSize_, blockSize_);
}
const ADataType* p_a_;
const BDataType* p_b_;
CDataType* p_c_;
std::vector<int> shape_;
GridDesc_M0 a_grid_desc_m0_;
GridDesc_M0 b_grid_desc_m0_;
GridDesc_M0 c_grid_desc_m0_;
std::vector<int> lengths_;
AGridDesc_M a_grid_desc_m_;
BGridDesc_M b_grid_desc_m_;
CGridDesc_M c_grid_desc_m_;
std::vector<index_t> a_strides_;
std::vector<index_t> b_strides_;
std::vector<index_t> c_strides_;
ElementwiseFunctor functor_;
index_t blockSize_;
index_t gridSize_;
......@@ -113,7 +129,9 @@ struct DeviceBinaryElementwise : public BaseOperator
ADataType,
BDataType,
CDataType,
GridDesc_M0,
AGridDesc_M,
BGridDesc_M,
CGridDesc_M,
ElementwiseFunctor>;
float elapsed_time = launch_and_time_kernel(stream_config,
......@@ -124,9 +142,9 @@ struct DeviceBinaryElementwise : public BaseOperator
arg.p_a_,
arg.p_b_,
arg.p_c_,
arg.a_grid_desc_m0_,
arg.b_grid_desc_m0_,
arg.c_grid_desc_m0_,
arg.a_grid_desc_m_,
arg.b_grid_desc_m_,
arg.c_grid_desc_m_,
arg.functor_);
return elapsed_time;
}
......@@ -146,7 +164,30 @@ struct DeviceBinaryElementwise : public BaseOperator
if(pArg == nullptr)
return false;
if(pArg->shape_.back() % ScalarPerVector != 0)
if(pArg->lengths_.size() != NDim)
return false;
if(pArg->lengths_.back() % MPerThread != 0)
return false;
auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) {
bool ret = true;
if(!isLastDimensionCoalesced)
ret = scalarPerVector == 1;
else
ret = MPerThread % scalarPerVector == 0;
return ret;
};
if(!IsScalarPerVectorValid(pArg->a_strides_.back() == 1, AScalarPerVector))
return false;
if(!IsScalarPerVectorValid(pArg->b_strides_.back() == 1, BScalarPerVector))
return false;
if(!IsScalarPerVectorValid(pArg->c_strides_.back() == 1, CScalarPerVector))
return false;
return true;
......@@ -155,19 +196,19 @@ struct DeviceBinaryElementwise : public BaseOperator
std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
const void* p_b,
void* p_c,
std::vector<index_t> shape,
std::vector<index_t> stride_a,
std::vector<index_t> stride_b,
std::vector<index_t> stride_c,
std::vector<index_t> lengths,
std::vector<index_t> a_strides,
std::vector<index_t> b_strides,
std::vector<index_t> c_strides,
ElementwiseFunctor functor)
{
return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
static_cast<const BDataType*>(p_b),
static_cast<CDataType*>(p_c),
shape,
stride_a,
stride_b,
stride_c,
lengths,
a_strides,
b_strides,
c_strides,
functor);
}
......@@ -180,7 +221,7 @@ struct DeviceBinaryElementwise : public BaseOperator
// clang-format off
str << "DeviceBinaryElementwise"
<< "<"
<< "ScalarPerVector = " << ScalarPerVector
<< "MPerThread = " << MPerThread
<< ">";
// clang-format on
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment