"...git@developer.sourcefind.cn:OpenDAS/torch-spline-conv.git" did not exist on "c4b33b497f7a7b12cdf02529271cb94f7d99ff74"
Commit 4899c20f authored by Chao Liu's avatar Chao Liu
Browse files

upadte profiler

parent 5729c23c
......@@ -17,17 +17,25 @@ namespace tensor_operation {
namespace device {
namespace instance {
void add_device_layernorm_f16_rank2_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 2, 1>>&);
// FP16
void add_device_layernorm_rank_2_1_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 2, 1>>>&);
void add_device_layernorm_f16_rank4_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 4, 3>>&);
void add_device_layernorm_rank_4_3_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 4, 3>>>&);
void add_device_layernorm_f32_rank2_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 2, 1>>&);
void add_device_layernorm_rank_5_3_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 5, 3>>>&);
void add_device_layernorm_f32_rank4_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 4, 3>>&);
// FP32
void add_device_layernorm_rank_2_1_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 2, 1>>>&);
void add_device_layernorm_rank_4_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
void add_device_layernorm_rank_5_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 5, 3>>>&);
template <typename XDataType,
typename GammaDataType,
......@@ -62,17 +70,33 @@ struct DeviceOperationInstanceFactory<
is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16>)
{
if constexpr(Rank == 2 && NumReduceDim == 1)
add_device_layernorm_f16_rank2_instances(op_ptrs);
{
add_device_layernorm_rank_2_1_f16_instances(op_ptrs);
}
else if constexpr(Rank == 4 && NumReduceDim == 3)
add_device_layernorm_f16_rank4_instances(op_ptrs);
{
add_device_layernorm_rank_4_3_f16_instances(op_ptrs);
}
else if constexpr(Rank == 5 && NumReduceDim == 3)
{
add_device_layernorm_rank_5_3_f16_instances(op_ptrs);
}
}
else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32>)
{
if constexpr(Rank == 2 && NumReduceDim == 1)
add_device_layernorm_f32_rank2_instances(op_ptrs);
{
add_device_layernorm_rank_2_1_f32_instances(op_ptrs);
}
else if constexpr(Rank == 4 && NumReduceDim == 3)
add_device_layernorm_f32_rank4_instances(op_ptrs);
{
add_device_layernorm_rank_4_3_f32_instances(op_ptrs);
}
else if constexpr(Rank == 5 && NumReduceDim == 3)
{
add_device_layernorm_rank_5_3_f32_instances(op_ptrs);
}
}
return op_ptrs;
......
......@@ -16,7 +16,6 @@ using F16 = ck::half_t;
using F32 = float;
using Pass = ck::tensor_operation::element_wise::PassThrough;
using Sigmoid = ck::tensor_operation::element_wise::Sigmoid;
template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_layernorm_f16_instances = std::tuple<
......@@ -36,22 +35,22 @@ using device_layernorm_f16_instances = std::tuple<
// clang-format on
>;
void add_device_layernorm_f16_rank2_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Pass, 2, 1>>& instances)
void add_device_layernorm_rank_2_1_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 2, 1>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 2, 1>{});
}
void add_device_layernorm_f16_rank4_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Pass, 4, 3>>& instances)
void add_device_layernorm_rank_4_3_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 4, 3>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 4, 3>{});
}
void add_device_groupnorm_f16_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Sigmoid, 5, 3>>& instances)
void add_device_layernorm_rank_5_3_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 5, 3>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f16_instances<Sigmoid, 5, 3>{});
add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 5, 3>{});
}
} // namespace instance
......
......@@ -15,7 +15,6 @@ namespace instance {
using F32 = float;
using Pass = ck::tensor_operation::element_wise::PassThrough;
using Sigmoid = ck::tensor_operation::element_wise::Sigmoid;
template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_layernorm_f32_instances = std::tuple<
......@@ -34,22 +33,22 @@ using device_layernorm_f32_instances = std::tuple<
// clang-format on
>;
void add_device_layernorm_f32_rank2_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Pass, 2, 1>>& instances)
void add_device_layernorm_rank_2_1_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 2, 1>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 2, 1>{});
}
void add_device_layernorm_f32_rank4_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Pass, 4, 3>>& instances)
void add_device_layernorm_rank_4_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 4, 3>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 4, 3>{});
}
void add_device_groupnorm_f32_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Sigmoid, 5, 3>>& instances)
void add_device_layernorm_rank_5_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 5, 3>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f32_instances<Sigmoid, 5, 3>{});
add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 5, 3>{});
}
} // namespace instance
......
......@@ -56,4 +56,3 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
......@@ -6,8 +6,8 @@
#include <iomanip>
#include "ck/ck.hpp"
#include "profiler/include/data_type_enum.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
......@@ -15,35 +15,9 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Sigmoid = ck::tensor_operation::element_wise::Sigmoid;
void add_device_groupnorm_f16_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, Sigmoid, 5, 3>>&);
void add_device_groupnorm_f32_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, Sigmoid, 5, 3>>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
enum struct ElementwiseOpEnum
{
ePassthrough = 0,
eSigmoid = 1
};
template <typename XDataType,
typename GammaDataType,
typename BetaDataType,
......@@ -53,12 +27,9 @@ bool profile_groupnorm_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> length,
ElementwiseOpEnum OutelementwiseOp)
std::vector<index_t> length)
{
using F16 = ck::half_t;
using F32 = float;
using Sigmoid = ck::tensor_operation::element_wise::Sigmoid;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() != 5)
return false;
......@@ -104,61 +75,44 @@ bool profile_groupnorm_impl(int do_verification,
beta_dev.ToDevice(beta.mData.data());
// add device normalization instances
std::vector<tensor_operation::device::DeviceLayernormPtr<XDataType,
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
Sigmoid,
PassThrough,
5,
3>>
instances;
3>;
if constexpr(is_same<XDataType, F16>::value && is_same<GammaDataType, F16>::value &&
is_same<BetaDataType, F16>::value && is_same<YDataType, F16>::value &&
is_same<AccDataType, F32>::value)
{
if(OutelementwiseOp == ElementwiseOpEnum::eSigmoid)
tensor_operation::device::instance::add_device_groupnorm_f16_instances(instances);
}
else if constexpr(is_same<XDataType, F32>::value && is_same<GammaDataType, F32>::value &&
is_same<BetaDataType, F32>::value && is_same<YDataType, F32>::value &&
is_same<AccDataType, F32>::value)
{
if(OutelementwiseOp == ElementwiseOpEnum::eSigmoid)
tensor_operation::device::instance::add_device_groupnorm_f32_instances(instances);
}
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
if(instances.size() <= 0)
{
throw std::runtime_error("wrong! no device normalization instance found");
}
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
if(do_verification)
{
if(OutelementwiseOp == ElementwiseOpEnum::eSigmoid)
{
using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
Sigmoid>;
PassThrough>;
ReferenceInstance ref;
auto ref_argument = ref.MakeArgument(x, gamma, beta, host_y, Sigmoid{}, length, 1e-6);
auto ref_argument = ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, length, 1e-6);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);
}
}
int num_kernel = 0;
for(auto& inst_ptr : instances)
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(
length,
......@@ -172,7 +126,7 @@ bool profile_groupnorm_impl(int do_verification,
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
Sigmoid{});
PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
......
......@@ -6,8 +6,8 @@
#include <iomanip>
#include "ck/ck.hpp"
#include "profiler/include/data_type_enum.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
......@@ -15,26 +15,6 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
void add_device_layernorm_f16_rank2_instances(
std::vector<DeviceLayernormPtr<F16, F16, F16, F32, F16, PassThrough, 2, 1>>&);
void add_device_layernorm_f32_rank2_instances(
std::vector<DeviceLayernormPtr<F32, F32, F32, F32, F32, PassThrough, 2, 1>>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
......@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification,
std::vector<index_t> strideGamma,
std::vector<index_t> strideBeta)
{
using F16 = ck::half_t;
using F32 = float;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() < 2)
......@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification,
gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data());
// add device normalization instances
constexpr int NumReduceDim = Rank - 1;
std::vector<tensor_operation::device::DeviceLayernormPtr<XDataType,
// add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
PassThrough,
Rank,
NumReduceDim>>
instances;
NumReduceDim>;
if constexpr(is_same<XDataType, F16>::value && is_same<GammaDataType, F16>::value &&
is_same<BetaDataType, F16>::value && is_same<YDataType, F16>::value &&
is_same<AccDataType, F32>::value)
{
if(length.size() == 2)
tensor_operation::device::instance::add_device_layernorm_f16_rank2_instances(instances);
}
else if constexpr(is_same<XDataType, F32>::value && is_same<GammaDataType, F32>::value &&
is_same<BetaDataType, F32>::value && is_same<YDataType, F32>::value &&
is_same<AccDataType, F32>::value)
{
if(length.size() == 2)
tensor_operation::device::instance::add_device_layernorm_f32_rank2_instances(instances);
}
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
if(instances.size() <= 0)
{
throw std::runtime_error("wrong! no device normalization instance found");
}
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
......@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification,
ref_invoker.Run(ref_argument);
}
for(auto& inst_ptr : instances)
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
strideXY,
......@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification,
if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = [", length, "], ") << std::endl;
LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
return;
continue;
}
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
......
......@@ -5,10 +5,10 @@
#include <vector>
#include <unordered_map>
#include "profiler/include/data_type_enum.hpp"
#include "profiler/include/profile_groupnorm_impl.hpp"
using ck::index_t;
using ck::profiler::ElementwiseOpEnum;
struct GroupnormArgParser
{
......@@ -50,8 +50,7 @@ void print_help_groupnorm()
<< "arg3: verification (0: no; 1: yes)\n"
<< "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg5: print tensor value (0: no; 1: yes)\n"
<< "arg6: time kernel (0=n0, 1=yes)\n"
<< "arg7: out elementwise op (0=passthrough, 1=sigmoid)\n"
<< "arg6: time kernel (0=no, 1=yes)\n"
<< "--length: tensor extents (e.g, --length 1 16 16 32 40) \n"
<< std::endl;
}
......@@ -63,10 +62,9 @@ int profile_groupnorm(int argc, char* argv[])
int init_method = 0;
bool do_log = 0;
bool time_kernel = 1;
ElementwiseOpEnum outElementwiseOp = ElementwiseOpEnum::eSigmoid;
std::vector<index_t> length = {1, 16, 16, 32, 40};
std::vector<index_t> length = {64, 16, 16, 32, 40};
if(argc != 1 && argc != 14)
if(argc != 1 && argc != 13)
{
print_help_groupnorm();
return 0;
......@@ -79,7 +77,6 @@ int profile_groupnorm(int argc, char* argv[])
init_method = std::stoi(argv[4]);
do_log = std::stoi(argv[5]);
time_kernel = std::stoi(argv[6]);
outElementwiseOp = static_cast<ElementwiseOpEnum>(std::stoi(argv[7]));
// parse the long options
GroupnormArgParser arg_parser;
......@@ -90,10 +87,15 @@ int profile_groupnorm(int argc, char* argv[])
using F16 = ck::half_t;
using F32 = float;
if(data_type == ck::DataTypeEnum::Half && outElementwiseOp == ElementwiseOpEnum::eSigmoid)
if(data_type == ck::DataTypeEnum::Float)
{
ck::profiler::profile_groupnorm_impl<F32, F32, F32, F32, F32>(
do_verification, init_method, do_log, time_kernel, length);
}
else if(data_type == ck::DataTypeEnum::Half)
{
ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16>(
do_verification, init_method, do_log, time_kernel, length, outElementwiseOp);
do_verification, init_method, do_log, time_kernel, length);
}
else
{
......@@ -102,10 +104,3 @@ int profile_groupnorm(int argc, char* argv[])
return 0;
}
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_groupnorm(argc, argv);
// return 0;
// }
......@@ -5,6 +5,7 @@
#include <vector>
#include <unordered_map>
#include "profiler/include/data_type_enum.hpp"
#include "profiler/include/profile_layernorm_impl.hpp"
using ck::index_t;
......@@ -49,7 +50,7 @@ void print_help_layernorm()
<< "arg2: verification (0: no; 1: yes)\n"
<< "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=n0, 1=yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n"
<< "--length: tensor extents (e.g, --length 1024 1024) \n"
<< "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n"
<< "--strideGamma: tensor strides (e.g, --strideGamma 1)\n"
......@@ -114,10 +115,3 @@ int profile_layernorm(int argc, char* argv[])
return 0;
}
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_layernorm(argc, argv);
// return 0;
// }
......@@ -3,26 +3,27 @@
#include <cstring>
int profile_gemm(int, char*[]);
int profile_gemm_splitk(int, char*[]);
int profile_gemm_bilinear(int, char*[]);
int profile_gemm_add_add_fastgelu(int, char*[]);
int profile_gemm_reduce(int, char*[]);
int profile_gemm_bias_add_reduce(int, char*[]);
int profile_batched_gemm(int, char*[]);
int profile_batched_gemm_gemm(int, char*[]);
int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
int profile_batched_gemm_reduce(int, char*[]);
int profile_grouped_gemm(int, char*[]);
int profile_conv_fwd(int, char*[]);
int profile_conv_fwd_bias_relu(int, char*[]);
int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_conv_bwd_data(int, char*[]);
int profile_conv_bwd_weight(int, char*[]);
int profile_grouped_conv_fwd(int, char*[]);
int profile_normalization(int, char*[]);
// int profile_gemm(int, char*[]);
// int profile_gemm_splitk(int, char*[]);
// int profile_gemm_bilinear(int, char*[]);
// int profile_gemm_add_add_fastgelu(int, char*[]);
// int profile_gemm_reduce(int, char*[]);
// int profile_gemm_bias_add_reduce(int, char*[]);
// int profile_batched_gemm(int, char*[]);
// int profile_batched_gemm_gemm(int, char*[]);
// int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
// int profile_batched_gemm_reduce(int, char*[]);
// int profile_grouped_gemm(int, char*[]);
// int profile_conv_fwd(int, char*[]);
// int profile_conv_fwd_bias_relu(int, char*[]);
// int profile_conv_fwd_bias_relu_add(int, char*[]);
// int profile_conv_bwd_data(int, char*[]);
// int profile_conv_bwd_weight(int, char*[]);
// int profile_grouped_conv_fwd(int, char*[]);
// int profile_normalization(int, char*[]);
int profile_layernorm(int, char*[]);
int profile_reduce(int, char*[]);
int profile_groupnorm(int, char*[]);
// int profile_reduce(int, char*[]);
static void print_helper_message()
{
......@@ -56,6 +57,7 @@ int main(int argc, char* argv[])
return 0;
}
#if 0
else if(strcmp(argv[1], "gemm") == 0)
{
return profile_gemm(argc, argv);
......@@ -132,6 +134,7 @@ int main(int argc, char* argv[])
{
return profile_normalization(argc, argv);
}
#endif
else if(strcmp(argv[1], "layernorm") == 0)
{
return profile_layernorm(argc, argv);
......
......@@ -7,7 +7,6 @@
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
using ck::profiler::ElementwiseOpEnum;
template <typename Tuple>
class TestGroupnorm : public ::testing::Test
......@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test
for(auto length : lengths)
{
bool success = ck::profiler::profile_groupnorm_impl<XDataType,
bool success =
ck::profiler::profile_groupnorm_impl<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType>(
true, 2, false, false, length, ElementwiseOpEnum::eSigmoid);
YDataType>(true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
......
......@@ -7,7 +7,6 @@
using F16 = ck::half_t;
using F32 = float;
using ck::index_t;
using ck::profiler::ElementwiseOpEnum;
template <typename Tuple>
class TestGroupnorm : public ::testing::Test
......@@ -31,12 +30,12 @@ class TestGroupnorm : public ::testing::Test
for(auto length : lengths)
{
bool success = ck::profiler::profile_groupnorm_impl<XDataType,
bool success =
ck::profiler::profile_groupnorm_impl<XDataType,
GammaDataType,
BetaDataType,
AccDataType,
YDataType>(
true, 2, false, false, length, ElementwiseOpEnum::eSigmoid);
YDataType>(true, 2, false, false, length);
EXPECT_TRUE(success);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment