Unverified Commit 53ea4713 authored by Qianfeng's avatar Qianfeng Committed by GitHub
Browse files

Batchnorm-forward and Batchnorm-infer Implemented using generic kernels (#320)

* Implement multiple-reduction in one kernel (kernels, device ops, examples)

* Add generic elementwise kernel and device interface

* Add generator for normal-distributed data initialization

* Add host refer implementation of batchnorm-forward and batchnorm-infer

* Add examples for implementing batchnorm-forward and batchnorm-infer using generic kernels

* Remove un-needed including in batchnorm example

* Renaming generic_elementwise to elementiwise in kernel and device classes/functions

* Change in gemm_layernorm examples to use DeviceElementwise instead of Device5AryElementwise

* Change in exampe 19_binary_elementwise to use DeviceElementwise instead of DeviceBinaryElementwise

* Change in device_cgemm_4gemm_xdl_cshuffle.hpp to use kernel_elementwise instead of kernel_binary_elementwise

* Add DeviceElementwiseBase and use it in device_normalize_instance.cpp

* Removing and renaming files

* Update to synchronize gemm_layernorm client example to the generic element-wise device op API

* Update to synchronize with the latest headers directory and HostTensorDescriptor interface renaming

* Merge two static member functions in device_elementwise.hpp

* Remove unary_elementwise_1d kernel and device
parent 5ee30459
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <typename Grid1dBufferDescTuple,
index_t NumBuffer,
index_t BlockSize,
typename DataTypePointerTuple,
typename DataTypeTuple>
__global__ void
kernel_multiple_buffer_set_value(const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple,
DataTypePointerTuple p_global_tuple,
DataTypeTuple value_tuple)
{
static_assert(NumBuffer == DataTypePointerTuple::Size() && NumBuffer == DataTypeTuple::Size(),
"The tuple size should be same as NumBuffer!");
static_for<0, NumBuffer, 1>{}([&](auto iB) {
using DataTypePointer = remove_cvref_t<decltype(DataTypePointerTuple{}[iB])>;
using DataTypeFromPointer = remove_pointer_t<DataTypePointer>;
using DataType = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
static_assert(is_same<DataType, DataTypeFromPointer>::value,
"Types in tuples does not match!");
});
constexpr auto I0 = Number<0>{};
const index_t thread_global_id = get_thread_global_1d_id();
auto value_buf_tuple = generate_tuple(
[&](auto iB) {
using DataType = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, 1, true>{};
},
Number<NumBuffer>{});
static_for<0, NumBuffer, 1>{}([&](auto iB) {
static_for<0, 1, 1>{}([&](auto J) { value_buf_tuple(iB)(J) = value_tuple[iB]; });
});
auto global_buf_tuple = generate_tuple(
[&](auto iB) {
return make_dynamic_buffer<AddressSpaceEnum::Global>(
p_global_tuple(iB), grid_1d_buffer_desc_tuple[iB].GetElementSpaceSize());
},
Number<NumBuffer>{});
constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
static_for<0, NumBuffer, 1>{}([&](auto iB) {
using DataType = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
using PassThroughOp = tensor_operation::element_wise::PassThrough;
auto threadwise_store =
ThreadwiseTensorSliceTransfer_v1r3<DataType,
DataType,
decltype(val_buff_desc),
decltype(Grid1dBufferDescTuple{}[iB]),
PassThroughOp,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum::Set,
1,
true>(
grid_1d_buffer_desc_tuple[iB], make_multi_index(thread_global_id), PassThroughOp{});
threadwise_store.Run(val_buff_desc,
make_tuple(I0),
value_buf_tuple(iB),
grid_1d_buffer_desc_tuple[iB],
global_buf_tuple(iB));
});
};
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck {
template <typename GridwiseUEltwise,
typename ADataType,
typename BDataType,
typename GridDesc_M0,
typename ElementwiseFunctor>
__global__ void kernel_unary_elementwise_1d(const ADataType* __restrict__ p_a_global,
BDataType* __restrict__ p_b_global,
const GridDesc_M0 a_grid_desc_m0,
const GridDesc_M0 b_grid_desc_m0,
const ElementwiseFunctor functor)
{
GridwiseUEltwise::Run(p_a_global, p_b_global, a_grid_desc_m0, b_grid_desc_m0, functor);
}
template <typename ADataType,
typename BDataType,
typename GridDesc_M0,
typename ElementwiseFunctor,
index_t ScalarPerVector>
struct GridwiseUnaryElementwise_1D
{
static constexpr auto I0 = Number<0>{};
static constexpr auto thread_desc_m0 =
make_naive_tensor_descriptor_packed(make_tuple(Number<ScalarPerVector>{}));
using PassThrough = tensor_operation::element_wise::PassThrough;
static __device__ auto CalculateElementwiseIndex()
{
const index_t global_thread_id = get_thread_global_1d_id();
return make_multi_index(global_thread_id * ScalarPerVector);
}
__host__ __device__ static constexpr bool CheckValidity(const GridDesc_M0 a_grid_desc_m0,
const GridDesc_M0 b_grid_desc_m0)
{
return a_grid_desc_m0.GetLength(I0) == b_grid_desc_m0.GetLength(I0);
}
__host__ __device__ static constexpr index_t CalculateGridSize(const index_t tensor_size)
{
const index_t grid_size = math::integer_divide_ceil(tensor_size, 256 * ScalarPerVector);
return grid_size;
}
__device__ static void Run(const ADataType* __restrict__ p_a_global,
BDataType* __restrict__ p_b_global,
const GridDesc_M0 a_grid_desc_m0,
const GridDesc_M0 b_grid_desc_m0,
const ElementwiseFunctor functor)
{
const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_a_global, a_grid_desc_m0.GetElementSpaceSize());
auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_b_global, b_grid_desc_m0.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum::Vgpr, ADataType, ScalarPerVector, true> a_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, BDataType, ScalarPerVector, true> b_thread_buf;
const auto thread_store_global_offset = CalculateElementwiseIndex();
auto a_global_load =
ThreadwiseTensorSliceTransfer_v2<ADataType,
ADataType,
GridDesc_M0,
decltype(thread_desc_m0),
Sequence<ScalarPerVector>, // SliceLengths
Sequence<0>, // DimAccessOrder
0, // SrcVectorDim
ScalarPerVector,
1, // SrcScalarStrideInVector
false>{a_grid_desc_m0, thread_store_global_offset};
auto b_global_write =
ThreadwiseTensorSliceTransfer_v1r3<BDataType,
BDataType,
decltype(thread_desc_m0),
GridDesc_M0,
PassThrough,
Sequence<ScalarPerVector>, // SliceLengths
Sequence<0>, // DimAccessOrder
0, // DstVectorDim
ScalarPerVector,
InMemoryDataOperationEnum::Set,
1, // DstScalarStrideInVector
false>{
b_grid_desc_m0, thread_store_global_offset, PassThrough{}};
const index_t blockSize = get_block_size();
const index_t blockPerGrid = get_grid_size();
const auto m0 = b_grid_desc_m0.GetLength(I0);
const index_t loop_step = blockPerGrid * blockSize * ScalarPerVector;
const auto loop_step_index = make_multi_index(loop_step);
index_t num_iter = m0 / (loop_step);
do
{
// read and process ScalarPerVector elements
a_global_load.Run(
a_grid_desc_m0, a_global_buf, thread_desc_m0, make_tuple(I0), a_thread_buf);
static_for<0, ScalarPerVector, 1>{}([&](auto m) {
constexpr auto offset = thread_desc_m0.CalculateOffset(make_tuple(m));
functor(b_thread_buf(Number<offset>{}), a_thread_buf(Number<offset>{}));
});
b_global_write.Run(thread_desc_m0,
make_tuple(I0), // SrcSliceOriginIdx
b_thread_buf,
b_grid_desc_m0,
b_global_buf);
a_global_load.MoveSrcSliceWindow(a_grid_desc_m0, loop_step_index);
b_global_write.MoveDstSliceWindow(b_grid_desc_m0, loop_step_index);
} while(--num_iter);
}
};
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include <array>
#include <algorithm>
#include <thread>
#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
namespace ck {
namespace tensor_operation {
namespace host {
template <typename InOutDataType, typename AccDataType>
struct ReferenceBatchNormFwd_Input_N_H_W_C_Output_C : public device::DeviceBatchNormFwd<4, 3>
{
struct Argument : public device::BaseArgument
{
Argument(const std::array<index_t, 4> xyLengths,
const std::array<index_t, 4> xStrides,
const std::array<index_t, 4> yStrides,
const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
const InOutDataType* p_x,
const AccDataType* bnScale,
const AccDataType* bnBias,
InOutDataType* p_y,
double exponentialAverageFactor,
AccDataType* resultRunningMean,
AccDataType* resultRunningVariance,
double epsilon,
AccDataType* resultSaveMean,
AccDataType* resultSaveInvVariance)
: p_x_(p_x),
bnScale_(bnScale),
bnBias_(bnBias),
p_y_(p_y),
resultRunningMean_(resultRunningMean),
resultRunningVariance_(resultRunningVariance),
resultSaveMean_(resultSaveMean),
resultSaveInvVariance_(resultSaveInvVariance),
exponentialAverageFactor_(exponentialAverageFactor),
epsilon_(epsilon)
{
(void)xStrides;
(void)yStrides;
(void)bnScaleBiasMeanVarStrides;
if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 ||
bnScaleBiasMeanVarLengths[0] != xyLengths[3])
throw std::runtime_error("Invalid tensor dimensions!");
n = xyLengths[0];
h = xyLengths[1];
w = xyLengths[2];
c = xyLengths[3];
resultSave = (resultSaveMean != nullptr && resultSaveInvVariance != nullptr);
resultRunning = (resultRunningMean != nullptr && resultRunningVariance != nullptr);
}
const InOutDataType* p_x_;
const AccDataType* bnScale_;
const AccDataType* bnBias_;
InOutDataType* p_y_;
AccDataType* resultRunningMean_;
AccDataType* resultRunningVariance_;
AccDataType* resultSaveMean_;
AccDataType* resultSaveInvVariance_;
bool resultSave, resultRunning;
index_t n, h, w, c;
double exponentialAverageFactor_;
double epsilon_;
};
struct Invoker : public device::BaseInvoker
{
float Run(const Argument& arg)
{
auto thread_reduce_func = [&](auto iC) {
AccDataType reduceSize = type_convert<AccDataType>(arg.n) *
type_convert<AccDataType>(arg.h) *
type_convert<AccDataType>(arg.w);
index_t offset_C = iC;
AccDataType mean = type_convert<AccDataType>(0.0f);
AccDataType meansquare = type_convert<AccDataType>(0.0f);
// compute mean, meanquare, variance, invVariance
for(index_t iN = 0; iN < arg.n; iN++)
{
index_t offset_N = iN * arg.h * arg.w * arg.c;
for(index_t iH = 0; iH < arg.h; iH++)
{
index_t offset_H = iH * arg.w * arg.c;
for(index_t iW = 0; iW < arg.w; iW++)
{
index_t offset_W = iW * arg.c;
auto offset = offset_N + offset_H + offset_W + offset_C;
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
mean += x;
meansquare += x * x;
};
}
};
mean = mean / reduceSize;
meansquare = meansquare / reduceSize;
AccDataType variance = meansquare - mean * mean;
AccDataType invVariance =
type_convert<AccDataType>(1.0f) /
std::sqrt(type_convert<AccDataType>(arg.epsilon_) + variance);
// save the mean/invVariance if required
if(arg.resultSave)
{
arg.resultSaveMean_[iC] = mean;
arg.resultSaveInvVariance_[iC] = invVariance;
};
// update the moving average if required
if(arg.resultRunning)
{
arg.resultRunningMean_[iC] =
arg.resultRunningMean_[iC] *
type_convert<AccDataType>(1.0 - arg.exponentialAverageFactor_) +
mean * arg.exponentialAverageFactor_;
arg.resultRunningVariance_[iC] =
arg.resultRunningVariance_[iC] *
type_convert<AccDataType>(1.0 - arg.exponentialAverageFactor_) +
variance * arg.exponentialAverageFactor_;
};
// Normalization
for(index_t iN = 0; iN < arg.n; iN++)
{
index_t offset_N = iN * arg.h * arg.w * arg.c;
for(index_t iH = 0; iH < arg.h; iH++)
{
index_t offset_H = iH * arg.w * arg.c;
for(index_t iW = 0; iW < arg.w; iW++)
{
index_t offset_W = iW * arg.c;
auto offset = offset_N + offset_H + offset_W + offset_C;
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
AccDataType norm_x =
arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC];
arg.p_y_[offset] = type_convert<InOutDataType>(norm_x);
};
}
};
};
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t ic_begin = it * work_per_thread;
std::size_t ic_end = std::min(static_cast<int>((it + 1) * work_per_thread), arg.c);
auto f = [=] {
for(std::size_t ic = ic_begin; ic < ic_end; ++ic)
{
thread_reduce_func(ic);
}
};
threads[it] = joinable_thread(f);
}
return (0.0f);
};
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
};
};
bool IsSupportedArgument(const device::BaseArgument* p_arg) override
{
(void)p_arg;
return (true);
};
std::unique_ptr<device::BaseArgument>
MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
const std::array<index_t, 4> xStrides,
const std::array<index_t, 4> yStrides,
const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
const void* p_x,
const void* bnScale,
const void* bnBias,
void* p_y,
double exponentialAverageFactor,
void* resultRunningMean,
void* resultRunningVariance,
double epsilon,
void* resultSaveMean,
void* resultSaveInvVariance) override
{
return std::make_unique<Argument>(xyLengths,
xStrides,
yStrides,
bnScaleBiasMeanVarLengths,
bnScaleBiasMeanVarStrides,
static_cast<const InOutDataType*>(p_x),
static_cast<const AccDataType*>(bnScale),
static_cast<const AccDataType*>(bnBias),
static_cast<InOutDataType*>(p_y),
exponentialAverageFactor,
static_cast<AccDataType*>(resultRunningMean),
static_cast<AccDataType*>(resultRunningVariance),
epsilon,
static_cast<AccDataType*>(resultSaveMean),
static_cast<AccDataType*>(resultSaveInvVariance));
};
std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>();
};
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl;
// clang-format on
return str.str();
}
};
} // namespace host
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include <array>
#include <algorithm>
#include "ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp"
namespace ck {
namespace tensor_operation {
namespace host {
template <typename InOutDataType, typename AccDataType>
struct ReferenceBatchNormInfer_Input_N_H_W_C_Output_C : public device::DeviceBatchNormInfer<4, 3>
{
struct Argument : public device::BaseArgument
{
Argument(const std::array<index_t, 4> xyLengths,
const std::array<index_t, 4> xStrides,
const std::array<index_t, 4> yStrides,
const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
const InOutDataType* p_x,
const AccDataType* bnScale,
const AccDataType* bnBias,
double epsilon,
const AccDataType* estimatedMean,
const AccDataType* estimatedVariance,
InOutDataType* p_y)
: p_x_(p_x),
bnScale_(bnScale),
bnBias_(bnBias),
epsilon_(epsilon),
estimatedMean_(estimatedMean),
estimatedVariance_(estimatedVariance),
p_y_(p_y)
{
(void)xStrides;
(void)yStrides;
(void)bnScaleBiasMeanVarStrides;
if(xyLengths.size() != 4 || bnScaleBiasMeanVarLengths.size() != 1 ||
bnScaleBiasMeanVarLengths[0] != xyLengths[3])
throw std::runtime_error("Invalid tensor dimensions!");
n = xyLengths[0];
h = xyLengths[1];
w = xyLengths[2];
c = xyLengths[3];
}
const InOutDataType* p_x_;
const AccDataType* bnScale_;
const AccDataType* bnBias_;
double epsilon_;
const AccDataType* estimatedMean_;
const AccDataType* estimatedVariance_;
InOutDataType* p_y_;
index_t n, h, w, c;
};
struct Invoker : public device::BaseInvoker
{
float Run(const Argument& arg)
{
auto thread_reduce_func = [&](auto iC) {
index_t offset_C = iC;
AccDataType mean = arg.estimatedMean_[offset_C];
AccDataType variance = arg.estimatedVariance_[offset_C];
AccDataType invVariance =
type_convert<AccDataType>(1.0f) /
std::sqrt(type_convert<AccDataType>(arg.epsilon_) + variance);
// Normalization
for(index_t iN = 0; iN < arg.n; iN++)
{
index_t offset_N = iN * arg.h * arg.w * arg.c;
for(index_t iH = 0; iH < arg.h; iH++)
{
index_t offset_H = iH * arg.w * arg.c;
for(index_t iW = 0; iW < arg.w; iW++)
{
index_t offset_W = iW * arg.c;
auto offset = offset_N + offset_H + offset_W + offset_C;
AccDataType x = type_convert<AccDataType>(arg.p_x_[offset]);
AccDataType norm_x =
arg.bnScale_[iC] * (x - mean) * invVariance + arg.bnBias_[iC];
arg.p_y_[offset] = type_convert<InOutDataType>(norm_x);
};
}
};
};
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t work_per_thread = (arg.c + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t ic_begin = it * work_per_thread;
std::size_t ic_end = std::min(static_cast<int>((it + 1) * work_per_thread), arg.c);
auto f = [=] {
for(std::size_t ic = ic_begin; ic < ic_end; ++ic)
{
thread_reduce_func(ic);
}
};
threads[it] = joinable_thread(f);
}
return (0.0f);
};
float Run(const device::BaseArgument* p_arg,
const StreamConfig& /*stream_config*/ = StreamConfig{}) override
{
return Run(*dynamic_cast<const Argument*>(p_arg));
};
};
bool IsSupportedArgument(const device::BaseArgument* p_arg) override
{
(void)p_arg;
return (true);
};
std::unique_ptr<device::BaseArgument>
MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
const std::array<index_t, 4> xStrides,
const std::array<index_t, 4> yStrides,
const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
const std::array<index_t, 1> bnScaleBiasMeanVarStrides,
const void* p_x,
const void* bnScale,
const void* bnBias,
double epsilon,
const void* estimatedMean,
const void* estimatedVariance,
void* p_y) override
{
return std::make_unique<Argument>(xyLengths,
xStrides,
yStrides,
bnScaleBiasMeanVarLengths,
bnScaleBiasMeanVarStrides,
static_cast<const InOutDataType*>(p_x),
static_cast<const AccDataType*>(bnScale),
static_cast<const AccDataType*>(bnBias),
epsilon,
static_cast<const AccDataType*>(estimatedMean),
static_cast<const AccDataType*>(estimatedVariance),
static_cast<InOutDataType*>(p_y));
};
std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
{
return std::make_unique<Invoker>();
};
std::string GetTypeString() const override
{
auto str = std::stringstream();
// clang-format off
str << "Reference_BatchNorm_Forward_NHWC_C<" << std::endl;
// clang-format on
return str.str();
}
};
} // namespace host
} // namespace tensor_operation
} // namespace ck
......@@ -17,9 +17,12 @@ namespace tensor_operation {
namespace device {
namespace instance {
using Normalize = ck::tensor_operation::element_wise::Normalize;
using DeviceNormalizeFromMeanMeanSquarePtr =
ck::tensor_operation::device::DeviceElementwisePtr<5, 1, 2, Normalize>;
using Normalize = ck::tensor_operation::element_wise::Normalize;
using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device::DeviceElementwiseBasePtr<
Tuple<half_t, float, float, half_t, half_t>,
Tuple<half_t>,
Normalize,
2>;
void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
std::vector<DeviceNormalizeFromMeanMeanSquarePtr>& instances);
......
......@@ -5,6 +5,7 @@
#include <cmath>
#include <numeric>
#include <random>
#include "ck/ck.hpp"
......@@ -126,6 +127,23 @@ struct GeneratorTensor_3<ck::bhalf_t>
}
};
template <typename T>
struct GeneratorTensor_4
{
std::default_random_engine generator;
std::normal_distribution<float> distribution;
GeneratorTensor_4(float mean, float stddev) : generator(1), distribution(mean, stddev){};
template <typename... Is>
T operator()(Is...)
{
float tmp = distribution(generator);
return ck::type_convert<T>(tmp);
}
};
struct GeneratorTensor_Checkboard
{
template <typename... Ts>
......
......@@ -6,7 +6,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
......@@ -27,19 +27,17 @@ using outputType = F16;
using Normalize = ck::tensor_operation::element_wise::Normalize;
using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
// clang-format off
//###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
//###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
//###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
//###################|in | mean| square_mean| gamma| beta| out| ComputeDataType| functor| NDim| MPerThread| in, mean, square_mean, gamma, beta, out ScalarPerVector|
Device5AryElementwise<F16, F32, F32, F16, F16, F16, F32, Normalize, 2, 8, 8, 1, 1, 8, 8, 8 >,
Device5AryElementwise<F16, F32, F32, F16, F16, F16, F32, Normalize, 2, 4, 4, 1, 1, 4, 4, 4 >,
Device5AryElementwise<F16, F32, F32, F16, F16, F16, F32, Normalize, 2, 2, 2, 1, 1, 2, 2, 2 >,
Device5AryElementwise<F16, F32, F32, F16, F16, F16, F32, Normalize, 2, 1, 1, 1, 1, 1, 1, 1 >
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
DeviceElementwise<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
// clang-format on
>;
void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
std::vector<DeviceElementwisePtr<5, 1, 2, Normalize>>& instances)
std::vector<DeviceElementwiseBasePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
instances)
{
add_device_operation_instances(
instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment