You need to sign in or sign up before continuing.
Unverified Commit 1b62bfaa authored by rocking5566's avatar rocking5566 Committed by GitHub
Browse files

Fix bug of layernorm ckProfiler and refine code (#448)

* Fix bug of profiler for layernorm

* 1. Rename layernorm into normalization
2. Decouple softmax from normalization

* clang-format
parent a8236c19
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_normalization.hpp" #include "ck/tensor_operation/gpu/device/device_normalization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
using XDataType = ck::half_t; using XDataType = ck::half_t;
using GammaDataType = ck::half_t; using GammaDataType = ck::half_t;
...@@ -51,14 +51,14 @@ int main(int argc, char* argv[]) ...@@ -51,14 +51,14 @@ int main(int argc, char* argv[])
SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N); SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
PassThrough, PassThrough,
Rank, Rank,
NumReduceDim>; NumReduceDim>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" #include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -30,26 +30,26 @@ constexpr int Rank = 2; ...@@ -30,26 +30,26 @@ constexpr int Rank = 2;
constexpr int NumReduceDim = 1; constexpr int NumReduceDim = 1;
using DeviceInstance = using DeviceInstance =
ck::tensor_operation::device::DeviceLayernormImpl<XDataType, ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
PassThrough, PassThrough,
Rank, Rank,
NumReduceDim, NumReduceDim,
256, // BlockSize 256, // BlockSize
8, // ClusterM 8, // ClusterM
32, // ClusterK 32, // ClusterK
1, // SliceM 1, // SliceM
8, // SliceK 8, // SliceK
1, // SrcVecDim (0=M, 1=K) 1, // SrcVecDim (0=M, 1=K)
8, // SrcScalarPerVector 8, // SrcScalarPerVector
1, // GammaVecDim (0=M, 1=K) 1, // GammaVecDim (0=M, 1=K)
8, // GammaScalarPerVector 8, // GammaScalarPerVector
1, // BetaVecDim (0=M, 1=K) 1, // BetaVecDim (0=M, 1=K)
8, // BetaScalarPerVector 8, // BetaScalarPerVector
8>; // OutScalarPerVector 8>; // OutScalarPerVector
int main() int main()
{ {
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" #include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/fill.hpp" #include "ck/library/utility/fill.hpp"
...@@ -47,26 +47,26 @@ struct YElementOp ...@@ -47,26 +47,26 @@ struct YElementOp
}; };
using DeviceInstance = using DeviceInstance =
ck::tensor_operation::device::DeviceLayernormImpl<XDataType, ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
YElementOp, YElementOp,
Rank, Rank,
NumReduceDim, NumReduceDim,
1024, // BlockSize 1024, // BlockSize
1, // ClusterM 1, // ClusterM
1024, // ClusterK 1024, // ClusterK
1, // SliceM 1, // SliceM
32, // SliceK 32, // SliceK
1, // SrcVecDim (0=M, 1=K) 1, // SrcVecDim (0=M, 1=K)
2, // SrcScalarPerVector 2, // SrcScalarPerVector
1, // GammaVecDim (0=M, 1=K) 1, // GammaVecDim (0=M, 1=K)
2, // GammaScalarPerVector 2, // GammaScalarPerVector
1, // BetaVecDim (0=M, 1=K) 1, // BetaVecDim (0=M, 1=K)
2, // BetaScalarPerVector 2, // BetaScalarPerVector
2>; // OutScalarPerVector 2>; // OutScalarPerVector
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
......
...@@ -11,33 +11,6 @@ ...@@ -11,33 +11,6 @@
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace device { namespace device {
struct DeviceNormalization : public BaseOperator
{
// inLengths: input tensor extent(s) from high to low dimension
// inStrides: input tensor stride(s) from high to low dimension
// reduceDims: the dimension(s) the normalization operation is applied
// alpha: typeless pointer in host memory storing the alpha scaling value of type AccDataType
// beta: typeless pointer in host memory storing the beta scaling value of type AccDataType
// in_dev: typeless const pointer in device memory storing the input tensor
// out_dev: typeless pointer in device memory storing the output tensor
virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
const std::vector<index_t> inStrides,
const std::vector<int> reduceDims,
const void* alpha,
const void* beta,
const void* in_dev,
void* out_dev) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
virtual index_t GetRank() const = 0;
virtual index_t GetNumReduceDim() const = 0;
};
using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization>;
template <typename XDataType, template <typename XDataType,
typename GammaDataType, typename GammaDataType,
typename BetaDataType, typename BetaDataType,
...@@ -46,7 +19,7 @@ template <typename XDataType, ...@@ -46,7 +19,7 @@ template <typename XDataType,
typename AccElementwiseOperation, typename AccElementwiseOperation,
index_t Rank, index_t Rank,
index_t NumReduceDim> index_t NumReduceDim>
struct DeviceLayernorm : public BaseOperator struct DeviceNormalization : public BaseOperator
{ {
virtual std::unique_ptr<BaseArgument> virtual std::unique_ptr<BaseArgument>
MakeArgumentPointer(const std::vector<index_t> lengths, MakeArgumentPointer(const std::vector<index_t> lengths,
...@@ -73,14 +46,14 @@ template <typename XDataType, ...@@ -73,14 +46,14 @@ template <typename XDataType,
typename AccElementwiseOperation, typename AccElementwiseOperation,
index_t Rank, index_t Rank,
index_t NumReduceDim> index_t NumReduceDim>
using DeviceLayernormPtr = std::unique_ptr<DeviceLayernorm<XDataType, using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
AccElementwiseOperation, AccElementwiseOperation,
Rank, Rank,
NumReduceDim>>; NumReduceDim>>;
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
......
...@@ -75,14 +75,14 @@ template <typename XDataType, ...@@ -75,14 +75,14 @@ template <typename XDataType,
index_t BetaSrcVectorDim, index_t BetaSrcVectorDim,
index_t BetaSrcVectorSize, index_t BetaSrcVectorSize,
index_t YDstVectorSize> index_t YDstVectorSize>
struct DeviceLayernormImpl : public DeviceLayernorm<XDataType, struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
AccElementwiseOperation, AccElementwiseOperation,
Rank, Rank,
NumReduceDim> NumReduceDim>
{ {
static_assert( static_assert(
((GammaSrcVectorDim == 0 && MThreadSliceSize % GammaSrcVectorSize == 0) || ((GammaSrcVectorDim == 0 && MThreadSliceSize % GammaSrcVectorSize == 0) ||
...@@ -452,7 +452,7 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType, ...@@ -452,7 +452,7 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
auto str = std::stringstream(); auto str = std::stringstream();
// clang-format off // clang-format off
str << "DeviceLayernormImpl<" << BlockSize << ","; str << "DeviceNormalizationImpl<" << BlockSize << ",";
str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","; str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","; str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
str << "XYSrcVectorDim_" << XYSrcVectorDim << ","; str << "XYSrcVectorDim_" << XYSrcVectorDim << ",";
......
...@@ -18,24 +18,24 @@ namespace device { ...@@ -18,24 +18,24 @@ namespace device {
namespace instance { namespace instance {
// FP16 // FP16
void add_device_layernorm_rank_2_1_f16_instances( void add_device_normalization_rank_2_1_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 2, 1>>>&); std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 2, 1>>>&);
void add_device_layernorm_rank_4_3_f16_instances( void add_device_normalization_rank_4_3_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 4, 3>>>&); std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 4, 3>>>&);
void add_device_layernorm_rank_5_3_f16_instances( void add_device_normalization_rank_5_3_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, PassThrough, 5, 3>>>&); std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 5, 3>>>&);
// FP32 // FP32
void add_device_layernorm_rank_2_1_f32_instances( void add_device_normalization_rank_2_1_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 2, 1>>>&); std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 2, 1>>>&);
void add_device_layernorm_rank_4_3_f32_instances( void add_device_normalization_rank_4_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&); std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
void add_device_layernorm_rank_5_3_f32_instances( void add_device_normalization_rank_5_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, PassThrough, 5, 3>>>&); std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 5, 3>>>&);
template <typename XDataType, template <typename XDataType,
typename GammaDataType, typename GammaDataType,
...@@ -43,24 +43,24 @@ template <typename XDataType, ...@@ -43,24 +43,24 @@ template <typename XDataType,
typename YDataType, typename YDataType,
index_t Rank, index_t Rank,
index_t NumReduceDim> index_t NumReduceDim>
struct DeviceOperationInstanceFactory< struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormalization<
ck::tensor_operation::device::DeviceLayernorm<XDataType, XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
F32, F32,
YDataType, YDataType,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
Rank, Rank,
NumReduceDim>> NumReduceDim>>
{ {
using DeviceOp = DeviceLayernorm<XDataType, using DeviceOp = DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
F32, F32,
YDataType, YDataType,
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::element_wise::PassThrough,
Rank, Rank,
NumReduceDim>; NumReduceDim>;
static auto GetInstances() static auto GetInstances()
{ {
...@@ -71,15 +71,15 @@ struct DeviceOperationInstanceFactory< ...@@ -71,15 +71,15 @@ struct DeviceOperationInstanceFactory<
{ {
if constexpr(Rank == 2 && NumReduceDim == 1) if constexpr(Rank == 2 && NumReduceDim == 1)
{ {
add_device_layernorm_rank_2_1_f16_instances(op_ptrs); add_device_normalization_rank_2_1_f16_instances(op_ptrs);
} }
else if constexpr(Rank == 4 && NumReduceDim == 3) else if constexpr(Rank == 4 && NumReduceDim == 3)
{ {
add_device_layernorm_rank_4_3_f16_instances(op_ptrs); add_device_normalization_rank_4_3_f16_instances(op_ptrs);
} }
else if constexpr(Rank == 5 && NumReduceDim == 3) else if constexpr(Rank == 5 && NumReduceDim == 3)
{ {
add_device_layernorm_rank_5_3_f16_instances(op_ptrs); add_device_normalization_rank_5_3_f16_instances(op_ptrs);
} }
} }
else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> && else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
...@@ -87,15 +87,15 @@ struct DeviceOperationInstanceFactory< ...@@ -87,15 +87,15 @@ struct DeviceOperationInstanceFactory<
{ {
if constexpr(Rank == 2 && NumReduceDim == 1) if constexpr(Rank == 2 && NumReduceDim == 1)
{ {
add_device_layernorm_rank_2_1_f32_instances(op_ptrs); add_device_normalization_rank_2_1_f32_instances(op_ptrs);
} }
else if constexpr(Rank == 4 && NumReduceDim == 3) else if constexpr(Rank == 4 && NumReduceDim == 3)
{ {
add_device_layernorm_rank_4_3_f32_instances(op_ptrs); add_device_normalization_rank_4_3_f32_instances(op_ptrs);
} }
else if constexpr(Rank == 5 && NumReduceDim == 3) else if constexpr(Rank == 5 && NumReduceDim == 3)
{ {
add_device_layernorm_rank_5_3_f32_instances(op_ptrs); add_device_normalization_rank_5_3_f32_instances(op_ptrs);
} }
} }
......
...@@ -17,7 +17,6 @@ IF(IS_DIRECTORY "${subdir_path}") ...@@ -17,7 +17,6 @@ IF(IS_DIRECTORY "${subdir_path}")
ENDIF() ENDIF()
ENDFOREACH() ENDFOREACH()
add_library(device_operations STATIC ${CK_DEVICE_INSTANCES}) add_library(device_operations STATIC ${CK_DEVICE_INSTANCES})
add_library(composablekernels::device_operations ALIAS device_operations) add_library(composablekernels::device_operations ALIAS device_operations)
......
add_instance_library(device_normalization_instance add_instance_library(device_normalization_instance
device_layernorm_f16_instance.cpp device_normalization_f16_instance.cpp
device_layernorm_f32_instance.cpp device_normalization_f32_instance.cpp
device_softmax_f32_f32_instance.cpp
device_softmax_f16_f16_instance.cpp
) )
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Pass = ck::tensor_operation::element_wise::PassThrough;
template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_layernorm_f16_instances = std::tuple<
// clang-format off
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>, // fallback kernel
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
DeviceLayernormImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2>
// clang-format on
>;
void add_device_layernorm_rank_2_1_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 2, 1>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 2, 1>{});
}
void add_device_layernorm_rank_4_3_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 4, 3>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 4, 3>{});
}
void add_device_layernorm_rank_5_3_f16_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F16, F16, F16, F32, F16, Pass, 5, 3>>>& instances)
{
add_device_operation_instances(instances, device_layernorm_f16_instances<Pass, 5, 3>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using F16 = ck::half_t;
using F32 = float;
using Pass = ck::tensor_operation::element_wise::PassThrough;
template <typename OutElementwise, index_t Rank, index_t Reduce>
// clang-format off
using device_normalization_f16_instances =
std::tuple <
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>, // fallback kernel
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2>
>;
// clang-format on
void add_device_normalization_rank_2_1_f16_instances(
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
instances)
{
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
}
void add_device_normalization_rank_4_3_f16_instances(
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
instances)
{
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
}
void add_device_normalization_rank_5_3_f16_instances(
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
instances)
{
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp" #include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp"
#include "ck/utility/data_type.hpp" #include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
...@@ -20,33 +20,36 @@ template <typename OutElementwise, index_t Rank, index_t Reduce> ...@@ -20,33 +20,36 @@ template <typename OutElementwise, index_t Rank, index_t Reduce>
using device_layernorm_f32_instances = std::tuple< using device_layernorm_f32_instances = std::tuple<
// clang-format off // clang-format off
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>, DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 1, 4, 1, 4, 4>, DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 1, 4, 1, 4, 4>, DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 1, 4, 1, 4, 4>, DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 1, 4, 1, 4, 4>,
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 1, 4, 1, 4, 4>, DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 1, 4, 1, 4, 4>,
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 1, 4, 1, 4, 4>, DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 1, 4, 1, 4, 4>, DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 1, 4, 1, 4, 4>,
DeviceLayernormImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 1, 4, 1, 4, 4> DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 1, 4, 1, 4, 4>
// clang-format on // clang-format on
>; >;
void add_device_layernorm_rank_2_1_f32_instances( void add_device_normalization_rank_2_1_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 2, 1>>>& instances) std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
instances)
{ {
add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 2, 1>{}); add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 2, 1>{});
} }
void add_device_layernorm_rank_4_3_f32_instances( void add_device_normalization_rank_4_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 4, 3>>>& instances) std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
instances)
{ {
add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 4, 3>{}); add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 4, 3>{});
} }
void add_device_layernorm_rank_5_3_f32_instances( void add_device_normalization_rank_5_3_f32_instances(
std::vector<std::unique_ptr<DeviceLayernorm<F32, F32, F32, F32, F32, Pass, 5, 3>>>& instances) std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
instances)
{ {
add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 5, 3>{}); add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 5, 3>{});
} }
......
add_instance_library(device_softmax_instance
device_softmax_f16_f16_instance.cpp
device_softmax_f32_f32_instance.cpp
)
...@@ -25,7 +25,7 @@ set(PROFILER_SOURCE ...@@ -25,7 +25,7 @@ set(PROFILER_SOURCE
src/profile_reduce.cpp src/profile_reduce.cpp
src/profile_groupnorm.cpp src/profile_groupnorm.cpp
src/profile_layernorm.cpp src/profile_layernorm.cpp
src/profile_normalization.cpp src/profile_softmax.cpp
) )
add_executable(ckProfiler ${PROFILER_SOURCE}) add_executable(ckProfiler ${PROFILER_SOURCE})
...@@ -55,4 +55,5 @@ target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_weight_instance) ...@@ -55,4 +55,5 @@ target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_normalization_instance) target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
target_link_libraries(ckProfiler PRIVATE device_softmax_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -75,14 +75,14 @@ bool profile_groupnorm_impl(int do_verification, ...@@ -75,14 +75,14 @@ bool profile_groupnorm_impl(int do_verification,
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.mData.data());
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
PassThrough, PassThrough,
5, 5,
3>; 3>;
// get device op instances // get device op instances
const auto instance_ptrs = const auto instance_ptrs =
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp" #include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -28,27 +28,29 @@ void profile_layernorm_impl(int do_verification, ...@@ -28,27 +28,29 @@ void profile_layernorm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
std::vector<index_t> length, std::vector<index_t> length)
std::vector<index_t> strideXY,
std::vector<index_t> strideGamma,
std::vector<index_t> strideBeta)
{ {
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() < 2) if(length.size() < 2)
return; return;
// Assume normalize dimension except for first dimension // Assume normalize dimension except for batch (first) dimension
std::vector<index_t> reduce_length{length.begin() + 1, length.end()}; std::vector<index_t> reduce_length{length.begin() + 1, length.end()};
std::vector<index_t> reduce_dim; std::vector<index_t> reduce_dim;
for(int i = 1; i < Rank; ++i) for(int i = 1; i < Rank; ++i)
reduce_dim.push_back(i); reduce_dim.push_back(i);
Tensor<XDataType> x(length); Tensor<XDataType> x(length);
Tensor<GammaDataType> gamma(reduce_length, strideGamma); Tensor<GammaDataType> gamma(reduce_length);
Tensor<BetaDataType> beta(reduce_length, strideBeta); Tensor<BetaDataType> beta(reduce_length);
Tensor<YDataType> y(length, strideXY); Tensor<YDataType> y(length);
Tensor<YDataType> host_y(length, strideXY); Tensor<YDataType> host_y(length);
std::vector<index_t> strideXY =
std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
std::vector<index_t> strideGammaBeta = strideXY;
strideGammaBeta[0] = 0;
switch(init_method) switch(init_method)
{ {
...@@ -84,14 +86,14 @@ void profile_layernorm_impl(int do_verification, ...@@ -84,14 +86,14 @@ void profile_layernorm_impl(int do_verification,
constexpr int NumReduceDim = Rank - 1; constexpr int NumReduceDim = Rank - 1;
// add device normalization instances // add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceLayernorm<XDataType, using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
YDataType, YDataType,
PassThrough, PassThrough,
Rank, Rank,
NumReduceDim>; NumReduceDim>;
// get device op instances // get device op instances
const auto instance_ptrs = const auto instance_ptrs =
...@@ -126,8 +128,8 @@ void profile_layernorm_impl(int do_verification, ...@@ -126,8 +128,8 @@ void profile_layernorm_impl(int do_verification,
{ {
auto argument_ptr = inst_ptr->MakeArgumentPointer(length, auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
strideXY, strideXY,
strideGamma, strideGammaBeta,
strideBeta, strideGammaBeta,
strideXY, strideXY,
reduce_dim, reduce_dim,
1e-4, 1e-4,
......
...@@ -69,16 +69,16 @@ template <> std::string type_to_string<int32_t>() { return "int32"; } ...@@ -69,16 +69,16 @@ template <> std::string type_to_string<int32_t>() { return "int32"; }
// clang-format on // clang-format on
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank> template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
void profile_normalization_impl(int do_verification, void profile_softmax_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
std::vector<index_t> in_length, std::vector<index_t> in_length,
std::vector<index_t> in_strides, std::vector<index_t> in_strides,
std::vector<index_t> reduce_dims, std::vector<index_t> reduce_dims,
AccDataType alpha, AccDataType alpha,
AccDataType beta, AccDataType beta,
NormType norm_type) NormType norm_type)
{ {
if(Rank != in_length.size()) if(Rank != in_length.size())
{ {
......
...@@ -12,8 +12,7 @@ using ck::index_t; ...@@ -12,8 +12,7 @@ using ck::index_t;
struct LayernormArgParser struct LayernormArgParser
{ {
std::unordered_map<std::string, std::vector<int>> long_opts = { std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}}};
{"length", {}}, {"strideXY", {}}, {"strideGamma", {}}, {"strideBeta", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i) bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{ {
...@@ -52,9 +51,6 @@ void print_help_layernorm() ...@@ -52,9 +51,6 @@ void print_help_layernorm()
<< "arg4: print tensor value (0: no; 1: yes)\n" << "arg4: print tensor value (0: no; 1: yes)\n"
<< "arg5: time kernel (0=no, 1=yes)\n" << "arg5: time kernel (0=no, 1=yes)\n"
<< "--length: tensor extents (e.g, --length 1024 1024) \n" << "--length: tensor extents (e.g, --length 1024 1024) \n"
<< "--strideXY: tensor strides (e.g, --strideXY 1024 1)\n"
<< "--strideGamma: tensor strides (e.g, --strideGamma 1)\n"
<< "--strideBeta: tensor strides (e.g, --strideBeta 1)\n"
<< std::endl; << std::endl;
} }
...@@ -77,10 +73,7 @@ int profile_layernorm(int argc, char* argv[]) ...@@ -77,10 +73,7 @@ int profile_layernorm(int argc, char* argv[])
// parse the long options // parse the long options
arg_parser(argc, argv); arg_parser(argc, argv);
const std::vector<index_t> length = arg_parser.long_opts["length"]; const std::vector<index_t> length = arg_parser.long_opts["length"];
const std::vector<index_t> strideXY = arg_parser.long_opts["strideXY"];
const std::vector<index_t> strideGamma = arg_parser.long_opts["strideGamma"];
const std::vector<index_t> strideBeta = arg_parser.long_opts["strideBeta"];
using F16 = ck::half_t; using F16 = ck::half_t;
using F32 = float; using F32 = float;
...@@ -88,25 +81,13 @@ int profile_layernorm(int argc, char* argv[]) ...@@ -88,25 +81,13 @@ int profile_layernorm(int argc, char* argv[])
if(data_type == ck::DataTypeEnum::Half) if(data_type == ck::DataTypeEnum::Half)
{ {
ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(do_verification, ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(
init_method, do_verification, init_method, do_log, time_kernel, length);
do_log,
time_kernel,
length,
strideXY,
strideGamma,
strideBeta);
} }
else if(data_type == ck::DataTypeEnum::Float) else if(data_type == ck::DataTypeEnum::Float)
{ {
ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(do_verification, ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(
init_method, do_verification, init_method, do_log, time_kernel, length);
do_log,
time_kernel,
length,
strideXY,
strideGamma,
strideBeta);
} }
else else
{ {
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include <vector> #include <vector>
#include <unordered_map> #include <unordered_map>
#include "profiler/include/profile_normalization_impl.hpp" #include "profiler/include/profile_softmax_impl.hpp"
using ck::index_t; using ck::index_t;
using ck::profiler::NormDataType; using ck::profiler::NormDataType;
...@@ -95,30 +95,29 @@ int profile_normalization(int argc, char* argv[]) ...@@ -95,30 +95,29 @@ int profile_normalization(int argc, char* argv[])
{ {
if(data_type == NormDataType::F16_F16) if(data_type == NormDataType::F16_F16)
{ {
ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t, 3>( ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
do_verification, init_method,
init_method, do_log,
do_log, time_kernel,
time_kernel, length,
length, stride,
stride, reduce,
reduce, float(alpha),
float(alpha), float(beta),
float(beta), norm_type);
norm_type);
} }
else if(data_type == NormDataType::F32_F32) else if(data_type == NormDataType::F32_F32)
{ {
ck::profiler::profile_normalization_impl<float, float, float, 3>(do_verification, ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
init_method, init_method,
do_log, do_log,
time_kernel, time_kernel,
length, length,
stride, stride,
reduce, reduce,
float(alpha), float(alpha),
float(beta), float(beta),
norm_type); norm_type);
} }
else else
{ {
...@@ -129,30 +128,29 @@ int profile_normalization(int argc, char* argv[]) ...@@ -129,30 +128,29 @@ int profile_normalization(int argc, char* argv[])
{ {
if(data_type == NormDataType::F16_F16) if(data_type == NormDataType::F16_F16)
{ {
ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t, 4>( ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
do_verification, init_method,
init_method, do_log,
do_log, time_kernel,
time_kernel, length,
length, stride,
stride, reduce,
reduce, float(alpha),
float(alpha), float(beta),
float(beta), norm_type);
norm_type);
} }
else if(data_type == NormDataType::F32_F32) else if(data_type == NormDataType::F32_F32)
{ {
ck::profiler::profile_normalization_impl<float, float, float, 4>(do_verification, ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
init_method, init_method,
do_log, do_log,
time_kernel, time_kernel,
length, length,
stride, stride,
reduce, reduce,
float(alpha), float(alpha),
float(beta), float(beta),
norm_type); norm_type);
} }
else else
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment