Commit 56532f77 authored by rocking's avatar rocking
Browse files

Add second kernel of normalization splitK

parent 28ebcfe7
...@@ -12,12 +12,13 @@ ...@@ -12,12 +12,13 @@
#include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp" #include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_1st.hpp"
#include "ck/tensor_operation/gpu/grid/normalization/gridwise_normalization_splitk_2nd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp" #include "ck/host_utility/kernel_launch.hpp"
namespace ck { namespace ck {
template <typename GridwiseWelford1, template <typename GridwiseWelford,
typename XDataType, typename XDataType,
typename MeanVarDataType, typename MeanVarDataType,
typename ComputeDataType, typename ComputeDataType,
...@@ -32,7 +33,7 @@ kernel_normalizationSplitK1st(const XGridDesc_M_K x_grid_desc_m_k, ...@@ -32,7 +33,7 @@ kernel_normalizationSplitK1st(const XGridDesc_M_K x_grid_desc_m_k,
MeanVarDataType* const __restrict__ p_welford_variance, MeanVarDataType* const __restrict__ p_welford_variance,
int32_t* const __restrict__ p_welford_count) int32_t* const __restrict__ p_welford_count)
{ {
GridwiseWelford1::Run(x_grid_desc_m_k, GridwiseWelford::Run(x_grid_desc_m_k,
mean_var_grid_desc_m_kblock, mean_var_grid_desc_m_kblock,
num_k_block_tile_iteration, num_k_block_tile_iteration,
p_x_global, p_x_global,
...@@ -40,6 +41,57 @@ kernel_normalizationSplitK1st(const XGridDesc_M_K x_grid_desc_m_k, ...@@ -40,6 +41,57 @@ kernel_normalizationSplitK1st(const XGridDesc_M_K x_grid_desc_m_k,
p_welford_variance, p_welford_variance,
p_welford_count); p_welford_count);
}; };
template <typename GridwiseWelfordNormalization,
typename MeanVarDataType,
typename XDataType,
typename GammaDataType,
typename BetaDataType,
typename YDataType,
typename ComputeDataType,
typename YElementwiseOperation,
typename MeanVarGridDesc_M_KBlock,
typename CountVarGridDesc_M_KBlock,
typename XYGammaBetaGridDesc_M_K>
__global__ void
kernel_normalizationSplitK2nd(const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock,
const CountVarGridDesc_M_KBlock count_grid_desc_m_kblock,
const XYGammaBetaGridDesc_M_K x_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K gamma_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K beta_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K y_grid_desc_m_k,
index_t num_k_mean_var_count_iteration,
index_t num_k_block_tile_iteration,
index_t k_grid_size,
ComputeDataType epsilon,
const MeanVarDataType* const p_mean_global,
const MeanVarDataType* const p_variance_global,
const int32_t* const p_welford_count_global,
const XDataType* const __restrict__ p_x_global,
const GammaDataType* const __restrict__ p_gamma_global,
const BetaDataType* const __restrict__ p_beta_global,
YDataType* const __restrict__ p_y_global,
const YElementwiseOperation y_elementwise_op)
{
GridwiseWelfordNormalization::Run(mean_var_grid_desc_m_kblock,
count_grid_desc_m_kblock,
x_grid_desc_m_k,
gamma_grid_desc_m_k,
beta_grid_desc_m_k,
y_grid_desc_m_k,
num_k_mean_var_count_iteration,
num_k_block_tile_iteration,
k_grid_size,
epsilon,
p_mean_global,
p_variance_global,
p_welford_count_global,
p_x_global,
p_gamma_global,
p_beta_global,
p_y_global,
y_elementwise_op);
};
} // namespace ck } // namespace ck
namespace ck { namespace ck {
...@@ -64,7 +116,7 @@ template <typename XDataType, ...@@ -64,7 +116,7 @@ template <typename XDataType,
index_t KThreadClusterSize, index_t KThreadClusterSize,
index_t MThreadSliceSize, index_t MThreadSliceSize,
index_t KThreadSliceSize, index_t KThreadSliceSize,
index_t XYSrcVectorDim, index_t XYVectorDim,
index_t XSrcVectorSize, index_t XSrcVectorSize,
index_t GammaSrcVectorDim, index_t GammaSrcVectorDim,
index_t GammaSrcVectorSize, index_t GammaSrcVectorSize,
...@@ -184,22 +236,53 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -184,22 +236,53 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
} }
using SrcGridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1)); using SrcGridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
using Welford1MeanVarGridDesc_M_KBlock = using Kernel1MeanVarGridDesc_M_KBlock =
decltype(MakeMeanVarDescriptor_M_K<Sequence<true, false>, 1, 1>(1, 1)); decltype(MakeMeanVarDescriptor_M_K<Sequence<true, false>, 1, 1>(1, 1));
using GridwiseWelford1 = GridwiseNormalizationSplitK1st<XDataType, using Kernel2MeanVarGridDesc_M_KBlock =
decltype(MakeMeanVarDescriptor_M_K<Sequence<true, true>, 1, 1>(1, 1));
using Kernel2CountGridDesc_M_KBlock =
decltype(MakeCountDescriptor_M_K<Sequence<true, true>, 1, 1>(1, 1));
using GridwiseWelford = GridwiseNormalizationSplitK1st<XDataType,
ComputeDataType, ComputeDataType,
MeanVarDataType, MeanVarDataType,
SrcGridDesc_M_K, SrcGridDesc_M_K,
Welford1MeanVarGridDesc_M_KBlock, Kernel1MeanVarGridDesc_M_KBlock,
BlockSize, BlockSize,
MThreadClusterSize, MThreadClusterSize,
KThreadClusterSize, KThreadClusterSize,
MThreadSliceSize, MThreadSliceSize,
KThreadSliceSize, KThreadSliceSize,
XYSrcVectorDim, XYVectorDim,
XSrcVectorSize>; XSrcVectorSize>;
using GridwiseWelfordNormalization =
GridwiseNormalizationSplitK2nd<MeanVarDataType,
XDataType,
GammaDataType,
BetaDataType,
YDataType,
ComputeDataType,
YElementwiseOperation,
Kernel2MeanVarGridDesc_M_KBlock,
Kernel2CountGridDesc_M_KBlock,
SrcGridDesc_M_K,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
XYVectorDim,
XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize,
XYVectorDim,
YDstVectorSize>;
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(const std::vector<index_t> lengths, Argument(const std::vector<index_t> lengths,
...@@ -236,19 +319,19 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -236,19 +319,19 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
numBlockTileIteration_ = 1; numBlockTileIteration_ = 1;
while(true) while(true)
{ {
int testKGridSize_ = int testKGridSize =
math::integer_divide_ceil(KRaw_, K_BlockTileSize * numBlockTileIteration_); math::integer_divide_ceil(KRaw_, K_BlockTileSize * numBlockTileIteration_);
// we want the testKGridSize_ be not more than 128 // we want the kGridSize_ be not more than 128
if(testKGridSize_ <= 128) if(testKGridSize <= 128)
break; break;
++numBlockTileIteration_; ++numBlockTileIteration_;
}; };
kGridSize_ = math::integer_divide_ceil(KRaw_, K_BlockTileSize * numBlockTileIteration_); kGridSize_ = math::integer_divide_ceil(KRaw_, K_BlockTileSize * numBlockTileIteration_);
gridSize_ = math::integer_divide_ceil(MRaw_, M_BlockTileSize) * kGridSize_; gridSize_ = math::integer_divide_ceil(MRaw_, M_BlockTileSize) * kGridSize_;
numMeanVarCountIteration_ = math::integer_divide_ceil(kGridSize_, KThreadClusterSize);
x_grid_desc_m_k_ = x_grid_desc_m_k_ =
MakeSrc2dDescriptor(Lengths_, xStrides_, kGridSize_, numBlockTileIteration_); MakeSrc2dDescriptor(Lengths_, xStrides_, kGridSize_, numBlockTileIteration_);
...@@ -260,9 +343,17 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -260,9 +343,17 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
MakeSrc2dDescriptor(Lengths_, yStrides_, kGridSize_, numBlockTileIteration_); MakeSrc2dDescriptor(Lengths_, yStrides_, kGridSize_, numBlockTileIteration_);
// We don't need to pad in K dimension for Welford1. Set KPerTile 1. // We don't need to pad in K dimension for Welford1. Set KPerTile 1.
mean_var_grid_desc_m_kblock_ = kernel1_mean_var_grid_desc_m_kblock_ =
MakeMeanVarDescriptor_M_K<Sequence<true, false>, M_BlockTileSize, 1>(MRaw_, MakeMeanVarDescriptor_M_K<Sequence<true, false>, M_BlockTileSize, 1>(MRaw_,
kGridSize_); kGridSize_);
kernel2_mean_var_grid_desc_m_kblock_ =
MakeMeanVarDescriptor_M_K<Sequence<true, true>, M_BlockTileSize, K_BlockTileSize>(
MRaw_, kGridSize_);
kernel2_count_grid_desc_m_kblock_ =
MakeCountDescriptor_M_K<Sequence<true, true>, M_BlockTileSize, K_BlockTileSize>(
MRaw_, kGridSize_);
} }
ComputeDataType epsilon_; ComputeDataType epsilon_;
...@@ -284,6 +375,7 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -284,6 +375,7 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
YElementwiseOperation y_elementwise_op_; YElementwiseOperation y_elementwise_op_;
int kGridSize_; int kGridSize_;
int numMeanVarCountIteration_;
int numBlockTileIteration_; int numBlockTileIteration_;
size_t gridSize_; size_t gridSize_;
...@@ -292,7 +384,9 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -292,7 +384,9 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
SrcGridDesc_M_K beta_grid_desc_m_k_; SrcGridDesc_M_K beta_grid_desc_m_k_;
SrcGridDesc_M_K y_grid_desc_m_k_; SrcGridDesc_M_K y_grid_desc_m_k_;
Welford1MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock_; Kernel1MeanVarGridDesc_M_KBlock kernel1_mean_var_grid_desc_m_kblock_;
Kernel2MeanVarGridDesc_M_KBlock kernel2_mean_var_grid_desc_m_kblock_;
Kernel2CountGridDesc_M_KBlock kernel2_count_grid_desc_m_kblock_;
index_t MRaw_; // invarient length index_t MRaw_; // invarient length
index_t KRaw_; // reduce length index_t KRaw_; // reduce length
...@@ -306,12 +400,24 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -306,12 +400,24 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
arg.p_workspace_count_ == nullptr) arg.p_workspace_count_ == nullptr)
throw std::runtime_error("wrong! WorkSpace pointer has not been set"); throw std::runtime_error("wrong! WorkSpace pointer has not been set");
auto kernel1 = kernel_normalizationSplitK1st<GridwiseWelford1, auto kernel1 = kernel_normalizationSplitK1st<GridwiseWelford,
XDataType, XDataType,
MeanVarDataType, MeanVarDataType,
ComputeDataType, ComputeDataType,
SrcGridDesc_M_K, SrcGridDesc_M_K,
Welford1MeanVarGridDesc_M_KBlock>; Kernel1MeanVarGridDesc_M_KBlock>;
auto kernel2 = kernel_normalizationSplitK2nd<GridwiseWelfordNormalization,
MeanVarDataType,
XDataType,
GammaDataType,
BetaDataType,
YDataType,
ComputeDataType,
YElementwiseOperation,
Kernel2MeanVarGridDesc_M_KBlock,
Kernel2CountGridDesc_M_KBlock,
SrcGridDesc_M_K>;
float avg_time = 0; float avg_time = 0;
avg_time += launch_and_time_kernel(stream_config, avg_time += launch_and_time_kernel(stream_config,
...@@ -320,16 +426,38 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -320,16 +426,38 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
dim3(BlockSize), dim3(BlockSize),
0, 0,
arg.x_grid_desc_m_k_, arg.x_grid_desc_m_k_,
arg.mean_var_grid_desc_m_kblock_, arg.kernel1_mean_var_grid_desc_m_kblock_,
arg.numBlockTileIteration_, arg.numBlockTileIteration_,
arg.p_x_, arg.p_x_,
static_cast<MeanVarDataType*>(arg.p_workspace_mean_), static_cast<MeanVarDataType*>(arg.p_workspace_mean_),
static_cast<MeanVarDataType*>(arg.p_workspace_var_), static_cast<MeanVarDataType*>(arg.p_workspace_var_),
static_cast<int32_t*>(arg.p_workspace_count_)); static_cast<int32_t*>(arg.p_workspace_count_));
// TODO - welford2 + elementwise avg_time += launch_and_time_kernel(stream_config,
kernel2,
dim3(arg.gridSize_),
dim3(BlockSize),
0,
arg.kernel2_mean_var_grid_desc_m_kblock_,
arg.kernel2_count_grid_desc_m_kblock_,
arg.x_grid_desc_m_k_,
arg.gamma_grid_desc_m_k_,
arg.beta_grid_desc_m_k_,
arg.y_grid_desc_m_k_,
arg.numMeanVarCountIteration_,
arg.numBlockTileIteration_,
arg.kGridSize_,
arg.epsilon_,
static_cast<MeanVarDataType*>(arg.p_workspace_mean_),
static_cast<MeanVarDataType*>(arg.p_workspace_var_),
static_cast<int32_t*>(arg.p_workspace_count_),
arg.p_x_,
arg.p_gamma_,
arg.p_beta_,
arg.p_y_,
arg.y_elementwise_op_);
return (avg_time); return avg_time;
}; };
float Run(const BaseArgument* p_arg, float Run(const BaseArgument* p_arg,
...@@ -390,7 +518,7 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -390,7 +518,7 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
constexpr index_t NumInvariantDim = Rank - NumReduceDim; constexpr index_t NumInvariantDim = Rank - NumReduceDim;
if constexpr(XYSrcVectorDim == 0) if constexpr(XYVectorDim == 0)
{ {
if constexpr(NumInvariantDim == 0) if constexpr(NumInvariantDim == 0)
{ {
...@@ -423,38 +551,41 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -423,38 +551,41 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
if constexpr(GammaSrcVectorDim == 0) if constexpr(GammaSrcVectorDim == 0)
{ {
if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1) if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1)
return (false); return false;
if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
return (false); return false;
} }
else // if fastest dim is reduced else // if fastest dim is reduced
{ {
if(p_arg_->gammaStrides_[Rank - 1] != 1) if(p_arg_->gammaStrides_[Rank - 1] != 1)
return (false); return false;
if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0) if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
return (false); return false;
} }
// if fastest dim is not reduced // if fastest dim is not reduced
if constexpr(BetaSrcVectorDim == 0) if constexpr(BetaSrcVectorDim == 0)
{ {
if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1) if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1)
return (false); return false;
if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0) if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0)
return (false); return false;
} }
else // if fastest dim is reduced else // if fastest dim is reduced
{ {
if(p_arg_->betaStrides_[Rank - 1] != 1) if(p_arg_->betaStrides_[Rank - 1] != 1)
return (false); return false;
if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0) if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0)
return (false); return false;
} }
if(p_arg_->kGridSize_ <= 1)
return false;
return true; return true;
}; };
...@@ -507,7 +638,7 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType, ...@@ -507,7 +638,7 @@ struct DeviceNormalizationSplitKImpl : public DeviceNormalization<XDataType,
str << "DeviceNormalizationImpl<" << BlockSize << ","; str << "DeviceNormalizationImpl<" << BlockSize << ",";
str << "Cluster_MK_" << MThreadClusterSize << "_" << KThreadClusterSize << ","; str << "Cluster_MK_" << MThreadClusterSize << "_" << KThreadClusterSize << ",";
str << "Slice_MK_" << MThreadSliceSize << "_" << KThreadSliceSize << ","; str << "Slice_MK_" << MThreadSliceSize << "_" << KThreadSliceSize << ",";
str << "XYSrcVectorDim_" << XYSrcVectorDim << ","; str << "XYSrcVectorDim_" << XYVectorDim << ",";
str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">"; str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">";
// clang-format on // clang-format on
......
...@@ -203,7 +203,7 @@ struct GridwiseNormalizationSplitK1st ...@@ -203,7 +203,7 @@ struct GridwiseNormalizationSplitK1st
var_thread_buf(I) = type_convert<ComputeDataType>(0.0f); var_thread_buf(I) = type_convert<ComputeDataType>(0.0f);
}); });
for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles) for(index_t k = 0; k < num_k_block_tile_iteration; ++k)
{ {
static_for<0, ThreadBufferNumber, 1>{}([&](auto i) { static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
threadwise_x_load.Run(x_grid_desc_m_k, threadwise_x_load.Run(x_grid_desc_m_k,
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/math.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace ck {
template <typename MeanVarDataType,
typename XDataType,
typename GammaDataType,
typename BetaDataType,
typename YDataType,
typename ComputeDataType,
typename YElementwiseOperation,
typename MeanVarGridDesc_M_KBlock,
typename CountVarGridDesc_M_KBlock,
typename XYGammaBetaGridDesc_M_K,
index_t BlockSize,
index_t MThreadClusterSize,
index_t KThreadClusterSize,
index_t MThreadSliceSize,
index_t KThreadSliceSize,
index_t XSrcVectorDim,
index_t XSrcVectorSize,
index_t GammaSrcVectorDim,
index_t GammaSrcVectorSize,
index_t BetaSrcVectorDim,
index_t BetaSrcVectorSize,
index_t YDstVectorDim,
index_t YDstVectorSize>
struct GridwiseNormalizationSplitK2nd
{
static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
(XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
(YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
static_assert(XSrcVectorSize == YDstVectorSize);
static_assert(XSrcVectorSize == GammaSrcVectorSize);
static_assert(XSrcVectorSize == BetaSrcVectorSize);
static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);
static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{};
static constexpr auto I2 = Number<2>{};
using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
using ThreadBufferDimAccessOrder =
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
using ThreadClusterArrangeOrder =
typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
static constexpr auto thread_cluster_desc =
make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, XSrcVectorSize>;
static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
static constexpr auto thread_buffer_desc_m_1 =
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, I1));
using ThreadWelfordSrcDesc_M_1 = decltype(thread_buffer_desc_m_1);
using ThreadWelfordDstDesc_M =
decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
using ThreadwiseWelford =
ThreadwiseWelfordMerge<ComputeDataType, ThreadWelfordSrcDesc_M_1, ThreadWelfordDstDesc_M>;
using BlockwiseWelford = BlockwiseWelford<ComputeDataType,
BlockSize,
ThreadClusterLengths_M_K,
ThreadClusterArrangeOrder>;
using PassThroughOp = tensor_operation::element_wise::PassThrough;
static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize;
static constexpr auto ThreadBufferNumber = Number<KThreadSliceSize / XSrcVectorSize>{};
__device__ static void Run(const MeanVarGridDesc_M_KBlock& mean_var_grid_desc_m_kblock,
const CountVarGridDesc_M_KBlock& count_grid_desc_m_kblock,
const XYGammaBetaGridDesc_M_K& x_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K& gamma_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K& beta_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K& y_grid_desc_m_k,
index_t num_k_mean_var_count_iteration,
index_t num_k_block_tile_iteration,
index_t k_grid_size,
ComputeDataType epsilon,
const MeanVarDataType* const p_mean_global,
const MeanVarDataType* const p_variance_global,
const int32_t* const p_welford_count_global,
const XDataType* const __restrict__ p_x_global,
const GammaDataType* const __restrict__ p_gamma_global,
const BetaDataType* const __restrict__ p_beta_global,
YDataType* const __restrict__ p_y_global,
const YElementwiseOperation y_elementwise_op)
{
// Thread/Block id
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t block_m_cluster_id = block_global_id / k_grid_size;
const index_t block_k_cluster_id = block_global_id % k_grid_size;
const auto thread_cluster_idx =
thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
const auto thread_m_cluster_id = thread_cluster_idx[I0];
const auto thread_k_cluster_id = thread_cluster_idx[I1];
// Global Memory
const auto mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_mean_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
const auto var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_variance_global, mean_var_grid_desc_m_kblock.GetElementSpaceSize());
const auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_welford_count_global, count_grid_desc_m_kblock.GetElementSpaceSize());
const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_x_global, x_grid_desc_m_k.GetElementSpaceSize());
const auto gamma_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize());
const auto beta_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize());
auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
// VGPR
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
in_mean_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
in_var_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
in_welford_count_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
mean_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
var_thread_buf;
StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
welford_count_thread_buf;
auto x_thread_buf = generate_tuple(
[&](auto) {
return StaticBuffer<AddressSpaceEnum::Vgpr,
ComputeDataType,
MThreadSliceSize * XSrcVectorSize,
true>{};
},
Number<ThreadBufferNumber>{});
auto gamma_thread_buf = generate_tuple(
[&](auto) {
return StaticBuffer<AddressSpaceEnum::Vgpr,
ComputeDataType,
MThreadSliceSize * GammaSrcVectorSize,
true>{};
},
Number<ThreadBufferNumber>{});
auto& beta_thread_buf = gamma_thread_buf;
auto& y_thread_buf = x_thread_buf;
// IO
auto threadwise_mean_var_load_m_kblock =
ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
ComputeDataType,
MeanVarGridDesc_M_KBlock,
decltype(thread_buffer_desc_m_1),
ThreadBufferLengths_M_1,
Sequence<0, 1>,
1,
1,
1,
true>(
mean_var_grid_desc_m_kblock,
make_multi_index(block_m_cluster_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
thread_k_cluster_id));
auto threadwise_count_load_m_kblock =
ThreadwiseTensorSliceTransfer_v2<int32_t,
int32_t,
CountVarGridDesc_M_KBlock,
decltype(thread_buffer_desc_m_1),
ThreadBufferLengths_M_1,
Sequence<0, 1>,
1,
1,
1,
true>(
count_grid_desc_m_kblock,
make_multi_index(block_m_cluster_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
thread_k_cluster_id));
auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
ComputeDataType,
XYGammaBetaGridDesc_M_K,
decltype(thread_buffer_desc_m_k),
ThreadBufferLengths_M_K,
ThreadBufferDimAccessOrder,
XSrcVectorDim,
XSrcVectorSize,
1,
true>(
x_grid_desc_m_k,
make_multi_index(block_m_cluster_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
thread_k_cluster_id * XSrcVectorSize));
auto threadwise_gamma_load =
ThreadwiseTensorSliceTransfer_v2<GammaDataType,
ComputeDataType,
XYGammaBetaGridDesc_M_K,
decltype(thread_buffer_desc_m_k),
ThreadBufferLengths_M_K,
ThreadBufferDimAccessOrder,
GammaSrcVectorDim,
GammaSrcVectorSize,
1,
true>(
gamma_grid_desc_m_k,
make_multi_index(block_m_cluster_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
thread_k_cluster_id * GammaSrcVectorSize));
auto threadwise_beta_load =
ThreadwiseTensorSliceTransfer_v2<BetaDataType,
ComputeDataType,
XYGammaBetaGridDesc_M_K,
decltype(thread_buffer_desc_m_k),
ThreadBufferLengths_M_K,
ThreadBufferDimAccessOrder,
BetaSrcVectorDim,
BetaSrcVectorSize,
1,
true>(
beta_grid_desc_m_k,
make_multi_index(block_m_cluster_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
thread_k_cluster_id * BetaSrcVectorSize));
auto threadwise_y_store =
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
YDataType,
decltype(thread_buffer_desc_m_k),
XYGammaBetaGridDesc_M_K,
YElementwiseOperation,
ThreadBufferLengths_M_K,
ThreadBufferDimAccessOrder,
YDstVectorDim,
YDstVectorSize,
InMemoryDataOperationEnum::Set,
1,
true>(
y_grid_desc_m_k,
make_multi_index(block_m_cluster_id * M_BlockTileSize +
thread_m_cluster_id * MThreadSliceSize,
block_k_cluster_id * K_BlockTileSize * num_k_block_tile_iteration +
thread_k_cluster_id * YDstVectorSize),
y_elementwise_op);
// step1: Merge mean and variance
constexpr auto mean_var_count_thread_copy_step_I0_k =
make_multi_index(I0, KThreadClusterSize);
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
mean_thread_buf(I) = type_convert<ComputeDataType>(0.0f);
var_thread_buf(I) = type_convert<ComputeDataType>(0.0f);
welford_count_thread_buf(I) = 0;
});
for(index_t k = 0; k < num_k_mean_var_count_iteration; ++k)
{
threadwise_mean_var_load_m_kblock.Run(mean_var_grid_desc_m_kblock,
mean_global_val_buf,
thread_buffer_desc_m_1,
make_tuple(I0, I0),
in_mean_thread_buf);
threadwise_mean_var_load_m_kblock.Run(mean_var_grid_desc_m_kblock,
var_global_val_buf,
thread_buffer_desc_m_1,
make_tuple(I0, I0),
in_var_thread_buf);
threadwise_count_load_m_kblock.Run(count_grid_desc_m_kblock,
welford_count_global_val_buf,
thread_buffer_desc_m_1,
make_tuple(I0, I0),
in_welford_count_thread_buf);
ThreadwiseWelford::Run(in_mean_thread_buf,
in_var_thread_buf,
in_welford_count_thread_buf,
mean_thread_buf,
var_thread_buf,
welford_count_thread_buf);
threadwise_mean_var_load_m_kblock.MoveSrcSliceWindow(
mean_var_grid_desc_m_kblock, mean_var_count_thread_copy_step_I0_k);
threadwise_count_load_m_kblock.MoveSrcSliceWindow(count_grid_desc_m_kblock,
mean_var_count_thread_copy_step_I0_k);
}
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
if constexpr(I > 0)
block_sync_lds();
BlockwiseWelford::Run(
mean_thread_buf(I), var_thread_buf(I), welford_count_thread_buf(I));
});
// step2: normalization
constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
for(index_t k = 0; k < num_k_mean_var_count_iteration; ++k)
{
static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
threadwise_x_load.Run(x_grid_desc_m_k,
x_global_val_buf,
thread_buffer_desc_m_k,
make_tuple(I0, I0),
x_thread_buf(i));
threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
});
static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
threadwise_gamma_load.Run(gamma_grid_desc_m_k,
gamma_global_val_buf,
thread_buffer_desc_m_k,
make_tuple(I0, I0),
gamma_thread_buf(i));
threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
thread_copy_fwd_step_m_k);
});
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
constexpr auto offset_m_k =
thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
// normalize
y_thread_buf(iK0)(Number<offset_m_k>{}) =
(x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
divisor;
// gamma
y_thread_buf(iK0)(Number<offset_m_k>{}) =
y_thread_buf(iK0)(Number<offset_m_k>{}) *
gamma_thread_buf(iK0)(Number<offset_m_k>{});
});
});
});
static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
threadwise_beta_load.Run(beta_grid_desc_m_k,
beta_global_val_buf,
thread_buffer_desc_m_k,
make_tuple(I0, I0),
beta_thread_buf(i));
threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
thread_copy_fwd_step_m_k);
});
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
constexpr auto offset_m_k =
thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
// beta
y_thread_buf(iK0)(Number<offset_m_k>{}) =
y_thread_buf(iK0)(Number<offset_m_k>{}) +
beta_thread_buf(iK0)(Number<offset_m_k>{});
});
});
});
static_for<0, ThreadBufferNumber, 1>{}([&](auto i) {
threadwise_y_store.Run(thread_buffer_desc_m_k,
make_tuple(I0, I0),
y_thread_buf(i),
y_grid_desc_m_k,
y_global_val_buf);
threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k);
});
} // end for (normalization)
}
};
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment