"...composable_kernel-1.git" did not exist on "1c8126a4c2372530db822c28fe6d2a4eb8f3998b"
Unverified Commit 147b7db5 authored by who who who's avatar who who who Committed by GitHub
Browse files

add multi embeddings support (#542)

* add multi embeddings support

* fix format

* optimize sqrt

* add reduce operation

* change to elementwise op

* fix name

* rename

* run ci cd

* format example

* format code

* format code
parent 55236709
...@@ -9,7 +9,8 @@ ...@@ -9,7 +9,8 @@
#include <ctime> #include <ctime>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_sparse_embedding3_forward_layernorm.hpp" #include "ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
...@@ -18,53 +19,26 @@ ...@@ -18,53 +19,26 @@
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp"
// using EmbType = float; // clang-format off
// using IndexType = int64_t;
// using GammaDataType = float;
// using BetaDataType = float;
// using AccDataType = float;
// using OutType = float;
using EmbType = ck::half_t; using EmbType = ck::half_t;
using IndexType = int64_t; using IndexType = int64_t;
using GammaDataType = ck::half_t; using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t; using BetaDataType = ck::half_t;
using AccDataType = float; using AccDataType = float;
using OutType = ck::half_t; using OutType = ck::half_t;
using EmbElementwiseOperation = ck::tensor_operation::element_wise::AddAdd;
// clang-format off using DeviceInstance_fp16_e256 = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, EmbElementwiseOperation, 256, 1, 256, 1, 256, 1, 1, 3>;
// BlockSize, DimClusterSize, RowClusterSize, DimPerBlock, RowPerBlock, DimThreadSize, RowVectorSize using DeviceInstance_fp16_e512 = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, EmbElementwiseOperation, 256, 1, 256, 1, 512, 1, 2, 3>;
using DeviceInstance_fp32_e256 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 256, 1, 1>; using DeviceInstance_fp16_e768 = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, EmbElementwiseOperation, 256, 1, 256, 1, 768, 1, 1, 3>;
using DeviceInstance_fp32_e512 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 512, 1, 1>; using DeviceInstance_fp16_e1024 = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, EmbElementwiseOperation, 256, 1, 256, 1, 1024, 1, 2, 3>;
using DeviceInstance_fp32_e768 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 768, 1, 1>; using DeviceInstance_fp16_e1536 = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, EmbElementwiseOperation, 256, 1, 256, 1, 1536, 1, 2, 3>;
using DeviceInstance_fp32_e1024 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 1024, 1, 1>; using DeviceInstance_fp16_e2048 = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, EmbElementwiseOperation, 256, 1, 256, 1, 2048, 1, 2, 3>;
using DeviceInstance_fp32_e1536 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 1536, 1, 1>; using DeviceInstance_fp16_e4096 = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, EmbElementwiseOperation, 256, 1, 256, 1, 4096, 1, 8, 3>;
using DeviceInstance_fp32_e2048 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 2048, 1, 4>; using DeviceInstance_fp16_e8192 = ck::tensor_operation::device::DeviceSparseEmbeddingsForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, EmbElementwiseOperation, 256, 1, 256, 1, 8192, 1, 8, 3>;
using DeviceInstance_fp32_e4096 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 4096, 1, 4>;
using DeviceInstance_fp32_e8192 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 8192, 1, 4>;
using DeviceInstance_fp32_e16384 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 16384, 1, 4>;
using DeviceInstance_fp16_e256 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 256, 1, 1>;
using DeviceInstance_fp16_e512 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 512, 1, 2>;
using DeviceInstance_fp16_e768 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 768, 1, 1>;
using DeviceInstance_fp16_e1024 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 1024, 1, 2>;
using DeviceInstance_fp16_e1536 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 1536, 1, 2>;
using DeviceInstance_fp16_e2048 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 2048, 1, 2>;
using DeviceInstance_fp16_e4096 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 4096, 1, 8>;
using DeviceInstance_fp16_e8192 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256, 1, 256, 1, 8192, 1, 8>;
template<typename emb_type, ck::index_t dim> struct emb_kernel{}; template<typename emb_type, ck::index_t dim> struct emb_kernel{};
template<> struct emb_kernel<float, 256> { using kernel_type = DeviceInstance_fp32_e256; };
template<> struct emb_kernel<float, 512> { using kernel_type = DeviceInstance_fp32_e512; };
template<> struct emb_kernel<float, 768> { using kernel_type = DeviceInstance_fp32_e768; };
template<> struct emb_kernel<float, 1024> { using kernel_type = DeviceInstance_fp32_e1024;};
template<> struct emb_kernel<float, 1536> { using kernel_type = DeviceInstance_fp32_e1536;};
template<> struct emb_kernel<float, 2048> { using kernel_type = DeviceInstance_fp32_e2048;};
template<> struct emb_kernel<float, 4096> { using kernel_type = DeviceInstance_fp32_e4096;};
template<> struct emb_kernel<float, 8192> { using kernel_type = DeviceInstance_fp32_e8192;};
template<> struct emb_kernel<float, 16384>{ using kernel_type = DeviceInstance_fp32_e16384;};
template<> struct emb_kernel<ck::half_t, 256> { using kernel_type = DeviceInstance_fp16_e256; }; template<> struct emb_kernel<ck::half_t, 256> { using kernel_type = DeviceInstance_fp16_e256; };
template<> struct emb_kernel<ck::half_t, 512> { using kernel_type = DeviceInstance_fp16_e512; }; template<> struct emb_kernel<ck::half_t, 512> { using kernel_type = DeviceInstance_fp16_e512; };
template<> struct emb_kernel<ck::half_t, 768> { using kernel_type = DeviceInstance_fp16_e768; }; template<> struct emb_kernel<ck::half_t, 768> { using kernel_type = DeviceInstance_fp16_e768; };
...@@ -152,19 +126,20 @@ int main() ...@@ -152,19 +126,20 @@ int main()
beta_dev.ToDevice(beta.mData.data()); beta_dev.ToDevice(beta.mData.data());
auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{}; auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
auto argument_ptr = device_instance.MakeArgumentPointer(out_dev.GetDeviceBuffer(), auto argument_ptr = device_instance.MakeArgumentPointer(
emb_a_dev.GetDeviceBuffer(), out_dev.GetDeviceBuffer(),
emb_b_dev.GetDeviceBuffer(), {ck::type_convert<EmbType*>(emb_a_dev.GetDeviceBuffer()),
emb_c_dev.GetDeviceBuffer(), ck::type_convert<EmbType*>(emb_b_dev.GetDeviceBuffer()),
index_a_dev.GetDeviceBuffer(), ck::type_convert<EmbType*>(emb_c_dev.GetDeviceBuffer())},
index_b_dev.GetDeviceBuffer(), {ck::type_convert<IndexType*>(index_a_dev.GetDeviceBuffer()),
index_c_dev.GetDeviceBuffer(), ck::type_convert<IndexType*>(index_b_dev.GetDeviceBuffer()),
gamma_dev.GetDeviceBuffer(), ck::type_convert<IndexType*>(index_c_dev.GetDeviceBuffer())},
beta_dev.GetDeviceBuffer(), gamma_dev.GetDeviceBuffer(),
num_rows, beta_dev.GetDeviceBuffer(),
current_dim, current_dim,
index_length, index_length,
epsilon); epsilon,
EmbElementwiseOperation{});
std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString() std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
<< std::endl << std::endl
<< std::flush; << std::flush;
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include "ck/utility/common_header.hpp" #include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -24,16 +24,17 @@ template <typename EmbType, ...@@ -24,16 +24,17 @@ template <typename EmbType,
typename BetaDataType, typename BetaDataType,
typename AccDataType, typename AccDataType,
typename OutType, typename OutType,
typename EmbElementwiseOperation,
ck::index_t BlockSize, ck::index_t BlockSize,
ck::index_t DimClusterSize, ck::index_t DimClusterSize,
ck::index_t RowClusterSize, ck::index_t RowClusterSize,
ck::index_t DimPerBlock, ck::index_t DimPerBlock,
ck::index_t RowPerBlock, ck::index_t RowPerBlock,
ck::index_t DimThreadSize, ck::index_t DimThreadSize,
ck::index_t RowVectorSize> ck::index_t RowVectorSize,
struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator ck::index_t NumEmbeddings>
struct DeviceSparseEmbeddingsForwardLayernorm : public BaseOperator
{ {
static auto MakeOutputDescriptor(const index_t index_length, const index_t rows) static auto MakeOutputDescriptor(const index_t index_length, const index_t rows)
{ {
return make_naive_tensor_descriptor_packed(make_tuple(index_length, rows)); return make_naive_tensor_descriptor_packed(make_tuple(index_length, rows));
...@@ -42,96 +43,79 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator ...@@ -42,96 +43,79 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator
struct Argument : public BaseArgument struct Argument : public BaseArgument
{ {
Argument(OutType* p_out, Argument(OutType* p_out,
const EmbType* p_emb_a, const ck::Array<EmbType*, NumEmbeddings>& p_embs,
const EmbType* p_emb_b, const ck::Array<IndexType*, NumEmbeddings>& p_indexs,
const EmbType* p_emb_c,
const IndexType* p_index_a,
const IndexType* p_index_b,
const IndexType* p_index_c,
const GammaDataType* p_gamma, const GammaDataType* p_gamma,
const BetaDataType* p_beta, const BetaDataType* p_beta,
const ck::index_t NumRows,
const ck::index_t EmbeddingDim, const ck::index_t EmbeddingDim,
const ck::index_t IndexLength, const ck::index_t IndexLength,
const AccDataType epsilon) const AccDataType epsilon,
const EmbElementwiseOperation emb_elementwise_op)
: p_out_(p_out), : p_out_(p_out),
p_emb_a_(p_emb_a), p_embs_(p_embs),
p_emb_b_(p_emb_b), p_indexs_(p_indexs),
p_emb_c_(p_emb_c),
p_index_a_(p_index_a),
p_index_b_(p_index_b),
p_index_c_(p_index_c),
p_gamma_(p_gamma), p_gamma_(p_gamma),
p_beta_(p_beta), p_beta_(p_beta),
NumRows_(NumRows),
EmbeddingDim_(EmbeddingDim), EmbeddingDim_(EmbeddingDim),
IndexLength_(IndexLength), IndexLength_(IndexLength),
epsilon_(epsilon) epsilon_(epsilon),
emb_elementwise_op_(emb_elementwise_op)
{ {
grid_size_ = (IndexLength + DimClusterSize - 1) / DimClusterSize; grid_size_ = (IndexLength + DimClusterSize - 1) / DimClusterSize;
} }
OutType* p_out_; OutType* p_out_;
const EmbType* p_emb_a_; ck::Array<EmbType*, NumEmbeddings> p_embs_;
const EmbType* p_emb_b_; ck::Array<IndexType*, NumEmbeddings> p_indexs_;
const EmbType* p_emb_c_;
const IndexType* p_index_a_;
const IndexType* p_index_b_;
const IndexType* p_index_c_;
const GammaDataType* p_gamma_; const GammaDataType* p_gamma_;
const BetaDataType* p_beta_; const BetaDataType* p_beta_;
ck::index_t NumRows_;
ck::index_t EmbeddingDim_; ck::index_t EmbeddingDim_;
ck::index_t IndexLength_; ck::index_t IndexLength_;
AccDataType epsilon_; AccDataType epsilon_;
EmbElementwiseOperation emb_elementwise_op_;
size_t grid_size_; size_t grid_size_;
}; };
virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(void* p_out, std::unique_ptr<BaseArgument>
const void* p_emb_a, MakeArgumentPointer(void* p_out,
const void* p_emb_b, const ck::Array<EmbType*, NumEmbeddings>& p_embs,
const void* p_emb_c, const ck::Array<IndexType*, NumEmbeddings>& p_indexs,
const void* p_index_a, const void* p_gamma,
const void* p_index_b, const void* p_beta,
const void* p_index_c, ck::index_t EmbeddingDim,
const void* p_gamma, ck::index_t IndexLength,
const void* p_beta, const AccDataType epsilon,
ck::index_t NumRows, const EmbElementwiseOperation emb_elementwise_op)
ck::index_t EmbeddingDim,
ck::index_t IndexLength,
const AccDataType epsilon)
{ {
return std::make_unique<Argument>(reinterpret_cast<OutType*>(p_out), return std::make_unique<Argument>(reinterpret_cast<OutType*>(p_out),
reinterpret_cast<const EmbType*>(p_emb_a), p_embs,
reinterpret_cast<const EmbType*>(p_emb_b), p_indexs,
reinterpret_cast<const EmbType*>(p_emb_c),
reinterpret_cast<const IndexType*>(p_index_a),
reinterpret_cast<const IndexType*>(p_index_b),
reinterpret_cast<const IndexType*>(p_index_c),
reinterpret_cast<const GammaDataType*>(p_gamma), reinterpret_cast<const GammaDataType*>(p_gamma),
reinterpret_cast<const BetaDataType*>(p_beta), reinterpret_cast<const BetaDataType*>(p_beta),
NumRows,
EmbeddingDim, EmbeddingDim,
IndexLength, IndexLength,
epsilon); epsilon,
emb_elementwise_op);
} }
using GridwiseSparseEmbedding = using GridwiseSparseEmbedding =
GridwiseSparseEmbedding3ForwardLayernorm<EmbType, GridwiseSparseEmbeddingsForwardLayernorm<EmbType,
IndexType, IndexType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
OutType, OutType,
decltype(MakeOutputDescriptor(1, 1)), decltype(MakeOutputDescriptor(1, 1)),
EmbElementwiseOperation,
BlockSize, BlockSize,
DimClusterSize, DimClusterSize,
RowClusterSize, RowClusterSize,
DimPerBlock, DimPerBlock,
RowPerBlock, RowPerBlock,
DimThreadSize, DimThreadSize,
RowVectorSize>; RowVectorSize,
NumEmbeddings>;
struct Invoker : public BaseInvoker struct Invoker : public BaseInvoker
{ {
...@@ -139,14 +123,16 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator ...@@ -139,14 +123,16 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator
{ {
auto out_desc = MakeOutputDescriptor(arg.IndexLength_, arg.EmbeddingDim_); auto out_desc = MakeOutputDescriptor(arg.IndexLength_, arg.EmbeddingDim_);
const auto kernel_main = const auto kernel_main =
kernel_sparse_embedding3_forward_layernorm<GridwiseSparseEmbedding, kernel_sparse_embeddings_forward_layernorm<GridwiseSparseEmbedding,
EmbType, EmbType,
IndexType, IndexType,
GammaDataType, GammaDataType,
BetaDataType, BetaDataType,
AccDataType, AccDataType,
OutType, OutType,
decltype(out_desc)>; decltype(out_desc),
EmbElementwiseOperation,
NumEmbeddings>;
float avg_time = 0; float avg_time = 0;
avg_time += launch_and_time_kernel(stream_config, avg_time += launch_and_time_kernel(stream_config,
kernel_main, kernel_main,
...@@ -154,16 +140,13 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator ...@@ -154,16 +140,13 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator
dim3(BlockSize), dim3(BlockSize),
0, 0,
arg.p_out_, arg.p_out_,
arg.p_emb_a_, arg.p_embs_,
arg.p_emb_b_, arg.p_indexs_,
arg.p_emb_c_,
arg.p_index_a_,
arg.p_index_b_,
arg.p_index_c_,
arg.p_gamma_, arg.p_gamma_,
arg.p_beta_, arg.p_beta_,
out_desc, out_desc,
arg.epsilon_); arg.epsilon_,
arg.emb_elementwise_op_);
return (avg_time); return (avg_time);
} }
...@@ -177,7 +160,7 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator ...@@ -177,7 +160,7 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator
static bool IsSupportedArgument(const Argument* p_arg) static bool IsSupportedArgument(const Argument* p_arg)
{ {
return (RowPerBlock == p_arg->EmbeddingDim_) && (p_arg->NumRows_ % DimPerBlock == 0); return (RowPerBlock == p_arg->EmbeddingDim_);
} }
bool IsSupportedArgument(const BaseArgument* p_arg) override bool IsSupportedArgument(const BaseArgument* p_arg) override
...@@ -195,7 +178,7 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator ...@@ -195,7 +178,7 @@ struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator
auto str = std::stringstream(); auto str = std::stringstream();
// clang-format off // clang-format off
str << "DeviceSparseEmbedding3ForwardLayernorm_"<< BlockSize << "_" << str << "DeviceSparseEmbeddingsForwardLayernorm_"<< BlockSize << "_" <<
DimClusterSize << "x" << RowClusterSize << "_" << DimClusterSize << "x" << RowClusterSize << "_" <<
DimPerBlock << "x" << RowPerBlock << "_" << DimPerBlock << "x" << RowPerBlock << "_" <<
DimThreadSize << "x" << RowVectorSize; DimThreadSize << "x" << RowVectorSize;
......
...@@ -17,33 +17,24 @@ template <typename GridwiseSparseEmbedding, ...@@ -17,33 +17,24 @@ template <typename GridwiseSparseEmbedding,
typename BetaDataType, typename BetaDataType,
typename AccDataType, typename AccDataType,
typename OutType, typename OutType,
typename OutGridDesc> typename OutGridDesc,
typename EmbElementwiseOperation,
ck::index_t NumEmbeddings>
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif #endif
__global__ void kernel_sparse_embedding3_forward_layernorm(OutType* p_out, __global__ void kernel_sparse_embeddings_forward_layernorm(
const EmbType* p_emb_a, OutType* p_out,
const EmbType* p_emb_b, const ck::Array<EmbType*, NumEmbeddings> p_embs,
const EmbType* p_emb_c, const ck::Array<IndexType*, NumEmbeddings> p_indexes,
const IndexType* p_index_a, const GammaDataType* p_gamma,
const IndexType* p_index_b, const BetaDataType* p_beta,
const IndexType* p_index_c, const OutGridDesc out_grid_desc,
const GammaDataType* p_gamma, const AccDataType epsilon,
const BetaDataType* p_beta, const EmbElementwiseOperation emb_elementwise_op)
const OutGridDesc out_grid_desc,
const AccDataType epsilon)
{ {
GridwiseSparseEmbedding::Run(p_out, GridwiseSparseEmbedding::Run(
p_emb_a, p_out, p_embs, p_indexes, p_gamma, p_beta, out_grid_desc, epsilon, emb_elementwise_op);
p_emb_b,
p_emb_c,
p_index_a,
p_index_b,
p_index_c,
p_gamma,
p_beta,
out_grid_desc,
epsilon);
} }
template <typename EmbType, template <typename EmbType,
...@@ -53,14 +44,16 @@ template <typename EmbType, ...@@ -53,14 +44,16 @@ template <typename EmbType,
typename AccDataType, typename AccDataType,
typename OutType, typename OutType,
typename OutGridDesc, typename OutGridDesc,
typename EmbElementwiseOperation,
ck::index_t BlockSize, ck::index_t BlockSize,
ck::index_t DimClusterSize, ck::index_t DimClusterSize,
ck::index_t RowClusterSize, ck::index_t RowClusterSize,
ck::index_t DimPerBlock, // Row x Dim, along Dim ck::index_t DimPerBlock, // Row x Dim, along Dim
ck::index_t RowPerBlock, // Row x Dim, along Row ck::index_t RowPerBlock, // Row x Dim, along Row
ck::index_t DimThreadSize, // this is actually not vector, but number of registers ck::index_t DimThreadSize, // this is actually not vector, but number of registers
ck::index_t RowVectorSize> ck::index_t RowVectorSize,
struct GridwiseSparseEmbedding3ForwardLayernorm ck::index_t NumEmbeddings>
struct GridwiseSparseEmbeddingsForwardLayernorm
{ {
static constexpr auto I0 = Number<0>{}; static constexpr auto I0 = Number<0>{};
static constexpr auto I1 = Number<1>{}; static constexpr auto I1 = Number<1>{};
...@@ -97,23 +90,17 @@ struct GridwiseSparseEmbedding3ForwardLayernorm ...@@ -97,23 +90,17 @@ struct GridwiseSparseEmbedding3ForwardLayernorm
BlockwiseWelford<AccDataType, BlockSize, ThreadClusterLength, Sequence<0, 1>>; BlockwiseWelford<AccDataType, BlockSize, ThreadClusterLength, Sequence<0, 1>>;
__device__ static void Run(OutType* p_out, __device__ static void Run(OutType* p_out,
const EmbType* p_emb_a, const ck::Array<EmbType*, NumEmbeddings> p_embs,
const EmbType* p_emb_b, const ck::Array<IndexType*, NumEmbeddings> p_indexes,
const EmbType* p_emb_c,
const IndexType* p_index_a,
const IndexType* p_index_b,
const IndexType* p_index_c,
const GammaDataType* p_gamma, const GammaDataType* p_gamma,
const BetaDataType* p_beta, const BetaDataType* p_beta,
const OutGridDesc, const OutGridDesc,
const AccDataType epsilon) const AccDataType epsilon,
const EmbElementwiseOperation emb_elementwise_op)
{ {
const index_t thread_local_id = get_thread_local_1d_id(); const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id(); const index_t block_global_id = get_block_1d_id();
// const auto index_length = out_grid_desc.GetLength(I0);
// const auto emb_dim = out_grid_desc.GetLength(I1);
constexpr auto thread_cluster_desc = constexpr auto thread_cluster_desc =
make_cluster_descriptor(Sequence<DimClusterSize, RowClusterSize>{}, Sequence<0, 1>{}); make_cluster_descriptor(Sequence<DimClusterSize, RowClusterSize>{}, Sequence<0, 1>{});
...@@ -141,13 +128,11 @@ struct GridwiseSparseEmbedding3ForwardLayernorm ...@@ -141,13 +128,11 @@ struct GridwiseSparseEmbedding3ForwardLayernorm
constexpr auto gamma_beta_buf_desc = constexpr auto gamma_beta_buf_desc =
make_naive_tensor_descriptor_packed(make_tuple(RowSubBlocks, RowVectorSize)); make_naive_tensor_descriptor_packed(make_tuple(RowSubBlocks, RowVectorSize));
StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_a; ck::Array<StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, thread_buf_size, true>,
StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_b; NumEmbeddings>
StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_c; in_thread_bufs;
ck::Array<StaticBuffer<AddressSpaceEnum::Vgpr, IndexType, DimPerBlock, true>, NumEmbeddings>
StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_a; index_bufs;
StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_b;
StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_c;
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, thread_buf_size, true> acc_thread_buf; StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, thread_buf_size, true> acc_thread_buf;
...@@ -160,42 +145,31 @@ struct GridwiseSparseEmbedding3ForwardLayernorm ...@@ -160,42 +145,31 @@ struct GridwiseSparseEmbedding3ForwardLayernorm
StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, mean_var_buf_size, true> var_thread_buf; StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, mean_var_buf_size, true> var_thread_buf;
auto load_current_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) { auto load_current_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
vector_type_maker_t<EmbType, RowVectorSize> emb_vector_a; ck::Array<vector_type_maker_t<EmbType, RowVectorSize>, NumEmbeddings> emb_vectors;
vector_type_maker_t<EmbType, RowVectorSize> emb_vector_b; auto emb_a = emb_vectors[0];
vector_type_maker_t<EmbType, RowVectorSize> emb_vector_c; using src_vector_t = typename decltype(emb_a)::type;
using src_vector_t = typename decltype(emb_vector_a)::type;
static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) { static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
constexpr auto current_dim = i_dim_sub_ * DimPerSubBlock + i_dim_vec_; constexpr auto current_dim = i_dim_sub_ * DimPerSubBlock + i_dim_vec_;
IndexType index_a = index_buf_a[Number<current_dim>{}];
IndexType index_b = index_buf_b[Number<current_dim>{}];
IndexType index_c = index_buf_c[Number<current_dim>{}];
auto thread_offset = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) * auto thread_offset = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
sizeof(EmbType) * RowVectorSize; sizeof(EmbType) * RowVectorSize;
static_for<0, NumEmbeddings, 1>{}([&](auto i_embedding_) {
IndexType index = index_bufs[i_embedding_][Number<current_dim>{}];
int32x4_t emb_res_a = int32x4_t emb_res = make_wave_buffer_resource_with_default_range(
make_wave_buffer_resource_with_default_range(p_emb_a + index_a * RowPerBlock); p_embs[i_embedding_] + index * RowPerBlock);
int32x4_t emb_res_b = emb_vectors(i_embedding_).template AsType<src_vector_t>()(I0) =
make_wave_buffer_resource_with_default_range(p_emb_b + index_b * RowPerBlock); amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res, thread_offset, 0);
int32x4_t emb_res_c = });
make_wave_buffer_resource_with_default_range(p_emb_c + index_c * RowPerBlock);
emb_vector_a.template AsType<src_vector_t>()(I0) =
amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_a, thread_offset, 0);
emb_vector_b.template AsType<src_vector_t>()(I0) =
amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_b, thread_offset, 0);
emb_vector_c.template AsType<src_vector_t>()(I0) =
amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_c, thread_offset, 0);
static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) { static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
constexpr auto register_offset = thread_buf_desc.CalculateOffset( constexpr auto register_offset = thread_buf_desc.CalculateOffset(
make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_)); make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
in_thread_buf_a(Number<register_offset>{}) = static_for<0, NumEmbeddings, 1>{}([&](auto i_embedding_) {
emb_vector_a.template AsType<EmbType>()[i_row_vec_]; in_thread_bufs(i_embedding_)(Number<register_offset>{}) =
in_thread_buf_b(Number<register_offset>{}) = ck::type_convert<AccDataType>(
emb_vector_b.template AsType<EmbType>()[i_row_vec_]; emb_vectors[i_embedding_].template AsType<EmbType>()[i_row_vec_]);
in_thread_buf_c(Number<register_offset>{}) = });
emb_vector_c.template AsType<EmbType>()[i_row_vec_];
}); });
}); });
}; };
...@@ -205,14 +179,17 @@ struct GridwiseSparseEmbedding3ForwardLayernorm ...@@ -205,14 +179,17 @@ struct GridwiseSparseEmbedding3ForwardLayernorm
static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) { static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
constexpr auto register_offset = thread_buf_desc.CalculateOffset( constexpr auto register_offset = thread_buf_desc.CalculateOffset(
make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_)); make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
AccDataType va = auto in_data_refs = generate_tie(
ck::type_convert<AccDataType>(in_thread_buf_a(Number<register_offset>{})); [&](auto i_embedding_) -> const auto& {
AccDataType vb = return in_thread_bufs(i_embedding_)(Number<register_offset>{});
ck::type_convert<AccDataType>(in_thread_buf_b(Number<register_offset>{})); },
AccDataType vc = Number<NumEmbeddings>{});
ck::type_convert<AccDataType>(in_thread_buf_c(Number<register_offset>{})); auto out_data_refs = generate_tie(
[&](auto output_index_) -> auto& {
acc_thread_buf(Number<register_offset>{}) += va + vb + vc; return acc_thread_buf(Number<register_offset>{});
},
Number<1>{});
unpack2(emb_elementwise_op, out_data_refs, in_data_refs);
}); });
}); });
}; };
...@@ -242,7 +219,8 @@ struct GridwiseSparseEmbedding3ForwardLayernorm ...@@ -242,7 +219,8 @@ struct GridwiseSparseEmbedding3ForwardLayernorm
constexpr auto mean_var_offset = constexpr auto mean_var_offset =
mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_)); mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_));
auto divisor =
1 / __builtin_amdgcn_sqrtf(var_thread_buf(Number<mean_var_offset>{}) + epsilon);
static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) { static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
constexpr auto register_offset = thread_buf_desc.CalculateOffset( constexpr auto register_offset = thread_buf_desc.CalculateOffset(
make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_)); make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
...@@ -250,9 +228,8 @@ struct GridwiseSparseEmbedding3ForwardLayernorm ...@@ -250,9 +228,8 @@ struct GridwiseSparseEmbedding3ForwardLayernorm
gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_)); gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_));
auto acc_val = acc_thread_buf[Number<register_offset>{}]; auto acc_val = acc_thread_buf[Number<register_offset>{}];
acc_val = (acc_val - mean_thread_buf(Number<mean_var_offset>{})) / acc_val = (acc_val - mean_thread_buf(Number<mean_var_offset>{})) * divisor;
sqrt(var_thread_buf(Number<mean_var_offset>{}) + epsilon); acc_val = acc_val * gamma_thread_buf[Number<gamma_beta_offset>{}] +
acc_val = acc_val * gamma_thread_buf[Number<gamma_beta_offset>{}] +
beta_thread_buf[Number<gamma_beta_offset>{}]; beta_thread_buf[Number<gamma_beta_offset>{}];
out_vector.template AsType<OutType>()(Number<i_row_vec_>{}) = out_vector.template AsType<OutType>()(Number<i_row_vec_>{}) =
...@@ -273,9 +250,10 @@ struct GridwiseSparseEmbedding3ForwardLayernorm ...@@ -273,9 +250,10 @@ struct GridwiseSparseEmbedding3ForwardLayernorm
// first load index // first load index
ck::static_for<0, DimPerBlock, 1>{}([&](auto i_idx_) { ck::static_for<0, DimPerBlock, 1>{}([&](auto i_idx_) {
// prefer use s_load // prefer use s_load
index_buf_a(i_idx_) = p_index_a[index_start + i_idx_.value]; ck::static_for<0, NumEmbeddings, 1>{}([&](auto i_embedding_) {
index_buf_b(i_idx_) = p_index_b[index_start + i_idx_.value]; index_bufs(i_embedding_)(i_idx_) =
index_buf_c(i_idx_) = p_index_c[index_start + i_idx_.value]; p_indexes[i_embedding_][index_start + i_idx_.value];
});
}); });
// load gamma/beta // load gamma/beta
...@@ -329,7 +307,6 @@ struct GridwiseSparseEmbedding3ForwardLayernorm ...@@ -329,7 +307,6 @@ struct GridwiseSparseEmbedding3ForwardLayernorm
static_for<0, mean_var_buf_size, 1>{}([&](auto I) { static_for<0, mean_var_buf_size, 1>{}([&](auto I) {
if constexpr(I > 0) if constexpr(I > 0)
block_sync_lds(); block_sync_lds();
BlockwiseWelford::Run( BlockwiseWelford::Run(
mean_thread_buf(I), var_thread_buf(I), threadwise_welford.cur_count_); mean_thread_buf(I), var_thread_buf(I), threadwise_welford.cur_count_);
}); });
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment