Commit 463e2aa1 authored by aska-0096's avatar aska-0096
Browse files

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into wmma_op

parents 6e106c19 236bd148
...@@ -86,8 +86,8 @@ struct ReferenceSoftmax : public device::BaseOperator ...@@ -86,8 +86,8 @@ struct ReferenceSoftmax : public device::BaseOperator
}; };
arg.in_.ForEach([&](auto& self, auto idx) { arg.in_.ForEach([&](auto& self, auto idx) {
reduce_max(to_sm_scalar_idx(idx)) = std::max(reduce_max(to_sm_scalar_idx(idx)), reduce_max(to_sm_scalar_idx(idx)) = std::max(
static_cast<AccDataType>(self(idx))); reduce_max(to_sm_scalar_idx(idx)), ck::type_convert<AccDataType>(self(idx)));
}); });
// LogRangeAsType<float>(std::cout << "reduce_max: ", reduce_max.mData, ",") << // LogRangeAsType<float>(std::cout << "reduce_max: ", reduce_max.mData, ",") <<
...@@ -96,7 +96,7 @@ struct ReferenceSoftmax : public device::BaseOperator ...@@ -96,7 +96,7 @@ struct ReferenceSoftmax : public device::BaseOperator
Tensor<AccDataType> in_stable(arg.in_.mDesc); Tensor<AccDataType> in_stable(arg.in_.mDesc);
in_stable.ForEach([&](auto& self, auto idx) { in_stable.ForEach([&](auto& self, auto idx) {
// numerator = exp(x - max(x)) // numerator = exp(x - max(x))
self(idx) = std::exp(static_cast<AccDataType>(arg.in_(idx)) - self(idx) = std::exp(ck::type_convert<AccDataType>(arg.in_(idx)) -
reduce_max(to_sm_scalar_idx(idx))); reduce_max(to_sm_scalar_idx(idx)));
}); });
...@@ -111,8 +111,10 @@ struct ReferenceSoftmax : public device::BaseOperator ...@@ -111,8 +111,10 @@ struct ReferenceSoftmax : public device::BaseOperator
// std::endl; // std::endl;
arg.out_.ForEach([&](auto& self, auto idx) { arg.out_.ForEach([&](auto& self, auto idx) {
self(idx) = arg.alpha_ * in_stable(idx) / reduce_sum(to_sm_scalar_idx(idx)) + AccDataType temp_result =
arg.alpha_ * in_stable(idx) / reduce_sum(to_sm_scalar_idx(idx)) +
arg.beta_ * self(idx); arg.beta_ * self(idx);
self(idx) = ck::type_convert<OutDataType>(temp_result);
}); });
// LogRangeAsType<float>(std::cout << "out: ", arg.out_.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "out: ", arg.out_.mData, ",") << std::endl;
......
...@@ -87,6 +87,8 @@ using Relu = ck::tensor_operation::element_wise::Relu; ...@@ -87,6 +87,8 @@ using Relu = ck::tensor_operation::element_wise::Relu;
using Scale = ck::tensor_operation::element_wise::Scale; using Scale = ck::tensor_operation::element_wise::Scale;
using Bilinear = ck::tensor_operation::element_wise::Bilinear; using Bilinear = ck::tensor_operation::element_wise::Bilinear;
using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu; using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
using FastGelu = ck::tensor_operation::element_wise::FastGelu;
template <typename Activation> template <typename Activation>
using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Activation>; using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Activation>;
......
...@@ -59,6 +59,48 @@ void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_g ...@@ -59,6 +59,48 @@ void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_g
MaskingSpecialization::MaskDisabled>>>& MaskingSpecialization::MaskDisabled>>>&
instances); instances);
void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
std::vector<std::unique_ptr<
DeviceBatchedGemmSoftmaxGemmPermute<2,
1,
1,
1,
1,
BF16,
BF16,
BF16,
BF16,
ck::Tuple<>,
ck::Tuple<>,
PassThrough,
PassThrough,
Scale,
PassThrough,
PassThrough,
MaskingSpecialization::MaskOutUpperTriangle>>>&
instances);
void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
std::vector<
std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<2,
1,
1,
1,
1,
BF16,
BF16,
BF16,
BF16,
ck::Tuple<>,
ck::Tuple<>,
PassThrough,
PassThrough,
Scale,
PassThrough,
PassThrough,
MaskingSpecialization::MaskDisabled>>>&
instances);
template <typename ADataType, template <typename ADataType,
typename B0DataType, typename B0DataType,
typename B1DataType, typename B1DataType,
...@@ -119,6 +161,20 @@ struct DeviceOperationInstanceFactory< ...@@ -119,6 +161,20 @@ struct DeviceOperationInstanceFactory<
op_ptrs); op_ptrs);
} }
} }
else if constexpr(is_same_v<ADataType, BF16> && is_same_v<B0DataType, BF16> &&
is_same_v<B1DataType, BF16> && is_same_v<CDataType, BF16>)
{
if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle)
{
add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
op_ptrs);
}
else if(MaskingSpec == MaskingSpecialization::MaskDisabled)
{
add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
op_ptrs);
}
}
return op_ptrs; return op_ptrs;
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment