Unverified Commit f5de8b57 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Merge branch 'develop' into modified_grouped_gemm_addressing_method

parents e83c7061 fa9a0a5c
# device_normalization_instance
set(DEVICE_NORMALIZATION_INSTANCE_SOURCE
device_softmax_f32_f32_instance.cpp
device_softmax_f16_f16_instance.cpp
)
add_library(device_normalization_instance OBJECT ${DEVICE_NORMALIZATION_INSTANCE_SOURCE})
set_target_properties(device_normalization_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
clang_tidy_check(device_normalization_instance)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/utility/data_type.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_normalization_instance {
using F16 = ck::half_t;
using F32 = float;
template <index_t Rank, index_t Reduce>
using device_softmax_f16_f16_instances = std::tuple<
// clang-format off
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>,
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>,
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>,
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>,
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>,
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8>
// clang-format on
>;
void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
}
void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
}
} // namespace device_normalization_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/utility/data_type.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_normalization_instance {
using F32 = float;
template <index_t Rank, index_t Reduce>
using device_softmax_f32_f32_instances = std::tuple<
// clang-format off
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>,
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>,
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>,
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>,
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>
// clang-format on
>;
void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
}
void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
}
} // namespace device_normalization_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -22,6 +22,7 @@ set(PROFILER_SOURCE ...@@ -22,6 +22,7 @@ set(PROFILER_SOURCE
src/profile_conv_bwd_weight.cpp src/profile_conv_bwd_weight.cpp
src/profile_batched_gemm_reduce.cpp src/profile_batched_gemm_reduce.cpp
src/profile_gemm_add_add_fastgelu.cpp src/profile_gemm_add_add_fastgelu.cpp
src/profile_normalization.cpp
) )
add_executable(ckProfiler ${PROFILER_SOURCE}) add_executable(ckProfiler ${PROFILER_SOURCE})
...@@ -46,4 +47,5 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) ...@@ -46,4 +47,5 @@ target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp" #include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
...@@ -21,32 +21,28 @@ namespace tensor_operation { ...@@ -21,32 +21,28 @@ namespace tensor_operation {
namespace device { namespace device {
namespace device_gemm_instance { namespace device_gemm_instance {
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using DPtrsGlobal = ck::Tuple<F32*, F32*>; using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
using Identity = ck::tensor_operation::element_wise::PassThrough; using Identity = ck::tensor_operation::element_wise::PassThrough;
using Square = ck::tensor_operation::element_wise::UnarySquare; using Square = ck::tensor_operation::element_wise::UnarySquare;
using DInElementOps = ck::Tuple<Identity, Square>; using ReduceInElementOps = ck::Tuple<Identity, Square>;
using DOutElementOps = ck::Tuple<Identity, Identity>; using ReduceOutElementOps = ck::Tuple<Identity, Identity>;
using DeviceBatchedGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceBatchedGemmReducePtr< using DeviceGemmReduceNoOpPtr =
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
DInElementOps,
DOutElementOps>;
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
std::vector<DeviceBatchedGemmReduceNoOpPtr>&); std::vector<DeviceGemmReduceNoOpPtr>&);
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
std::vector<DeviceBatchedGemmReduceNoOpPtr>&); std::vector<DeviceGemmReduceNoOpPtr>&);
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
std::vector<DeviceBatchedGemmReduceNoOpPtr>&); std::vector<DeviceGemmReduceNoOpPtr>&);
void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances( void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
std::vector<DeviceBatchedGemmReduceNoOpPtr>&); std::vector<DeviceGemmReduceNoOpPtr>&);
} // namespace device_gemm_instance } // namespace device_gemm_instance
} // namespace device } // namespace device
...@@ -59,7 +55,7 @@ namespace profiler { ...@@ -59,7 +55,7 @@ namespace profiler {
template <typename ADataType, template <typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename DDataType, typename ReduceDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout> typename CLayout>
...@@ -99,16 +95,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -99,16 +95,16 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
Tensor<CDataType> c_g_m_n_host_result( Tensor<CDataType> c_g_m_n_host_result(
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{})); f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
Tensor<DDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>( Tensor<ReduceDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)}))); {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
Tensor<DDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>( Tensor<ReduceDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)}))); {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
Tensor<CDataType> c_g_m_n_device_result( Tensor<CDataType> c_g_m_n_device_result(
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{})); f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
Tensor<DDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>( Tensor<ReduceDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)}))); {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
Tensor<DDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>( Tensor<ReduceDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)}))); {static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl; std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
...@@ -135,20 +131,23 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -135,20 +131,23 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
using AElementOp = ck::tensor_operation::element_wise::PassThrough; using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using D0ReduceOp = ck::reduce::Add; using ReduceOp0 = ck::reduce::Add;
using D1ReduceOp = ck::reduce::Add; using ReduceOp1 = ck::reduce::Add;
using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
using DxsInElementOps = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using DxsOutElementOps = ck::Tuple<UnaryIdenticElementOp, UnaryIdenticElementOp>;
const auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{}; auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{}; auto c_element_op = CElementOp{};
const auto dxs_in_element_op = DxsInElementOps{}; std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
const auto dxs_out_element_op = DxsOutElementOps{};
const auto d0_reduce_op = D0ReduceOp{}; const auto reduce0_op = ReduceOp0{};
const auto d1_reduce_op = D1ReduceOp{}; const auto reduce1_op = ReduceOp1{};
auto passthrough = UnaryIdenticElementOp{};
auto square = UnarySquareElementOp{};
std::array<void*, 2> reduce_in_element_ops = {&passthrough, &square};
std::array<void*, 2> reduce_out_element_ops = {&passthrough, &passthrough};
if(do_verification) if(do_verification)
{ {
...@@ -160,6 +159,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -160,6 +159,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
BElementOp, BElementOp,
CElementOp>; CElementOp>;
using ReduceAccDataType = ReduceDataType;
auto ref_batched_gemm = ReferenceBatchedGemmInstance{}; auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
auto ref_invoker = ref_batched_gemm.MakeInvoker(); auto ref_invoker = ref_batched_gemm.MakeInvoker();
...@@ -172,21 +173,22 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -172,21 +173,22 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
{ {
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
float d0_acc = d0_reduce_op.GetIdentityValue<float>(); auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
float d1_acc = d1_reduce_op.GetIdentityValue<float>(); auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
float d0_val = ck::type_convert<float>(c_g_m_n_host_result(batch, m, n)); ReduceAccDataType d0_val =
float d1_val; ck::type_convert<ReduceAccDataType>(c_g_m_n_host_result(batch, m, n));
ReduceAccDataType d1_val;
UnarySquareElementOp{}(d1_val, d0_val); square(d1_val, d0_val);
d0_reduce_op(d0_acc, d0_val); reduce0_op(reduce0_acc, d0_val);
d1_reduce_op(d1_acc, d1_val); reduce1_op(reduce1_acc, d1_val);
} }
d0_g_m_host_result(batch, m) = ck::type_convert<DDataType>(d0_acc); d0_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce0_acc);
d1_g_m_host_result(batch, m) = ck::type_convert<DDataType>(d1_acc); d1_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce1_acc);
} }
} }
} }
...@@ -194,17 +196,19 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -194,17 +196,19 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpace());
DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpace());
DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpace());
DeviceMem d0_device_buf(sizeof(DDataType) * d0_g_m_device_result.mDesc.GetElementSpace()); DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
DeviceMem d1_device_buf(sizeof(DDataType) * d1_g_m_device_result.mDesc.GetElementSpace()); d0_g_m_device_result.mDesc.GetElementSpace());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
d1_g_m_device_result.mDesc.GetElementSpace());
auto dxs_global = ck::make_tuple(static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()), std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer())); reduce1_device_buf.GetDeviceBuffer()};
a_device_buf.ToDevice(a_g_m_k.mData.data()); a_device_buf.ToDevice(a_g_m_k.mData.data());
b_device_buf.ToDevice(b_g_k_n.mData.data()); b_device_buf.ToDevice(b_g_k_n.mData.data());
// add device GEMM instances // add device GEMM instances
std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceBatchedGemmReduceNoOpPtr> std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceGemmReduceNoOpPtr>
gemm_ptrs; gemm_ptrs;
if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value && if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
...@@ -257,31 +261,32 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -257,31 +261,32 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
// profile device GEMM instances // profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs) for(auto& gemm_ptr : gemm_ptrs)
{ {
auto argument_ptr = auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
gemm_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()), b_device_buf.GetDeviceBuffer(),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()), nullptr,
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()), {},
&dxs_global, c_device_buf.GetDeviceBuffer(),
M, p_reduces,
N, M,
K, N,
StrideA, K,
StrideB, StrideA,
StrideC, StrideB,
a_element_op, StrideC,
b_element_op, {},
c_element_op, gemm_element_ops,
dxs_in_element_op, {},
dxs_out_element_op, reduce_in_element_ops,
BatchCount); reduce_out_element_ops,
BatchCount);
auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// init DO, D1 to 0 // init DO, D1 to 0
d0_device_buf.SetZero(); reduce0_device_buf.SetZero();
d1_device_buf.SetZero(); reduce1_device_buf.SetZero();
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
...@@ -311,8 +316,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -311,8 +316,8 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_g_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
d0_device_buf.FromDevice(d0_g_m_device_result.mData.data()); reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
d1_device_buf.FromDevice(d1_g_m_device_result.mData.data()); reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
float c_error = check_error(c_g_m_n_host_result, c_g_m_n_device_result); float c_error = check_error(c_g_m_n_host_result, c_g_m_n_device_result);
float d0_error = check_error(d0_g_m_host_result, d0_g_m_device_result); float d0_error = check_error(d0_g_m_host_result, d0_g_m_device_result);
......
...@@ -21,33 +21,28 @@ namespace tensor_operation { ...@@ -21,33 +21,28 @@ namespace tensor_operation {
namespace device { namespace device {
namespace device_gemm_instance { namespace device_gemm_instance {
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using DPtrsGlobal = ck::Tuple<F32*, F32*>; using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
using Div = ck::tensor_operation::element_wise::UnaryDivide; using Div = ck::tensor_operation::element_wise::UnaryDivide;
using Identity = ck::tensor_operation::element_wise::PassThrough; using Identity = ck::tensor_operation::element_wise::PassThrough;
using Square = ck::tensor_operation::element_wise::UnarySquare; using Square = ck::tensor_operation::element_wise::UnarySquare;
using DInElementOps = ck::Tuple<Identity, Square>; using ReduceInElementOps = ck::Tuple<Identity, Square>;
using DOutElementOps = ck::Tuple<Div, Div>; using ReduceOutElementOps = ck::Tuple<Div, Div>;
using DeviceGemmBiasAddReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmBiasAddReducePtr< using DeviceGemmBiasAddReduceNoOpPtr =
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::device::DeviceGemmReducePtr<1, ReducePtrsGlobal::Size()>;
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough, void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
ck::tensor_operation::element_wise::PassThrough,
DInElementOps,
DOutElementOps>;
void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
std::vector<DeviceGemmBiasAddReduceNoOpPtr>&); std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances( void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
std::vector<DeviceGemmBiasAddReduceNoOpPtr>&); std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances( void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
std::vector<DeviceGemmBiasAddReduceNoOpPtr>&); std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
void add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances( void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
std::vector<DeviceGemmBiasAddReduceNoOpPtr>&); std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
} // namespace device_gemm_instance } // namespace device_gemm_instance
...@@ -61,9 +56,9 @@ namespace profiler { ...@@ -61,9 +56,9 @@ namespace profiler {
template <typename ADataType, template <typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename C0DataType, typename BiasDataType,
typename C1DataType, typename D0DataType,
typename DDataType, typename ReduceDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout> typename CLayout>
...@@ -77,7 +72,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -77,7 +72,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
int StrideA, int StrideA,
int StrideB, int StrideB,
int StrideC, int StrideC,
int StrideC1) int StrideD0)
{ {
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor(std::vector<std::size_t>({len}), return HostTensorDescriptor(std::vector<std::size_t>({len}),
...@@ -102,24 +97,24 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -102,24 +97,24 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<C0DataType> bias_n(f_host_tensor_descriptor1d(N, 1)); Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<C1DataType> c1_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<DDataType> d0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<DDataType> d1_m_host_result( Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<DDataType> d0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<DDataType> d1_m_device_result( Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl; std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl; std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
std::size_t num_thread = 1; std::size_t num_thread = 1;
switch(init_method) switch(init_method)
...@@ -130,50 +125,53 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -130,50 +125,53 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread); a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread); b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
bias_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread); bias_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
c1_m_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread); d0_m_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
break; break;
default: default:
std::srand(0); std::srand(0);
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread); a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
bias_n.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5}, num_thread); bias_n.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5}, num_thread);
c1_m_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread); d0_m_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
} }
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AElementOp = PassThrough; using AElementOp = PassThrough;
using BElementOp = PassThrough; using BElementOp = PassThrough;
using CElementOp = PassThrough; using CElementOp = PassThrough;
using C1ElementOp = PassThrough; using D0ElementOp = PassThrough;
using D0ReduceOp = ck::reduce::Add; using ReduceOp0 = ck::reduce::Add;
using D1ReduceOp = ck::reduce::Add; using ReduceOp1 = ck::reduce::Add;
using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide; using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
using DxsInElementOps = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
using DxsOutElementOps = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
const auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{}; auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{}; auto c_element_op = CElementOp{};
const auto c1_element_op = C1ElementOp{}; std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
const auto d0_reduce_op = D0ReduceOp{};
const auto d1_reduce_op = D1ReduceOp{};
auto dxs_in_element_op = DxsInElementOps{}; auto d0_element_op = D0ElementOp{};
auto dxs_out_element_op = DxsOutElementOps{N, N}; const auto reduce0_op = ReduceOp0{};
const auto reduce1_op = ReduceOp1{};
auto passthrough = UnaryIdenticElementOp{};
auto square = UnarySquareElementOp{};
auto div = UnaryDivElementOp{N};
std::array<void*, 2> reduce_in_element_ops = {&passthrough, &square};
std::array<void*, 2> reduce_out_element_ops = {&div, &div};
if(do_verification) if(do_verification)
{ {
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
CDataType, CDataType,
DDataType, ReduceDataType,
AElementOp, AElementOp,
BElementOp, BElementOp,
CElementOp>; CElementOp>;
using ReduceAccDataType = DDataType; using ReduceAccDataType = ReduceDataType;
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
...@@ -189,53 +187,53 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -189,53 +187,53 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
ReduceAccDataType acc = static_cast<ReduceAccDataType>(c_m_n_host_result(m, n)) + ReduceAccDataType acc = static_cast<ReduceAccDataType>(c_m_n_host_result(m, n)) +
static_cast<ReduceAccDataType>(bias_n(n)); static_cast<ReduceAccDataType>(bias_n(n));
ReduceAccDataType c1 = static_cast<ReduceAccDataType>(c1_m_n(m, n)); ReduceAccDataType d0 = static_cast<ReduceAccDataType>(d0_m_n(m, n));
c_element_op(acc, acc); c_element_op(acc, acc);
c1_element_op(c1, c1); d0_element_op(d0, d0);
acc += c1; acc += d0;
c_m_n_host_result(m, n) = static_cast<CDataType>(acc); c_m_n_host_result(m, n) = static_cast<CDataType>(acc);
} }
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
auto d0_acc = d0_reduce_op.GetIdentityValue<ReduceAccDataType>(); auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
auto d1_acc = d1_reduce_op.GetIdentityValue<ReduceAccDataType>(); auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
ReduceAccDataType c_val = ReduceAccDataType d0_val =
ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n)); ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
ReduceAccDataType d0_val;
ReduceAccDataType d1_val; ReduceAccDataType d1_val;
dxs_in_element_op(ck::Number<0>{})(d0_val, c_val); square(d1_val, d0_val);
dxs_in_element_op(ck::Number<1>{})(d1_val, c_val); reduce0_op(reduce0_acc, d0_val);
d0_reduce_op(d0_acc, d0_val); reduce1_op(reduce1_acc, d1_val);
d1_reduce_op(d1_acc, d1_val);
} }
dxs_out_element_op(ck::Number<0>{})(d0_acc, d0_acc); div(reduce0_acc, reduce0_acc);
dxs_out_element_op(ck::Number<1>{})(d1_acc, d1_acc); div(reduce1_acc, reduce1_acc);
d0_m_host_result(m) = ck::type_convert<DDataType>(d0_acc); reduce0_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce0_acc);
d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc); reduce1_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce1_acc);
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
DeviceMem bias_device_buf(sizeof(C0DataType) * bias_n.mDesc.GetElementSpace()); DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpace());
DeviceMem c1_device_buf(sizeof(C1DataType) * c1_m_n.mDesc.GetElementSpace()); DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace());
DeviceMem d0_device_buf(sizeof(DDataType) * d0_m_device_result.mDesc.GetElementSpace()); DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
DeviceMem d1_device_buf(sizeof(DDataType) * d1_m_device_result.mDesc.GetElementSpace()); reduce0_m_device_result.mDesc.GetElementSpace());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
reduce1_m_device_result.mDesc.GetElementSpace());
auto dxs_global = ck::make_tuple(static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()), std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer())); reduce1_device_buf.GetDeviceBuffer()};
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
bias_device_buf.ToDevice(bias_n.mData.data()); bias_device_buf.ToDevice(bias_n.mData.data());
c1_device_buf.ToDevice(c1_m_n.mData.data()); d0_device_buf.ToDevice(d0_m_n.mData.data());
// add device GEMM instances // add device GEMM instances
std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceGemmBiasAddReduceNoOpPtr> std::vector<ck::tensor_operation::device::device_gemm_instance::DeviceGemmBiasAddReduceNoOpPtr>
...@@ -249,7 +247,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -249,7 +247,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{ {
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances( add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
gemm_ptrs); gemm_ptrs);
} }
else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value && else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
...@@ -257,7 +255,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -257,7 +255,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{ {
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances( add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
gemm_ptrs); gemm_ptrs);
} }
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value && else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
...@@ -265,7 +263,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -265,7 +263,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{ {
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances( add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
gemm_ptrs); gemm_ptrs);
} }
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value && else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
...@@ -273,7 +271,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -273,7 +271,7 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{ {
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_bias_add_reduce_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances( add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
gemm_ptrs); gemm_ptrs);
} }
} }
...@@ -291,34 +289,31 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -291,34 +289,31 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
// profile device GEMM instances // profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs) for(auto& gemm_ptr : gemm_ptrs)
{ {
auto argument_ptr = gemm_ptr->MakeArgumentPointer( auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()), b_device_buf.GetDeviceBuffer(),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()), bias_device_buf.GetDeviceBuffer(),
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()), {d0_device_buf.GetDeviceBuffer()},
static_cast<C0DataType*>(bias_device_buf.GetDeviceBuffer()), c_device_buf.GetDeviceBuffer(),
static_cast<C1DataType*>(c1_device_buf.GetDeviceBuffer()), p_reduces,
&dxs_global, M,
M, N,
N, K,
K, StrideA,
StrideA, StrideB,
StrideB, StrideC,
StrideC, {StrideD0},
StrideC1, gemm_element_ops,
a_element_op, {&d0_element_op},
b_element_op, reduce_in_element_ops,
c_element_op, reduce_out_element_ops);
c1_element_op,
dxs_in_element_op,
dxs_out_element_op);
auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// init DO, D1 to 0 // init DO, D1 to 0
d0_device_buf.SetZero(); reduce0_device_buf.SetZero();
d1_device_buf.SetZero(); reduce1_device_buf.SetZero();
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
...@@ -328,9 +323,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -328,9 +323,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N; std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(CDataType) * M * N + sizeof(C0DataType) * M * N + sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
sizeof(C1DataType) * M * N + sizeof(DDataType) * M + sizeof(D0DataType) * M * N + sizeof(ReduceDataType) * M +
sizeof(DDataType) * M; sizeof(ReduceDataType) * M;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
...@@ -350,12 +345,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -350,12 +345,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
d0_device_buf.FromDevice(d0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
d1_device_buf.FromDevice(d1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
ck::utils::check_err(d0_m_device_result.mData, d0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
ck::utils::check_err(d1_m_device_result.mData, d1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
if(do_log) if(do_log)
{ {
...@@ -365,13 +360,17 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -365,13 +360,17 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "d0_host: ", d0_m_host_result.mData, ",") LogRangeAsType<float>(
std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "d0_device: ", d0_m_device_result.mData, ",") LogRangeAsType<float>(
std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "d1_host: ", d1_m_host_result.mData, ",") LogRangeAsType<float>(
std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "d1_device: ", d1_m_device_result.mData, ",") LogRangeAsType<float>(
std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -21,21 +21,17 @@ namespace tensor_operation { ...@@ -21,21 +21,17 @@ namespace tensor_operation {
namespace device { namespace device {
namespace device_gemm_instance { namespace device_gemm_instance {
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using DPtrsGlobal = ck::Tuple<F32*, F32*>; using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
using Div = ck::tensor_operation::element_wise::UnaryDivide; using Div = ck::tensor_operation::element_wise::UnaryDivide;
using Identity = ck::tensor_operation::element_wise::PassThrough; using Identity = ck::tensor_operation::element_wise::PassThrough;
using Square = ck::tensor_operation::element_wise::UnarySquare; using Square = ck::tensor_operation::element_wise::UnarySquare;
using DInElementOps = ck::Tuple<Identity, Square>; using ReduceInElementOps = ck::Tuple<Identity, Square>;
using DOutElementOps = ck::Tuple<Div, Div>; using ReduceOutElementOps = ck::Tuple<Div, Div>;
using DeviceGemmReduceNoOpPtr = ck::tensor_operation::device::DeviceGemmReducePtr< using DeviceGemmReduceNoOpPtr =
ck::tensor_operation::element_wise::PassThrough, ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
DInElementOps,
DOutElementOps>;
void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances( void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
std::vector<DeviceGemmReduceNoOpPtr>&); std::vector<DeviceGemmReduceNoOpPtr>&);
...@@ -60,7 +56,7 @@ namespace profiler { ...@@ -60,7 +56,7 @@ namespace profiler {
template <typename ADataType, template <typename ADataType,
typename BDataType, typename BDataType,
typename CDataType, typename CDataType,
typename DDataType, typename ReduceDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout> typename CLayout>
...@@ -95,22 +91,22 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -95,22 +91,22 @@ bool profile_gemm_reduce_impl(int do_verification,
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<DDataType> d0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<DDataType> d1_m_host_result( Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<DDataType> d0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<DDataType> d1_m_device_result( Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl; std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl; std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
std::size_t num_thread = 1; std::size_t num_thread = 1;
switch(init_method) switch(init_method)
...@@ -130,34 +126,37 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -130,34 +126,37 @@ bool profile_gemm_reduce_impl(int do_verification,
using AElementOp = ck::tensor_operation::element_wise::PassThrough; using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough; using CElementOp = ck::tensor_operation::element_wise::PassThrough;
using D0ReduceOp = ck::reduce::Add; using ReduceOp0 = ck::reduce::Add;
using D1ReduceOp = ck::reduce::Add; using ReduceOp1 = ck::reduce::Add;
using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough; using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare; using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
using DxsInElementOps = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>; using UnaryDivElementOp = ck::tensor_operation::element_wise::UnaryDivide;
using DxsOutElementOps = ck::Tuple<UnaryDivElementOp, UnaryDivElementOp>;
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
const auto a_element_op = AElementOp{}; const auto reduce0_op = ReduceOp0{};
const auto b_element_op = BElementOp{}; const auto reduce1_op = ReduceOp1{};
const auto c_element_op = CElementOp{};
const auto d0_reduce_op = D0ReduceOp{};
const auto d1_reduce_op = D1ReduceOp{};
auto dxs_in_element_op = DxsInElementOps{}; auto passthrough = UnaryIdenticElementOp{};
auto dxs_out_element_op = DxsOutElementOps{N, N}; auto square = UnarySquareElementOp{};
auto div = UnaryDivElementOp{N};
std::array<void*, 2> reduce_in_element_ops = {&passthrough, &square};
std::array<void*, 2> reduce_out_element_ops = {&div, &div};
if(do_verification) if(do_verification)
{ {
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
CDataType, CDataType,
DDataType, ReduceDataType,
AElementOp, AElementOp,
BElementOp, BElementOp,
CElementOp>; CElementOp>;
using ReduceAccDataType = DDataType; using ReduceAccDataType = ReduceDataType;
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
...@@ -169,37 +168,37 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -169,37 +168,37 @@ bool profile_gemm_reduce_impl(int do_verification,
for(int m = 0; m < M; ++m) for(int m = 0; m < M; ++m)
{ {
auto d0_acc = d0_reduce_op.GetIdentityValue<ReduceAccDataType>(); auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
auto d1_acc = d1_reduce_op.GetIdentityValue<ReduceAccDataType>(); auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
ReduceAccDataType c_val = ReduceAccDataType d0_val =
ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n)); ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
ReduceAccDataType d0_val;
ReduceAccDataType d1_val; ReduceAccDataType d1_val;
dxs_in_element_op(ck::Number<0>{})(d0_val, c_val); square(d1_val, d0_val);
dxs_in_element_op(ck::Number<1>{})(d1_val, c_val); reduce0_op(reduce0_acc, d0_val);
d0_reduce_op(d0_acc, d0_val); reduce1_op(reduce1_acc, d1_val);
d1_reduce_op(d1_acc, d1_val);
} }
dxs_out_element_op(ck::Number<0>{})(d0_acc, d0_acc); div(reduce0_acc, reduce0_acc);
dxs_out_element_op(ck::Number<1>{})(d1_acc, d1_acc); div(reduce1_acc, reduce1_acc);
d0_m_host_result(m) = ck::type_convert<DDataType>(d0_acc); reduce0_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce0_acc);
d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc); reduce1_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce1_acc);
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
DeviceMem d0_device_buf(sizeof(DDataType) * d0_m_device_result.mDesc.GetElementSpace()); DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
DeviceMem d1_device_buf(sizeof(DDataType) * d1_m_device_result.mDesc.GetElementSpace()); reduce0_m_device_result.mDesc.GetElementSpace());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
reduce1_m_device_result.mDesc.GetElementSpace());
auto dxs_global = ck::make_tuple(static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()), std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer())); reduce1_device_buf.GetDeviceBuffer()};
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
...@@ -258,30 +257,31 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -258,30 +257,31 @@ bool profile_gemm_reduce_impl(int do_verification,
// profile device GEMM instances // profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs) for(auto& gemm_ptr : gemm_ptrs)
{ {
auto argument_ptr = auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
gemm_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()), b_device_buf.GetDeviceBuffer(),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()), nullptr,
static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()), {},
&dxs_global, c_device_buf.GetDeviceBuffer(),
M, p_reduces,
N, M,
K, N,
StrideA, K,
StrideB, StrideA,
StrideC, StrideB,
a_element_op, StrideC,
b_element_op, {},
c_element_op, gemm_element_ops,
dxs_in_element_op, {},
dxs_out_element_op); reduce_in_element_ops,
reduce_out_element_ops);
auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// init DO, D1 to 0 // init DO, D1 to 0
d0_device_buf.SetZero(); reduce0_device_buf.SetZero();
d1_device_buf.SetZero(); reduce1_device_buf.SetZero();
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
...@@ -311,12 +311,12 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -311,12 +311,12 @@ bool profile_gemm_reduce_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
d0_device_buf.FromDevice(d0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
d1_device_buf.FromDevice(d1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
ck::utils::check_err(d0_m_device_result.mData, d0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData);
ck::utils::check_err(d1_m_device_result.mData, d1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData);
if(do_log) if(do_log)
{ {
...@@ -326,13 +326,17 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -326,13 +326,17 @@ bool profile_gemm_reduce_impl(int do_verification,
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",") LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "d0_host: ", d0_m_host_result.mData, ",") LogRangeAsType<float>(
std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "d0_device: ", d0_m_device_result.mData, ",") LogRangeAsType<float>(
std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "d1_host: ", d1_m_host_result.mData, ",") LogRangeAsType<float>(
std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "d1_device: ", d1_m_device_result.mData, ",") LogRangeAsType<float>(
std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
<< std::endl; << std::endl;
} }
} }
......
...@@ -232,6 +232,10 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -232,6 +232,10 @@ void profile_grouped_gemm_impl(int do_verification,
auto invoker_ptr = gemm_ptr->MakeInvokerPointer(); auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
std::string gemm_name = gemm_ptr->GetTypeString(); std::string gemm_name = gemm_ptr->GetTypeString();
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/host_tensor/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_normalization_instance {
void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>&);
void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>&);
void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>&);
void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>&);
} // namespace device_normalization_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck {
namespace profiler {
enum struct NormType
{
LAYERNORM,
BATCHNORM,
SOFTMAX,
};
enum struct NormDataType
{
F32_F32, // in, out
F16_F16,
BF16_BF16,
INT8_INT8,
};
// clang-format off
template <typename NormDataType> std::string type_to_string();
template <> std::string type_to_string<float>() { return "f32"; }
template <> std::string type_to_string<half_t>() { return "f16"; }
template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
template <> std::string type_to_string<int8_t>() { return "int8"; }
template <> std::string type_to_string<int32_t>() { return "int32"; }
// clang-format on
template <typename InDataType, typename AccDataType, typename OutDataType>
void profile_normalization_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> in_length,
std::vector<index_t> in_strides,
std::vector<index_t> reduce_dims,
AccDataType alpha,
AccDataType beta,
NormType norm_type)
{
Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
: Tensor<InDataType>(in_length, in_strides);
Tensor<OutDataType> out(in.mDesc);
switch(init_method)
{
// case 0: break;
case 0:
in.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
}
Tensor<OutDataType> out_ref(out);
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace());
in_dev.ToDevice(in.mData.data());
out_dev.ToDevice(out.mData.data());
std::vector<index_t> i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end());
std::vector<index_t> i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end());
// add device normalization instances
std::vector<tensor_operation::device::DeviceNormalizationPtr> instances;
if(norm_type == NormType::SOFTMAX)
{
if constexpr(is_same<InDataType, half_t>::value && is_same<OutDataType, half_t>::value &&
is_same<AccDataType, float>::value)
{
if(in_length.size() == 3)
tensor_operation::device::device_normalization_instance::
add_device_softmax_f16_f16_rank3_instances(instances);
if(in_length.size() == 4)
tensor_operation::device::device_normalization_instance::
add_device_softmax_f16_f16_rank4_instances(instances);
}
else if constexpr(is_same<InDataType, float>::value && is_same<OutDataType, float>::value &&
is_same<AccDataType, float>::value)
{
if(in_length.size() == 3)
tensor_operation::device::device_normalization_instance::
add_device_softmax_f32_f32_rank3_instances(instances);
if(in_length.size() == 4)
tensor_operation::device::device_normalization_instance::
add_device_softmax_f32_f32_rank4_instances(instances);
}
}
if(instances.size() <= 0)
{
throw std::runtime_error("wrong! no device normalization instance found");
}
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
for(auto& inst_ptr : instances)
{
// Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
// problem to rank 4 kernel) other than invoking IsSupportedArgument()?
if(!(inst_ptr->GetRank() == static_cast<index_t>(i_in_lengths.size()) &&
inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
{
continue;
}
auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths,
i_in_strides,
reduce_dims,
&alpha,
&beta,
in_dev.GetDeviceBuffer(),
out_dev.GetDeviceBuffer());
if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
LogRange(std::cout << "input lengths = [", in_length, ", ")
<< "], "
<< "scaler = [" << alpha << ", " << beta << "]." << std::endl;
return;
}
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes =
in.mDesc.GetElementSize() * sizeof(InDataType) +
(beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time)
{
best_instance_name = inst_ptr->GetTypeString();
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
// TODO: factory method to dynamically switch between different reference normalizations
using ReferenceFactory =
tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
out_dev.FromDevice(out.mData.data());
bool pass;
if(std::is_same<InDataType, int8_t>::value)
{
pass = ck::utils::check_err(
out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
if(do_log)
{
LogRangeAsType<int>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<int>(std::cout << "out_ref : ", out_ref.mData, ",")
<< std::endl;
LogRangeAsType<int>(std::cout << "out : ", out.mData, ",") << std::endl;
}
}
else
{
pass = ck::utils::check_err(out.mData, out_ref.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_ref : ", out_ref.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "out : ", out.mData, ",") << std::endl;
}
}
if(!pass)
{
std::cout << inst_ptr->GetTypeString() << " failed verification: ";
LogRange(std::cout << "input lengths = [", in_length, ", ")
<< "], "
<< "scaler = [" << alpha << ", " << beta << "]." << std::endl;
}
}
}
std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
<< type_to_string<OutDataType>() << ", ";
LogRange(std::cout << "length = ", i_in_lengths, ",") << ", ";
LogRange(std::cout << "stride = ", i_in_strides, ",") << ", ";
LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
std::cout << "alpha = " << alpha << ", "
<< "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_instance_name << std::endl;
}
} // namespace profiler
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/include/profile_normalization_impl.hpp"
using ck::index_t;
using ck::profiler::NormDataType;
using ck::profiler::NormType;
struct ArgParser
{
std::unordered_map<std::string, NormType> norm_dict = {{"layernorm", NormType::LAYERNORM},
{"batchnorm", NormType::BATCHNORM},
{"softmax", NormType::SOFTMAX}};
std::unordered_map<std::string, std::vector<int>> long_opts = {
{"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
bool parse_opt(int argc, char* argv[], const std::string& key, int i)
{
if(std::string("--") + key == argv[i])
{
int pos = i;
while(++i < argc && argv[i][0] != '-') {}
int end = i;
for(int j = pos + 1; j < end; j++)
{
long_opts[key].push_back(std::stoi(argv[j]));
}
return true;
}
return false;
}
void operator()(int argc, char* argv[])
{
for(auto& kv : long_opts)
{
for(int i = 1; i < argc; i++)
{
if(parse_opt(argc, argv, kv.first, i))
break;
}
}
}
};
void print_help()
{
std::cout << "arg1: tensor operation (layernorm/batchnorm/softmax)\n"
<< "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
<< "arg3: verification (0: no; 1: yes)\n"
<< "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg5: print tensor value (0: no; 1: yes)\n"
<< "arg6: time kernel (0=n0, 1=yes)\n"
<< "--length: tensor extents (e.g, --length 8 4 256) \n"
<< "--stride: tensor strides (e.g, --stride 1024 256 1)\n"
<< "--reduce: to-reduce dimensions (e.g, --reduce 2)\n"
<< "--alpha: alpha scaling value\n"
<< "--beta: beta scaling value\n"
<< std::endl;
}
int profile_normalization(int argc, char* argv[])
{
if(argc <= 2)
{
print_help();
return 0;
}
ArgParser arg_parser;
// short unnamed options
const NormType norm_type = arg_parser.norm_dict[argv[1]];
const NormDataType data_type = static_cast<NormDataType>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const bool time_kernel = std::stoi(argv[6]);
// parse the long options
arg_parser(argc, argv);
const std::vector<index_t> length = arg_parser.long_opts["length"];
const std::vector<index_t> stride = arg_parser.long_opts["stride"];
const std::vector<index_t> reduce = arg_parser.long_opts["reduce"];
const index_t alpha =
arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
if(data_type == NormDataType::F16_F16)
{
ck::profiler::profile_normalization_impl<ck::half_t, float, ck::half_t>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
float(alpha),
float(beta),
norm_type);
}
else if(data_type == NormDataType::F32_F32)
{
ck::profiler::profile_normalization_impl<float, float, float>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
float(alpha),
float(beta),
norm_type);
}
else
{
throw std::runtime_error("not implemented yet");
}
return 0;
}
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_normalization(argc, argv);
// return 0;
// }
...@@ -20,6 +20,7 @@ int profile_conv_fwd_bias_relu_add(int, char*[]); ...@@ -20,6 +20,7 @@ int profile_conv_fwd_bias_relu_add(int, char*[]);
int profile_convnd_fwd(int argc, char* argv[]); int profile_convnd_fwd(int argc, char* argv[]);
int profile_convnd_bwd_data(int, char*[], int); int profile_convnd_bwd_data(int, char*[], int);
int profile_conv_bwd_weight(int, char*[]); int profile_conv_bwd_weight(int, char*[]);
int profile_normalization(int, char*[]);
int profile_reduce(int, char*[]); int profile_reduce(int, char*[]);
static void print_helper_message() static void print_helper_message()
...@@ -130,6 +131,11 @@ int main(int argc, char* argv[]) ...@@ -130,6 +131,11 @@ int main(int argc, char* argv[])
{ {
return profile_gemm_add_add_fastgelu(argc, argv); return profile_gemm_add_add_fastgelu(argc, argv);
} }
else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "layernorm") == 0 ||
strcmp(argv[1], "softmax") == 0)
{
return profile_normalization(argc, argv);
}
else else
{ {
print_helper_message(); print_helper_message();
......
...@@ -2,7 +2,10 @@ add_custom_target(test_softmax) ...@@ -2,7 +2,10 @@ add_custom_target(test_softmax)
add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp) add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp)
add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp) add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp)
add_gtest_executable(test_softmax_int8 test_softmax_int8.cpp)
target_link_libraries(test_softmax_fp32 PRIVATE host_tensor) target_link_libraries(test_softmax_fp32 PRIVATE host_tensor)
target_link_libraries(test_softmax_fp16 PRIVATE host_tensor) target_link_libraries(test_softmax_fp16 PRIVATE host_tensor)
target_link_libraries(test_softmax_int8 PRIVATE host_tensor)
add_dependencies(test_softmax test_softmax_fp32) add_dependencies(test_softmax test_softmax_fp32)
add_dependencies(test_softmax test_softmax_fp16) add_dependencies(test_softmax test_softmax_fp16)
\ No newline at end of file add_dependencies(test_softmax test_softmax_int8)
\ No newline at end of file
...@@ -15,14 +15,19 @@ class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple> ...@@ -15,14 +15,19 @@ class TestSoftmaxFP16 : public ck::TestSoftmax<Tuple>
// clang-format off // clang-format off
using KernelTypes = ::testing::Types< using KernelTypes = ::testing::Types<
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
std::tuple<ck::half_t, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<4>>, // mixed precision
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>, std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>, std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>, std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>, std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>, std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>, std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>, std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>> std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>,
std::tuple<ck::half_t, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>
>; >;
// clang-format on // clang-format on
TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes); TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes);
......
...@@ -15,14 +15,19 @@ class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple> ...@@ -15,14 +15,19 @@ class TestSoftmaxFP32 : public ck::TestSoftmax<Tuple>
// clang-format off // clang-format off
using KernelTypes = ::testing::Types< using KernelTypes = ::testing::Types<
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> // InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
std::tuple<float, float, ck::half_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision
std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>, std::tuple<float, float, float, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>, std::tuple<float, float, float, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>, std::tuple<float, float, float, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>, std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>, std::tuple<float, float, float, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>, std::tuple<float, float, float, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>, std::tuple<float, float, float, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>> std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>,
std::tuple<float, float, float, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>
>; >;
// clang-format on // clang-format on
TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes); TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes);
......
#include "gtest/gtest.h"
#include "test_softmax_util.hpp"
template <ck::index_t N>
using I = ck::Number<N>;
template <typename Tuple>
class TestSoftmaxINT8 : public ck::TestSoftmax<Tuple>
{
};
// clang-format off
using KernelTypes = ::testing::Types<
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<1>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>,
std::tuple<int8_t, float, int8_t, I<3>, I<2>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>
>;
// clang-format on
TYPED_TEST_SUITE(TestSoftmaxINT8, KernelTypes);
TYPED_TEST(TestSoftmaxINT8, Test_INT8) { this->Run(); }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector> #include <vector>
#include <iostream> #include <iostream>
#include <gtest/gtest.h> #include <gtest/gtest.h>
...@@ -16,6 +18,18 @@ ...@@ -16,6 +18,18 @@
namespace ck { namespace ck {
template <typename Range>
std::string serialize_range(const Range& range)
{
std::stringstream ss;
for(auto& r : range)
{
ss << r << ", ";
}
std::string str = ss.str();
return std::string(str.begin(), str.end() - 2);
}
template <typename Tuple> template <typename Tuple>
class TestSoftmax : public ::testing::Test class TestSoftmax : public ::testing::Test
{ {
...@@ -80,23 +94,43 @@ class TestSoftmax : public ::testing::Test ...@@ -80,23 +94,43 @@ class TestSoftmax : public ::testing::Test
auto argument_ptr = device_instance.MakeArgumentPointer(i_in_lengths, auto argument_ptr = device_instance.MakeArgumentPointer(i_in_lengths,
i_in_strides, i_in_strides,
reduce_dims, reduce_dims,
alpha, &alpha,
beta, &beta,
in_dev.GetDeviceBuffer(), in_dev.GetDeviceBuffer(),
out_dev.GetDeviceBuffer()); out_dev.GetDeviceBuffer());
if(!device_instance.IsSupportedArgument(argument_ptr.get())) if(!device_instance.IsSupportedArgument(argument_ptr.get()))
{ {
FAIL() << "Unsupported argument"; // std::cout << "Skipped due to unsupported argument: "
// << "input lengths = [" << serialize_range(in_length) << "], "
// << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
return;
} }
auto invoker_ptr = device_instance.MakeInvokerPointer(); auto invoker_ptr = device_instance.MakeInvokerPointer();
invoker_ptr->Run(argument_ptr.get()); invoker_ptr->Run(argument_ptr.get());
ref_instance_invoker_.Run({in, out_ref, alpha, beta, Rank, reduce_dims}); ref_instance_invoker_.Run({in, out_ref, alpha, beta, reduce_dims});
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.mData.data());
EXPECT_TRUE(ck::utils::check_err(out.mData, out_ref.mData));
bool pass;
if(std::is_same<InDataType, int8_t>::value)
{
EXPECT_TRUE(pass = ck::utils::check_err(
out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1));
}
else
{
EXPECT_TRUE(pass = ck::utils::check_err(out.mData, out_ref.mData));
}
if(!pass)
{
FAIL() << "Failure in input lengths = [" << serialize_range(in_length) << "], "
<< "scaler = [" << alpha << ", " << beta << "].";
}
} }
void Run() void Run()
...@@ -105,13 +139,14 @@ class TestSoftmax : public ::testing::Test ...@@ -105,13 +139,14 @@ class TestSoftmax : public ::testing::Test
{ {
for(auto scale : this->scales_) for(auto scale : this->scales_)
{ {
this->RunSingle(in_length, std::get<0>(scale), std::get<1>(scale)); this->RunSingle(in_length, scale[0], scale[1]);
} }
} }
} }
std::vector<std::vector<index_t>> in_lengths_ = {{1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}}; std::vector<std::vector<index_t>> in_lengths_ = {
std::vector<std::tuple<AccDataType, AccDataType>> scales_ = {{1, 0}, {2, 2}, {0, 1}}; {1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}, {4, 4, 2048}, {8, 1, 8192}};
std::vector<std::vector<AccDataType>> scales_ = {{1, 0}, {1, 1}, {0, 1}, {2, 2}};
typename ReferenceInstance::Invoker ref_instance_invoker_; typename ReferenceInstance::Invoker ref_instance_invoker_;
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment