Unverified Commit 24af0144 authored by Po Yen Chen's avatar Po Yen Chen Committed by GitHub
Browse files

Merge branch 'develop' into gemm_layernorm_welford

parents 961f5e9e b79bbbc2
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_i8_i8_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_i8_i8_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -20,12 +20,12 @@ set(PROFILER_SOURCE ...@@ -20,12 +20,12 @@ set(PROFILER_SOURCE
src/profile_conv_fwd_bias_relu.cpp src/profile_conv_fwd_bias_relu.cpp
src/profile_conv_fwd_bias_relu_add.cpp src/profile_conv_fwd_bias_relu_add.cpp
src/profile_conv_bwd_data.cpp src/profile_conv_bwd_data.cpp
src/profile_conv_bwd_weight.cpp
src/profile_grouped_conv_fwd.cpp src/profile_grouped_conv_fwd.cpp
src/profile_grouped_conv_bwd_weight.cpp
src/profile_reduce.cpp src/profile_reduce.cpp
src/profile_groupnorm.cpp src/profile_groupnorm.cpp
src/profile_layernorm.cpp src/profile_layernorm.cpp
src/profile_normalization.cpp src/profile_softmax.cpp
) )
add_executable(ckProfiler ${PROFILER_SOURCE}) add_executable(ckProfiler ${PROFILER_SOURCE})
...@@ -49,10 +49,13 @@ target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_fwd_instance) ...@@ -49,10 +49,13 @@ target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_fwd_instance)
target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_data_instance)
target_link_libraries(ckProfiler PRIVATE device_conv1d_bwd_weight_instance) target_link_libraries(ckProfiler PRIVATE device_grouped_conv1d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance) target_link_libraries(ckProfiler PRIVATE device_grouped_conv2d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_weight_instance) target_link_libraries(ckProfiler PRIVATE device_grouped_conv3d_bwd_weight_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
target_link_libraries(ckProfiler PRIVATE device_normalization_instance) target_link_libraries(ckProfiler PRIVATE device_normalization_instance)
target_link_libraries(ckProfiler PRIVATE device_softmax_instance)
target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance)
rocm_install(TARGETS ckProfiler COMPONENT profiler)
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace ck { namespace ck {
...@@ -111,15 +112,15 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification, ...@@ -111,15 +112,15 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
std::size_t stride, std::size_t stride,
std::size_t batch_stride, std::size_t batch_stride,
auto layout) { auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), Row>::value) if(std::is_same<decltype(layout), Row>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
std::vector<std::size_t>({batch_stride, stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
std::vector<std::size_t>({batch_stride, 1, stride}));
} }
}; };
...@@ -330,8 +331,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification, ...@@ -330,8 +331,7 @@ bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
{ {
e1_g_m_o_device_buf.FromDevice(e1_g_m_o_device_result.mData.data()); e1_g_m_o_device_buf.FromDevice(e1_g_m_o_device_result.mData.data());
pass = pass & ck::utils::check_err(e1_g_m_o_device_result.mData, pass = pass & ck::utils::check_err(e1_g_m_o_device_result, e1_g_m_o_host_result);
e1_g_m_o_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace ck { namespace ck {
...@@ -105,15 +106,15 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, ...@@ -105,15 +106,15 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
std::size_t stride, std::size_t stride,
std::size_t batch_stride, std::size_t batch_stride,
auto layout) { auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), Row>::value) if(std::is_same<decltype(layout), Row>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
std::vector<std::size_t>({batch_stride, stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
std::vector<std::size_t>({batch_stride, 1, stride}));
} }
}; };
...@@ -283,8 +284,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification, ...@@ -283,8 +284,7 @@ bool profile_batched_gemm_gemm_impl(bool do_verification,
{ {
c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data()); c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
pass = pass & pass = pass & ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace ck { namespace ck {
...@@ -50,15 +51,15 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -50,15 +51,15 @@ bool profile_batched_gemm_impl(int do_verification,
std::size_t stride, std::size_t stride,
std::size_t batch_stride, std::size_t batch_stride,
auto layout) { auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
std::vector<std::size_t>({batch_stride, stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
std::vector<std::size_t>({batch_stride, 1, stride}));
} }
}; };
...@@ -202,8 +203,7 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -202,8 +203,7 @@ bool profile_batched_gemm_impl(int do_verification,
{ {
c_device_buf.FromDevice(c_g_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
pass = pass & pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
ck::utils::check_err(c_g_m_n_device_result.mData, c_g_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
namespace ck { namespace ck {
...@@ -78,15 +79,15 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -78,15 +79,15 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
std::size_t col, std::size_t col,
std::size_t stride, std::size_t stride,
auto layout) { auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
std::vector<std::size_t>({row * stride, stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
std::vector<std::size_t>({col * stride, 1, stride}));
} }
}; };
...@@ -95,17 +96,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -95,17 +96,13 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
Tensor<CDataType> c_g_m_n_host_result( Tensor<CDataType> c_g_m_n_host_result(
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{})); f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> d0_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>( Tensor<ReduceDataType> d0_g_m_host_result({BatchCount, M});
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> d1_g_m_host_result({BatchCount, M});
Tensor<ReduceDataType> d1_g_m_host_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
Tensor<CDataType> c_g_m_n_device_result( Tensor<CDataType> c_g_m_n_device_result(
f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{})); f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> d0_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>( Tensor<ReduceDataType> d0_g_m_device_result({BatchCount, M});
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> d1_g_m_device_result({BatchCount, M});
Tensor<ReduceDataType> d1_g_m_device_result(HostTensorDescriptor(std::vector<std::size_t>(
{static_cast<std::size_t>(BatchCount), static_cast<std::size_t>(M)})));
std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl; std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl; std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
...@@ -319,12 +316,9 @@ bool profile_batched_gemm_reduce_impl(int do_verification, ...@@ -319,12 +316,9 @@ bool profile_batched_gemm_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data()); reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data()); reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
bool c_error = bool c_error = ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
ck::utils::check_err(c_g_m_n_device_result.mData, c_g_m_n_host_result.mData); bool d0_error = ck::utils::check_err(d0_g_m_device_result, d0_g_m_host_result);
bool d0_error = bool d1_error = ck::utils::check_err(d1_g_m_device_result, d1_g_m_host_result);
ck::utils::check_err(d0_g_m_device_result.mData, d0_g_m_host_result.mData);
bool d1_error =
ck::utils::check_err(d1_g_m_device_result.mData, d1_g_m_host_result.mData);
pass = pass && (c_error == true); pass = pass && (c_error == true);
pass = pass && (d0_error == true); pass = pass && (d0_error == true);
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
...@@ -29,7 +30,8 @@ template <typename ADataType, ...@@ -29,7 +30,8 @@ template <typename ADataType,
typename ALayout, typename ALayout,
typename B0Layout, typename B0Layout,
typename B1Layout, typename B1Layout,
typename CLayout> typename CLayout,
bool MaskOutUpperTriangle>
bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
...@@ -46,16 +48,18 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -46,16 +48,18 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
int BatchStrideA = -1, int BatchStrideA = -1,
int BatchStrideB0 = -1, int BatchStrideB0 = -1,
int BatchStrideB1 = -1, int BatchStrideB1 = -1,
int BatchStrideC = -1) int BatchStrideC = -1,
float alpha = 1.f)
{ {
using Row = tensor_layout::gemm::RowMajor; using Row = tensor_layout::gemm::RowMajor;
using Col = tensor_layout::gemm::ColumnMajor; using Col = tensor_layout::gemm::ColumnMajor;
using PassThrough = tensor_operation::element_wise::PassThrough; using PassThrough = tensor_operation::element_wise::PassThrough;
using Scale = tensor_operation::element_wise::Scale;
using AElementOp = PassThrough; using AElementOp = PassThrough;
using B0ElementOp = PassThrough; using B0ElementOp = PassThrough;
using Acc0ElementOp = PassThrough; using Acc0ElementOp = Scale;
using B1ElementOp = PassThrough; using B1ElementOp = PassThrough;
using CElementOp = PassThrough; using CElementOp = PassThrough;
using AccDataType = float; using AccDataType = float;
...@@ -67,7 +71,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -67,7 +71,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
AccDataType, AccDataType,
AElementOp, AElementOp,
B0ElementOp, B0ElementOp,
CElementOp>; Acc0ElementOp>;
// Ref Softmax: fp32 in, various type out // Ref Softmax: fp32 in, various type out
using ReferenceSoftmaxInstance = using ReferenceSoftmaxInstance =
...@@ -110,15 +114,15 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -110,15 +114,15 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
std::size_t stride, std::size_t stride,
std::size_t batch_stride, std::size_t batch_stride,
auto layout) { auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), Row>::value) if(std::is_same<decltype(layout), Row>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
std::vector<std::size_t>({batch_stride, stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
std::vector<std::size_t>({batch_stride, 1, stride}));
} }
}; };
...@@ -185,7 +189,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -185,7 +189,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
auto b0_element_op = B0ElementOp{}; auto b0_element_op = B0ElementOp{};
auto acc0_element_op = Acc0ElementOp{}; auto acc0_element_op = Acc0ElementOp{alpha};
auto b1_element_op = B1ElementOp{}; auto b1_element_op = B1ElementOp{};
auto c_element_op = CElementOp{}; auto c_element_op = CElementOp{};
...@@ -201,7 +205,8 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -201,7 +205,8 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
B0ElementOp, B0ElementOp,
Acc0ElementOp, Acc0ElementOp,
B1ElementOp, B1ElementOp,
CElementOp>; CElementOp,
MaskOutUpperTriangle>;
// get device op instances // get device op instances
const auto op_ptrs = tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -214,10 +219,16 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -214,10 +219,16 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
auto ref_gemm0 = ReferenceGemm0Instance{}; auto ref_gemm0 = ReferenceGemm0Instance{};
auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); auto ref_gemm0_invoker = ref_gemm0.MakeInvoker();
auto ref_gemm0_argument = ref_gemm0.MakeArgument( auto ref_gemm0_argument = ref_gemm0.MakeArgument(
a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, PassThrough{}); a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, Scale{alpha});
ref_gemm0_invoker.Run(ref_gemm0_argument); ref_gemm0_invoker.Run(ref_gemm0_argument);
// mask out upper triangle
acc0_g_m_n.ForEach([&](auto& self, auto idx) {
if(MaskOutUpperTriangle && idx[1] < idx[2])
self(idx) = -ck::NumericLimits<float>::Infinity();
});
auto ref_softmax = ReferenceSoftmaxInstance{}; auto ref_softmax = ReferenceSoftmaxInstance{};
auto ref_softmax_invoker = ref_softmax.MakeInvoker(); auto ref_softmax_invoker = ref_softmax.MakeInvoker();
auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2}); auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
...@@ -297,8 +308,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification, ...@@ -297,8 +308,7 @@ bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
{ {
c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data()); c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
pass = pass & pass = pass & ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -7,51 +7,48 @@ ...@@ -7,51 +7,48 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/batched_gemm_masking_scale_softmax_gemm_permute.hpp" #include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <typename ADataType, template <index_t NumDimG,
index_t NumDimM,
index_t NumDimN,
index_t NumDimK,
index_t NumDimO,
typename ADataType,
typename B0DataType, typename B0DataType,
typename B1DataType, typename B1DataType,
typename CDataType, typename CDataType,
typename ALayout, typename Acc0BiasesDataType,
typename B0Layout, typename Acc1BiasesDataType,
typename B1Layout, tensor_operation::device::MaskingSpecialization MaskingSpec>
typename CPermuteNumDims_G_M_O> bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verification, int init_method,
int init_method, bool do_log,
bool do_log, bool time_kernel,
bool time_kernel, int M,
int M, int N,
int N, int K,
int K, int O,
int O, int G0,
int G0, int G1,
int G1, float alpha = 1.f)
int StrideA = -1,
int StrideB0 = -1,
int StrideB1 = -1,
int BatchStrideA = -1,
int BatchStrideB0 = -1,
int BatchStrideB1 = -1,
float alpha = 1.f)
{ {
using Row = tensor_layout::gemm::RowMajor;
using Col = tensor_layout::gemm::ColumnMajor;
using PassThrough = tensor_operation::element_wise::PassThrough; using PassThrough = tensor_operation::element_wise::PassThrough;
using Scale = tensor_operation::element_wise::Scale; using Scale = tensor_operation::element_wise::Scale;
using AElementOp = PassThrough; using AElementOp = PassThrough;
...@@ -60,6 +57,7 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi ...@@ -60,6 +57,7 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
using B1ElementOp = PassThrough; using B1ElementOp = PassThrough;
using CElementOp = PassThrough; using CElementOp = PassThrough;
using AccDataType = float; using AccDataType = float;
using tensor_operation::device::MaskingSpecialization;
// Ref Gemm0: various type in, fp32 out // Ref Gemm0: various type in, fp32 out
using ReferenceGemm0Instance = tensor_operation::host::ReferenceBatchedGemm<ADataType, using ReferenceGemm0Instance = tensor_operation::host::ReferenceBatchedGemm<ADataType,
...@@ -85,67 +83,33 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi ...@@ -85,67 +83,33 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
bool pass = true; bool pass = true;
std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O}; // A layout [G0, M, G1, K]
std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1}; std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
StrideA = (StrideA < 0) ? DefaultStrideA : StrideA; // B0 layout [G0, N, G1, K]
StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0; std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1; std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA; // B1 layout [G0, N, G1, O]
const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0; std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1; std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
BatchStrideA = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA; // C layout [G0, M, G1, O]
BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0; std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1; std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
const int BatchCount = G0 * G1; const int BatchCount = G0 * G1;
auto f_host_tensor_descriptor = [](std::size_t batch_count, Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
std::size_t row, Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
std::size_t col, Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
std::size_t stride, Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
std::size_t batch_stride, Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
auto layout) {
if(std::is_same<decltype(layout), Row>::value) std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
{ std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}), std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl;
std::vector<std::size_t>({batch_stride, stride, 1}));
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
std::vector<std::size_t>({batch_stride, 1, stride}));
}
};
// C_m_o = A_m_k * B0_k_n * B1_n_o
Tensor<ADataType> a_g_m_k(
f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
Tensor<B0DataType> b0_g_k_n(
f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
Tensor<B1DataType> b1_g_n_o(
f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
Tensor<CDataType> c_gs_ms_os_host_result(
std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
Tensor<CDataType> c_gs_ms_os_device_result(
std::vector<std::size_t>(c_gs_ms_os_lengths.begin(), c_gs_ms_os_lengths.end()),
std::vector<std::size_t>(c_gs_ms_os_strides.begin(), c_gs_ms_os_strides.end()));
// Host verification: Output of Gemm0 is input A of Gemm1
Tensor<AccDataType> acc0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
Tensor<CDataType> c_g_m_o_host_result(std::vector<int>{BatchCount, M, O},
std::vector<int>{M * O, O, 1});
std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl; std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl;
std::srand(1); // work around test flakiness std::srand(1); // work around test flakiness
...@@ -157,38 +121,38 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi ...@@ -157,38 +121,38 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
// or not. May want to try exact same approach as the GPU kernel in the host reference // or not. May want to try exact same approach as the GPU kernel in the host reference
// GEMM+Softmax+GEMM function to see if the accuracy discrepancy goes away. Until then, // GEMM+Softmax+GEMM function to see if the accuracy discrepancy goes away. Until then,
// shrink the input value range as it is less likely to produce errors of around ~1e-3. // shrink the input value range as it is less likely to produce errors of around ~1e-3.
// a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}); // a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
// b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5}); // b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
// b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5}); // b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2}); a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
break; break;
case 2: case 2:
a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
break; break;
case 3: case 3:
a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2}); a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
break; break;
default: default:
a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1}); a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
} }
DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSize()); DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize()); DeviceMem b0_device_buf(sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize());
DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize()); DeviceMem b1_device_buf(sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize());
DeviceMem c_gs_ms_os_device_buf(sizeof(CDataType) * DeviceMem c_device_buf(sizeof(CDataType) *
c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()); c_gs_ms_os_device_result.mDesc.GetElementSpaceSize());
a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data()); a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data()); b0_device_buf.ToDevice(b0_gs_ns_ks.mData.data());
b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data()); b1_device_buf.ToDevice(b1_gs_os_ns.mData.data());
auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
auto b0_element_op = B0ElementOp{}; auto b0_element_op = B0ElementOp{};
...@@ -196,20 +160,23 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi ...@@ -196,20 +160,23 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
auto b1_element_op = B1ElementOp{}; auto b1_element_op = B1ElementOp{};
auto c_element_op = CElementOp{}; auto c_element_op = CElementOp{};
using DeviceOp = using DeviceOp = tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<ALayout, 1,
B0Layout, 1,
B1Layout, 1,
CPermuteNumDims_G_M_O, 1,
ADataType, ADataType,
B0DataType, B0DataType,
B1DataType, B1DataType,
CDataType, CDataType,
AElementOp, ck::Tuple<>,
B0ElementOp, ck::Tuple<>,
Acc0ElementOp, AElementOp,
B1ElementOp, B0ElementOp,
CElementOp>; Acc0ElementOp,
B1ElementOp,
CElementOp,
MaskingSpec>;
// get device op instances // get device op instances
const auto op_ptrs = tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -219,6 +186,26 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi ...@@ -219,6 +186,26 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
if(do_verification) if(do_verification)
{ {
c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
Tensor<ADataType> a_g_m_k({BatchCount, M, K});
Tensor<B0DataType> b0_g_k_n({BatchCount, K, N});
Tensor<B1DataType> b1_g_n_o({BatchCount, N, O});
Tensor<AccDataType> acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0
Tensor<ADataType> a1_g_m_n({BatchCount, M, N}); // scratch object after softmax
Tensor<CDataType> c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1
// permute
a_gs_ms_ks.ForEach([&](auto& self, auto idx) {
a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
});
b0_gs_ns_ks.ForEach([&](auto& self, auto idx) {
b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
});
b1_gs_os_ns.ForEach([&](auto& self, auto idx) {
b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
});
auto ref_gemm0 = ReferenceGemm0Instance{}; auto ref_gemm0 = ReferenceGemm0Instance{};
auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); auto ref_gemm0_invoker = ref_gemm0.MakeInvoker();
auto ref_gemm0_argument = ref_gemm0.MakeArgument( auto ref_gemm0_argument = ref_gemm0.MakeArgument(
...@@ -228,7 +215,7 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi ...@@ -228,7 +215,7 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
// mask out upper triangle // mask out upper triangle
acc0_g_m_n.ForEach([&](auto& self, auto idx) { acc0_g_m_n.ForEach([&](auto& self, auto idx) {
if(idx[1] < idx[2]) if(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle && idx[1] < idx[2])
self(idx) = -ck::NumericLimits<float>::Infinity(); self(idx) = -ck::NumericLimits<float>::Infinity();
}); });
...@@ -265,23 +252,24 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi ...@@ -265,23 +252,24 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()), static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()), static_cast<B0DataType*>(b0_device_buf.GetDeviceBuffer()),
static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()), static_cast<B1DataType*>(b1_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_gs_ms_os_device_buf.GetDeviceBuffer()), static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
M, {}, // std::array<void*, 1> p_acc0_biases;
N, {}, // std::array<void*, 1> p_acc1_biases;
K, a_gs_ms_ks_lengths,
O, a_gs_ms_ks_strides,
BatchCount, b0_gs_ns_ks_lengths,
b0_gs_ns_ks_strides,
b1_gs_os_ns_lengths,
b1_gs_os_ns_strides,
c_gs_ms_os_lengths, c_gs_ms_os_lengths,
c_gs_ms_os_strides, c_gs_ms_os_strides,
StrideA, {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
StrideB0, {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
StrideB1, {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
BatchStrideA, {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
BatchStrideB0,
BatchStrideB1,
a_element_op, a_element_op,
b0_element_op, b0_element_op,
acc0_element_op, acc0_element_op,
...@@ -319,18 +307,18 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi ...@@ -319,18 +307,18 @@ bool profile_batched_gemm_masking_scale_softmax_gemm_permute_impl(bool do_verifi
if(do_verification) if(do_verification)
{ {
c_gs_ms_os_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
pass = pass & ck::utils::check_err(c_gs_ms_os_device_result.mData, pass =
c_gs_ms_os_host_result.mData); pass & ck::utils::check_err(c_gs_ms_os_device_result, c_gs_ms_os_host_result);
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "a_g_m_k: ", a_g_m_k.mData, ",") LogRangeAsType<float>(std::cout << "a_gs_ms_ks: ", a_gs_ms_ks.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "b0_g_k_n : ", b0_g_k_n.mData, ",") LogRangeAsType<float>(std::cout << "b0_gs_ns_ks : ", b0_gs_ns_ks.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>(std::cout << "b1_g_n_o : ", b1_g_n_o.mData, ",") LogRangeAsType<float>(std::cout << "b1_gs_os_ns : ", b1_gs_os_ns.mData, ",")
<< std::endl; << std::endl;
LogRangeAsType<float>( LogRangeAsType<float>(
std::cout << "c_gs_ms_os_host_result : ", c_gs_ms_os_host_result.mData, ",") std::cout << "c_gs_ms_os_host_result : ", c_gs_ms_os_host_result.mData, ",")
......
...@@ -209,8 +209,7 @@ bool profile_conv_bwd_data_impl(int do_verification, ...@@ -209,8 +209,7 @@ bool profile_conv_bwd_data_impl(int do_verification,
{ {
in_device_buf.FromDevice(input_device_result.mData.data()); in_device_buf.FromDevice(input_device_result.mData.data());
pass = pass = pass & ck::utils::check_err(input_device_result, input_host_result);
pass & ck::utils::check_err(input_device_result.mData, input_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
namespace ck { namespace ck {
...@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
using namespace ck::literals;
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value || if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value || is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value) is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
} }
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value || else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value || is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value) is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
} }
}; };
...@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k( Tensor<OutDataType> bias_k({K});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
// residual: assume same layout as output tensor // residual: assume same layout as output tensor
Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
...@@ -251,8 +251,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification, ...@@ -251,8 +251,7 @@ void profile_conv_fwd_bias_relu_add_impl(int do_verification,
{ {
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData, ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
out_n_k_ho_wo_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
namespace ck { namespace ck {
...@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -68,19 +69,19 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
using namespace ck::literals;
if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value || if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value || is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value) is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
} }
else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value || else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
is_same<decltype(layout), tensor_layout::convolution::KYXC>::value || is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
is_same<decltype(layout), tensor_layout::convolution::NHWK>::value) is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
} }
}; };
...@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -92,8 +93,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias_k( Tensor<OutDataType> bias_k({K});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
...@@ -239,8 +239,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification, ...@@ -239,8 +239,7 @@ void profile_conv_fwd_bias_relu_impl(int do_verification,
{ {
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
ck::utils::check_err(out_n_k_ho_wo_device_result.mData, ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
out_n_k_ho_wo_host_result.mData);
if(do_log) if(do_log)
{ {
......
...@@ -191,7 +191,7 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -191,7 +191,7 @@ bool profile_conv_fwd_impl(int do_verification,
{ {
out_device_buf.FromDevice(device_output.mData.data()); out_device_buf.FromDevice(device_output.mData.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData); pass = pass & ck::utils::check_err(device_output, host_output);
if(do_log) if(do_log)
{ {
......
...@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification, ...@@ -453,7 +453,7 @@ bool profile_convnd_bwd_data_impl(int do_verification,
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
} }
success = ck::utils::check_err(input_host_result.mData, input_device_result.mData); success = ck::utils::check_err(input_host_result, input_device_result);
if(do_log) if(do_log)
{ {
......
...@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -433,7 +433,7 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(weights_device_result.mData.data()); wei_device_buf.FromDevice(weights_device_result.mData.data());
success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData); success = ck::utils::check_err(weights_host_result, weights_device_result);
if(success == false) if(success == false)
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck {
namespace profiler {
template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
void host_elementwise2D(HostTensorC& C,
const HostTensorA& A,
const HostTensorB& B,
const std::vector<std::size_t>& shape,
Functor functor)
{
using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
for(std::size_t m = 0; m < shape[0]; ++m)
for(std::size_t n = 0; n < shape[1]; ++n)
{
auto a_val = A(m, n);
auto b_val = B(m, n);
ctype c_val = 0;
functor(c_val, a_val, b_val);
C(m, n) = c_val;
}
}
template <typename ADataType,
typename BDataType,
typename GammaDataType,
typename BetaDataType,
typename AccDataType,
typename YDataType>
bool profile_elementwise_layernorm_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> length)
{
using Add = ck::tensor_operation::element_wise::Add;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
if(length.size() != 2)
return false;
index_t M = length[0];
index_t N = length[1];
index_t Stride = N;
constexpr int Rank = 2;
constexpr int NumReduceDim = 1;
std::vector<index_t> reduce_dim = {1};
std::vector<index_t> gammaBetaLength = {N};
std::vector<index_t> gammaBetaStride = {0, 1};
auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
using namespace ck::literals;
return HostTensorDescriptor({row, col}, {stride, 1_uz});
};
Tensor<ADataType> a(length);
Tensor<BDataType> b(length);
Tensor<GammaDataType> gamma(gammaBetaLength);
Tensor<BetaDataType> beta(gammaBetaLength);
Tensor<YDataType> y(length);
Tensor<YDataType> host_y(length);
switch(init_method)
{
case 0:
a.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
b.GenerateTensorValue(GeneratorTensor_1<BDataType>{});
gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
beta.GenerateTensorValue(GeneratorTensor_1<BetaDataType>{});
break;
case 1:
a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
break;
default:
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0, 1});
b.GenerateTensorValue(GeneratorTensor_3<BDataType>{0, 1});
gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
}
DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_dev(sizeof(ADataType) * b.mDesc.GetElementSpaceSize());
DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
a_dev.ToDevice(a.mData.data());
b_dev.ToDevice(b.mData.data());
gamma_dev.ToDevice(gamma.mData.data());
beta_dev.ToDevice(beta.mData.data());
std::array<const void*, 2> input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()};
// add device normalization instances
using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization<
ck::Tuple<ADataType, BDataType>,
GammaDataType,
BetaDataType,
AccDataType,
YDataType,
Add,
PassThrough,
2,
1>;
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
if(do_verification)
{
using XDataType = ADataType;
std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
static_cast<unsigned long>(N)};
Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
host_elementwise2D<Tensor<ADataType>, Tensor<BDataType>, Tensor<XDataType>, Add>(
x, a, b, mn, Add{});
using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
PassThrough,
Rank,
NumReduceDim>;
ReferenceInstance ref;
auto ref_argument =
ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4);
auto ref_invoker = ref.MakeInvoker();
ref_invoker.Run(ref_argument);
}
int num_kernel = 0;
for(auto& inst_ptr : instance_ptrs)
{
auto argument_ptr = inst_ptr->MakeArgumentPointer(
length,
{
std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
},
gammaBetaStride,
gammaBetaStride,
std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
reduce_dim,
1e-4,
input,
gamma_dev.GetDeviceBuffer(),
beta_dev.GetDeviceBuffer(),
y_dev.GetDeviceBuffer(),
Add{},
PassThrough{});
if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
{
++num_kernel;
}
else
{
continue;
}
auto invoker_ptr = inst_ptr->MakeInvokerPointer();
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t num_bytes = a.mDesc.GetElementSize() * sizeof(ADataType) +
b.mDesc.GetElementSize() * sizeof(BDataType) +
gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
y.mDesc.GetElementSize() * sizeof(YDataType);
float gb_per_sec = num_bytes / 1.E6 / avg_time;
if(time_kernel)
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< inst_ptr->GetTypeString() << std::endl;
if(avg_time < best_avg_time)
{
best_instance_name = inst_ptr->GetTypeString();
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
y_dev.FromDevice(y.mData.data());
bool pass =
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b : ", b.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_y : ", host_y.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "y : ", y.mData, ",") << std::endl;
}
if(!pass)
{
std::cout << inst_ptr->GetTypeString() << " failed verification: ";
LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
return false;
}
else
{
if(time_kernel)
std::cout << "pass" << std::endl;
}
}
}
if(time_kernel)
{
LogRange(std::cout << "length = ", length, ",") << ", ";
std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, "
<< best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
}
if(num_kernel == 0)
{
std::cout << "Error: No kernel is tested" << std::endl;
return false;
}
return true;
}
} // namespace profiler
} // namespace ck
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -47,15 +48,15 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -121,8 +122,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
// run reference // run reference
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor( Tensor<AccDataType> c_m_n({M, N});
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -223,8 +223,7 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
} }
} }
else else
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -75,21 +76,20 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
int StrideD0) int StrideD0)
{ {
auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) { auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
return HostTensorDescriptor(std::vector<std::size_t>({len}), return HostTensorDescriptor({len}, {stride});
std::vector<std::size_t>({stride}));
}; };
auto f_host_tensor_descriptor2d = auto f_host_tensor_descriptor2d =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -99,16 +99,12 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1)); Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_host_result( Tensor<ReduceDataType> reduce0_m_host_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_host_result({M});
Tensor<ReduceDataType> reduce1_m_host_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
Tensor<ReduceDataType> reduce0_m_device_result( Tensor<ReduceDataType> reduce0_m_device_result({M});
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)}))); Tensor<ReduceDataType> reduce1_m_device_result({M});
Tensor<ReduceDataType> reduce1_m_device_result(
HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -347,9 +343,9 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data()); reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data()); reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
ck::utils::check_err(reduce0_m_device_result.mData, reduce0_m_host_result.mData); ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
ck::utils::check_err(reduce1_m_device_result.mData, reduce1_m_host_result.mData); ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
if(do_log) if(do_log)
{ {
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -46,15 +47,15 @@ bool profile_gemm_bilinear_impl(int do_verification,
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -116,8 +117,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
// run reference // run reference
if(do_verification) if(do_verification)
{ {
Tensor<AccDataType> c_m_n(HostTensorDescriptor( Tensor<AccDataType> c_m_n({M, N});
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -215,8 +215,7 @@ bool profile_gemm_bilinear_impl(int do_verification,
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
} }
} }
else else
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification, ...@@ -45,15 +46,15 @@ int profile_gemm_impl(int do_verification,
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification, ...@@ -187,8 +188,7 @@ int profile_gemm_impl(int do_verification,
{ {
c_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
pass & ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment