Commit aea62819 authored by Chaitanya Inumella's avatar Chaitanya Inumella
Browse files

Rebase branch 'develop' of...

Rebase branch 'develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into contraction_hipTENSOR
parents 75af5450 75ab874e
...@@ -433,21 +433,17 @@ bool profile_convnd_bwd_weight_impl(int do_verification, ...@@ -433,21 +433,17 @@ bool profile_convnd_bwd_weight_impl(int do_verification,
{ {
wei_device_buf.FromDevice(weights_device_result.mData.data()); wei_device_buf.FromDevice(weights_device_result.mData.data());
float max_error = check_error(weights_host_result, weights_device_result); success = ck::utils::check_err(weights_host_result.mData, weights_device_result.mData);
if(max_error > 8) if(success == false)
{ {
std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
success = false;
} }
else else
{ {
std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
} }
check_error(weights_host_result, weights_device_result);
if(do_log) if(do_log)
{ {
std::cout << "in : "; std::cout << "in : ";
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp" #include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -29,7 +29,9 @@ template <typename ADataType, ...@@ -29,7 +29,9 @@ template <typename ADataType,
typename EDataType, typename EDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename DELayout> // assume Ds and E have same layout typename D0Layout,
typename D1Layout,
typename ELayout>
bool profile_gemm_add_add_fastgelu_impl(int do_verification, bool profile_gemm_add_add_fastgelu_impl(int do_verification,
int init_method, int init_method,
bool /*do_log*/, bool /*do_log*/,
...@@ -59,10 +61,10 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -59,10 +61,10 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, DELayout{})); Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, DELayout{})); Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{})); Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{})); Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -100,7 +102,8 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -100,7 +102,8 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
ALayout, ALayout,
BLayout, BLayout,
DELayout, ck::Tuple<D0Layout, D1Layout>,
ELayout,
ADataType, ADataType,
BDataType, BDataType,
ck::Tuple<D0DataType, D1DataType>, ck::Tuple<D0DataType, D1DataType>,
...@@ -146,11 +149,11 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification, ...@@ -146,11 +149,11 @@ bool profile_gemm_add_add_fastgelu_impl(int do_verification,
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace()); DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpace()); DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace()); DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -217,15 +217,15 @@ void profile_gemm_bias_add_reduce_impl(int do_verification, ...@@ -217,15 +217,15 @@ void profile_gemm_bias_add_reduce_impl(int do_verification,
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpace()); DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpace()); DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) * DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
reduce0_m_device_result.mDesc.GetElementSpace()); reduce0_m_device_result.mDesc.GetElementSpaceSize());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) * DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
reduce1_m_device_result.mDesc.GetElementSpace()); reduce1_m_device_result.mDesc.GetElementSpaceSize());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(), std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
reduce1_device_buf.GetDeviceBuffer()}; reduce1_device_buf.GetDeviceBuffer()};
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
#include "ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp" #include "ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -28,7 +28,8 @@ template <typename ADataType, ...@@ -28,7 +28,8 @@ template <typename ADataType,
typename EDataType, typename EDataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename DELayout> // assume Ds and E have same layout typename DLayout,
typename ELayout>
bool profile_gemm_bilinear_impl(int do_verification, bool profile_gemm_bilinear_impl(int do_verification,
int init_method, int init_method,
bool /*do_log*/, bool /*do_log*/,
...@@ -59,9 +60,9 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -59,9 +60,9 @@ bool profile_gemm_bilinear_impl(int do_verification,
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DELayout{})); Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{})); Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, DELayout{})); Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
...@@ -96,7 +97,8 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -96,7 +97,8 @@ bool profile_gemm_bilinear_impl(int do_verification,
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD< using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
ALayout, ALayout,
BLayout, BLayout,
DELayout, ck::Tuple<DLayout>,
ELayout,
ADataType, ADataType,
BDataType, BDataType,
ck::Tuple<DDataType>, ck::Tuple<DDataType>,
...@@ -142,10 +144,10 @@ bool profile_gemm_bilinear_impl(int do_verification, ...@@ -142,10 +144,10 @@ bool profile_gemm_bilinear_impl(int do_verification,
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpace()); DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpace()); DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
......
...@@ -15,21 +15,21 @@ ...@@ -15,21 +15,21 @@
#include "ck/library/tensor_operation_instance/gpu/gemm.hpp" #include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
namespace profiler { namespace profiler {
template <typename ADataType, template <typename ALayout,
typename BLayout,
typename CLayout,
typename ADataType,
typename BDataType, typename BDataType,
typename AccDataType, typename AccDataType,
typename CDataType, typename CDataType>
typename ALayout,
typename BLayout,
typename CLayout>
int profile_gemm_impl(int do_verification, int profile_gemm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
...@@ -86,13 +86,12 @@ int profile_gemm_impl(int do_verification, ...@@ -86,13 +86,12 @@ int profile_gemm_impl(int do_verification,
const auto b_element_op = BElementOp{}; const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{}; const auto c_element_op = CElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
c_device_buf.ToDevice(c_m_n_device_result.mData.data());
using DeviceOp = ck::tensor_operation::device::DeviceGemm<ALayout, using DeviceOp = ck::tensor_operation::device::DeviceGemm<ALayout,
BLayout, BLayout,
...@@ -110,7 +109,7 @@ int profile_gemm_impl(int do_verification, ...@@ -110,7 +109,7 @@ int profile_gemm_impl(int do_verification,
std::cout << "found " << op_ptrs.size() << " instances" << std::endl; std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
// Run reference GEMM // Run reference op
if(do_verification) if(do_verification)
{ {
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
...@@ -131,11 +130,11 @@ int profile_gemm_impl(int do_verification, ...@@ -131,11 +130,11 @@ int profile_gemm_impl(int do_verification,
} }
std::string best_op_name; std::string best_op_name;
float best_ave_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
// profile device GEMM instances // profile device op instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr =
...@@ -161,7 +160,7 @@ int profile_gemm_impl(int do_verification, ...@@ -161,7 +160,7 @@ int profile_gemm_impl(int do_verification,
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float ave_time = float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = std::size_t(2) * M * N * K;
...@@ -169,18 +168,18 @@ int profile_gemm_impl(int do_verification, ...@@ -169,18 +168,18 @@ int profile_gemm_impl(int do_verification,
std::size_t num_btype = std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl; << gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops) if(tflops > best_tflops)
{ {
best_op_name = op_name; best_op_name = op_name;
best_tflops = tflops; best_tflops = tflops;
best_ave_time = ave_time; best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec; best_gb_per_sec = gb_per_sec;
} }
...@@ -244,7 +243,7 @@ int profile_gemm_impl(int do_verification, ...@@ -244,7 +243,7 @@ int profile_gemm_impl(int do_verification,
} }
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_avg_time
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl; << best_op_name << std::endl;
......
...@@ -10,10 +10,10 @@ ...@@ -10,10 +10,10 @@
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -189,13 +189,13 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -189,13 +189,13 @@ bool profile_gemm_reduce_impl(int do_verification,
} }
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem reduce0_device_buf(sizeof(ReduceDataType) * DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
reduce0_m_device_result.mDesc.GetElementSpace()); reduce0_m_device_result.mDesc.GetElementSpaceSize());
DeviceMem reduce1_device_buf(sizeof(ReduceDataType) * DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
reduce1_m_device_result.mDesc.GetElementSpace()); reduce1_m_device_result.mDesc.GetElementSpaceSize());
std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(), std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
reduce1_device_buf.GetDeviceBuffer()}; reduce1_device_buf.GetDeviceBuffer()};
......
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp" #include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -87,9 +87,9 @@ bool profile_gemm_splitk_impl(int do_verification, ...@@ -87,9 +87,9 @@ bool profile_gemm_splitk_impl(int do_verification,
const auto b_element_op = BElementOp{}; const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{}; const auto c_element_op = CElementOp{};
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace()); DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace()); DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace()); DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.mData.data());
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include <iostream>
#include <typeinfo>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
namespace ck {
namespace profiler {
template <ck::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType>
bool profile_grouped_conv_fwd_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param)
{
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
const auto in_element_op = InElementOp{};
const auto wei_element_op = WeiElementOp{};
const auto out_element_op = OutElementOp{};
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weight.mData.data());
// run reference op
if(do_verification)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>{};
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weight,
host_output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_,
in_element_op,
wei_element_op,
out_element_op);
// init host output to zero
host_output.SetZero();
ref_invoker.Run(ref_argument);
}
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
float best_avg_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device op instances
bool pass = true;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr =
op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op);
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init output to zero before profiling next kernel
out_device_buf.SetZero();
std::string op_name = op_ptr->GetTypeString();
auto invoker_ptr = op_ptr->MakeInvokerPointer();
float avg_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
out_device_buf.FromDevice(device_output.mData.data());
pass = pass & ck::utils::check_err(device_output.mData, host_output.mData);
if(do_log)
{
LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "host_output : ", host_output.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
<< std::endl;
}
}
}
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
}
std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
...@@ -7,40 +7,18 @@ ...@@ -7,40 +7,18 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using DeviceGroupedGemmNoOpPtr = ck::tensor_operation::device::DeviceGroupedGemmPtr<
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>;
void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
std::vector<DeviceGroupedGemmNoOpPtr>&);
void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
std::vector<DeviceGroupedGemmNoOpPtr>&);
void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
std::vector<DeviceGroupedGemmNoOpPtr>&);
void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
std::vector<DeviceGroupedGemmNoOpPtr>&);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
namespace ck { namespace ck {
namespace profiler { namespace profiler {
...@@ -51,7 +29,7 @@ template <typename ADataType, ...@@ -51,7 +29,7 @@ template <typename ADataType,
typename ALayout, typename ALayout,
typename BLayout, typename BLayout,
typename CLayout> typename CLayout>
void profile_grouped_gemm_impl(int do_verification, bool profile_grouped_gemm_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
bool time_kernel, bool time_kernel,
...@@ -62,6 +40,9 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -62,6 +40,9 @@ void profile_grouped_gemm_impl(int do_verification,
const std::vector<int>& StrideBs, const std::vector<int>& StrideBs,
const std::vector<int>& StrideCs) const std::vector<int>& StrideCs)
{ {
bool pass = true;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value) if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
...@@ -145,68 +126,47 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -145,68 +126,47 @@ void profile_grouped_gemm_impl(int do_verification,
p_b.reserve(group_count); p_b.reserve(group_count);
p_c.reserve(group_count); p_c.reserve(group_count);
std::vector<ck::tensor_operation::device::GemmShape> gemm_shapes; std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
gemm_shapes.reserve(group_count); gemm_descs.reserve(group_count);
for(std::size_t i = 0; i < group_count; i++) for(std::size_t i = 0; i < group_count; i++)
{ {
a_device_buf.emplace_back( a_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpace())); std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
b_device_buf.emplace_back( b_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpace())); std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
c_device_buf.emplace_back(std::make_unique<DeviceMem>( c_device_buf.emplace_back(std::make_unique<DeviceMem>(
sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpace())); sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
a_device_buf[i]->ToDevice(a_m_k[i].mData.data()); a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
b_device_buf[i]->ToDevice(b_k_n[i].mData.data()); b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data()); c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data());
gemm_shapes.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i]}); gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
p_a.push_back(a_device_buf[i]->GetDeviceBuffer()); p_a.push_back(a_device_buf[i]->GetDeviceBuffer());
p_b.push_back(b_device_buf[i]->GetDeviceBuffer()); p_b.push_back(b_device_buf[i]->GetDeviceBuffer());
p_c.push_back(c_device_buf[i]->GetDeviceBuffer()); p_c.push_back(c_device_buf[i]->GetDeviceBuffer());
} }
// add device GEMM instances using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
std::vector<ck::tensor_operation::device::instance::DeviceGroupedGemmNoOpPtr> gemm_ptrs; BLayout,
ck::Tuple<>,
if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value && CLayout,
is_same<CDataType, half_t>::value) ADataType,
{ BDataType,
if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value && ck::Tuple<>,
is_same<BLayout, tensor_layout::gemm::RowMajor>::value && CDataType,
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) AElementOp,
{ BElementOp,
ck::tensor_operation::device::instance:: CElementOp>;
add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs);
} const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value && DeviceOp>::GetInstances();
is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) if(op_ptrs.size() <= 0)
{
ck::tensor_operation::device::instance::
add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs);
}
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{
ck::tensor_operation::device::instance::
add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(gemm_ptrs);
}
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{
ck::tensor_operation::device::instance::
add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(gemm_ptrs);
}
}
if(gemm_ptrs.size() <= 0)
{ {
throw std::runtime_error("wrong! no device GEMM instance found"); throw std::runtime_error("wrong! no device GEMM instance found");
} }
...@@ -216,14 +176,17 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -216,14 +176,17 @@ void profile_grouped_gemm_impl(int do_verification,
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
auto p_ds = std::vector<std::array<const void*, 0>>{};
// profile device GEMM instances // profile device GEMM instances
for(auto& gemm_ptr : gemm_ptrs) for(auto& gemm_ptr : op_ptrs)
{ {
auto argument_ptr = auto argument_ptr =
gemm_ptr->MakeArgumentPointer(p_a, gemm_ptr->MakeArgumentPointer(p_a,
p_b, p_b,
p_ds,
p_c, p_c,
gemm_shapes, gemm_descs,
ck::tensor_operation::element_wise::PassThrough{}, ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{}, ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{}); ck::tensor_operation::element_wise::PassThrough{});
...@@ -242,7 +205,7 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -242,7 +205,7 @@ void profile_grouped_gemm_impl(int do_verification,
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = 0, num_btype = 0; std::size_t flop = 0, num_btype = 0;
for(std::size_t i = 0; i < gemm_shapes.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i]; flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
...@@ -266,7 +229,7 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -266,7 +229,7 @@ void profile_grouped_gemm_impl(int do_verification,
if(do_verification) if(do_verification)
{ {
for(std::size_t i = 0; i < gemm_shapes.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data()); c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
...@@ -294,7 +257,8 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -294,7 +257,8 @@ void profile_grouped_gemm_impl(int do_verification,
c_element_op); c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
ck::utils::check_err(c_m_n_device_results[i].mData, c_m_n_host_result.mData); pass = pass && ck::utils::check_err(c_m_n_device_results[i].mData,
c_m_n_host_result.mData);
if(do_log) if(do_log)
{ {
...@@ -319,6 +283,8 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -319,6 +283,8 @@ void profile_grouped_gemm_impl(int do_verification,
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
return pass;
} // namespace profiler } // namespace profiler
} // namespace profiler } // namespace profiler
......
...@@ -9,10 +9,10 @@ ...@@ -9,10 +9,10 @@
#include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/conv_util.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
namespace ck { namespace ck {
...@@ -92,8 +92,8 @@ void profile_normalization_impl(int do_verification, ...@@ -92,8 +92,8 @@ void profile_normalization_impl(int do_verification,
Tensor<OutDataType> out_ref(out); Tensor<OutDataType> out_ref(out);
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace()); DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.mData.data());
out_dev.ToDevice(out.mData.data()); out_dev.ToDevice(out.mData.data());
......
...@@ -8,10 +8,10 @@ ...@@ -8,10 +8,10 @@
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
#include "ck/library/host_tensor/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/host_tensor/host_reduction.hpp" #include "ck/library/utility/host_reduction.hpp"
#include "ck/library/host_tensor/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
#include "ck/library/host_tensor/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
namespace ck { namespace ck {
namespace tensor_operation { namespace tensor_operation {
...@@ -245,13 +245,13 @@ bool profile_reduce_impl_impl(bool do_verification, ...@@ -245,13 +245,13 @@ bool profile_reduce_impl_impl(bool do_verification,
} }
if(beta != 0.0f) if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpace(); i++) for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
out.mData[i] = out_ref.mData[i]; out.mData[i] = out_ref.mData[i];
}; };
// these buffers are usually provided by the user application // these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpace()); DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpace()); DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.mData.data());
......
...@@ -24,9 +24,9 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) ...@@ -24,9 +24,9 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
F16_F16_F16_F32_F32, // 1 F16_F16_F16_F32_F32, // 1
}; };
if(!(argc == 15 || argc == 16)) if(argc != 15)
{ {
printf("arg1: tensor operation (batched_gemm: BatchedGEMM+Reduce)\n"); printf("arg1: tensor operation (batched_gemm_reduce: BatchedGEMM+Reduce)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n"); printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"); printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n"); printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
...@@ -37,7 +37,6 @@ int profile_batched_gemm_reduce(int argc, char* argv[]) ...@@ -37,7 +37,6 @@ int profile_batched_gemm_reduce(int argc, char* argv[])
printf("arg6: print tensor value (0: no; 1: yes)\n"); printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=n0, 1=yes)\n"); printf("arg7: time kernel (0=n0, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n"); printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
printf("arg15: split k into mulitiple batch\n");
exit(1); exit(1);
} }
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_bwd_data_impl.hpp"
namespace {
enum struct ConvLayout
{
NCHW_KCYX_NKHW, // 0
NHWC_KYXC_NHWK, // 1
};
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
static void print_helper_msg()
{
std::cout
<< "arg1: tensor operation (conv_bwd_data: Convolution Backward Data)\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight bf16, Output bf16\n"
<< " 3: Input int8, Weight int8, Output int8)\n"
<< "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
<< " 1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, "
"K])\n"
<< "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
}
} // namespace
int profile_conv_bwd_data(int argc, char* argv[])
{
// 8 for control, 1 for num_dim_spatial
if(argc < 9)
{
print_helper_msg();
return 1;
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
{
print_helper_msg();
return 1;
}
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
using F32 = float;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using INT8 = int8_t;
using NWC = ck::tensor_layout::convolution::NWC;
using NHWC = ck::tensor_layout::convolution::NHWC;
using NDHWC = ck::tensor_layout::convolution::NDHWC;
using KXC = ck::tensor_layout::convolution::KXC;
using KYXC = ck::tensor_layout::convolution::KYXC;
using KZYXC = ck::tensor_layout::convolution::KZYXC;
using NWK = ck::tensor_layout::convolution::NWK;
using NHWK = ck::tensor_layout::convolution::NHWK;
using NDHWK = ck::tensor_layout::convolution::NDHWK;
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp,
auto in_layout,
auto wei_layout,
auto out_layout,
auto in_type,
auto wei_type,
auto out_type) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout);
using WeiLayout = decltype(wei_layout);
using OutLayout = decltype(out_layout);
using InDataType = decltype(in_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
bool pass = ck::profiler::profile_conv_bwd_data_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
};
if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{});
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
...@@ -8,141 +8,168 @@ ...@@ -8,141 +8,168 @@
#include "profiler/include/profile_conv_bwd_weight_impl.hpp" #include "profiler/include/profile_conv_bwd_weight_impl.hpp"
enum struct ConvDataType namespace {
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum struct ConvInputLayout enum struct ConvLayout
{ {
NCHW, // 0 NCHW_KCYX_NKHW, // 0
NHWC, // 1 NHWC_KYXC_NHWK, // 1
}; };
enum struct ConvWeightLayout enum struct ConvDataType
{ {
KCYX, // 0 F32_F32_F32, // 0
KYXC, // 1 F16_F16_F16, // 1
BF16_F32_BF16, // 2
}; };
enum struct ConvOutputLayout static void print_helper_msg()
{ {
NKHW, // 0 std::cout
NHWK, // 1 << "arg1: tensor operation (conv_bwd_weight: Convolution Backward Weight\n"
}; << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight fp32, Output bf16)\n"
<< "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
<< " 1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, K]\n"
<< "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
<< std::endl;
}
} // namespace
int profile_conv_bwd_weight(int argc, char* argv[]) int profile_conv_bwd_weight(int argc, char* argv[])
{ {
if(argc != 26) // 8 for control, 1 for num_dim_spatial
if(argc < 9)
{ {
printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n"); print_helper_msg();
printf("arg2: data type (0: fp32; 1: fp16)\n"); return 1;
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: run kernel # of times (>1)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
printf("arg25: split k (>=1)\n");
exit(1);
} }
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3])); const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const auto wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4])); const bool do_verification = std::stoi(argv[4]);
const auto out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5])); const int init_method = std::stoi(argv[5]);
const bool do_verification = std::stoi(argv[6]); const bool do_log = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]); const bool time_kernel = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]); const int num_dim_spatial = std::stoi(argv[8]);
const bool time_kernel = std::stoi(argv[9]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K
const ck::index_t N = std::stoi(argv[10]); if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
const ck::index_t K = std::stoi(argv[11]); {
const ck::index_t C = std::stoi(argv[12]); print_helper_msg();
const ck::index_t Y = std::stoi(argv[13]); return 1;
const ck::index_t X = std::stoi(argv[14]); }
const ck::index_t Hi = std::stoi(argv[15]);
const ck::index_t Wi = std::stoi(argv[16]); const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
const ck::index_t conv_stride_h = std::stoi(argv[17]); ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
const ck::index_t conv_stride_w = std::stoi(argv[18]); split_k = std::max(1, split_k);
const ck::index_t conv_dilation_h = std::stoi(argv[19]);
const ck::index_t conv_dilation_w = std::stoi(argv[20]); using F32 = float;
const ck::index_t in_left_pad_h = std::stoi(argv[21]); using F16 = ck::half_t;
const ck::index_t in_left_pad_w = std::stoi(argv[22]); using BF16 = ck::bhalf_t;
const ck::index_t in_right_pad_h = std::stoi(argv[23]);
const ck::index_t in_right_pad_w = std::stoi(argv[24]); using NWC = ck::tensor_layout::convolution::NWC;
ck::index_t split_k = std::stoi(argv[25]); using NHWC = ck::tensor_layout::convolution::NHWC;
split_k = std::max(1, split_k); using NDHWC = ck::tensor_layout::convolution::NDHWC;
const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1; using KXC = ck::tensor_layout::convolution::KXC;
const ck::index_t XEff = (X - 1) * conv_dilation_w + 1; using KYXC = ck::tensor_layout::convolution::KYXC;
using KZYXC = ck::tensor_layout::convolution::KZYXC;
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; using NWK = ck::tensor_layout::convolution::NWK;
using NHWK = ck::tensor_layout::convolution::NHWK;
if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC && using NDHWK = ck::tensor_layout::convolution::NDHWK;
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp,
auto in_layout,
auto wei_layout,
auto out_layout,
auto in_type,
auto wei_type,
auto out_type) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout);
using WeiLayout = decltype(wei_layout);
using OutLayout = decltype(out_layout);
using InDataType = decltype(in_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
bool pass = ck::profiler::profile_conv_bwd_weight_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType>(
do_verification, init_method, do_log, time_kernel, params, split_k);
return pass ? 0 : 1;
};
if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
{ {
ck::profiler::profile_conv_bwd_weight_impl<2, if(data_type == ConvDataType::F32_F32_F32)
float, {
float, return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
float, }
ck::tensor_layout::convolution::NHWC, else if(data_type == ConvDataType::F16_F16_F16)
ck::tensor_layout::convolution::KYXC, {
ck::tensor_layout::convolution::NHWK>( return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
do_verification, }
init_method, else if(data_type == ConvDataType::BF16_F32_BF16)
do_log, {
time_kernel, // fp32 atomic add is used for weight tensor in bf16 kernel
N, return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, F32{}, BF16{});
K, }
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w},
split_k);
} }
else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC && else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{ {
ck::profiler::profile_conv_bwd_weight_impl<2, if(data_type == ConvDataType::F32_F32_F32)
ck::half_t, {
ck::half_t, return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
ck::half_t, }
ck::tensor_layout::convolution::NHWC, else if(data_type == ConvDataType::F16_F16_F16)
ck::tensor_layout::convolution::KYXC, {
ck::tensor_layout::convolution::NHWK>( return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
do_verification, }
init_method, else if(data_type == ConvDataType::BF16_F32_BF16)
do_log, {
time_kernel, // fp32 atomic add is used for weight tensor in bf16 kernel
N, return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, F32{}, BF16{});
K, }
C,
std::vector<ck::index_t>{Hi, Wi},
std::vector<ck::index_t>{Y, X},
std::vector<ck::index_t>{Ho, Wo},
std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w},
split_k);
} }
else else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
{ {
throw std::runtime_error("wrong! this Conv data_type & layout is not implemented"); if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_F32_BF16)
{
// fp32 atomic add is used for weight tensor in bf16 kernel
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, F32{}, BF16{});
}
} }
return 0; std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
} }
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_conv_fwd_impl.hpp"
namespace {
enum struct ConvLayout
{
NCHW_KCYX_NKHW, // 0
NHWC_KYXC_NHWK, // 1
};
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
static void print_helper_msg()
{
std::cout
// clang-format-off
<< "arg1: tensor operation (conv_fwd: Convolution Forward)\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
<< " 1: Input fp16, Weight fp16, Output fp16\n"
<< " 2: Input bf16, Weight bf16, Output bf16\n"
<< " 3: Input int8, Weight int8, Output int8)\n"
<< "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
<< " 1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, "
"K])\n"
<< "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
// clang-format-on
}
} // namespace
int profile_conv_fwd(int argc, char* argv[])
{
// 8 for control, 1 for num_dim_spatial
if(argc < 9)
{
print_helper_msg();
return 1;
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
{
print_helper_msg();
return 1;
}
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
using F32 = float;
using F16 = ck::half_t;
using BF16 = ck::bhalf_t;
using INT8 = int8_t;
using NWC = ck::tensor_layout::convolution::NWC;
using NHWC = ck::tensor_layout::convolution::NHWC;
using NDHWC = ck::tensor_layout::convolution::NDHWC;
using KXC = ck::tensor_layout::convolution::KXC;
using KYXC = ck::tensor_layout::convolution::KYXC;
using KZYXC = ck::tensor_layout::convolution::KZYXC;
using NWK = ck::tensor_layout::convolution::NWK;
using NHWK = ck::tensor_layout::convolution::NHWK;
using NDHWK = ck::tensor_layout::convolution::NDHWK;
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
auto profile = [&](auto num_dim_spatial_tmp,
auto in_layout,
auto wei_layout,
auto out_layout,
auto in_type,
auto wei_type,
auto out_type) {
constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
using InLayout = decltype(in_layout);
using WeiLayout = decltype(wei_layout);
using OutLayout = decltype(out_layout);
using InDataType = decltype(in_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
bool pass = ck::profiler::profile_conv_fwd_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
};
if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{});
}
}
else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
{
if(data_type == ConvDataType::F32_F32_F32)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
}
else if(data_type == ConvDataType::F16_F16_F16)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8)
{
return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{});
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_convnd_bwd_data_impl.hpp"
namespace {
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum struct ConvInputLayout
{
NCHW, // 0
NHWC, // 1
};
enum struct ConvWeightLayout
{
KCYX, // 0
KYXC, // 1
};
enum struct ConvOutputLayout
{
NKHW, // 0
NHWK, // 1
};
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], int arg_idx)
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck::utils::conv::ConvParams params;
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
} // namespace
int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial)
{
const int preParams = 10;
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + preParams;
if(cmdline_nargs != argc)
{
printf("arg1: tensor operation (conv[1|2|3]d_bwd_data: BackwardConvolution)\n");
printf("arg2: data type (0: fp32; 1: fp16)\n");
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
return 1;
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
const auto wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
const auto out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const bool time_kernel = std::stoi(argv[9]);
ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);
auto Run = [&](auto input_type, auto wei_type, auto out_type, auto acc_type) {
using InDataType = decltype(input_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
using AccDataType = decltype(acc_type);
switch(num_dim_spatial)
{
case 1:
ck::profiler::profile_convnd_bwd_data_impl<1,
InDataType,
WeiDataType,
OutDataType,
AccDataType,
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK>(
do_verification,
init_method,
do_log,
time_kernel,
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_);
break;
case 2:
ck::profiler::profile_convnd_bwd_data_impl<2,
InDataType,
WeiDataType,
OutDataType,
AccDataType,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
time_kernel,
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_);
break;
case 3:
ck::profiler::profile_convnd_bwd_data_impl<3,
InDataType,
WeiDataType,
OutDataType,
AccDataType,
ck::tensor_layout::convolution::NDHWC,
ck::tensor_layout::convolution::KZYXC,
ck::tensor_layout::convolution::NDHWK>(
do_verification,
init_method,
do_log,
time_kernel,
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_);
break;
default: break;
}
};
if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
Run(float{}, float{}, float{}, float{});
}
else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
Run(ck::half_t{}, ck::half_t{}, ck::half_t{}, float{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
Run(ck::bhalf_t{}, ck::bhalf_t{}, ck::bhalf_t{}, float{});
}
else if(data_type == ConvDataType::INT8_INT8_INT8 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
Run(int8_t{}, int8_t{}, int8_t{}, int32_t{});
}
else
{
std::cout << "wrong! this Conv data_type & layout is not implemented" << std::endl;
return 1;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/include/profile_convnd_bwd_weight_impl.hpp"
namespace {
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
};
enum struct ConvInputLayout
{
NCHW, // 0
NHWC, // 1
};
enum struct ConvWeightLayout
{
KCYX, // 0
KYXC, // 1
};
enum struct ConvOutputLayout
{
NKHW, // 0
NHWK, // 1
};
ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], int arg_idx)
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck::utils::conv::ConvParams params;
params.num_dim_spatial_ = num_dim_spatial;
params.N_ = std::stoi(argv[arg_idx++]);
params.K_ = std::stoi(argv[arg_idx++]);
params.C_ = std::stoi(argv[arg_idx++]);
params.filter_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.filter_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.input_spatial_lengths_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_spatial_lengths_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_strides_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_strides_[i] = std::stoi(argv[arg_idx++]);
}
params.conv_filter_dilations_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.conv_filter_dilations_[i] = std::stoi(argv[arg_idx++]);
}
params.input_left_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_left_pads_[i] = std::stoi(argv[arg_idx++]);
}
params.input_right_pads_.resize(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i)
{
params.input_right_pads_[i] = std::stoi(argv[arg_idx++]);
}
return params;
}
} // namespace
int profile_convnd_bwd_weight(int argc, char* argv[], int num_dim_spatial)
{
const int preParams = 11;
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + preParams;
if(cmdline_nargs != argc)
{
printf("arg1: tensor operation (convnd[1|2|3]d_bwd_weight: BackwardConvolution)\n");
printf("arg2: data type (0: fp32; 1: fp16, 2: bf16)\n");
printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
printf("arg6: verification (0: no; 1: yes)\n");
printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg8: print tensor value (0: no; 1: yes)\n");
printf("arg9: time kernel (0=n0, 1=yes)\n");
printf("arg10: splitk\n");
printf("arg11 to 25: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
"RightPx\n");
return 1;
}
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto in_layout = static_cast<ConvInputLayout>(std::stoi(argv[3]));
const auto wei_layout = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
const auto out_layout = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
const bool do_verification = std::stoi(argv[6]);
const int init_method = std::stoi(argv[7]);
const bool do_log = std::stoi(argv[8]);
const bool time_kernel = std::stoi(argv[9]);
ck::index_t split_k = std::stoi(argv[10]);
split_k = std::max(1, split_k);
ck::utils::conv::ConvParams params = parse_conv_params(num_dim_spatial, argv, preParams);
auto Run = [&](auto input_type, auto wei_type, auto out_type) {
using InDataType = decltype(input_type);
using WeiDataType = decltype(wei_type);
using OutDataType = decltype(out_type);
switch(num_dim_spatial)
{
case 1:
ck::profiler::profile_convnd_bwd_weight_impl<1,
InDataType,
WeiDataType,
OutDataType,
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK>(
do_verification,
init_method,
do_log,
time_kernel,
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
split_k);
break;
case 2:
ck::profiler::profile_convnd_bwd_weight_impl<2,
InDataType,
WeiDataType,
OutDataType,
ck::tensor_layout::convolution::NHWC,
ck::tensor_layout::convolution::KYXC,
ck::tensor_layout::convolution::NHWK>(
do_verification,
init_method,
do_log,
time_kernel,
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
split_k);
break;
case 3:
ck::profiler::profile_convnd_bwd_weight_impl<3,
InDataType,
WeiDataType,
OutDataType,
ck::tensor_layout::convolution::NDHWC,
ck::tensor_layout::convolution::KZYXC,
ck::tensor_layout::convolution::NDHWK>(
do_verification,
init_method,
do_log,
time_kernel,
params.N_,
params.K_,
params.C_,
params.input_spatial_lengths_,
params.filter_spatial_lengths_,
params.GetOutputSpatialLengths(),
params.conv_filter_strides_,
params.conv_filter_dilations_,
params.input_left_pads_,
params.input_right_pads_,
split_k);
break;
default: break;
}
};
if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
Run(float{}, float{}, float{});
}
else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
Run(ck::half_t{}, ck::half_t{}, ck::half_t{});
}
else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC &&
wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
{
Run(ck::bhalf_t{}, ck::bhalf_t{}, ck::bhalf_t{});
}
else
{
std::cout << "wrong! this Conv data_type & layout is not implemented" << std::endl;
return 1;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/conv_util.hpp"
#include "ck/library/utility/fill.hpp"
namespace {
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
INT8_INT8_INT8, // 3
};
enum struct ConvDataLayout
{
NCHW, // 0
NHWC, // 1
};
namespace ctl = ck::tensor_layout::convolution;
template <int NDim, ConvDataLayout DataLayout>
struct ConvolutionLayouts;
template <>
struct ConvolutionLayouts<1, ConvDataLayout::NHWC>
{
typedef ctl::NWC Input;
typedef ctl::KXC Weight;
typedef ctl::NWK Output;
};
template <>
struct ConvolutionLayouts<2, ConvDataLayout::NHWC>
{
typedef ctl::NHWC Input;
typedef ctl::KYXC Weight;
typedef ctl::NHWK Output;
};
template <>
struct ConvolutionLayouts<3, ConvDataLayout::NHWC>
{
typedef ctl::NDHWC Input;
typedef ctl::KZYXC Weight;
typedef ctl::NDHWK Output;
};
template <>
struct ConvolutionLayouts<1, ConvDataLayout::NCHW>
{
typedef ctl::NCW Input;
typedef ctl::KCX Weight;
typedef ctl::NKW Output;
};
template <>
struct ConvolutionLayouts<2, ConvDataLayout::NCHW>
{
typedef ctl::NCHW Input;
typedef ctl::KCYX Weight;
typedef ctl::NKHW Output;
};
template <>
struct ConvolutionLayouts<3, ConvDataLayout::NCHW>
{
typedef ctl::NCDHW Input;
typedef ctl::KCZYX Weight;
typedef ctl::NKDHW Output;
};
void print_use_msg()
{
std::cout << "arg1: tensor operation (conv_fwd: ForwardConvolution)\n"
<< "arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n"
<< "arg3: data layout (0: NCHW; 1: NHWC)\n"
<< "arg4: verification (0=no, 1=yes)\n"
<< "arg5: initialization (0=no init, 1=integer value, 2=decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: run kernel # of times (>1)\n"
<< "arg8: N spatial dimensions (default 2)\n"
<< "Following arguments (depending on number of spatial dims):\n"
<< " N, K, C, \n"
<< " <filter spatial dimensions>, (ie Y, X for 2D)\n"
<< " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
<< " <strides>, (ie Sy, Sx for 2D)\n"
<< " <dilations>, (ie Dy, Dx for 2D)\n"
<< " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
<< " <right padding>, (ie RightPy, RightPx for 2D)\n"
<< std::endl;
}
ck::utils::conv::ConvParams parse_params(int num_dim_spatial, int argc, char* argv[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
int conv_args = 3 + num_dim_spatial * 6;
int cmdline_nargs = conv_args + 9;
if(cmdline_nargs != argc)
{
print_use_msg();
exit(1);
}
int arg_idx = 9;
return ck::utils::conv::parse_conv_params(num_dim_spatial, arg_idx, argv);
}
template <int NDim,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename ConvLayouts>
void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
bool time_kernel,
int init_method,
ConvLayouts)
{
using namespace std::placeholders;
using namespace ck::utils;
std::unique_ptr<OpInstance<OutDataType, InDataType, WeiDataType>> conv_instance;
switch(init_method)
{
case 0:
conv_instance =
std::make_unique<conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output>>(params, false);
break;
case 1:
conv_instance = std::make_unique<
conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::utils::FillUniformDistributionIntegerValue<int>,
ck::utils::FillUniformDistributionIntegerValue<int>>>(
params,
true,
ck::utils::FillUniformDistributionIntegerValue<int>{},
ck::utils::FillUniformDistributionIntegerValue<int>{});
break;
case 2:
conv_instance = std::make_unique<
conv::ConvFwdOpInstance<InDataType,
WeiDataType,
OutDataType,
typename ConvLayouts::Input,
typename ConvLayouts::Weight,
typename ConvLayouts::Output,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::utils::FillUniformDistribution<InDataType>,
ck::utils::FillUniformDistribution<WeiDataType>>>(
params,
true,
ck::utils::FillUniformDistribution<InDataType>{},
ck::utils::FillUniformDistribution<WeiDataType>{});
break;
default: throw std::runtime_error("Unsupported init method!");
}
auto reference_conv_fwd_fun = std::bind(
conv::run_reference_convolution_forward<NDim, InDataType, WeiDataType, OutDataType>,
params,
_1,
_2,
_3);
OpInstanceRunEngine<InDataType, WeiDataType, OutDataType> run_engine(
*conv_instance, reference_conv_fwd_fun, do_verification);
auto best_conf = run_engine.Profile(
conv::ConvolutionFwdInstances<InDataType, WeiDataType, OutDataType>::template Get<NDim>(),
time_kernel,
do_verification,
do_log);
std::cout << "Best configuration parameters:"
<< "\nname: " << best_conf.best_op_name << "\navg_time: " << best_conf.best_avg_time
<< "\ntflops: " << best_conf.best_tflops << "\nGB/s: " << best_conf.best_gb_per_sec
<< std::endl;
}
template <int NDim>
void profile_convnd_instances(ConvDataType data_type,
ConvDataLayout data_layout,
const ck::utils::conv::ConvParams& params,
bool do_verification,
bool do_log,
bool time_kernel,
int init_method)
{
switch(data_layout)
{
case ConvDataLayout::NHWC: {
switch(data_type)
{
case ConvDataType::F32_F32_F32:
profile_convnd_instances_impl<NDim, float, float, float>(
params,
do_verification,
do_log,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::F16_F16_F16:
profile_convnd_instances_impl<NDim, ck::half_t, ck::half_t, ck::half_t>(
params,
do_verification,
do_log,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::BF16_BF16_BF16:
profile_convnd_instances_impl<NDim, ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>(
params,
do_verification,
do_log,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
case ConvDataType::INT8_INT8_INT8:
profile_convnd_instances_impl<NDim, int8_t, int8_t, int8_t>(
params,
do_verification,
do_log,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NHWC>{});
break;
}
break;
}
case ConvDataLayout::NCHW: {
switch(data_type)
{
case ConvDataType::F32_F32_F32:
profile_convnd_instances_impl<NDim, float, float, float>(
params,
do_verification,
do_log,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::F16_F16_F16:
profile_convnd_instances_impl<NDim, ck::half_t, ck::half_t, ck::half_t>(
params,
do_verification,
do_log,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::BF16_BF16_BF16:
profile_convnd_instances_impl<NDim, ck::bhalf_t, ck::bhalf_t, ck::bhalf_t>(
params,
do_verification,
do_log,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
case ConvDataType::INT8_INT8_INT8:
profile_convnd_instances_impl<NDim, int8_t, int8_t, int8_t>(
params,
do_verification,
do_log,
time_kernel,
init_method,
ConvolutionLayouts<NDim, ConvDataLayout::NCHW>{});
break;
}
break;
}
}
}
} // namespace
int profile_convnd_fwd(int argc, char* argv[])
{
using namespace ck::utils::conv;
ConvDataType data_type{ConvDataType::F32_F32_F32};
ConvDataLayout data_layout{ConvDataLayout::NHWC};
bool do_verification{true};
int init_method{2};
bool do_log{false};
bool time_kernel{false};
int num_dim_spatial{2};
ConvParams params;
if(argc >= 4)
{
data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
data_layout = static_cast<ConvDataLayout>(std::stoi(argv[3]));
}
if(argc >= 9)
{
do_verification = std::stoi(argv[4]);
init_method = std::stoi(argv[5]);
do_log = std::stoi(argv[6]);
time_kernel = std::stoi(argv[7]);
num_dim_spatial = std::stoi(argv[8]);
}
if(argc >= 10)
{
params = parse_params(num_dim_spatial, argc, argv);
}
// TODO Print nice message what is being profiled.
switch(num_dim_spatial)
{
case 1:
profile_convnd_instances<1>(
data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
case 2:
profile_convnd_instances<2>(
data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
case 3:
profile_convnd_instances<3>(
data_type, data_layout, params, do_verification, do_log, time_kernel, init_method);
break;
default:
throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " +
std::to_string(num_dim_spatial));
}
return 0;
}
...@@ -24,21 +24,27 @@ enum struct GemmDataType ...@@ -24,21 +24,27 @@ enum struct GemmDataType
INT8_INT8_INT8, // 3 INT8_INT8_INT8, // 3
}; };
static void print_helper_msg()
{
std::cout << "arg1: tensor operation (gemm: GEMM)\n"
<< "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
<< "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
<< " 1: A[m, k] * B[n, k] = C[m, n];\n"
<< " 2: A[k, m] * B[k, n] = C[m, n];\n"
<< " 3: A[k, m] * B[n, k] = C[m, n])\n"
<< "arg4: verification (0: no; 1: yes)\n"
<< "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< "arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"
<< std::endl;
}
int profile_gemm(int argc, char* argv[]) int profile_gemm(int argc, char* argv[])
{ {
if(argc != 14) if(argc != 14)
{ {
printf("arg1: tensor operation (gemm: GEMM)\n"); print_helper_msg();
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
printf(" 1: A[m, k] * B[n, k] = C[m, n];\n");
printf(" 2: A[k, m] * B[k, n] = C[m, n];\n");
printf(" 3: A[k, m] * B[n, k] = C[m, n])\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
exit(1); exit(1);
} }
...@@ -109,67 +115,67 @@ int profile_gemm(int argc, char* argv[]) ...@@ -109,67 +115,67 @@ int profile_gemm(int argc, char* argv[])
if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Row{}, Row{}); return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Col{}, Row{}); return profile(Row{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Row{}, Row{}); return profile(Col{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Col{}, Row{}); return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{}); return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{}); return profile(Row{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{}); return profile(Col{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{}); return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
} }
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{}); return profile(Row{}, Row{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
} }
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{}); return profile(Row{}, Col{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
} }
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{}); return profile(Col{}, Row{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
} }
else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Col{}, Row{}); return profile(Col{}, Col{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
} }
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
return profile(INT8{}, INT8{}, INT32{}, INT8{}, Row{}, Row{}, Row{}); return profile(Row{}, Row{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
} }
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(INT8{}, INT8{}, INT32{}, INT8{}, Row{}, Col{}, Row{}); return profile(Row{}, Col{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
} }
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
return profile(INT8{}, INT8{}, INT32{}, INT8{}, Col{}, Row{}, Row{}); return profile(Col{}, Row{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
} }
else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
return profile(INT8{}, INT8{}, INT32{}, INT8{}, Col{}, Col{}, Row{}); return profile(Col{}, Col{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
} }
else else
{ {
......
...@@ -75,7 +75,9 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[]) ...@@ -75,7 +75,9 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
auto e_type, auto e_type,
auto a_layout, auto a_layout,
auto b_layout, auto b_layout,
auto de_layout) { auto d0_layout,
auto d1_layout,
auto e_layout) {
using ADataType = decltype(a_type); using ADataType = decltype(a_type);
using BDataType = decltype(b_type); using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type); using AccDataType = decltype(acc_type);
...@@ -85,13 +87,15 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[]) ...@@ -85,13 +87,15 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
using ALayout = decltype(a_layout); using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout); using BLayout = decltype(b_layout);
using DELayout = decltype(de_layout); using D0Layout = decltype(d0_layout);
using D1Layout = decltype(d1_layout);
using ELayout = decltype(e_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M; const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K; const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideD0 = ck::is_same_v<DELayout, Row> ? N : M; const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
const int DefaultStrideD1 = ck::is_same_v<DELayout, Row> ? N : M; const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
const int DefaultStrideE = ck::is_same_v<DELayout, Row> ? N : M; const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_add_add_fastgelu_impl<ADataType, bool pass = ck::profiler::profile_gemm_add_add_fastgelu_impl<ADataType,
BDataType, BDataType,
...@@ -101,7 +105,9 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[]) ...@@ -101,7 +105,9 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
EDataType, EDataType,
ALayout, ALayout,
BLayout, BLayout,
DELayout>( D0Layout,
D1Layout,
ELayout>(
do_verification, do_verification,
init_method, init_method,
do_log, do_log,
...@@ -120,22 +126,22 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[]) ...@@ -120,22 +126,22 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN) if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
} }
else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
layout == MatrixLayout::MK_NK_MN_MN_MN) layout == MatrixLayout::MK_NK_MN_MN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
} }
else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
layout == MatrixLayout::KM_KN_MN_MN_MN) layout == MatrixLayout::KM_KN_MN_MN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{}, Row{});
} }
else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
layout == MatrixLayout::KM_NK_MN_MN_MN) layout == MatrixLayout::KM_NK_MN_MN_MN)
{ {
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}); return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{}, Row{});
} }
else else
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment