Unverified Commit 31b40352 authored by Chao Liu's avatar Chao Liu Committed by GitHub
Browse files

Merge pull request #16 from ROCmSoftwarePlatform/develop

Merge develop into master
parents 5781adf5 b62bf8c3
#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R2 #ifndef DRIVER_GEMM_DLOPS_V1R2
#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R2 #define DRIVER_GEMM_DLOPS_V1R2
#include "common_header.hpp" #include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp" #include "tensor_descriptor_helper.hpp"
#include "gridwise_dynamic_gemm_dlops_v1r2.hpp" #include "gridwise_gemm_dlops_v1r2.hpp"
template <ck::index_t BlockSize, template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
...@@ -43,23 +43,23 @@ template <ck::index_t BlockSize, ...@@ -43,23 +43,23 @@ template <ck::index_t BlockSize,
typename CThreadTransferSrcDstAccessOrder, typename CThreadTransferSrcDstAccessOrder,
ck::index_t CThreadTransferSrcDstVectorDim, ck::index_t CThreadTransferSrcDstVectorDim,
ck::index_t CThreadTransferDstScalarPerVector, ck::index_t CThreadTransferDstScalarPerVector,
typename AGridIteratorHacks, typename AGridStepHacks,
typename BGridIteratorHacks, typename BGridStepHacks,
typename CGridIteratorHacks, typename CGridStepHacks,
typename AGridMoveSliceWindowIteratorHacks, typename AGridMoveSliceWindowStepHacks,
typename BGridMoveSliceWindowIteratorHacks> typename BGridMoveSliceWindowStepHacks>
__host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
const FloatAB* p_b_grid, const FloatAB* p_b_grid,
FloatC* p_c_grid, FloatC* p_c_grid,
const AKMGridDesc& a_k_m_grid_desc, const AKMGridDesc& a_k_m_grid_desc,
const BKNGridDesc& b_k_n_grid_desc, const BKNGridDesc& b_k_n_grid_desc,
const CMNGridDesc& c_m_n_grid_desc, const CMNGridDesc& c_m_n_grid_desc,
AGridIteratorHacks, AGridStepHacks,
BGridIteratorHacks, BGridStepHacks,
CGridIteratorHacks, CGridStepHacks,
AGridMoveSliceWindowIteratorHacks, AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowIteratorHacks, BGridMoveSliceWindowStepHacks,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
...@@ -72,49 +72,48 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -72,49 +72,48 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
constexpr auto I5 = Number<5>{}; constexpr auto I5 = Number<5>{};
// GEMM // GEMM
using GridwiseGemm = using GridwiseGemm = GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
GridwiseDynamicGemmDlops_km_kn_mn_v1r2<BlockSize, FloatAB,
FloatAB, FloatAcc,
FloatAcc, FloatC,
FloatC, CGlobalMemoryDataOperation,
CGlobalMemoryDataOperation, AKMGridDesc,
AKMGridDesc, BKNGridDesc,
BKNGridDesc, CMNGridDesc,
CMNGridDesc, MPerBlock,
MPerBlock, NPerBlock,
NPerBlock, KPerBlock,
KPerBlock, M1PerThread,
M1PerThread, N1PerThread,
N1PerThread, KPerThread,
KPerThread, M1N1ThreadClusterM10,
M1N1ThreadClusterM10, M1N1ThreadClusterN10,
M1N1ThreadClusterN10, M1N1ThreadClusterM11,
M1N1ThreadClusterM11, M1N1ThreadClusterN11,
M1N1ThreadClusterN11, ABlockTransferThreadSliceLengths_K_M0_M1,
ABlockTransferThreadSliceLengths_K_M0_M1, ABlockTransferThreadClusterLengths_K_M0_M1,
ABlockTransferThreadClusterLengths_K_M0_M1, ABlockTransferThreadClusterArrangeOrder,
ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder,
ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,
ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector,
ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_M1,
ABlockTransferDstScalarPerVector_M1, AThreadTransferSrcResetCoordinateAfterRun,
AThreadTransferSrcResetCoordinateAfterRun, BBlockTransferThreadSliceLengths_K_N0_N1,
BBlockTransferThreadSliceLengths_K_N0_N1, BBlockTransferThreadClusterLengths_K_N0_N1,
BBlockTransferThreadClusterLengths_K_N0_N1, BBlockTransferThreadClusterArrangeOrder,
BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,
BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim,
BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,
BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_N1,
BBlockTransferDstScalarPerVector_N1, BThreadTransferSrcResetCoordinateAfterRun,
BThreadTransferSrcResetCoordinateAfterRun, CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim,
CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector,
CThreadTransferDstScalarPerVector, AGridStepHacks,
AGridIteratorHacks, BGridStepHacks,
BGridIteratorHacks, CGridStepHacks,
CGridIteratorHacks, AGridMoveSliceWindowStepHacks,
AGridMoveSliceWindowIteratorHacks, BGridMoveSliceWindowStepHacks>;
BGridMoveSliceWindowIteratorHacks>;
const auto M = a_k_m_grid_desc.GetLength(I1); const auto M = a_k_m_grid_desc.GetLength(I1);
const auto N = b_k_n_grid_desc.GetLength(I1); const auto N = b_k_n_grid_desc.GetLength(I1);
...@@ -122,8 +121,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -122,8 +121,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc)) if(!GridwiseGemm::CheckValidity(a_k_m_grid_desc, b_k_n_grid_desc, c_m_n_grid_desc))
{ {
throw std::runtime_error( throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r2 has invalid setting");
"wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r2 has invalid setting");
} }
const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc); const auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
...@@ -174,22 +172,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -174,22 +172,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm, kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AKM0M1GridDesc>, remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>, remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true, true,
true>; true>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -201,22 +198,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -201,22 +198,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm, kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AKM0M1GridDesc>, remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>, remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true, true,
false>; false>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -228,22 +224,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -228,22 +224,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm, kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AKM0M1GridDesc>, remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>, remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false, false,
true>; true>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -255,22 +250,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -255,22 +250,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
else else
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm, kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AKM0M1GridDesc>, remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>, remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false, false,
false>; false>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -299,15 +293,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -299,15 +293,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm, kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AKM0M1GridDesc>, remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>, remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true, true,
true>; true>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -315,27 +309,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -315,27 +309,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
(void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()); c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm, kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AKM0M1GridDesc>, remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>, remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true, true,
false>; false>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -343,27 +338,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -343,27 +338,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
(void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()); c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm, kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AKM0M1GridDesc>, remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>, remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false, false,
true>; true>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -371,27 +367,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -371,27 +367,28 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
(void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()); c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
else else
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r2<GridwiseGemm, kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AKM0M1GridDesc>, remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>, remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false, false,
false>; false>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -399,14 +396,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -399,14 +396,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r2(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
(void CONSTANT*)a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()); c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
return ave_time; return ave_time;
......
#ifndef DRIVER_DYNAMIC_GEMM_DLOPS_V1R3 #ifndef DRIVER_GEMM_DLOPS_V1R3
#define DRIVER_DYNAMIC_GEMM_DLOPS_V1R3 #define DRIVER_GEMM_DLOPS_V1R3
#include "common_header.hpp" #include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp" #include "tensor_descriptor_helper.hpp"
#include "gridwise_dynamic_gemm_dlops_v1r3.hpp" #include "gridwise_gemm_dlops_v1r3.hpp"
template <ck::index_t BlockSize, template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
...@@ -39,23 +39,23 @@ template <ck::index_t BlockSize, ...@@ -39,23 +39,23 @@ template <ck::index_t BlockSize,
typename CThreadTransferSrcDstAccessOrder, typename CThreadTransferSrcDstAccessOrder,
ck::index_t CThreadTransferSrcDstVectorDim, ck::index_t CThreadTransferSrcDstVectorDim,
ck::index_t CThreadTransferDstScalarPerVector, ck::index_t CThreadTransferDstScalarPerVector,
typename AGridIteratorHacks, typename AGridStepHacks,
typename BGridIteratorHacks, typename BGridStepHacks,
typename CGridIteratorHacks, typename CGridStepHacks,
typename AGridMoveSliceWindowIteratorHacks, typename AGridMoveSliceWindowStepHacks,
typename BGridMoveSliceWindowIteratorHacks> typename BGridMoveSliceWindowStepHacks>
__host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
const FloatAB* p_b_grid, const FloatAB* p_b_grid,
FloatC* p_c_grid, FloatC* p_c_grid,
const AK0MK1GridDesc& a_k0_m_k1_grid_desc, const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
const BK0NK1GridDesc& b_k0_n_k1_grid_desc, const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
const CMNGridDesc& c_m_n_grid_desc, const CMNGridDesc& c_m_n_grid_desc,
AGridIteratorHacks, AGridStepHacks,
BGridIteratorHacks, BGridStepHacks,
CGridIteratorHacks, CGridStepHacks,
AGridMoveSliceWindowIteratorHacks, AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowIteratorHacks, BGridMoveSliceWindowStepHacks,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
...@@ -69,44 +69,44 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -69,44 +69,44 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
// GEMM // GEMM
using GridwiseGemm = using GridwiseGemm =
GridwiseDynamicGemmDlops_km_kn_mn_v1r3<BlockSize, GridwiseGemmDlops_km_kn_mn_v1r3<BlockSize,
FloatAB, FloatAB,
FloatAcc, FloatAcc,
FloatC, FloatC,
CGlobalMemoryDataOperation, CGlobalMemoryDataOperation,
AK0MK1GridDesc, AK0MK1GridDesc,
BK0NK1GridDesc, BK0NK1GridDesc,
CMNGridDesc, CMNGridDesc,
MPerBlock, MPerBlock,
NPerBlock, NPerBlock,
KPerBlock, KPerBlock,
M1PerThread, M1PerThread,
N1PerThread, N1PerThread,
KPerThread, KPerThread,
M1N1ThreadClusterM1Xs, M1N1ThreadClusterM1Xs,
M1N1ThreadClusterN1Xs, M1N1ThreadClusterN1Xs,
ABlockTransferThreadSliceLengths_K0_M0_M1_K1, ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
ABlockTransferThreadClusterLengths_K0_M0_M1_K1, ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
ABlockTransferThreadClusterArrangeOrder, ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder, ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
ABlockTransferSrcVectorTensorContiguousDimOrder, ABlockTransferSrcVectorTensorContiguousDimOrder,
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
BBlockTransferThreadSliceLengths_K0_N0_N1_K1, BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
BBlockTransferThreadClusterLengths_K0_N0_N1_K1, BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
BBlockTransferThreadClusterArrangeOrder, BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder, BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
BBlockTransferSrcVectorTensorContiguousDimOrder, BBlockTransferSrcVectorTensorContiguousDimOrder,
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector, CThreadTransferDstScalarPerVector,
AGridIteratorHacks, AGridStepHacks,
BGridIteratorHacks, BGridStepHacks,
CGridIteratorHacks, CGridStepHacks,
AGridMoveSliceWindowIteratorHacks, AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowIteratorHacks>; BGridMoveSliceWindowStepHacks>;
const auto M = a_k0_m_k1_grid_desc.GetLength(I1); const auto M = a_k0_m_k1_grid_desc.GetLength(I1);
const auto N = b_k0_n_k1_grid_desc.GetLength(I1); const auto N = b_k0_n_k1_grid_desc.GetLength(I1);
...@@ -114,8 +114,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -114,8 +114,7 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc)) if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
{ {
throw std::runtime_error( throw std::runtime_error("wrong! GridwiseGemmDlops_km_kn_mn_v1r3 has invalid setting");
"wrong! GridwiseDynamicGemmDlops_km_kn_mn_v1r3 has invalid setting");
} }
const auto a_k0_m0_m1_k1_grid_desc = const auto a_k0_m0_m1_k1_grid_desc =
...@@ -170,22 +169,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -170,22 +169,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm, kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0M0M1K1GridDesc>, remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>, remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true, true,
true>; true>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -197,22 +195,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -197,22 +195,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm, kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0M0M1K1GridDesc>, remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>, remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true, true,
false>; false>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -224,22 +221,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -224,22 +221,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm, kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0M0M1K1GridDesc>, remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>, remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false, false,
true>; true>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -251,22 +247,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -251,22 +247,21 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
else else
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm, kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0M0M1K1GridDesc>, remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>, remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false, false,
false>; false>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -295,15 +290,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -295,15 +290,15 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm, kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0M0M1K1GridDesc>, remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>, remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true, true,
true>; true>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -311,27 +306,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -311,27 +306,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
(void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(), a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()); b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm, kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0M0M1K1GridDesc>, remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>, remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true, true,
false>; false>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -339,27 +337,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -339,27 +337,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
(void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(), a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()); b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm, kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0M0M1K1GridDesc>, remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>, remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false, false,
true>; true>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -367,27 +368,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -367,27 +368,30 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
(void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(), a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()); b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
else else
{ {
const auto kernel = const auto kernel =
kernel_dynamic_gemm_dlops_v1r3<GridwiseGemm, kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0M0M1K1GridDesc>, remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>, remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>, remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>, remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false, false,
false>; false>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -395,14 +399,17 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -395,14 +399,17 @@ __host__ float driver_dynamic_gemm_dlops_v1r3(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
(void CONSTANT*)a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer(), a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(
(void CONSTANT*)c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()); b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
return ave_time; return ave_time;
......
#ifndef DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3 #ifndef DRIVER_GEMM_XDLOPS_V2R3
#define DRIVER_DYNAMIC_GEMM_XDLOPS_V2R3 #define DRIVER_GEMM_XDLOPS_V2R3
#include "common_header.hpp" #include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp" #include "tensor_descriptor_helper.hpp"
#include "gridwise_dynamic_gemm_xdlops_v2r3.hpp" #include "gridwise_gemm_xdlops_v2r3.hpp"
template <ck::index_t BlockSize, template <ck::index_t BlockSize,
typename FloatAB, typename FloatAB,
...@@ -41,24 +41,24 @@ template <ck::index_t BlockSize, ...@@ -41,24 +41,24 @@ template <ck::index_t BlockSize,
typename CThreadTransferSrcDstAccessOrder, typename CThreadTransferSrcDstAccessOrder,
ck::index_t CThreadTransferSrcDstVectorDim, ck::index_t CThreadTransferSrcDstVectorDim,
ck::index_t CThreadTransferDstScalarPerVector, ck::index_t CThreadTransferDstScalarPerVector,
typename AGridIteratorHacks, typename AGridStepHacks,
typename BGridIteratorHacks, typename BGridStepHacks,
typename CGridIteratorHacks, typename CGridStepHacks,
typename AGridMoveSliceWindowIteratorHacks, typename AGridMoveSliceWindowStepHacks,
typename BGridMoveSliceWindowIteratorHacks, typename BGridMoveSliceWindowStepHacks,
bool CAccessOrderMRepeatNRepeat> bool CAccessOrderMRepeatNRepeat>
__host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
const FloatAB* p_b_grid, const FloatAB* p_b_grid,
FloatC* p_c_grid, FloatC* p_c_grid,
const AK0MK1GridDesc& a_k0_m_k1_grid_desc, const AK0MK1GridDesc& a_k0_m_k1_grid_desc,
const BK0NK1GridDesc& b_k0_n_k1_grid_desc, const BK0NK1GridDesc& b_k0_n_k1_grid_desc,
const CMNGridDesc& c_m_n_grid_desc, const CMNGridDesc& c_m_n_grid_desc,
AGridIteratorHacks, AGridStepHacks,
BGridIteratorHacks, BGridStepHacks,
CGridIteratorHacks, CGridStepHacks,
AGridMoveSliceWindowIteratorHacks, AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowIteratorHacks, BGridMoveSliceWindowStepHacks,
ck::index_t nrepeat) ck::index_t nrepeat)
{ {
using namespace ck; using namespace ck;
...@@ -66,52 +66,49 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, ...@@ -66,52 +66,49 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
using GridwiseGemm = using GridwiseGemm =
GridwiseDynamicGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize, GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
FloatAB, FloatAB,
FloatAcc, FloatAcc,
FloatC, FloatC,
CGlobalMemoryDataOperation, CGlobalMemoryDataOperation,
AK0MK1GridDesc, AK0MK1GridDesc,
BK0NK1GridDesc, BK0NK1GridDesc,
CMNGridDesc, CMNGridDesc,
MPerBlock, MPerBlock,
NPerBlock, NPerBlock,
KPerBlock, KPerBlock,
MPerWave, MPerWave,
NPerWave, NPerWave,
K1, K1,
MRepeat, MRepeat,
NRepeat, NRepeat,
ABlockTransferThreadSliceLengths_K0_M_K1, ABlockTransferThreadSliceLengths_K0_M_K1,
ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterLengths_K0_M_K1,
ABlockTransferThreadClusterArrangeOrder, ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder, ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim, ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector, ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1, ABlockTransferDstScalarPerVector_K1,
AThreadTransferSrcResetCoordinateAfterRun, AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K0_N_K1, BBlockTransferThreadSliceLengths_K0_N_K1,
BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterLengths_K0_N_K1,
BBlockTransferThreadClusterArrangeOrder, BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder, BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim, BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1, BBlockTransferDstScalarPerVector_K1,
BThreadTransferSrcResetCoordinateAfterRun, BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim, CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector, CThreadTransferDstScalarPerVector,
AGridIteratorHacks, AGridStepHacks,
BGridIteratorHacks, BGridStepHacks,
CGridIteratorHacks, CGridStepHacks,
AGridMoveSliceWindowIteratorHacks, AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowIteratorHacks, BGridMoveSliceWindowStepHacks,
CAccessOrderMRepeatNRepeat>; CAccessOrderMRepeatNRepeat>;
{ {
std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", " std::cout << "a_k0_m_k1_grid_desc{" << a_k0_m_k1_grid_desc.GetLength(I0) << ", "
...@@ -129,7 +126,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, ...@@ -129,7 +126,7 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc)) if(!GridwiseGemm::CheckValidity(a_k0_m_k1_grid_desc, b_k0_n_k1_grid_desc, c_m_n_grid_desc))
{ {
throw std::runtime_error( throw std::runtime_error(
"wrong! GridwiseDynamicGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting"); "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
} }
const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc); const auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
...@@ -142,13 +139,13 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, ...@@ -142,13 +139,13 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc); const index_t grid_size = GridwiseGemm::CalculateGridSize(c_m_n_grid_desc);
const auto kernel = kernel_dynamic_gemm_xdlops_v2r3<GridwiseGemm, const auto kernel = kernel_gemm_xdlops_v2r3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AK0MK1GridDesc>, remove_reference_t<AK0MK1GridDesc>,
remove_reference_t<BK0NK1GridDesc>, remove_reference_t<BK0NK1GridDesc>,
remove_reference_t<CM0M1M2NGridDesc>, remove_reference_t<CM0M1M2NGridDesc>,
remove_reference_t<CBlockClusterAdaptor>>; remove_reference_t<CBlockClusterAdaptor>>;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE #if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
float ave_time = launch_and_time_kernel(kernel, float ave_time = launch_and_time_kernel(kernel,
...@@ -156,7 +153,6 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, ...@@ -156,7 +153,6 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0,
p_a_grid, p_a_grid,
p_b_grid, p_b_grid,
p_c_grid, p_c_grid,
...@@ -176,20 +172,19 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid, ...@@ -176,20 +172,19 @@ __host__ float driver_dynamic_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
c_m0_m1_m2_n_grid_desc_dev_buf.ToDevice(&c_m0_m1_m2_n_grid_desc); c_m0_m1_m2_n_grid_desc_dev_buf.ToDevice(&c_m0_m1_m2_n_grid_desc);
c_block_cluster_adaptor_dev_buf.ToDevice(&c_block_cluster_adaptor); c_block_cluster_adaptor_dev_buf.ToDevice(&c_block_cluster_adaptor);
float ave_time = float ave_time = launch_and_time_kernel(
launch_and_time_kernel(kernel, kernel,
nrepeat, nrepeat,
dim3(grid_size), dim3(grid_size),
dim3(BlockSize), dim3(BlockSize),
0, 0,
0, p_a_grid,
p_a_grid, p_b_grid,
p_b_grid, p_c_grid,
p_c_grid, cast_pointer_to_constant_address_space(a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)a_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer()),
(void CONSTANT*)c_m0_m1_m2_n_grid_desc_dev_buf.GetDeviceBuffer(), cast_pointer_to_constant_address_space(c_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
(void CONSTANT*)c_block_cluster_adaptor_dev_buf.GetDeviceBuffer());
#endif #endif
return ave_time; return ave_time;
} }
......
...@@ -12,10 +12,10 @@ ...@@ -12,10 +12,10 @@
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv_bwd_data.hpp" #include "host_conv_bwd_data.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp" #include "device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp" #include "device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE 1 #define USE_MODE 1
#define USE_CONV_BWD_V4R1_XDL_NHWC 1 #define USE_CONV_BWD_V4R1_XDL_NHWC 1
#define USE_CONV_BWD_V4R1R2_XDL_NHWC 1 #define USE_CONV_BWD_V4R1R2_XDL_NHWC 1
...@@ -37,7 +37,7 @@ int main(int argc, char* argv[]) ...@@ -37,7 +37,7 @@ int main(int argc, char* argv[])
constexpr auto I5 = Number<5>{}; constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{}; constexpr auto I6 = Number<6>{};
#if USE_DYNAMIC_MODE #if USE_MODE
// dynamic mode // dynamic mode
if(argc != 22) if(argc != 22)
{ {
...@@ -46,29 +46,29 @@ int main(int argc, char* argv[]) ...@@ -46,29 +46,29 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1])); const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2])); const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
const bool do_verification = atoi(argv[3]); const bool do_verification = std::stoi(argv[3]);
const int init_method = atoi(argv[4]); const int init_method = std::stoi(argv[4]);
const bool do_log = atoi(argv[5]); const bool do_log = std::stoi(argv[5]);
const int nrepeat = atoi(argv[6]); const int nrepeat = std::stoi(argv[6]);
const index_t N = atoi(argv[7]); const index_t N = std::stoi(argv[7]);
const index_t K = atoi(argv[8]); const index_t K = std::stoi(argv[8]);
const index_t C = atoi(argv[9]); const index_t C = std::stoi(argv[9]);
const index_t Y = atoi(argv[10]); const index_t Y = std::stoi(argv[10]);
const index_t X = atoi(argv[11]); const index_t X = std::stoi(argv[11]);
const index_t Hi = atoi(argv[12]); const index_t Hi = std::stoi(argv[12]);
const index_t Wi = atoi(argv[13]); const index_t Wi = std::stoi(argv[13]);
const index_t conv_stride_h = atoi(argv[14]); const index_t conv_stride_h = std::stoi(argv[14]);
const index_t conv_stride_w = atoi(argv[15]); const index_t conv_stride_w = std::stoi(argv[15]);
const index_t conv_dilation_h = atoi(argv[16]); const index_t conv_dilation_h = std::stoi(argv[16]);
const index_t conv_dilation_w = atoi(argv[17]); const index_t conv_dilation_w = std::stoi(argv[17]);
const index_t in_left_pad_h = atoi(argv[18]); const index_t in_left_pad_h = std::stoi(argv[18]);
const index_t in_left_pad_w = atoi(argv[19]); const index_t in_left_pad_w = std::stoi(argv[19]);
const index_t in_right_pad_h = atoi(argv[20]); const index_t in_right_pad_h = std::stoi(argv[20]);
const index_t in_right_pad_w = atoi(argv[21]); const index_t in_right_pad_w = std::stoi(argv[21]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1; const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1; const index_t XEff = (X - 1) * conv_dilation_w + 1;
...@@ -83,12 +83,12 @@ int main(int argc, char* argv[]) ...@@ -83,12 +83,12 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1])); const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(atoi(argv[2])); const ConvBackwardDataAlgo algo = static_cast<ConvBackwardDataAlgo>(std::stoi(argv[2]));
const bool do_verification = atoi(argv[3]); const bool do_verification = std::stoi(argv[3]);
const int init_method = atoi(argv[4]); const int init_method = std::stoi(argv[4]);
const bool do_log = atoi(argv[5]); const bool do_log = std::stoi(argv[5]);
const int nrepeat = atoi(argv[6]); const int nrepeat = std::stoi(argv[6]);
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
...@@ -115,23 +115,19 @@ int main(int argc, char* argv[]) ...@@ -115,23 +115,19 @@ int main(int argc, char* argv[])
#endif #endif
#if 0 #if 0
constexpr index_t in_vector_size = 1;
using in_data_t = float; using in_data_t = float;
using acc_data_t = float; using acc_data_t = float;
using out_data_t = float; using out_data_t = float;
#elif 1 #elif 1
constexpr index_t in_vector_size = 1; using in_data_t = half_t;
using in_data_t = half_t; using acc_data_t = float;
using acc_data_t = float; using out_data_t = half_t;
using out_data_t = half_t;
#endif #endif
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4); std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
switch(layout) if(layout == ConvTensorLayout::NCHW)
{ {
case ConvTensorLayout::NCHW:
// NCHW
in_lengths_host[0] = static_cast<std::size_t>(N); in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C); in_lengths_host[1] = static_cast<std::size_t>(C);
in_lengths_host[2] = static_cast<std::size_t>(Hi); in_lengths_host[2] = static_cast<std::size_t>(Hi);
...@@ -144,9 +140,9 @@ int main(int argc, char* argv[]) ...@@ -144,9 +140,9 @@ int main(int argc, char* argv[])
out_lengths_host[1] = static_cast<std::size_t>(K); out_lengths_host[1] = static_cast<std::size_t>(K);
out_lengths_host[2] = static_cast<std::size_t>(Ho); out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo); out_lengths_host[3] = static_cast<std::size_t>(Wo);
break; }
case ConvTensorLayout::NHWC: else if(layout == ConvTensorLayout::NHWC)
// NHWC {
in_lengths_host[0] = static_cast<std::size_t>(N); in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(Hi); in_lengths_host[1] = static_cast<std::size_t>(Hi);
in_lengths_host[2] = static_cast<std::size_t>(Wi); in_lengths_host[2] = static_cast<std::size_t>(Wi);
...@@ -159,8 +155,10 @@ int main(int argc, char* argv[]) ...@@ -159,8 +155,10 @@ int main(int argc, char* argv[])
out_lengths_host[1] = static_cast<std::size_t>(Ho); out_lengths_host[1] = static_cast<std::size_t>(Ho);
out_lengths_host[2] = static_cast<std::size_t>(Wo); out_lengths_host[2] = static_cast<std::size_t>(Wo);
out_lengths_host[3] = static_cast<std::size_t>(K); out_lengths_host[3] = static_cast<std::size_t>(K);
break; }
default: throw std::runtime_error("wrong! not implemented"); else
{
throw std::runtime_error("wrong! not implemented");
} }
Tensor<in_data_t> in_host(in_lengths_host); Tensor<in_data_t> in_host(in_lengths_host);
...@@ -213,40 +211,8 @@ int main(int argc, char* argv[]) ...@@ -213,40 +211,8 @@ int main(int argc, char* argv[])
wei.GenerateTensorValue(gen_wei, num_thread); wei.GenerateTensorValue(gen_wei, num_thread);
} }
auto f_make_for_device_nchw = [&]() {
#if USE_DYNAMIC_MODE
const auto in_lengths_dev = make_tuple(N, C, Hi, Wi);
const auto wei_lengths_dev = make_tuple(K, C, Y, X);
const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
const auto conv_strides_dev = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations_dev = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads_dev = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads_dev = make_tuple(in_right_pad_h, in_right_pad_w);
#else
const auto in_lengths_dev =
make_tuple(Number<N>{}, Number<C>{}, Number<Hi>{}, Number<Wi>{});
const auto wei_lengths_dev = make_tuple(Number<K>{}, Number<C>{}, Number<Y>{}, Number<X>{});
const auto out_lengths_dev =
make_tuple(Number<N>{}, Number<K>{}, Number<Ho>{}, Number<Wo>{});
const auto conv_strides_dev = make_tuple(Number<conv_stride_h>{}, Number<conv_stride_w>{});
const auto conv_dilations_dev =
make_tuple(Number<conv_dilation_h>{}, Number<conv_dilation_w>{});
const auto in_left_pads_dev = make_tuple(Number<in_left_pad_h>{}, Number<in_left_pad_w>{});
const auto in_right_pads_dev =
make_tuple(Number<in_right_pad_h>{}, Number<in_right_pad_w>{});
#endif
return make_tuple(in_lengths_dev,
wei_lengths_dev,
out_lengths_dev,
conv_strides_dev,
conv_dilations_dev,
in_left_pads_dev,
in_right_pads_dev);
};
auto f_make_for_device_nhwc = [&]() { auto f_make_for_device_nhwc = [&]() {
#if USE_DYNAMIC_MODE #if USE_MODE
const auto in_lengths_dev = make_tuple(N, Hi, Wi, C); const auto in_lengths_dev = make_tuple(N, Hi, Wi, C);
const auto wei_lengths_dev = make_tuple(K, Y, X, C); const auto wei_lengths_dev = make_tuple(K, Y, X, C);
const auto out_lengths_dev = make_tuple(N, Ho, Wo, K); const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
...@@ -277,8 +243,6 @@ int main(int argc, char* argv[]) ...@@ -277,8 +243,6 @@ int main(int argc, char* argv[])
in_right_pads_dev); in_right_pads_dev);
}; };
const auto nhwc_desc = f_make_for_device_nhwc();
#if USE_CONV_BWD_V4R1_XDL_NHWC #if USE_CONV_BWD_V4R1_XDL_NHWC
if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC) if(algo == ConvBackwardDataAlgo::V4R1XDLNHWC)
{ {
...@@ -289,20 +253,20 @@ int main(int argc, char* argv[]) ...@@ -289,20 +253,20 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nhwc(); const auto tmp = f_make_for_device_nhwc();
device_dynamic_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk< device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk<in_data_t,
in_data_t, acc_data_t,
acc_data_t, out_data_t>(
out_data_t>(tmp[I0], tmp[I0],
tmp[I1], tmp[I1],
tmp[I2], tmp[I2],
tmp[I3], tmp[I3],
tmp[I4], tmp[I4],
tmp[I5], tmp[I5],
tmp[I6], tmp[I6],
in_device, in_device,
wei, wei,
out, out,
nrepeat); nrepeat);
} }
#endif #endif
...@@ -316,20 +280,20 @@ int main(int argc, char* argv[]) ...@@ -316,20 +280,20 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nhwc(); const auto tmp = f_make_for_device_nhwc();
device_dynamic_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk< device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk<in_data_t,
in_data_t, acc_data_t,
acc_data_t, out_data_t>(
out_data_t>(tmp[I0], tmp[I0],
tmp[I1], tmp[I1],
tmp[I2], tmp[I2],
tmp[I3], tmp[I3],
tmp[I4], tmp[I4],
tmp[I5], tmp[I5],
tmp[I6], tmp[I6],
in_device, in_device,
wei, wei,
out, out,
nrepeat); nrepeat);
} }
#endif #endif
......
...@@ -12,17 +12,17 @@ ...@@ -12,17 +12,17 @@
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv.hpp" #include "host_conv.hpp"
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp" #include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_DYNAMIC_MODE 1 #define USE_MODE 1
#define USE_CONV_FWD_V4R4_NCHW 1 #define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V4R4R2_NHWC 1 #define USE_CONV_FWD_V4R4R2_NHWC 1
#define USE_CONV_FWD_V6R1_NCHW 1 #define USE_CONV_FWD_V6R1_NCHW 0
#define USE_CONV_FWD_V5R1_NCHW 0 #define USE_CONV_FWD_V5R1_NCHW 0
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0 #define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
...@@ -49,7 +49,7 @@ int main(int argc, char* argv[]) ...@@ -49,7 +49,7 @@ int main(int argc, char* argv[])
constexpr auto I5 = Number<5>{}; constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{}; constexpr auto I6 = Number<6>{};
#if USE_DYNAMIC_MODE #if USE_MODE
// dynamic mode // dynamic mode
if(argc != 22) if(argc != 22)
{ {
...@@ -58,29 +58,29 @@ int main(int argc, char* argv[]) ...@@ -58,29 +58,29 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1])); const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(atoi(argv[2])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
const bool do_verification = atoi(argv[3]); const bool do_verification = std::stoi(argv[3]);
const int init_method = atoi(argv[4]); const int init_method = std::stoi(argv[4]);
const bool do_log = atoi(argv[5]); const bool do_log = std::stoi(argv[5]);
const int nrepeat = atoi(argv[6]); const int nrepeat = std::stoi(argv[6]);
const index_t N = atoi(argv[7]); const index_t N = std::stoi(argv[7]);
const index_t K = atoi(argv[8]); const index_t K = std::stoi(argv[8]);
const index_t C = atoi(argv[9]); const index_t C = std::stoi(argv[9]);
const index_t Y = atoi(argv[10]); const index_t Y = std::stoi(argv[10]);
const index_t X = atoi(argv[11]); const index_t X = std::stoi(argv[11]);
const index_t Hi = atoi(argv[12]); const index_t Hi = std::stoi(argv[12]);
const index_t Wi = atoi(argv[13]); const index_t Wi = std::stoi(argv[13]);
const index_t conv_stride_h = atoi(argv[14]); const index_t conv_stride_h = std::stoi(argv[14]);
const index_t conv_stride_w = atoi(argv[15]); const index_t conv_stride_w = std::stoi(argv[15]);
const index_t conv_dilation_h = atoi(argv[16]); const index_t conv_dilation_h = std::stoi(argv[16]);
const index_t conv_dilation_w = atoi(argv[17]); const index_t conv_dilation_w = std::stoi(argv[17]);
const index_t in_left_pad_h = atoi(argv[18]); const index_t in_left_pad_h = std::stoi(argv[18]);
const index_t in_left_pad_w = atoi(argv[19]); const index_t in_left_pad_w = std::stoi(argv[19]);
const index_t in_right_pad_h = atoi(argv[20]); const index_t in_right_pad_h = std::stoi(argv[20]);
const index_t in_right_pad_w = atoi(argv[21]); const index_t in_right_pad_w = std::stoi(argv[21]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1; const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1; const index_t XEff = (X - 1) * conv_dilation_w + 1;
...@@ -95,12 +95,12 @@ int main(int argc, char* argv[]) ...@@ -95,12 +95,12 @@ int main(int argc, char* argv[])
exit(1); exit(1);
} }
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1])); const ConvTensorLayout layout = static_cast<ConvTensorLayout>(std::stoi(argv[1]));
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(atoi(argv[2])); const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(std::stoi(argv[2]));
const bool do_verification = atoi(argv[3]); const bool do_verification = std::stoi(argv[3]);
const int init_method = atoi(argv[4]); const int init_method = std::stoi(argv[4]);
const bool do_log = atoi(argv[5]); const bool do_log = std::stoi(argv[5]);
const int nrepeat = atoi(argv[6]); const int nrepeat = std::stoi(argv[6]);
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 192; constexpr index_t C = 192;
...@@ -142,10 +142,8 @@ int main(int argc, char* argv[]) ...@@ -142,10 +142,8 @@ int main(int argc, char* argv[])
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4); std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
switch(layout) if(layout == ConvTensorLayout::NCHW)
{ {
case ConvTensorLayout::NCHW:
// NCHW
in_lengths_host[0] = static_cast<std::size_t>(N); in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C); in_lengths_host[1] = static_cast<std::size_t>(C);
in_lengths_host[2] = static_cast<std::size_t>(Hi); in_lengths_host[2] = static_cast<std::size_t>(Hi);
...@@ -158,9 +156,9 @@ int main(int argc, char* argv[]) ...@@ -158,9 +156,9 @@ int main(int argc, char* argv[])
out_lengths_host[1] = static_cast<std::size_t>(K); out_lengths_host[1] = static_cast<std::size_t>(K);
out_lengths_host[2] = static_cast<std::size_t>(Ho); out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo); out_lengths_host[3] = static_cast<std::size_t>(Wo);
break; }
case ConvTensorLayout::NHWC: else if(layout == ConvTensorLayout::NHWC)
// NHWC {
in_lengths_host[0] = static_cast<std::size_t>(N); in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(Hi); in_lengths_host[1] = static_cast<std::size_t>(Hi);
in_lengths_host[2] = static_cast<std::size_t>(Wi); in_lengths_host[2] = static_cast<std::size_t>(Wi);
...@@ -173,8 +171,10 @@ int main(int argc, char* argv[]) ...@@ -173,8 +171,10 @@ int main(int argc, char* argv[])
out_lengths_host[1] = static_cast<std::size_t>(Ho); out_lengths_host[1] = static_cast<std::size_t>(Ho);
out_lengths_host[2] = static_cast<std::size_t>(Wo); out_lengths_host[2] = static_cast<std::size_t>(Wo);
out_lengths_host[3] = static_cast<std::size_t>(K); out_lengths_host[3] = static_cast<std::size_t>(K);
break; }
default: throw std::runtime_error("wrong! not implemented"); else
{
std::runtime_error("wrong! not implemented");
} }
Tensor<in_data_t> in(in_lengths_host); Tensor<in_data_t> in(in_lengths_host);
...@@ -228,7 +228,7 @@ int main(int argc, char* argv[]) ...@@ -228,7 +228,7 @@ int main(int argc, char* argv[])
} }
auto f_make_for_device_nchw = [&]() { auto f_make_for_device_nchw = [&]() {
#if USE_DYNAMIC_MODE #if USE_MODE
const auto in_lengths_dev = make_tuple(N, C, Hi, Wi); const auto in_lengths_dev = make_tuple(N, C, Hi, Wi);
const auto wei_lengths_dev = make_tuple(K, C, Y, X); const auto wei_lengths_dev = make_tuple(K, C, Y, X);
const auto out_lengths_dev = make_tuple(N, K, Ho, Wo); const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
...@@ -260,7 +260,7 @@ int main(int argc, char* argv[]) ...@@ -260,7 +260,7 @@ int main(int argc, char* argv[])
}; };
auto f_make_for_device_nhwc = [&]() { auto f_make_for_device_nhwc = [&]() {
#if USE_DYNAMIC_MODE #if USE_MODE
const auto in_lengths_dev = make_tuple(N, Hi, Wi, C); const auto in_lengths_dev = make_tuple(N, Hi, Wi, C);
const auto wei_lengths_dev = make_tuple(K, Y, X, C); const auto wei_lengths_dev = make_tuple(K, Y, X, C);
const auto out_lengths_dev = make_tuple(N, Ho, Wo, K); const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
...@@ -301,20 +301,19 @@ int main(int argc, char* argv[]) ...@@ -301,20 +301,19 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nchw(); const auto tmp = f_make_for_device_nchw();
device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t, device_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<in_data_t,
acc_data_t, acc_data_t,
out_data_t>( out_data_t>(tmp[I0],
tmp[I0], tmp[I1],
tmp[I1], tmp[I2],
tmp[I2], tmp[I3],
tmp[I3], tmp[I4],
tmp[I4], tmp[I5],
tmp[I5], tmp[I6],
tmp[I6], in,
in, wei,
wei, out_device,
out_device, nrepeat);
nrepeat);
} }
#endif #endif
...@@ -328,20 +327,19 @@ int main(int argc, char* argv[]) ...@@ -328,20 +327,19 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nhwc(); const auto tmp = f_make_for_device_nhwc();
device_dynamic_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t, device_convolution_forward_implicit_gemm_v4r4r2_dlops_nhwc_kyxc_nhwk<in_data_t,
acc_data_t, acc_data_t,
out_data_t>( out_data_t>(tmp[I0],
tmp[I0], tmp[I1],
tmp[I1], tmp[I2],
tmp[I2], tmp[I3],
tmp[I3], tmp[I4],
tmp[I4], tmp[I5],
tmp[I5], tmp[I6],
tmp[I6], in,
in, wei,
wei, out_device,
out_device, nrepeat);
nrepeat);
} }
#endif #endif
...@@ -355,20 +353,19 @@ int main(int argc, char* argv[]) ...@@ -355,20 +353,19 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nchw(); const auto tmp = f_make_for_device_nchw();
device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t, device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<in_data_t,
acc_data_t, acc_data_t,
out_data_t>( out_data_t>(tmp[I0],
tmp[I0], tmp[I1],
tmp[I1], tmp[I2],
tmp[I2], tmp[I3],
tmp[I3], tmp[I4],
tmp[I4], tmp[I5],
tmp[I5], tmp[I6],
tmp[I6], in,
in, wei,
wei, out_device,
out_device, nrepeat);
nrepeat);
} }
#endif #endif
...@@ -382,21 +379,20 @@ int main(int argc, char* argv[]) ...@@ -382,21 +379,20 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nchw(); const auto tmp = f_make_for_device_nchw();
device_dynamic_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t, device_convolution_forward_implicit_gemm_v5r1_dlops_nchw_kcyx_nkhw<in_data_t,
16, 16,
acc_data_t, acc_data_t,
out_data_t>( out_data_t>(tmp[I0],
tmp[I0], tmp[I1],
tmp[I1], tmp[I2],
tmp[I2], tmp[I3],
tmp[I3], tmp[I4],
tmp[I4], tmp[I5],
tmp[I5], tmp[I6],
tmp[I6], in,
in, wei,
wei, out_device,
out_device, nrepeat);
nrepeat);
} }
#endif #endif
...@@ -410,9 +406,9 @@ int main(int argc, char* argv[]) ...@@ -410,9 +406,9 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nchw(); const auto tmp = f_make_for_device_nchw();
device_dynamic_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t, device_convolution_forward_implicit_gemm_v4r4r2_xdlops_nchw_kcyx_nkhw<in_data_t,
acc_data_t, acc_data_t,
out_data_t>( out_data_t>(
tmp[I0], tmp[I0],
tmp[I1], tmp[I1],
tmp[I2], tmp[I2],
...@@ -437,9 +433,9 @@ int main(int argc, char* argv[]) ...@@ -437,9 +433,9 @@ int main(int argc, char* argv[])
const auto tmp = f_make_for_device_nhwc(); const auto tmp = f_make_for_device_nhwc();
device_dynamic_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t, device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk<in_data_t,
acc_data_t, acc_data_t,
out_data_t>( out_data_t>(
tmp[I0], tmp[I0],
tmp[I1], tmp[I1],
tmp[I2], tmp[I2],
...@@ -467,7 +463,6 @@ int main(int argc, char* argv[]) ...@@ -467,7 +463,6 @@ int main(int argc, char* argv[])
check_error(out_host, out_device); check_error(out_host, out_device);
#if 0
if(do_log) if(do_log)
{ {
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
...@@ -475,6 +470,5 @@ int main(int argc, char* argv[]) ...@@ -475,6 +470,5 @@ int main(int argc, char* argv[])
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl; LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
} }
#endif
} }
} }
include_directories(BEFORE
include
${PROJECT_BINARY_DIR}/host/online_compile/include
${PROJECT_SOURCE_DIR}/host/online_compile/include
${PROJECT_SOURCE_DIR}/host/host_tensor/include
${PROJECT_SOURCE_DIR}/host/solver/include
${PROJECT_SOURCE_DIR}/composable_kernel/include
${PROJECT_SOURCE_DIR}/composable_kernel/include/utility
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description
${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation
${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
${PROJECT_SOURCE_DIR}/composable_kernel/include/driver
${PROJECT_SOURCE_DIR}/external/rocm/include
${PROJECT_SOURCE_DIR}/external/half/include
)
set(CONV_FWD_DRIVER_ONLINE_SOURCE conv_fwd_driver_online.cpp)
add_executable(conv_fwd_driver_online ${CONV_FWD_DRIVER_ONLINE_SOURCE})
target_link_libraries(conv_fwd_driver_online PRIVATE host_tensor)
target_link_libraries(conv_fwd_driver_online PRIVATE online_compile)
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "conv_common.hpp"
#include "host_conv.hpp"
#include "device_tensor.hpp"
#include "handle.hpp"
#include "hipCheck.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
#include "online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
#define USE_CONV_FWD_V4R4_NCHW 1
#define USE_CONV_FWD_V6R1_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NCHW 1
#define USE_CONV_FWD_V4R4_XDLOPS_NHWC 1
enum ConvForwardAlgo
{
V4R4NCHW, // 0
V6R1NCHW, // 1
V4R4XDLNCHW, // 2
V4R4XDLNHWC // 3
};
int main(int argc, char* argv[])
{
using namespace ck;
using namespace ck_driver;
using size_t = std::size_t;
hipStream_t stream;
online_compile::Handle* handle;
MY_HIP_CHECK(hipStreamCreate(&stream));
handle = new online_compile::Handle(stream);
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I4 = Number<4>{};
constexpr auto I5 = Number<5>{};
constexpr auto I6 = Number<6>{};
if(argc != 22)
{
printf("arg1 to 5: layout, algo, do_verification, init_method, do_log, nrepeat\n");
printf("rest: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx\n");
exit(1);
}
const ConvTensorLayout layout = static_cast<ConvTensorLayout>(atoi(argv[1]));
const ConvForwardAlgo algo = static_cast<ConvForwardAlgo>(atoi(argv[2]));
const bool do_verification = atoi(argv[3]);
const int init_method = atoi(argv[4]);
const bool do_log = atoi(argv[5]);
const int nrepeat = atoi(argv[6]);
const index_t N = atoi(argv[7]);
const index_t K = atoi(argv[8]);
const index_t C = atoi(argv[9]);
const index_t Y = atoi(argv[10]);
const index_t X = atoi(argv[11]);
const index_t Hi = atoi(argv[12]);
const index_t Wi = atoi(argv[13]);
const index_t conv_stride_h = atoi(argv[14]);
const index_t conv_stride_w = atoi(argv[15]);
const index_t conv_dilation_h = atoi(argv[16]);
const index_t conv_dilation_w = atoi(argv[17]);
const index_t in_left_pad_h = atoi(argv[18]);
const index_t in_left_pad_w = atoi(argv[19]);
const index_t in_right_pad_h = atoi(argv[20]);
const index_t in_right_pad_w = atoi(argv[21]);
const index_t YEff = (Y - 1) * conv_dilation_h + 1;
const index_t XEff = (X - 1) * conv_dilation_w + 1;
const index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
#if 1
using in_data_t = float;
using acc_data_t = float;
using out_data_t = float;
#elif 0
using in_data_t = half_t;
using acc_data_t = float;
using out_data_t = half_t;
#elif 1
using in_data_t = int8_t;
using acc_data_t = int32_t;
using out_data_t = int8_t;
#endif
std::vector<std::size_t> in_lengths_host(4), wei_lengths_host(4), out_lengths_host(4);
switch(layout)
{
case ConvTensorLayout::NCHW:
// NCHW
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(C);
in_lengths_host[2] = static_cast<std::size_t>(Hi);
in_lengths_host[3] = static_cast<std::size_t>(Wi);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(C);
wei_lengths_host[2] = static_cast<std::size_t>(Y);
wei_lengths_host[3] = static_cast<std::size_t>(X);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(K);
out_lengths_host[2] = static_cast<std::size_t>(Ho);
out_lengths_host[3] = static_cast<std::size_t>(Wo);
break;
case ConvTensorLayout::NHWC:
// NHWC
in_lengths_host[0] = static_cast<std::size_t>(N);
in_lengths_host[1] = static_cast<std::size_t>(Hi);
in_lengths_host[2] = static_cast<std::size_t>(Wi);
in_lengths_host[3] = static_cast<std::size_t>(C);
wei_lengths_host[0] = static_cast<std::size_t>(K);
wei_lengths_host[1] = static_cast<std::size_t>(Y);
wei_lengths_host[2] = static_cast<std::size_t>(X);
wei_lengths_host[3] = static_cast<std::size_t>(C);
out_lengths_host[0] = static_cast<std::size_t>(N);
out_lengths_host[1] = static_cast<std::size_t>(Ho);
out_lengths_host[2] = static_cast<std::size_t>(Wo);
out_lengths_host[3] = static_cast<std::size_t>(K);
break;
default: throw std::runtime_error("wrong! not implemented");
}
Tensor<in_data_t> in(in_lengths_host);
Tensor<in_data_t> wei(wei_lengths_host);
Tensor<out_data_t> out_host(out_lengths_host);
Tensor<out_data_t> out_device(out_lengths_host);
std::cout << "layout: " << layout << std::endl;
ostream_HostTensorDescriptor(in.mDesc, std::cout << "in: ");
ostream_HostTensorDescriptor(wei.mDesc, std::cout << "wei: ");
ostream_HostTensorDescriptor(out_host.mDesc, std::cout << "out: ");
print_array("InLeftPads", make_tuple(in_left_pad_h, in_left_pad_w));
print_array("InRightPads", make_tuple(in_right_pad_h, in_right_pad_w));
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
switch(init_method)
{
case 0:
// no initialization
break;
case 1:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 2:
in.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 3:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
break;
case 4:
in.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
break;
case 5:
in.GenerateTensorValue(GeneratorTensor_3<float>{0.0, 1.0}, num_thread);
wei.GenerateTensorValue(GeneratorTensor_3<float>{-0.5, 0.5}, num_thread);
break;
default:
in.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
auto gen_wei = [](auto... is) {
return GeneratorTensor_2{1, 5}(is...) * GeneratorTensor_Checkboard{}(is...);
};
wei.GenerateTensorValue(gen_wei, num_thread);
}
auto f_make_for_device_nchw = [&]() {
const auto in_lengths_dev = make_tuple(N, C, Hi, Wi);
const auto wei_lengths_dev = make_tuple(K, C, Y, X);
const auto out_lengths_dev = make_tuple(N, K, Ho, Wo);
return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
};
auto f_make_for_device_nhwc = [&]() {
const auto in_lengths_dev = make_tuple(N, Hi, Wi, C);
const auto wei_lengths_dev = make_tuple(K, Y, X, C);
const auto out_lengths_dev = make_tuple(N, Ho, Wo, K);
return make_tuple(in_lengths_dev, wei_lengths_dev, out_lengths_dev);
};
const auto conv_strides = make_tuple(conv_stride_h, conv_stride_w);
const auto conv_dilations = make_tuple(conv_dilation_h, conv_dilation_w);
const auto in_left_pads = make_tuple(in_left_pad_h, in_left_pad_w);
const auto in_right_pads = make_tuple(in_right_pad_h, in_right_pad_w);
#if USE_CONV_FWD_V4R4_NCHW
if(algo == ConvForwardAlgo::V4R4NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
#if USE_CONV_FWD_V6R1_NCHW
if(algo == ConvForwardAlgo::V6R1NCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
#if 1
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
get_datatype_enum_from_type<in_data_t>::value,
get_datatype_enum_from_type<acc_data_t>::value,
get_datatype_enum_from_type<out_data_t>::value,
256,
4,
1,
128,
32,
8,
4,
4,
1,
{8, 2},
{8, 2},
{4, 1, 1, 1, 1},
{2, 1, 1, 128, 1},
{4, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
{1, 4, 1, 1, 1},
{8, 1, 1, 32, 1},
{1, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
4,
true,
true};
#elif 0
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
get_datatype_enum_from_type<in_data_t>::value,
get_datatype_enum_from_type<acc_data_t>::value,
get_datatype_enum_from_type<out_data_t>::value,
256,
4,
2,
128,
32,
8,
4,
4,
1,
{8, 2},
{8, 2},
{4, 1, 1, 1, 2},
{2, 1, 1, 128, 1},
{4, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
{1, 4, 1, 1, 2},
{8, 1, 1, 32, 1},
{1, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
4,
true,
true};
#elif 1
const CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw compile_param = {
get_datatype_enum_from_type<in_data_t>::value,
get_datatype_enum_from_type<acc_data_t>::value,
get_datatype_enum_from_type<out_data_t>::value,
256,
4,
4,
128,
32,
8,
4,
4,
1,
{8, 2},
{8, 2},
{4, 1, 1, 1, 4},
{2, 1, 1, 128, 1},
{4, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
{1, 4, 1, 1, 4},
{8, 1, 1, 32, 1},
{1, 1, 1, 1, 1},
{1, 1, 1, 1, 1},
4,
true,
true};
#endif
online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
compile_param,
nrepeat);
}
#endif
#if USE_CONV_FWD_V4R4_XDLOPS_NCHW
if(algo == ConvForwardAlgo::V4R4XDLNCHW)
{
if(layout != ConvTensorLayout::NCHW)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nchw();
tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable =
&default_tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
#if USE_CONV_FWD_V4R4_XDLOPS_NHWC
if(algo == ConvForwardAlgo::V4R4XDLNHWC)
{
if(layout != ConvTensorLayout::NHWC)
{
throw std::runtime_error("wrong! layout");
}
const auto tmp = f_make_for_device_nhwc();
tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable =
&default_tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk<
in_data_t,
acc_data_t,
out_data_t>(handle,
tmp[I0],
tmp[I1],
tmp[I2],
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads,
in,
wei,
out_device,
tunable,
nrepeat);
}
#endif
if(do_verification)
{
host_direct_convolution(in,
wei,
out_host,
make_tuple(conv_stride_h, conv_stride_w),
make_tuple(conv_dilation_h, conv_dilation_w),
make_tuple(in_left_pad_h, in_left_pad_w),
make_tuple(in_right_pad_h, in_right_pad_w),
layout);
check_error(out_host, out_device);
#if 0
if(do_log)
{
LogRangeAsType<float>(std::cout << "in : ", in.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "wei: ", wei.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_host : ", out_host.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "out_device: ", out_device.mData, ",") << std::endl;
}
#endif
}
delete handle;
MY_HIP_CHECK(hipStreamDestroy(stream));
}
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "conv_tunable_fwd_v4r4_dlops_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
using namespace ck;
std::string out;
out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->M1PerThread) + "x" + std::to_string(pt->N1PerThread) + "x" +
std::to_string(pt->KPerThread) + "_";
out += std::to_string(pt->M1N1ThreadClusterM10) + "x" +
std::to_string(pt->M1N1ThreadClusterN10) + "x" +
std::to_string(pt->M1N1ThreadClusterM11) + "x" +
std::to_string(pt->M1N1ThreadClusterN11) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_M1) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_N1) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
using namespace ck;
std::string out;
out +=
" -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* pt)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
" -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_M1PerThread=" + std::to_string(pt->M1PerThread) +
" -DCK_PARAM_N1PerThread=" + std::to_string(pt->N1PerThread) +
" -DCK_PARAM_KPerThread=" + std::to_string(pt->KPerThread);
out += " -DCK_PARAM_M1N1ThreadClusterM10=" + std::to_string(pt->M1N1ThreadClusterM10) +
" -DCK_PARAM_M1N1ThreadClusterN10=" + std::to_string(pt->M1N1ThreadClusterN10) +
" -DCK_PARAM_M1N1ThreadClusterM11=" + std::to_string(pt->M1N1ThreadClusterM11) +
" -DCK_PARAM_M1N1ThreadClusterN11=" + std::to_string(pt->M1N1ThreadClusterN11);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K_M0_M1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K_M0_M1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_M1=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_M1);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K_N0_N1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K_N0_N1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_N1=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_N1);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
online_compile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v4r4_dlops_nchw_kcyx_nkhw* tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace ck_driver;
using namespace detail_dyn_conv_fwd_v4r4_nchw_kcyx_nkhw;
using size_t = std::size_t;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
const auto descs =
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto a_k_m_grid_desc = descs[I0];
const auto c_m_n_grid_desc = descs[I2];
const auto M = c_m_n_grid_desc.GetLength(I0);
const auto N = c_m_n_grid_desc.GetLength(I1);
const auto K = a_k_m_grid_desc.GetLength(I0);
const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
const bool hasMainKBlockLoop = ((K + tunable->KPerBlock) / (2 * tunable->KPerBlock) > 1);
const bool hasDoubleTailKBlockLoop = ((K / tunable->KPerBlock) % 2 == 0);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_k_n0_n1_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_dlops_nchw";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " +
get_definition_string_from_tunable(tunable) +
" -DCK_PARAM_HAS_MAIN_KBLOCK_LOOP=" + std::to_string(hasMainKBlockLoop) +
" -DCK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP=" + std::to_string(hasDoubleTailKBlockLoop);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
get_network_config_string_from_tunable(tunable) + "_" +
std::to_string(hasMainKBlockLoop) + "_" +
std::to_string(hasDoubleTailKBlockLoop);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_k_m0_m1_grid_desc_dev_buf,
b_k_n0_n1_grid_desc_dev_buf,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(a_k_m0_m1_grid_desc_dev_buf),
(const void*)(b_k_n0_n1_grid_desc_dev_buf),
(const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1];
const auto K = out_n_k_ho_wo_lengths[I1];
const auto Ho = out_n_k_ho_wo_lengths[I2];
const auto Wo = out_n_k_ho_wo_lengths[I3];
const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nchw_kcyx_nkhw.hpp"
namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
using namespace ck;
std::string out;
out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
std::to_string(pt->K1) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
using namespace ck;
std::string out;
out +=
" -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* pt)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
" -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
" -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
" -DCK_PARAM_K1=" + std::to_string(pt->K1) +
" -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
" -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
online_compile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const tunable_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw* tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace ck_driver;
using namespace detail_dyn_conv_fwd_v4r4_xdlops_nchw_kcyx_nkhw;
using size_t = std::size_t;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_c_hi_wi_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_c_hi_wi_lengths);
const auto wei_k_c_y_x_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_c_y_x_lengths);
const auto out_n_k_ho_wo_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_k_ho_wo_lengths);
const auto n = in_n_c_hi_wi_desc.GetLength(I0);
const auto c = in_n_c_hi_wi_desc.GetLength(I1);
const auto hi = in_n_c_hi_wi_desc.GetLength(I2);
const auto wi = in_n_c_hi_wi_desc.GetLength(I3);
const auto k = wei_k_c_y_x_desc.GetLength(I0);
const auto y = wei_k_c_y_x_desc.GetLength(I2);
const auto x = wei_k_c_y_x_desc.GetLength(I3);
const auto ho = out_n_k_ho_wo_desc.GetLength(I2);
const auto wo = out_n_k_ho_wo_desc.GetLength(I3);
const auto M = k;
const auto N = n * ho * wo;
const auto K = c * y * x;
const auto K0 = K / tunable->K1;
const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// these buffers are usually provided by the user application
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_k_m0_m1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_k_n0_n1_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nchw";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " " + " -DCK_USE_AMD_XDLOPS" +
get_definition_string_from_tunable(tunable);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
get_network_config_string_from_tunable(tunable);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_k_m0_m1_grid_desc_dev_buf,
b_k_n0_n1_grid_desc_dev_buf,
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(a_k_m0_m1_grid_desc_dev_buf),
(const void*)(b_k_n0_n1_grid_desc_dev_buf),
(const void*)(c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf),
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_c_hi_wi_lengths[I0];
const auto C = in_n_c_hi_wi_lengths[I1];
const auto K = out_n_k_ho_wo_lengths[I1];
const auto Ho = out_n_k_ho_wo_lengths[I2];
const auto Wo = out_n_k_ho_wo_lengths[I3];
const auto Y = wei_k_c_y_x_lengths[I2];
const auto X = wei_k_c_y_x_lengths[I3];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
#include "conv_tunable_fwd_v4r4_xdlops_nhwc_kyxc_nhwk.hpp"
namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk {
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_network_config_string_from_types()
{
using namespace ck;
std::string out;
out += std::to_string(get_datatype_enum_from_type<TInWei>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TAcc>::value) + "_" +
std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_network_config_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
{
std::string out("TUN_");
out += std::to_string(pt->BlockSize) + "_";
out += std::to_string(pt->MPerBlock) + "x" + std::to_string(pt->NPerBlock) + "x" +
std::to_string(pt->KPerBlock) + "_";
out += std::to_string(pt->MPerWave) + "x" + std::to_string(pt->NPerWave) + "x" +
std::to_string(pt->MRepeat) + "x" + std::to_string(pt->NRepeat) + "x" +
std::to_string(pt->K1) + "_";
out += std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]) + "_";
out += std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->ABlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->ABlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->ABlockTransferDstScalarPerVector_K1) + "_";
out += std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]) + "_";
out += std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "x" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "x" +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]) + "_";
out += std::to_string(pt->BBlockTransferSrcVectorDim) + "_";
out += std::to_string(pt->BBlockTransferSrcScalarPerVector) + "_";
out += std::to_string(pt->BBlockTransferDstScalarPerVector_K1) + "_";
out += std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun) + "_";
out += std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "x" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]) + "_";
out += std::to_string(pt->CThreadTransferSrcDstVectorDim) + "_";
out += std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
template <typename TInWei, typename TAcc, typename TOut>
static std::string get_definition_string_from_types()
{
using namespace ck;
std::string out;
out +=
" -DCK_PARAM_ABDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TInWei>::value) +
" -DCK_PARAM_AccDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TAcc>::value) +
" -DCK_PARAM_CDataTypeEnum=" + std::to_string(get_datatype_enum_from_type<TOut>::value);
return (out);
};
static std::string
get_definition_string_from_tunable(const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* pt)
{
std::string out;
out += " -DCK_PARAM_BlockSize=" + std::to_string(pt->BlockSize);
out += " -DCK_PARAM_MPerBlock=" + std::to_string(pt->MPerBlock) +
" -DCK_PARAM_NPerBlock=" + std::to_string(pt->NPerBlock) +
" -DCK_PARAM_KPerBlock=" + std::to_string(pt->KPerBlock);
out += " -DCK_PARAM_MPerWave=" + std::to_string(pt->MPerWave) +
" -DCK_PARAM_NPerWave=" + std::to_string(pt->NPerWave) +
" -DCK_PARAM_K1=" + std::to_string(pt->K1) +
" -DCK_PARAM_MRepeat=" + std::to_string(pt->MRepeat) +
" -DCK_PARAM_NRepeat=" + std::to_string(pt->NRepeat);
out += " -DCK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1=" +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[0]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[1]) + "," +
std::to_string(pt->ABlockTransferThreadSliceLengths_K0_M_K1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1=" +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterLengths_K0_M_K1[2]);
out += " -DCK_PARAM_ABlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->ABlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_ABlockTransferSrcAccessOrder=" +
std::to_string(pt->ABlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->ABlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_ABlockTransferSrcVectorDim=" + std::to_string(pt->ABlockTransferSrcVectorDim);
out += " -DCK_PARAM_ABlockTransferSrcScalarPerVector=" +
std::to_string(pt->ABlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_ABlockTransferDstScalarPerVector_K1=" +
std::to_string(pt->ABlockTransferDstScalarPerVector_K1);
out += " -DCK_PARAM_AThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->AThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1=" +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[0]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[1]) + "," +
std::to_string(pt->BBlockTransferThreadSliceLengths_K0_N_K1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1=" +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterLengths_K0_N_K1[2]);
out += " -DCK_PARAM_BBlockTransferThreadClusterArrangeOrder=" +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[0]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[1]) + "," +
std::to_string(pt->BBlockTransferThreadClusterArrangeOrder[2]);
out += " -DCK_PARAM_BBlockTransferSrcAccessOrder=" +
std::to_string(pt->BBlockTransferSrcAccessOrder[0]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[1]) + "," +
std::to_string(pt->BBlockTransferSrcAccessOrder[2]);
out +=
" -DCK_PARAM_BBlockTransferSrcVectorDim=" + std::to_string(pt->BBlockTransferSrcVectorDim);
out += " -DCK_PARAM_BBlockTransferSrcScalarPerVector=" +
std::to_string(pt->BBlockTransferSrcScalarPerVector);
out += " -DCK_PARAM_BBlockTransferDstScalarPerVector_K1=" +
std::to_string(pt->BBlockTransferDstScalarPerVector_K1);
out += " -DCK_PARAM_BThreadTransferSrcResetCoordinateAfterRun=" +
std::to_string(pt->BThreadTransferSrcResetCoordinateAfterRun);
out += " -DCK_PARAM_CThreadTransferSrcDstAccessOrder=" +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[0]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[1]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[2]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[3]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[4]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[5]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[6]) + "," +
std::to_string(pt->CThreadTransferSrcDstAccessOrder[7]);
out += " -DCK_PARAM_CThreadTransferSrcDstVectorDim=" +
std::to_string(pt->CThreadTransferSrcDstVectorDim);
out += " -DCK_PARAM_CThreadTransferDstScalarPerVector=" +
std::to_string(pt->CThreadTransferDstScalarPerVector);
return (out);
};
} // namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
online_compile::Handle* handle,
const InLengths& in_n_hi_wi_c_lengths,
const WeiLengths& wei_k_y_x_c_lengths,
const OutLengths& out_n_ho_wo_k_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_hi_wi_c,
const Tensor<TInWei>& wei_k_y_x_c,
Tensor<TOut>& out_n_ho_wo_k,
const tunable_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk* tunable,
ck::index_t nrepeat)
{
using namespace ck;
using namespace detail_dyn_conv_fwd_v4r4_xdlops_nhwc_kyxc_nhwk;
using size_t = std::size_t;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The follow codes are only used for computing the grid_size, hasMainKBlockLoop,
// hasDoubleTailKBlockLoop
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const auto in_n_hi_wi_c_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(in_n_hi_wi_c_lengths);
const auto wei_k_y_x_c_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(wei_k_y_x_c_lengths);
const auto out_n_ho_wo_k_desc =
make_dynamic_naive_tensor_descriptor_packed_v2(out_n_ho_wo_k_lengths);
const auto n = in_n_hi_wi_c_desc.GetLength(I0);
const auto hi = in_n_hi_wi_c_desc.GetLength(I1);
const auto wi = in_n_hi_wi_c_desc.GetLength(I2);
const auto c = in_n_hi_wi_c_desc.GetLength(I3);
const auto k = wei_k_y_x_c_desc.GetLength(I0);
const auto y = wei_k_y_x_c_desc.GetLength(I1);
const auto x = wei_k_y_x_c_desc.GetLength(I2);
const auto ho = out_n_ho_wo_k_desc.GetLength(I1);
const auto wo = out_n_ho_wo_k_desc.GetLength(I2);
const auto M = k;
const auto N = n * ho * wo;
const auto K = c * y * x;
const auto K0 = K / tunable->K1;
const index_t grid_size = (M / tunable->MPerBlock) * (N / tunable->NPerBlock);
// these buffers are usually provided by the user application
DeviceMem in_n_hi_wi_c_dev_buf(sizeof(TInWei) * in_n_hi_wi_c.mDesc.GetElementSpace());
DeviceMem wei_k_y_x_c_dev_buf(sizeof(TInWei) * wei_k_y_x_c.mDesc.GetElementSpace());
DeviceMem out_n_ho_wo_k_dev_buf(sizeof(TOut) * out_n_ho_wo_k.mDesc.GetElementSpace());
in_n_hi_wi_c_dev_buf.ToDevice(in_n_hi_wi_c.mData.data());
wei_k_y_x_c_dev_buf.ToDevice(wei_k_y_x_c.mData.data());
out_n_ho_wo_k_dev_buf.ToDevice(out_n_ho_wo_k.mData.data());
// these are workspace buffers that should be expressed to the user by the corresponding
// workspace API
DeviceMem workspace_buf(4096);
void* a_k0_m_k1_grid_desc_dev_buf = workspace_buf.GetDeviceBuffer();
void* b_k0_n_k1_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 1024);
void* c_m0_m1_m2_n_grid_desc_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 2048);
void* c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf =
static_cast<void*>(static_cast<unsigned char*>(workspace_buf.GetDeviceBuffer()) + 3072);
const std::vector<size_t> vld = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd1 = {static_cast<size_t>(tunable->BlockSize), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * tunable->BlockSize), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v4r4_xdlops_nhwc";
std::string param = " -std=c++17 ";
std::string network_config;
param += get_definition_string_from_types<TInWei, TAcc, TOut>() + " -DCK_USE_AMD_XDLOPS ";
param += get_definition_string_from_tunable(tunable);
network_config = get_network_config_string_from_types<TInWei, TAcc, TOut>() + "_" +
get_network_config_string_from_tunable(tunable);
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name =
"dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name, network_config_1, program_name, kernel_name, vld, vgd1, param)(
static_cast<index_t>(in_n_hi_wi_c_lengths[I0]),
static_cast<index_t>(in_n_hi_wi_c_lengths[I1]),
static_cast<index_t>(in_n_hi_wi_c_lengths[I2]),
static_cast<index_t>(in_n_hi_wi_c_lengths[I3]),
static_cast<index_t>(wei_k_y_x_c_lengths[I0]),
static_cast<index_t>(wei_k_y_x_c_lengths[I1]),
static_cast<index_t>(wei_k_y_x_c_lengths[I2]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
a_k0_m_k1_grid_desc_dev_buf,
b_k0_n_k1_grid_desc_dev_buf,
c_m0_m1_m2_n_grid_desc_dev_buf,
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf);
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name, network_config_2, program_name, kernel_name, vld, vgd2, param)(
reinterpret_cast<const TInWei*>(in_n_hi_wi_c_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(wei_k_y_x_c_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_ho_wo_k_dev_buf.GetDeviceBuffer()),
(const void*)(a_k0_m_k1_grid_desc_dev_buf),
(const void*)(b_k0_n_k1_grid_desc_dev_buf),
(const void*)(c_m0_m1_m2_n_grid_desc_dev_buf),
(const void*)(c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
(nrepeat - 1);
const auto N = in_n_hi_wi_c_lengths[I0];
const auto C = in_n_hi_wi_c_lengths[I3];
const auto Ho = out_n_ho_wo_k_lengths[I1];
const auto Wo = out_n_ho_wo_k_lengths[I2];
const auto K = out_n_ho_wo_k_lengths[I3];
const auto Y = wei_k_y_x_c_lengths[I1];
const auto X = wei_k_y_x_c_lengths[I2];
float perf = (float)(std::size_t(2) * N * K * Ho * Wo * C * Y * X) /
(std::size_t(1000) * 1000 * 1000) / ave_time2;
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_ho_wo_k_dev_buf.FromDevice(out_n_ho_wo_k.mData.data());
}
#pragma once
#include "device.hpp"
#include "host_tensor.hpp"
#include "handle.hpp"
#include "online_driver_common.hpp"
#include "convolution_problem_descriptor.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
#include "conv_igemm_fwd_v6r1_dlops_nchw_kcyx_nkhw.hpp"
template <typename TInWei,
typename TAcc,
typename TOut,
typename InLengths,
typename WeiLengths,
typename OutLengths,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void online_device_dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
online_compile::Handle* handle,
const InLengths& in_n_c_hi_wi_lengths,
const WeiLengths& wei_k_c_y_x_lengths,
const OutLengths& out_n_k_ho_wo_lengths,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads& in_right_pads,
const Tensor<TInWei>& in_n_c_hi_wi,
const Tensor<TInWei>& wei_k_c_y_x,
Tensor<TOut>& out_n_k_ho_wo,
const ck_driver::CompileParameterConvIgemmFwdV6r1DlopsNchwKcyxNkhw& compile_param,
ck::index_t nrepeat)
{
using namespace ck;
using namespace ck_driver;
using size_t = std::size_t;
std::cout << __func__ << std::endl;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
ConvolutionProblemDescriptor conv_problem_desc{in_n_c_hi_wi_lengths[I0],
out_n_k_ho_wo_lengths[I1],
in_n_c_hi_wi_lengths[I1],
wei_k_c_y_x_lengths[I2],
wei_k_c_y_x_lengths[I3],
in_n_c_hi_wi_lengths[I2],
in_n_c_hi_wi_lengths[I3],
out_n_k_ho_wo_lengths[I2],
out_n_k_ho_wo_lengths[I3],
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
get_datatype_enum_from_type<TInWei>::value,
get_datatype_enum_from_type<TInWei>::value,
get_datatype_enum_from_type<TOut>::value};
if(!ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::IsValidCompileParameter(conv_problem_desc,
compile_param))
{
throw std::runtime_error("wrong! IsValidCompileParameter fail");
}
DeviceMem in_n_c_hi_wi_dev_buf(sizeof(TInWei) * in_n_c_hi_wi.mDesc.GetElementSpace());
DeviceMem wei_k_c_y_x_dev_buf(sizeof(TInWei) * wei_k_c_y_x.mDesc.GetElementSpace());
DeviceMem out_n_k_ho_wo_dev_buf(sizeof(TOut) * out_n_k_ho_wo.mDesc.GetElementSpace());
in_n_c_hi_wi_dev_buf.ToDevice(in_n_c_hi_wi.mData.data());
wei_k_c_y_x_dev_buf.ToDevice(wei_k_c_y_x.mData.data());
out_n_k_ho_wo_dev_buf.ToDevice(out_n_k_ho_wo.mData.data());
// workspace is used for save transformed tensor descritpors created by prepare kernel
DeviceMem workspace_dev_buf(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetWorkSpaceSize(conv_problem_desc, compile_param));
const auto block_size = std::size_t(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetBlockSize(conv_problem_desc, compile_param));
const auto grid_size = std::size_t(
ConvIgemmFwdV6r1DlopsNchwKcyxNkhw::GetGridSize(conv_problem_desc, compile_param));
const std::vector<size_t> vld1 = {1, 1, 1};
const std::vector<size_t> vgd1 = {1, 1, 1};
const std::vector<size_t> vld2 = {static_cast<size_t>(block_size), 1, 1};
const std::vector<size_t> vgd2 = {static_cast<size_t>(grid_size * block_size), 1, 1};
std::string program_name =
"dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp";
std::string algo_name = "implicit_gemm_conv_fwd_v6r1_dlops_nchw";
std::string compile_param_string = get_ck_hip_online_compile_common_flag() + compile_param.GetCompileParameterString();
std::string network_config = compile_param_string;
std::vector<float> kernel1_times;
std::vector<float> kernel2_times;
for(index_t i = 0; i < nrepeat + 1; ++i)
{
KernelTimer timer1, timer2;
std::string kernel_name;
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare";
auto network_config_1 = network_config + "_1";
timer1.Start();
handle->AddKernel(algo_name,
network_config_1,
program_name,
kernel_name,
vld1,
vgd1,
compile_param_string)(static_cast<index_t>(in_n_c_hi_wi_lengths[I0]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I1]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I2]),
static_cast<index_t>(in_n_c_hi_wi_lengths[I3]),
static_cast<index_t>(wei_k_c_y_x_lengths[I0]),
static_cast<index_t>(wei_k_c_y_x_lengths[I2]),
static_cast<index_t>(wei_k_c_y_x_lengths[I3]),
conv_strides[I0],
conv_strides[I1],
conv_dilations[I0],
conv_dilations[I1],
in_left_pads[I0],
in_left_pads[I1],
in_right_pads[I0],
in_right_pads[I1],
(void*)(workspace_dev_buf.GetDeviceBuffer()));
timer1.End();
kernel_name = "dynamic_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw";
auto network_config_2 = network_config + "_2";
timer2.Start();
handle->AddKernel(algo_name,
network_config_2,
program_name,
kernel_name,
vld2,
vgd2,
compile_param_string)(
reinterpret_cast<const TInWei*>(wei_k_c_y_x_dev_buf.GetDeviceBuffer()),
reinterpret_cast<const TInWei*>(in_n_c_hi_wi_dev_buf.GetDeviceBuffer()),
reinterpret_cast<TOut*>(out_n_k_ho_wo_dev_buf.GetDeviceBuffer()),
(const void*)(workspace_dev_buf.GetDeviceBuffer()));
timer2.End();
kernel1_times.push_back(timer1.GetElapsedTime());
kernel2_times.push_back(timer2.GetElapsedTime());
}
{
auto ave_time1 =
std::accumulate(
std::next(kernel1_times.begin()), kernel1_times.end(), 0., std::plus<float>{}) /
nrepeat;
auto ave_time2 =
std::accumulate(
std::next(kernel2_times.begin()), kernel2_times.end(), 0., std::plus<float>{}) /
nrepeat;
float perf = (float)(conv_problem_desc.CalculateFlop()) /
(std::size_t(1000) * 1000 * 1000) / (ave_time1 + ave_time2);
std::cout << "Average time : " << ave_time1 + ave_time2 << " ms(" << ave_time1 << ", "
<< ave_time2 << "), " << perf << " TFlop/s" << std::endl;
};
// copy result back to host
out_n_k_ho_wo_dev_buf.FromDevice(out_n_k_ho_wo.mData.data());
}
...@@ -10,6 +10,8 @@ set(HOST_TENSOR_SOURCE ...@@ -10,6 +10,8 @@ set(HOST_TENSOR_SOURCE
## the library target ## the library target
add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE}) add_library(host_tensor SHARED ${HOST_TENSOR_SOURCE})
target_include_directories(host_tensor SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
target_link_libraries(host_tensor PRIVATE hip::device) target_link_libraries(host_tensor PRIVATE hip::device)
target_link_libraries(host_tensor INTERFACE hip::host) target_link_libraries(host_tensor INTERFACE hip::host)
......
#ifndef CONV_COMMON_HPP #ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP #define CONV_COMMON_HPP
#include "dynamic_tensor_descriptor.hpp" #include "tensor_descriptor.hpp"
enum ConvTensorLayout enum ConvTensorLayout
{ {
...@@ -19,8 +19,8 @@ template <typename... InDesc, ...@@ -19,8 +19,8 @@ template <typename... InDesc,
typename LeftPads, typename LeftPads,
typename RightPads> typename RightPads>
constexpr auto get_convolution_output_default_4d_tensor_descriptor( constexpr auto get_convolution_output_default_4d_tensor_descriptor(
const ck::DynamicTensorDescriptor<InDesc...>& in_desc, const ck::TensorDescriptor<InDesc...>& in_desc,
const ck::DynamicTensorDescriptor<WeiDesc...>& wei_desc, const ck::TensorDescriptor<WeiDesc...>& wei_desc,
const ConvStrides& conv_strides, const ConvStrides& conv_strides,
const ConvDilations conv_dilations, const ConvDilations conv_dilations,
const LeftPads& left_pads, const LeftPads& left_pads,
...@@ -57,12 +57,12 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor( ...@@ -57,12 +57,12 @@ constexpr auto get_convolution_output_default_4d_tensor_descriptor(
const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1; const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1; const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
return make_dynamic_naive_tensor_descriptor_packed_v2(make_tuple(N, K, Ho, Wo)); return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
} }
template <class InDesc, class WeiDesc, class OutDesc> template <class InDesc, class WeiDesc, class OutDesc>
constexpr std::size_t constexpr std::size_t
calculate_convolution_flops(const InDesc& in_desc, const WeiDesc& wei_desc, const OutDesc& out_desc) calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
{ {
using namespace ck; using namespace ck;
......
...@@ -34,24 +34,16 @@ struct KernelTimer ...@@ -34,24 +34,16 @@ struct KernelTimer
using device_stream_t = hipStream_t; using device_stream_t = hipStream_t;
template <typename... Args, typename F> template <typename... Args, typename F>
void launch_kernel(F kernel, void launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
dim3 grid_dim,
dim3 block_dim,
std::size_t lds_byte,
hipStream_t stream_id,
Args... args)
{ {
hipStream_t stream_id = nullptr;
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...); hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
} }
template <typename... Args, typename F> template <typename... Args, typename F>
float launch_and_time_kernel(F kernel, float launch_and_time_kernel(
int nrepeat, F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
dim3 grid_dim,
dim3 block_dim,
std::size_t lds_byte,
hipStream_t stream_id,
Args... args)
{ {
KernelTimer timer; KernelTimer timer;
...@@ -66,6 +58,8 @@ float launch_and_time_kernel(F kernel, ...@@ -66,6 +58,8 @@ float launch_and_time_kernel(F kernel,
printf("Warm up\n"); printf("Warm up\n");
hipStream_t stream_id = nullptr;
// warm up // warm up
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...); hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_id, args...);
......
...@@ -14,15 +14,13 @@ void host_direct_convolution(const Tensor<TIn>& in, ...@@ -14,15 +14,13 @@ void host_direct_convolution(const Tensor<TIn>& in,
const ConvStrides& conv_strides, const ConvStrides& conv_strides,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
const InRightPads& in_right_pads, const InRightPads&,
const ConvTensorLayout layout = ConvTensorLayout::NCHW) const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{ {
using namespace ck; using namespace ck;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
double v = 0; double v = 0;
...@@ -68,23 +66,25 @@ void host_direct_convolution(const Tensor<TIn>& in, ...@@ -68,23 +66,25 @@ void host_direct_convolution(const Tensor<TIn>& in,
out(n, ho, wo, k) = v; out(n, ho, wo, k) = v;
}; };
switch(layout) if(layout == ConvTensorLayout::NCHW)
{ {
case ConvTensorLayout::NCHW:
make_ParallelTensorFunctor(f_nchw, make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0], out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1], out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2], out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break; }
case ConvTensorLayout::NHWC: else if(layout == ConvTensorLayout::NHWC)
{
make_ParallelTensorFunctor(f_nhwc, make_ParallelTensorFunctor(f_nhwc,
out.mDesc.GetLengths()[0], out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1], out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2], out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break; }
default: throw std::runtime_error("wrong! not supported layout"); else
{
throw std::runtime_error("wrong! not supported layout");
} }
} }
...@@ -100,17 +100,15 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, ...@@ -100,17 +100,15 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
constexpr std::size_t HoPerTile = 2; constexpr std::size_t HoPerTile = 2;
constexpr std::size_t WoPerTile = 2; constexpr std::size_t WoPerTile = 2;
std::size_t N = in_nchw.mDesc.GetLengths()[0]; std::size_t N = in_nchw.mDesc.GetLengths()[0];
std::size_t C = in_nchw.mDesc.GetLengths()[1]; std::size_t C = in_nchw.mDesc.GetLengths()[1];
std::size_t HI = in_nchw.mDesc.GetLengths()[2];
std::size_t WI = in_nchw.mDesc.GetLengths()[3];
std::size_t K = wei_kcyx.mDesc.GetLengths()[0]; std::size_t K = wei_kcyx.mDesc.GetLengths()[0];
std::size_t Y = wei_kcyx.mDesc.GetLengths()[2]; std::size_t Y = wei_kcyx.mDesc.GetLengths()[2];
std::size_t X = wei_kcyx.mDesc.GetLengths()[3]; std::size_t X = wei_kcyx.mDesc.GetLengths()[3];
std::size_t HO = out_nkhw.mDesc.GetLengths()[2]; std::size_t Ho = out_nkhw.mDesc.GetLengths()[2];
std::size_t WO = out_nkhw.mDesc.GetLengths()[3]; std::size_t Wo = out_nkhw.mDesc.GetLengths()[3];
index_t h_pad_low = InLeftPads{}.Get(Number<0>{}); index_t h_pad_low = InLeftPads{}.Get(Number<0>{});
index_t w_pad_low = InLeftPads{}.Get(Number<1>{}); index_t w_pad_low = InLeftPads{}.Get(Number<1>{});
...@@ -118,8 +116,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, ...@@ -118,8 +116,8 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std::size_t HiPerTile = HoPerTile + Y - 1; std::size_t HiPerTile = HoPerTile + Y - 1;
std::size_t WiPerTile = WoPerTile + X - 1; std::size_t WiPerTile = WoPerTile + X - 1;
std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile; std::size_t HTile = (Ho + HoPerTile - 1) / HoPerTile;
std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile; std::size_t WTile = (Wo + WoPerTile - 1) / WoPerTile;
Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
......
...@@ -14,7 +14,7 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in, ...@@ -14,7 +14,7 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
const ConvStrides& conv_strides, const ConvStrides& conv_strides,
const ConvDilations& conv_dilations, const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads, const InLeftPads& in_left_pads,
const InRightPads& in_right_pads, const InRightPads& /* in_right_pads */,
const ConvTensorLayout layout = ConvTensorLayout::NCHW) const ConvTensorLayout layout = ConvTensorLayout::NCHW)
{ {
using namespace ck; using namespace ck;
...@@ -25,11 +25,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in, ...@@ -25,11 +25,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
constexpr auto I3 = Number<3>{}; constexpr auto I3 = Number<3>{};
auto f_nchw = [&](auto n, auto c, auto hi, auto wi) { auto f_nchw = [&](auto n, auto c, auto hi, auto wi) {
std::size_t N = in.mDesc.GetLengths()[I0];
std::size_t C = in.mDesc.GetLengths()[I1];
std::size_t Hi = in.mDesc.GetLengths()[I2];
std::size_t Wi = in.mDesc.GetLengths()[I3];
std::size_t K = wei.mDesc.GetLengths()[I0]; std::size_t K = wei.mDesc.GetLengths()[I0];
std::size_t Y = wei.mDesc.GetLengths()[I2]; std::size_t Y = wei.mDesc.GetLengths()[I2];
std::size_t X = wei.mDesc.GetLengths()[I3]; std::size_t X = wei.mDesc.GetLengths()[I3];
...@@ -74,11 +69,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in, ...@@ -74,11 +69,6 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
}; };
auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) { auto f_nhwc = [&](auto n, auto hi, auto wi, auto c) {
std::size_t N = in.mDesc.GetLengths()[I0];
std::size_t Hi = in.mDesc.GetLengths()[I1];
std::size_t Wi = in.mDesc.GetLengths()[I2];
std::size_t C = in.mDesc.GetLengths()[I3];
std::size_t K = wei.mDesc.GetLengths()[I0]; std::size_t K = wei.mDesc.GetLengths()[I0];
std::size_t Y = wei.mDesc.GetLengths()[I1]; std::size_t Y = wei.mDesc.GetLengths()[I1];
std::size_t X = wei.mDesc.GetLengths()[I2]; std::size_t X = wei.mDesc.GetLengths()[I2];
...@@ -122,22 +112,24 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in, ...@@ -122,22 +112,24 @@ void host_direct_convolution_backward_data(Tensor<TIn>& in,
in(n, hi, wi, c) = v; in(n, hi, wi, c) = v;
}; };
switch(layout) if(layout == ConvTensorLayout::NCHW)
{ {
case ConvTensorLayout::NCHW:
make_ParallelTensorFunctor(f_nchw, make_ParallelTensorFunctor(f_nchw,
in.mDesc.GetLengths()[0], in.mDesc.GetLengths()[0],
in.mDesc.GetLengths()[1], in.mDesc.GetLengths()[1],
in.mDesc.GetLengths()[2], in.mDesc.GetLengths()[2],
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break; }
case ConvTensorLayout::NHWC: else if(layout == ConvTensorLayout::NHWC)
{
make_ParallelTensorFunctor(f_nhwc, make_ParallelTensorFunctor(f_nhwc,
in.mDesc.GetLengths()[0], in.mDesc.GetLengths()[0],
in.mDesc.GetLengths()[1], in.mDesc.GetLengths()[1],
in.mDesc.GetLengths()[2], in.mDesc.GetLengths()[2],
in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); in.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
break; }
default: throw std::runtime_error("wrong! not supported layout"); else
{
throw std::runtime_error("wrong! not supported layout");
} }
} }
...@@ -34,7 +34,7 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim) ...@@ -34,7 +34,7 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
first = false; first = false;
else else
os << delim; os << delim;
os << T{v}; os << static_cast<T>(v);
} }
return os; return os;
} }
......
...@@ -9,7 +9,7 @@ struct GeneratorTensor_1 ...@@ -9,7 +9,7 @@ struct GeneratorTensor_1
int value = 1; int value = 1;
template <typename... Is> template <typename... Is>
float operator()(Is... is) float operator()(Is...)
{ {
return value; return value;
} }
......
...@@ -24,32 +24,32 @@ struct KernelTimerImpl ...@@ -24,32 +24,32 @@ struct KernelTimerImpl
{ {
KernelTimerImpl() KernelTimerImpl()
{ {
hipEventCreate(&mStart); hipGetErrorString(hipEventCreate(&mStart));
hipEventCreate(&mEnd); hipGetErrorString(hipEventCreate(&mEnd));
} }
~KernelTimerImpl() ~KernelTimerImpl()
{ {
hipEventDestroy(mStart); hipGetErrorString(hipEventDestroy(mStart));
hipEventDestroy(mEnd); hipGetErrorString(hipEventDestroy(mEnd));
} }
void Start() void Start()
{ {
hipDeviceSynchronize(); hipGetErrorString(hipDeviceSynchronize());
hipEventRecord(mStart, 0); hipGetErrorString(hipEventRecord(mStart, nullptr));
} }
void End() void End()
{ {
hipEventRecord(mEnd, 0); hipGetErrorString(hipEventRecord(mEnd, nullptr));
hipEventSynchronize(mEnd); hipGetErrorString(hipEventSynchronize(mEnd));
} }
float GetElapsedTime() const float GetElapsedTime() const
{ {
float time; float time;
hipEventElapsedTime(&time, mStart, mEnd); hipGetErrorString(hipEventElapsedTime(&time, mStart, mEnd));
return time; return time;
} }
......
set(CMAKE_CXX_COMPILER /opt/rocm/llvm/bin/clang++)
## for online-compiling of HIP kernels
set(OLC_HIP_COMPILER ${CMAKE_CXX_COMPILER} CACHE PATH "")
## reset to avoid the C++ options from the parent project
set(CMAKE_CXX_FLAGS "")
message("Compiling options for library and kernels: ${CMAKE_CXX_FLAGS}")
# look for and register clang-offload-bundler
if(OLC_HIP_COMPILER MATCHES ".*clang\\+\\+$")
find_program(OLC_OFFLOADBUNDLER_BIN clang-offload-bundler
PATH_SUFFIXES bin
PATHS
/opt/rocm/llvm
${CMAKE_INSTALL_PREFIX}/llvm
)
endif()
if(OLC_OFFLOADBUNDLER_BIN)
message(STATUS "clang-offload-bundler found: ${OLC_OFFLOADBUNDLER_BIN}")
set(OLC_OFFLOADBUNDLER_BIN "${OLC_OFFLOADBUNDLER_BIN}")
else()
# look for and register extractkernel
message(STATUS "clang-offload-bundler not found")
find_program(EXTRACTKERNEL_BIN extractkernel
PATH_SUFFIXES bin
PATHS
/opt/rocm/hip
/opt/rocm/hcc
/opt/rocm
${CMAKE_INSTALL_PREFIX}/hip
${CMAKE_INSTALL_PREFIX}/hcc
${CMAKE_INSTALL_PREFIX}
)
if(EXTRACTKERNEL_BIN)
message(STATUS "extractkernel found: ${EXTRACTKERNEL_BIN}")
set(EXTRACTKERNEL_BIN "${EXTRACTKERNEL_BIN}")
else()
message(FATAL_ERROR "extractkernel not found")
endif()
endif()
option(Boost_USE_STATIC_LIBS "Use boost static libraries" OFF)
set(BOOST_COMPONENTS filesystem)
add_definitions(-DBOOST_ALL_NO_LIB=1)
find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})
# HIP is always required
find_package(hip REQUIRED PATHS /opt/rocm)
message(STATUS "Build with HIP ${hip_VERSION}")
target_flags(HIP_COMPILER_FLAGS hip::device)
# Remove cuda arch flags
string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
string(REGEX REPLACE --offload-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
set(OLC_hip_VERSION_MAJOR "${hip_VERSION_MAJOR}")
set(OLC_hip_VERSION_MINOR "${hip_VERSION_MINOR}")
set(OLC_hip_VERSION_PATCH "${hip_VERSION_PATCH}")
option(ENABLE_DEBUG "Build to enable debugging" ON)
if(ENABLE_DEBUG)
set(OLC_DEBUG 1)
else()
set(OLC_DEBUG 0)
endif()
configure_file("${PROJECT_SOURCE_DIR}/host/online_compile/include/config.h.in" "${PROJECT_BINARY_DIR}/host/online_compile/include/config.h")
include_directories(BEFORE
${PROJECT_BINARY_DIR}/host/online_compile/include
)
message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
## HIP_COMPILER_FLAGS will be used for on-line compiling of the HIP kernels
set(HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS} ${HIP_ONLINE_COMPILER_FLAGS}")
add_definitions("-DHIP_COMPILER_FLAGS=${HIP_COMPILER_FLAGS}")
file(GLOB_RECURSE COMPOSABLE_KERNEL_INCLUDE_1 "${PROJECT_SOURCE_DIR}/composable_kernel/include/*/*.hpp")
file(GLOB COMPOSABLE_KERNEL_INCLUDE_2 "${PROJECT_SOURCE_DIR}/external/rocm/include/bfloat16_dev.hpp")
set(MCONV_KERNEL_INCLUDES
${COMPOSABLE_KERNEL_INCLUDE_1}
${COMPOSABLE_KERNEL_INCLUDE_2}
)
file(GLOB_RECURSE MCONV_KERNELS "${PROJECT_SOURCE_DIR}/composable_kernel/src/kernel_wrapper/*.cpp")
add_kernels(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNELS}")
add_kernel_includes(${CMAKE_CURRENT_SOURCE_DIR} "${MCONV_KERNEL_INCLUDES}")
set(ONLINE_COMPILATION_SOURCE
${PROJECT_BINARY_DIR}/kernel.cpp
${PROJECT_BINARY_DIR}/kernel_includes.cpp
)
include_directories(BEFORE
${PROJECT_BINARY_DIR}/host/online_compile/include
include
)
set(OLC_HIP_UTILITY_CPPS
hip_utility/logger.cpp
hip_utility/tmp_dir.cpp
hip_utility/md5.cpp
hip_utility/exec_utils.cpp
hip_utility/target_properties.cpp
hip_utility/handlehip.cpp
hip_utility/kernel_build_params.cpp
hip_utility/hip_build_utils.cpp
hip_utility/hipoc_program.cpp
hip_utility/hipoc_kernel.cpp
hip_utility/kernel_cache.cpp
hip_utility/binary_cache.cpp
)
list(APPEND OLC_SOURCES ${OLC_HIP_UTILITY_CPPS} ${OLC_HIP_UTILITY_HEADERS})
## addkernels provide the tool to create inlined kernels in one header
add_subdirectory(addkernels)
function(inline_kernels_src KERNELS KERNEL_INCLUDES)
set(KERNEL_SRC_HPP_FILENAME batch_all.cpp.hpp)
set(KERNEL_SRC_HPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/${KERNEL_SRC_HPP_FILENAME})
set(KERNEL_SRC_CPP_PATH ${PROJECT_BINARY_DIR}/inlined_kernels/batch_all.cpp)
add_custom_command(
OUTPUT ${KERNEL_SRC_HPP_PATH}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS addkernels ${KERNELS} ${KERNEL_INCLUDES}
COMMAND $<TARGET_FILE:addkernels> -target ${KERNEL_SRC_HPP_PATH} -extern -source ${KERNELS}
COMMENT "Inlining All kernels"
)
configure_file(kernels_batch.cpp.in ${KERNEL_SRC_CPP_PATH})
list(APPEND OLC_SOURCES ${KERNEL_SRC_CPP_PATH} ${KERNEL_SRC_HPP_PATH})
set(OLC_SOURCES ${OLC_SOURCES} PARENT_SCOPE)
endfunction()
inline_kernels_src("${MCONV_KERNELS}" "${MCONV_KERNEL_INCLUDES}")
list(APPEND ONLINE_COMPILATION_SOURCE ${OLC_SOURCES} ${PROJECT_BINARY_DIR}/olc_kernel_includes.h)
add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/olc_kernel_includes.h
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
DEPENDS addkernels ${MCONV_KERNEL_INCLUDES}
COMMAND $<TARGET_FILE:addkernels> -no-recurse -guard GUARD_OLC_KERNEL_INCLUDES_HPP_ -target ${PROJECT_BINARY_DIR}/olc_kernel_includes.h -source ${MCONV_KERNEL_INCLUDES}
COMMENT "Inlining HIP kernel includes"
)
## the library target
add_library(online_compile SHARED ${ONLINE_COMPILATION_SOURCE})
target_include_directories(online_compile PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/online_compile/include/)
target_include_directories(online_compile PRIVATE ${PROJECT_BINARY_DIR})
target_include_directories(online_compile PRIVATE ${PROJECT_SOURCE_DIR}/external/half/include/)
target_link_libraries(online_compile PRIVATE hip::device)
target_link_libraries(online_compile INTERFACE hip::host)
target_link_libraries(online_compile PRIVATE Boost::filesystem)
target_compile_features(online_compile PUBLIC)
set_target_properties(online_compile PROPERTIES POSITION_INDEPENDENT_CODE ON)
install(TARGETS online_compile LIBRARY DESTINATION lib)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment