Commit cfc80c01 authored by ltqin's avatar ltqin
Browse files

Merge branch 'develop' into ck_conv_bww_fp16

parents 69ea9ad9 6d4450ef
...@@ -365,16 +365,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -365,16 +365,14 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl; std::cerr << "has_main_e0_block_loop = " << has_main_e0_block_loop << std::endl;
const auto c_blockid_to_k_n_h_w_block_cluster_adaptor = const auto cblockid_to_k_n_h_w_block_cluster_adaptor =
GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc); GridwiseGemm::MakeCBlockIdToKNHoWoBlockClusterAdaptor(c_k_n_hop_wop_grid_desc);
using CBlockIdToBlockClusterAdaptor_K_N_H_W = using CBlockIdToBlockClusterAdaptor_K_N_H_W =
decltype(c_blockid_to_k_n_h_w_block_cluster_adaptor); decltype(cblockid_to_k_n_h_w_block_cluster_adaptor);
float ave_time = 0; float ave_time = 0;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if(has_main_e0_block_loop) if(has_main_e0_block_loop)
{ {
const auto kernel = kernel_gemm_dlops_v3_maxpool< const auto kernel = kernel_gemm_dlops_v3_maxpool<
...@@ -403,7 +401,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -403,7 +401,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor); cblockid_to_k_n_h_w_block_cluster_adaptor);
} }
else else
{ {
...@@ -433,136 +431,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -433,136 +431,9 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor); cblockid_to_k_n_h_w_block_cluster_adaptor);
}
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem a_e0_e1_k0_k1_e2_grid_desc_dev_buf(sizeof(AGridDesc_E0_E1_K0_K1_E2));
DeviceMem b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf(
sizeof(BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2));
DeviceMem c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf(
sizeof(CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2));
DeviceMem d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf(
sizeof(DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx));
DeviceMem c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf(
sizeof(CBlockIdToBlockClusterAdaptor_K_N_H_W));
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.ToDevice(&a_e0_e1_k0_k1_e2_grid_desc);
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.ToDevice(
&b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc);
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.ToDevice(
&c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.ToDevice(
&d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.ToDevice(
&c_blockid_to_k_n_h_w_block_cluster_adaptor);
if(has_main_e0_block_loop)
{
const auto kernel = kernel_gemm_dlops_v3_maxpool<
GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
true,
activ_type>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_bias_grid,
p_c_grid,
p_d_grid,
cast_pointer_to_constant_address_space(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else
{
const auto kernel = kernel_gemm_dlops_v3_maxpool<
GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
false,
activ_type>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_bias_grid,
p_c_grid,
p_d_grid,
cast_pointer_to_constant_address_space(
a_e0_e1_k0_k1_e2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_k_n_h_w_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
} }
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
{
static_assert(a_e0_e1_k_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
const auto kernel = kernel_gemm_dlops_v3_maxpool<
GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
has_main_e0_block_loop,
activ_type>;
ave_time = launch_and_time_kernel(kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_bias_grid,
p_c_grid,
p_d_grid);
}
#endif
return ave_time; return ave_time;
} }
}; };
......
...@@ -136,11 +136,11 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -136,11 +136,11 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc); using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
// c_blockid_to_m0_n0_block_cluster_adaptor // cblockid_to_m0_n0_block_cluster_adaptor
const auto c_blockid_to_m0_n0_block_cluster_adaptor = const auto cblockid_to_m0_n0_block_cluster_adaptor =
GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc); GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
using CBlockIdToM0N0BlockClusterAdaptor = decltype(c_blockid_to_m0_n0_block_cluster_adaptor); using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor);
const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N); const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
...@@ -166,7 +166,6 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -166,7 +166,6 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
<< c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl; << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
} }
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
float ave_time = 0; float ave_time = 0;
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
...@@ -193,7 +192,7 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -193,7 +192,7 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
a_k_m0_m1_grid_desc, a_k_m0_m1_grid_desc,
b_k_n0_n1_grid_desc, b_k_n0_n1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc, c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor); cblockid_to_m0_n0_block_cluster_adaptor);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
...@@ -219,7 +218,7 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -219,7 +218,7 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
a_k_m0_m1_grid_desc, a_k_m0_m1_grid_desc,
b_k_n0_n1_grid_desc, b_k_n0_n1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc, c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor); cblockid_to_m0_n0_block_cluster_adaptor);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
...@@ -245,7 +244,7 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -245,7 +244,7 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
a_k_m0_m1_grid_desc, a_k_m0_m1_grid_desc,
b_k_n0_n1_grid_desc, b_k_n0_n1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc, c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor); cblockid_to_m0_n0_block_cluster_adaptor);
} }
else else
{ {
...@@ -271,143 +270,9 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid, ...@@ -271,143 +270,9 @@ __host__ float driver_gemm_dlops_v1r2(const FloatAB* p_a_grid,
a_k_m0_m1_grid_desc, a_k_m0_m1_grid_desc,
b_k_n0_n1_grid_desc, b_k_n0_n1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc, c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor); cblockid_to_m0_n0_block_cluster_adaptor);
} }
return ave_time; return ave_time;
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem a_k_m0_m1_grid_desc_dev_buf(sizeof(AKM0M1GridDesc));
DeviceMem b_k_n0_n1_grid_desc_dev_buf(sizeof(BKN0N1GridDesc));
DeviceMem c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf(sizeof(CM0M10M11N0N10N11GridDesc));
DeviceMem c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf(
sizeof(CBlockIdToM0N0BlockClusterAdaptor));
a_k_m0_m1_grid_desc_dev_buf.ToDevice(&a_k_m0_m1_grid_desc);
b_k_n0_n1_grid_desc_dev_buf.ToDevice(&b_k_n0_n1_grid_desc);
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.ToDevice(&c_m0_m10_m11_n0_n10_n11_grid_desc);
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.ToDevice(
&c_blockid_to_m0_n0_block_cluster_adaptor);
float ave_time = 0;
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true,
true>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true,
false>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false,
true>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else
{
const auto kernel =
kernel_gemm_dlops_v1r2<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AKM0M1GridDesc>,
remove_reference_t<BKN0N1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false,
false>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(a_k_m0_m1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(b_k_n0_n1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
return ave_time;
#endif
} }
#endif #endif
...@@ -131,11 +131,11 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -131,11 +131,11 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc); using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc);
// c_blockid_to_m0_n0_block_cluster_adaptor // cblockid_to_m0_n0_block_cluster_adaptor
const auto c_blockid_to_m0_n0_block_cluster_adaptor = const auto cblockid_to_m0_n0_block_cluster_adaptor =
GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc); GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
using CBlockIdToM0N0BlockClusterAdaptor = decltype(c_blockid_to_m0_n0_block_cluster_adaptor); using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor);
const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N); const index_t grid_size = GridwiseGemm::CalculateGridSize(M, N);
...@@ -163,7 +163,6 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -163,7 +163,6 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
<< c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl; << c_m0_m10_m11_n0_n10_n11_grid_desc.GetLength(I5) << "}" << std::endl;
} }
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
float ave_time = 0; float ave_time = 0;
if(has_main_k_block_loop && has_double_tail_k_block_loop) if(has_main_k_block_loop && has_double_tail_k_block_loop)
...@@ -190,7 +189,7 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -190,7 +189,7 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
a_k0_m0_m1_k1_grid_desc, a_k0_m0_m1_k1_grid_desc,
b_k0_n0_n1_k1_grid_desc, b_k0_n0_n1_k1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc, c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor); cblockid_to_m0_n0_block_cluster_adaptor);
} }
else if(has_main_k_block_loop && !has_double_tail_k_block_loop) else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{ {
...@@ -216,7 +215,7 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -216,7 +215,7 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
a_k0_m0_m1_k1_grid_desc, a_k0_m0_m1_k1_grid_desc,
b_k0_n0_n1_k1_grid_desc, b_k0_n0_n1_k1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc, c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor); cblockid_to_m0_n0_block_cluster_adaptor);
} }
else if(!has_main_k_block_loop && has_double_tail_k_block_loop) else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{ {
...@@ -242,7 +241,7 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -242,7 +241,7 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
a_k0_m0_m1_k1_grid_desc, a_k0_m0_m1_k1_grid_desc,
b_k0_n0_n1_k1_grid_desc, b_k0_n0_n1_k1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc, c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor); cblockid_to_m0_n0_block_cluster_adaptor);
} }
else else
{ {
...@@ -268,151 +267,9 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid, ...@@ -268,151 +267,9 @@ __host__ float driver_gemm_dlops_v1r3(const FloatAB* p_a_grid,
a_k0_m0_m1_k1_grid_desc, a_k0_m0_m1_k1_grid_desc,
b_k0_n0_n1_k1_grid_desc, b_k0_n0_n1_k1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc, c_m0_m10_m11_n0_n10_n11_grid_desc,
c_blockid_to_m0_n0_block_cluster_adaptor); cblockid_to_m0_n0_block_cluster_adaptor);
} }
return ave_time; return ave_time;
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem a_k0_m0_m1_k1_grid_desc_dev_buf(sizeof(AK0M0M1K1GridDesc));
DeviceMem b_k0_n0_n1_k1_grid_desc_dev_buf(sizeof(BK0N0N1K1GridDesc));
DeviceMem c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf(sizeof(CM0M10M11N0N10N11GridDesc));
DeviceMem c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf(
sizeof(CBlockIdToM0N0BlockClusterAdaptor));
a_k0_m0_m1_k1_grid_desc_dev_buf.ToDevice(&a_k0_m0_m1_k1_grid_desc);
b_k0_n0_n1_k1_grid_desc_dev_buf.ToDevice(&b_k0_n0_n1_k1_grid_desc);
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.ToDevice(&c_m0_m10_m11_n0_n10_n11_grid_desc);
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.ToDevice(
&c_blockid_to_m0_n0_block_cluster_adaptor);
float ave_time = 0;
if(has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true,
true>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(
a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
true,
false>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(
a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
{
const auto kernel =
kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false,
true>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(
a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else
{
const auto kernel =
kernel_gemm_dlops_v1r3<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AK0M0M1K1GridDesc>,
remove_reference_t<BK0N0N1K1GridDesc>,
remove_reference_t<CM0M10M11N0N10N11GridDesc>,
remove_reference_t<CBlockIdToM0N0BlockClusterAdaptor>,
false,
false>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(
a_k0_m0_m1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
b_k0_n0_n1_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_m10_m11_n0_n10_n11_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_blockid_to_m0_n0_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
return ave_time;
#endif
} }
#endif #endif
...@@ -138,7 +138,8 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid, ...@@ -138,7 +138,8 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 = decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc); using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 = decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc);
const auto block_2_ctile_map = GridwiseGemm::MakeBlock2CTileMap(c_grid_desc_m_n, M01, N01); const auto block_2_ctile_map =
GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, M01, N01);
using Block2CTileMap = decltype(block_2_ctile_map); using Block2CTileMap = decltype(block_2_ctile_map);
...@@ -152,7 +153,6 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid, ...@@ -152,7 +153,6 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
auto element_op_ = ElementwiseOperation{}; auto element_op_ = ElementwiseOperation{};
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if(has_main_k0_block_loop) if(has_main_k0_block_loop)
{ {
const auto kernel = const auto kernel =
...@@ -215,74 +215,6 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid, ...@@ -215,74 +215,6 @@ __host__ float driver_gemm_xdlops_v2r3(const FloatAB* p_a_grid,
element_op_, element_op_,
block_2_ctile_map); block_2_ctile_map);
} }
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem a_grid_desc_k0_m_k1_dev_buf(sizeof(AGridDesc_K0_M_K1));
DeviceMem b_grid_desc_k0_n_k1_dev_buf(sizeof(BGridDesc_K0_N_K));
DeviceMem c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc_dev_buf(
sizeof(CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2));
DeviceMem block_2_ctile_map_dev_buf(sizeof(Block2CTileMap));
a_grid_desc_k0_m_k1_dev_buf.ToDevice(&a_grid_desc_k0_m_k1);
b_grid_desc_k0_n_k1_dev_buf.ToDevice(&b_grid_desc_k0_n_k1);
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc_dev_buf.ToDevice(&c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc);
block_2_ctile_map_dev_buf.ToDevice(&block_2_ctile_map);
if(has_main_k0_block_loop)
{
const auto kernel =
kernel_gemm_xdlops_v2r3<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AGridDesc_K0_M_K1>,
remove_reference_t<BGridDesc_K0_N_K>,
remove_reference_t<CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
remove_reference_t<Block2CTileMap>,
true>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(a_grid_desc_k0_m_k1_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(b_grid_desc_k0_n_k1_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(block_2_ctile_map_dev_buf.GetDeviceBuffer()));
}
else
{
const auto kernel =
kernel_gemm_xdlops_v2r3<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<AGridDesc_K0_M_K1>,
remove_reference_t<BGridDesc_K0_N_K>,
remove_reference_t<CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
remove_reference_t<Block2CTileMap>,
false>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(a_grid_desc_k0_m_k1_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(b_grid_desc_k0_n_k1_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(block_2_ctile_map_dev_buf.GetDeviceBuffer()));
}
}
#endif
return ave_time; return ave_time;
} }
#endif #endif
...@@ -161,7 +161,6 @@ __host__ float driver_gemm_xdlops_v2r4(const FloatAB* p_a_grid, ...@@ -161,7 +161,6 @@ __host__ float driver_gemm_xdlops_v2r4(const FloatAB* p_a_grid,
const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0); const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
float ave_time = 0; float ave_time = 0;
#if CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE
if(has_main_k0_block_loop) if(has_main_k0_block_loop)
{ {
const auto kernel = kernel_gemm_xdlops_v2r4<GridwiseGemm, const auto kernel = kernel_gemm_xdlops_v2r4<GridwiseGemm,
...@@ -209,70 +208,6 @@ __host__ float driver_gemm_xdlops_v2r4(const FloatAB* p_a_grid, ...@@ -209,70 +208,6 @@ __host__ float driver_gemm_xdlops_v2r4(const FloatAB* p_a_grid,
c_block_cluster_adaptor); c_block_cluster_adaptor);
} }
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
DeviceMem a_b_k0_m_k1_grid_desc_dev_buf(sizeof(ABK0MK1GridDesc));
DeviceMem b_b_k0_n_k1_grid_desc_dev_buf(sizeof(BBK0NK1GridDesc));
DeviceMem c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc_dev_buf(sizeof(CM0N0M1N1M2M3M4N2GridDesc));
DeviceMem c_block_cluster_adaptor_dev_buf(sizeof(CBlockClusterAdaptor));
a_b_k0_m_k1_grid_desc_dev_buf.ToDevice(&a_b_k0_m_k1_grid_desc);
b_b_k0_n_k1_grid_desc_dev_buf.ToDevice(&b_b_k0_n_k1_grid_desc);
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc_dev_buf.ToDevice(&c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc);
c_block_cluster_adaptor_dev_buf.ToDevice(&c_block_cluster_adaptor);
if(has_main_k0_block_loop)
{
const auto kernel = kernel_gemm_xdlops_v2r4<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<ABK0MK1GridDesc>,
remove_reference_t<BBK0NK1GridDesc>,
remove_reference_t<CM0N0M1N1M2M3M4N2GridDesc>,
remove_reference_t<CBlockClusterAdaptor>,
true>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(a_b_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(b_b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
else
{
const auto kernel = kernel_gemm_xdlops_v2r4<GridwiseGemm,
FloatAB,
FloatC,
remove_reference_t<ABK0MK1GridDesc>,
remove_reference_t<BBK0NK1GridDesc>,
remove_reference_t<CM0N0M1N1M2M3M4N2GridDesc>,
remove_reference_t<CBlockClusterAdaptor>,
false>;
ave_time = launch_and_time_kernel(
kernel,
nrepeat,
dim3(grid_size),
dim3(BlockSize),
0,
p_a_grid,
p_b_grid,
p_c_grid,
cast_pointer_to_constant_address_space(a_b_k0_m_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(b_b_k0_n_k1_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc_dev_buf.GetDeviceBuffer()),
cast_pointer_to_constant_address_space(
c_block_cluster_adaptor_dev_buf.GetDeviceBuffer()));
}
#endif
return ave_time; return ave_time;
} }
#endif #endif
...@@ -48,3 +48,102 @@ void host_conv_nchw_kcyx_nkhw(const Tensor<TIn>& in, ...@@ -48,3 +48,102 @@ void host_conv_nchw_kcyx_nkhw(const Tensor<TIn>& in,
out.mDesc.GetLengths()[2], out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
} }
template <typename TIn,
typename TWei,
typename TOut,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
void host_conv3d_ndhwc_kzyxc_ndhwk(const Tensor<TIn>& in,
const Tensor<TWei>& wei,
Tensor<TOut>& out,
const ConvStrides& conv_strides,
const ConvDilations& conv_dilations,
const InLeftPads& in_left_pads,
const InRightPads&)
{
using namespace ck;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
const auto Di = in.mDesc.GetLengths()[1];
const auto Hi = in.mDesc.GetLengths()[2];
const auto Wi = in.mDesc.GetLengths()[3];
const auto Z = wei.mDesc.GetLengths()[1];
const auto Y = wei.mDesc.GetLengths()[2];
const auto X = wei.mDesc.GetLengths()[3];
const auto C = wei.mDesc.GetLengths()[4];
auto f_ndhwc = [&](auto n, auto do__, auto ho_, auto wo_, auto k) {
// do__ must be converted to signed integer, otherwise zmin might be wrong in cases
// negative values.
const int do_ = static_cast<int>(do__);
const int ho = static_cast<int>(ho_);
const int wo = static_cast<int>(wo_);
const int zmin =
std::max(0,
(in_left_pads[I0] - do_ * conv_strides[I0] + conv_dilations[I0] - 1) /
conv_dilations[I0]);
const int ymin =
std::max(0,
(in_left_pads[I1] - ho * conv_strides[I1] + conv_dilations[I1] - 1) /
conv_dilations[I1]);
const int xmin =
std::max(0,
(in_left_pads[I2] - wo * conv_strides[I2] + conv_dilations[I2] - 1) /
conv_dilations[I2]);
const int zmax =
std::min(Z, (in_left_pads[I0] - do_ * conv_strides[I0] + Di) / conv_dilations[I0]);
const int ymax =
std::min(Y, (in_left_pads[I1] - ho * conv_strides[I1] + Hi) / conv_dilations[I1]);
const int xmax =
std::min(X, (in_left_pads[I2] - wo * conv_strides[I2] + Wi) / conv_dilations[I2]);
const int di_min = do_ * conv_strides[I0] + zmin * conv_dilations[I0] - in_left_pads[I0];
const int hi_min = ho * conv_strides[I1] + ymin * conv_dilations[I1] - in_left_pads[I1];
const int wi_min = wo * conv_strides[I2] + xmin * conv_dilations[I2] - in_left_pads[I2];
double v = 0;
const TIn* in_n = in.mData.data() + n * Di * Hi * Wi * C;
const TWei* wei_k = wei.mData.data() + k * Z * Y * X * C;
int di = di_min;
for(int z = zmin; z < zmax; ++z, di += conv_dilations[I0])
{
const TIn* in_n_di = in_n + di * Hi * Wi * C;
const TWei* wei_k_z = wei_k + z * Y * X * C;
int hi = hi_min;
for(int y = ymin; y < ymax; ++y, hi += conv_dilations[I1])
{
const TIn* in_n_di_hi = in_n_di + hi * Wi * C;
const TWei* wei_k_z_y = wei_k_z + y * X * C;
int wi = wi_min;
for(int x = xmin; x < xmax; ++x, wi += conv_dilations[I2])
{
const TIn* in_n_di_hi_wi = in_n_di_hi + wi * C;
const TWei* wei_k_z_y_x = wei_k_z_y + x * C;
for(int c = 0; c < C; ++c)
{
v += static_cast<const double>(in_n_di_hi_wi[c]) *
static_cast<const double>(wei_k_z_y_x[c]);
}
}
}
}
out(n, do_, ho, wo, k) = v;
};
make_ParallelTensorFunctor(f_ndhwc,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3],
out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency() - 4);
}
...@@ -144,7 +144,7 @@ struct GeneratorTensor_Checkboard ...@@ -144,7 +144,7 @@ struct GeneratorTensor_Checkboard
template <typename... Ts> template <typename... Ts>
float operator()(Ts... Xs) const float operator()(Ts... Xs) const
{ {
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}}; std::array<ck::index_t, sizeof...(Ts)> dims = {static_cast<ck::index_t>(Xs)...};
return std::accumulate(dims.begin(), return std::accumulate(dims.begin(),
dims.end(), dims.end(),
true, true,
...@@ -154,4 +154,15 @@ struct GeneratorTensor_Checkboard ...@@ -154,4 +154,15 @@ struct GeneratorTensor_Checkboard
} }
}; };
template <ck::index_t Dim>
struct GeneratorTensor_Sequential
{
template <typename... Ts>
float operator()(Ts... Xs) const
{
std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
return dims[Dim];
}
};
#endif #endif
#pragma once #pragma once
#include <iomanip>
#include "config.hpp" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
...@@ -30,6 +31,9 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(std::vector<De ...@@ -30,6 +31,9 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(std::vector<De
void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
...@@ -40,6 +44,11 @@ void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(std::vector<Devic ...@@ -40,6 +44,11 @@ void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(std::vector<Devic
void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&); void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(std::vector<DeviceGemmNoOpPtr>&);
} // namespace device_gemm_instance } // namespace device_gemm_instance
} // namespace device } // namespace device
} // namespace tensor_operation } // namespace tensor_operation
...@@ -64,7 +73,7 @@ void profile_gemm_impl(int do_verification, ...@@ -64,7 +73,7 @@ void profile_gemm_impl(int do_verification,
int StrideA, int StrideA,
int StrideB, int StrideB,
int StrideC, int StrideC,
int KBatch = 1) int KBatch)
{ {
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
...@@ -177,7 +186,6 @@ void profile_gemm_impl(int do_verification, ...@@ -177,7 +186,6 @@ void profile_gemm_impl(int do_verification,
{ {
if(KBatch > 1) if(KBatch > 1)
{ {
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemm_ptrs); add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(gemm_ptrs);
} }
...@@ -210,41 +218,76 @@ void profile_gemm_impl(int do_verification, ...@@ -210,41 +218,76 @@ void profile_gemm_impl(int do_verification,
is_same<BLayout, tensor_layout::gemm::RowMajor>::value && is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{ {
ck::tensor_operation::device::device_gemm_instance:: if(KBatch > 1)
add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs); {
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs);
}
else
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs);
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs); add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(gemm_ptrs);
}
} }
else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value && else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value && is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{ {
ck::tensor_operation::device::device_gemm_instance:: if(KBatch > 1)
add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs); {
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs);
}
else
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs);
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs);
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs); add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(gemm_ptrs);
}
} }
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value && else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<BLayout, tensor_layout::gemm::RowMajor>::value && is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{ {
ck::tensor_operation::device::device_gemm_instance:: if(KBatch > 1)
add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(gemm_ptrs); {
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(gemm_ptrs);
}
else
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(gemm_ptrs);
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(gemm_ptrs); add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(gemm_ptrs);
}
} }
else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value && else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value && is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
is_same<CLayout, tensor_layout::gemm::RowMajor>::value) is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
{ {
ck::tensor_operation::device::device_gemm_instance:: if(KBatch > 1)
add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(gemm_ptrs); {
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(gemm_ptrs);
}
else
{
ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(gemm_ptrs);
ck::tensor_operation::device::device_gemm_instance:: ck::tensor_operation::device::device_gemm_instance::
add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(gemm_ptrs); add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(gemm_ptrs);
}
} }
} }
...@@ -293,8 +336,8 @@ void profile_gemm_impl(int do_verification, ...@@ -293,8 +336,8 @@ void profile_gemm_impl(int do_verification,
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< " GB/s, " << gemm_name << std::endl; << gb_per_sec << " GB/s, " << gemm_name << std::endl;
if(tflops > best_tflops) if(tflops > best_tflops)
{ {
......
...@@ -78,7 +78,8 @@ int profile_gemm(int argc, char* argv[]) ...@@ -78,7 +78,8 @@ int profile_gemm(int argc, char* argv[])
K, K,
(StrideA < 0) ? K : StrideA, (StrideA < 0) ? K : StrideA,
(StrideB < 0) ? N : StrideB, (StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC); (StrideC < 0) ? N : StrideC,
KBatch);
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
...@@ -97,7 +98,8 @@ int profile_gemm(int argc, char* argv[]) ...@@ -97,7 +98,8 @@ int profile_gemm(int argc, char* argv[])
K, K,
(StrideA < 0) ? K : StrideA, (StrideA < 0) ? K : StrideA,
(StrideB < 0) ? K : StrideB, (StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC); (StrideC < 0) ? N : StrideC,
KBatch);
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
{ {
...@@ -116,7 +118,8 @@ int profile_gemm(int argc, char* argv[]) ...@@ -116,7 +118,8 @@ int profile_gemm(int argc, char* argv[])
K, K,
(StrideA < 0) ? M : StrideA, (StrideA < 0) ? M : StrideA,
(StrideB < 0) ? N : StrideB, (StrideB < 0) ? N : StrideB,
(StrideC < 0) ? N : StrideC); (StrideC < 0) ? N : StrideC,
KBatch);
} }
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN) else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
{ {
...@@ -135,7 +138,8 @@ int profile_gemm(int argc, char* argv[]) ...@@ -135,7 +138,8 @@ int profile_gemm(int argc, char* argv[])
K, K,
(StrideA < 0) ? M : StrideA, (StrideA < 0) ? M : StrideA,
(StrideB < 0) ? K : StrideB, (StrideB < 0) ? K : StrideB,
(StrideC < 0) ? N : StrideC); (StrideC < 0) ? N : StrideC,
KBatch);
} }
else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN) else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
{ {
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define REFERENCE_CONV_FWD_HPP #define REFERENCE_CONV_FWD_HPP
#include <iostream> #include <iostream>
#include <type_traits>
#include <sstream> #include <sstream>
#include "device_base.hpp" #include "device_base.hpp"
#include "host_tensor.hpp" #include "host_tensor.hpp"
...@@ -10,21 +11,38 @@ namespace ck { ...@@ -10,21 +11,38 @@ namespace ck {
namespace tensor_operation { namespace tensor_operation {
namespace host { namespace host {
// out[N, K, Ho, Wo] = in[N, C, Hi, Wi] * wei[K, C, Y, X] //
// @brief Reference implementation for forward convolution.
//
// @paragraph Supported tensor layouts. Input tensor supports NCHiWi data layout.
// Weights tensor supports KCYX data layout. Output tensor supports
// NKHoWo data layout.
//
// @tparam InDataType Input tensor data type.
// @tparam WeiDataType Weights tensor data type.
// @tparam OutDataType Output tensor data type.
// @tparam InElementwiseOperation Functor for input tensor elementwise
// operation.
// @tparam WeiElementwiseOperation Functor for weights tensor elementwise
// operation.
// @tparam NumDimSpatial Number of spatial dimensions.
//
template <typename InDataType, template <typename InDataType,
typename WeiDataType, typename WeiDataType,
typename OutDataType, typename OutDataType,
typename InElementwiseOperation, typename InElementwiseOperation,
typename WeiElementwiseOperation, typename WeiElementwiseOperation,
typename OutElementwiseOperation> typename OutElementwiseOperation,
ck::index_t NumDimSpatial = 2,
typename std::enable_if<NumDimSpatial >= 1 && NumDimSpatial <= 3, bool>::type = false>
struct ReferenceConvFwd : public device::BaseOperator struct ReferenceConvFwd : public device::BaseOperator
{ {
// Argument // Argument
struct Argument : public device::BaseArgument struct Argument : public device::BaseArgument
{ {
Argument(const Tensor<InDataType>& in_n_c_hi_wi, Argument(const Tensor<InDataType>& input,
const Tensor<WeiDataType>& wei_k_c_y_x, const Tensor<WeiDataType>& weight,
Tensor<OutDataType>& out_n_k_ho_wo, Tensor<OutDataType>& output,
std::vector<ck::index_t> conv_filter_strides, std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations, std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads, std::vector<ck::index_t> input_left_pads,
...@@ -32,9 +50,9 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -32,9 +50,9 @@ struct ReferenceConvFwd : public device::BaseOperator
InElementwiseOperation in_element_op, InElementwiseOperation in_element_op,
WeiElementwiseOperation wei_element_op, WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op) OutElementwiseOperation out_element_op)
: in_n_c_hi_wi_{in_n_c_hi_wi}, : input_{input},
wei_k_c_y_x_{wei_k_c_y_x}, weight_{weight},
out_n_k_ho_wo_{out_n_k_ho_wo}, output_{output},
conv_strides_{conv_filter_strides}, conv_strides_{conv_filter_strides},
conv_dilations_{conv_filter_dilations}, conv_dilations_{conv_filter_dilations},
in_left_pads_{input_left_pads}, in_left_pads_{input_left_pads},
...@@ -45,9 +63,9 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -45,9 +63,9 @@ struct ReferenceConvFwd : public device::BaseOperator
{ {
} }
const Tensor<InDataType>& in_n_c_hi_wi_; const Tensor<InDataType>& input_;
const Tensor<WeiDataType>& wei_k_c_y_x_; const Tensor<WeiDataType>& weight_;
Tensor<OutDataType>& out_n_k_ho_wo_; Tensor<OutDataType>& output_;
std::vector<index_t> conv_strides_; std::vector<index_t> conv_strides_;
std::vector<index_t> conv_dilations_; std::vector<index_t> conv_dilations_;
...@@ -59,58 +77,98 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -59,58 +77,98 @@ struct ReferenceConvFwd : public device::BaseOperator
OutElementwiseOperation out_element_op_; OutElementwiseOperation out_element_op_;
}; };
// Invoker
struct Invoker : public device::BaseInvoker struct Invoker : public device::BaseInvoker
{ {
using Argument = ReferenceConvFwd::Argument; using Argument = ReferenceConvFwd::Argument;
float Run(const Argument& arg) float Run(const Argument& arg)
{ {
auto f_nchw = [&](auto n, auto k, auto ho, auto wo) { if constexpr(NumDimSpatial == 1)
float v_acc = 0; {
auto f_ncw = [&](auto n, auto k, auto wo) {
float v_acc = 0;
for(int c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c) for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
{ {
int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] - for(int x = 0; x < arg.weight_.mDesc.GetLengths()[2]; ++x)
arg.in_left_pads_[0];
for(int x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
{ {
int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] - int wi = wo * arg.conv_strides_[0] + x * arg.conv_dilations_[0] -
arg.in_left_pads_[1]; arg.in_left_pads_[0];
if(hi >= 0 && hi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] && wi >= 0 && if(wi >= 0 && wi < arg.input_.mDesc.GetLengths()[2])
wi < arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
{ {
float v_in; float v_in;
float v_wei; float v_wei;
arg.in_element_op_( arg.in_element_op_(v_in,
v_in, ck::type_convert<float>(arg.in_n_c_hi_wi_(n, c, hi, wi))); static_cast<const float>(arg.input_(n, c, wi)));
arg.wei_element_op_( arg.wei_element_op_(v_wei,
v_wei, ck::type_convert<float>(arg.wei_k_c_y_x_(k, c, y, x))); static_cast<const float>(arg.weight_(k, c, x)));
v_acc += v_in * v_wei; v_acc += v_in * v_wei;
} }
} }
} }
}
float v_out; float v_out;
arg.out_element_op_(v_out, v_acc); arg.out_element_op_(v_out, v_acc);
arg.output_(n, k, wo) = v_out;
};
arg.out_n_k_ho_wo_(n, k, ho, wo) = ck::type_convert<OutDataType>(v_out); make_ParallelTensorFunctor(f_ncw,
}; arg.output_.mDesc.GetLengths()[0],
arg.output_.mDesc.GetLengths()[1],
arg.output_.mDesc.GetLengths()[2])(
std::thread::hardware_concurrency());
make_ParallelTensorFunctor(f_nchw, return 0;
arg.out_n_k_ho_wo_.mDesc.GetLengths()[0], }
arg.out_n_k_ho_wo_.mDesc.GetLengths()[1], else if constexpr(NumDimSpatial == 2)
arg.out_n_k_ho_wo_.mDesc.GetLengths()[2], {
arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])( auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
std::thread::hardware_concurrency()); float v_acc = 0;
return 0; for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
{
for(int y = 0; y < arg.weight_.mDesc.GetLengths()[2]; ++y)
{
int hi = ho * arg.conv_strides_[0] + y * arg.conv_dilations_[0] -
arg.in_left_pads_[0];
for(int x = 0; x < arg.weight_.mDesc.GetLengths()[3]; ++x)
{
int wi = wo * arg.conv_strides_[1] + x * arg.conv_dilations_[1] -
arg.in_left_pads_[1];
if(hi >= 0 && hi < arg.input_.mDesc.GetLengths()[2] && wi >= 0 &&
wi < arg.input_.mDesc.GetLengths()[3])
{
float v_in;
float v_wei;
arg.in_element_op_(
v_in, ck::type_convert<float>(arg.input_(n, c, hi, wi)));
arg.wei_element_op_(
v_wei, ck::type_convert<float>(arg.weight_(k, c, y, x)));
v_acc += v_in * v_wei;
}
}
}
}
float v_out;
arg.out_element_op_(v_out, v_acc);
arg.output_(n, k, ho, wo) = ck::type_convert<OutDataType>(v_out);
};
make_ParallelTensorFunctor(f_nchw,
arg.output_.mDesc.GetLengths()[0],
arg.output_.mDesc.GetLengths()[1],
arg.output_.mDesc.GetLengths()[2],
arg.output_.mDesc.GetLengths()[3])(
std::thread::hardware_concurrency());
return 0;
}
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg, int) override
...@@ -127,9 +185,9 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -127,9 +185,9 @@ struct ReferenceConvFwd : public device::BaseOperator
bool IsSupportedArgument(const device::BaseArgument*) override { return true; } bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi, static auto MakeArgument(const Tensor<InDataType>& input,
const Tensor<WeiDataType>& wei_k_c_y_x, const Tensor<WeiDataType>& weight,
Tensor<OutDataType>& out_n_k_ho_wo, Tensor<OutDataType>& output,
std::vector<ck::index_t> conv_filter_strides, std::vector<ck::index_t> conv_filter_strides,
std::vector<ck::index_t> conv_filter_dilations, std::vector<ck::index_t> conv_filter_dilations,
std::vector<ck::index_t> input_left_pads, std::vector<ck::index_t> input_left_pads,
...@@ -138,9 +196,9 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -138,9 +196,9 @@ struct ReferenceConvFwd : public device::BaseOperator
WeiElementwiseOperation wei_element_op, WeiElementwiseOperation wei_element_op,
OutElementwiseOperation out_element_op) OutElementwiseOperation out_element_op)
{ {
return Argument{in_n_c_hi_wi, return Argument{input,
wei_k_c_y_x, weight,
out_n_k_ho_wo, output,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
......
...@@ -10,6 +10,7 @@ include_directories(BEFORE ...@@ -10,6 +10,7 @@ include_directories(BEFORE
${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform ${PROJECT_SOURCE_DIR}/composable_kernel/include/problem_transform
${PROJECT_SOURCE_DIR}/external/rocm/include ${PROJECT_SOURCE_DIR}/external/rocm/include
${PROJECT_SOURCE_DIR}/reference_operation/include ${PROJECT_SOURCE_DIR}/reference_operation/include
${PROJECT_SOURCE_DIR}/test/include
) )
# test_magic_number_division # test_magic_number_division
...@@ -30,3 +31,22 @@ add_executable(test_split_k ${SPLIT_K_SOURCE}) ...@@ -30,3 +31,22 @@ add_executable(test_split_k ${SPLIT_K_SOURCE})
target_link_libraries(test_split_k PRIVATE host_tensor) target_link_libraries(test_split_k PRIVATE host_tensor)
target_link_libraries(test_split_k PRIVATE device_gemm_instance) target_link_libraries(test_split_k PRIVATE device_gemm_instance)
# test_conv_util
set(CONV_UTIL_SOURCE conv_util/main.cpp)
add_executable(test_conv_util ${CONV_UTIL_SOURCE})
target_link_libraries(test_conv_util PRIVATE host_tensor)
# test_reference_conv_fwd
set(REFERENCE_CONV_FWD_SOURCE reference_conv_fwd/main.cpp)
add_executable(test_reference_conv_fwd ${REFERENCE_CONV_FWD_SOURCE})
target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor)
# test_convnd_fwd_xdl
set(CONVND_FWD_XDL_SOURCE convnd_fwd_xdl/main.cpp)
add_executable(test_convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
target_link_libraries(test_convnd_fwd_xdl PRIVATE host_tensor)
# test space_filling_curve_
set(SPACE_FILLING_CURVE_SOURCE space_filling_curve/space_filling_curve.cpp)
add_executable(space_filling_curve ${SPACE_FILLING_CURVE_SOURCE})
target_link_libraries(space_filling_curve PRIVATE host_tensor)
...@@ -130,13 +130,13 @@ int main(int argc, char* argv[]) ...@@ -130,13 +130,13 @@ int main(int argc, char* argv[])
const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const std::vector<ck::index_t> input_spatial_lengths{{Hi, Wi}}; const std::vector<ck::index_t> input_spatial_lengths{Hi, Wi};
const std::vector<ck::index_t> filter_spatial_lengths{{Y, X}}; const std::vector<ck::index_t> filter_spatial_lengths{Y, X};
const std::vector<ck::index_t> output_spatial_lengths{{Ho, Wo}}; const std::vector<ck::index_t> output_spatial_lengths{Ho, Wo};
const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}}; const std::vector<ck::index_t> conv_filter_strides{conv_stride_h, conv_stride_w};
const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}}; const std::vector<ck::index_t> conv_filter_dilations{conv_dilation_h, conv_dilation_w};
const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}}; const std::vector<ck::index_t> input_left_pads{in_left_pad_h, in_left_pad_w};
const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}}; const std::vector<ck::index_t> input_right_pads{in_right_pad_h, in_right_pad_w};
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W) {
......
#include <iostream>
#include <string>
#include <vector>
#include "config.hpp"
#include "conv_utils.hpp"
#include "tensor_layout.hpp"
namespace {
template <typename T>
bool cmp_vec(const std::vector<T>& out, const std::vector<T>& ref, const std::string& msg)
{
if(out.size() != ref.size())
{
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl
<< msg << std::endl;
return false;
}
for(std::size_t i = 0; i < ref.size(); ++i)
{
if(out[i] != ref[i])
{
std::cout << "out[" << i << "] != ref[" << i << "]: " << out[i] << "!=" << ref[i]
<< std::endl
<< msg << std::endl;
return false;
}
}
return true;
}
bool TestConvParams_GetOutputSpatialLengths()
{
bool res{true};
// -------------------------- default 2D ------------------------------------
// input NCHW {128,192,71,71},
// weights KCYX {256,192,3,3},
// stride {2,2},
// dilations {1,1},
// padding {{1,1}, {1,1}}
ck::conv_util::ConvParams conv_params;
std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{36, 36},
"Error: ConvParams 2D default constructor.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(
out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
conv_params.input_left_pads = std::vector<ck::index_t>{2, 2};
conv_params.input_right_pads = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{37, 37},
"Error: ConvParams 2D padding left/right {2,2}.");
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(
out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{3, 3};
conv_params.input_left_pads = std::vector<ck::index_t>{1, 1};
conv_params.input_right_pads = std::vector<ck::index_t>{1, 1};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{23, 23},
"Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}.");
// -------------------------- 1D ------------------------------------
conv_params.num_dim_spatial = 1;
conv_params.filter_spatial_lengths = std::vector<ck::index_t>{3};
conv_params.input_spatial_lengths = std::vector<ck::index_t>{71};
conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{1};
conv_params.input_left_pads = std::vector<ck::index_t>{1};
conv_params.input_right_pads = std::vector<ck::index_t>{1};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(
out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D default constructor.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res =
cmp_vec(out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{2};
conv_params.input_left_pads = std::vector<ck::index_t>{2};
conv_params.input_right_pads = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{37},
"Error: ConvParams 1D padding left/right {2}.");
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(
out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}.");
conv_params.conv_filter_strides = std::vector<ck::index_t>{3};
conv_params.input_left_pads = std::vector<ck::index_t>{1};
conv_params.input_right_pads = std::vector<ck::index_t>{1};
conv_params.conv_filter_dilations = std::vector<ck::index_t>{2};
out_spatial_len = conv_params.GetOutputSpatialLengths();
res = cmp_vec(out_spatial_len,
std::vector<ck::index_t>{23},
"Error: ConvParams 1D strides{3}, padding {1}, dilations {2}.");
return res;
}
bool TestGetHostTensorDescriptor()
{
bool res{true};
namespace tl = ck::tensor_layout::convolution;
std::vector<std::size_t> dims{2, 3, 4, 5};
HostTensorDescriptor h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NHWC{});
res = cmp_vec(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NHWC dimensions lengths!");
res =
cmp_vec(h.GetStrides(), {3 * 4 * 5, 1, 3 * 5, 3}, "Error: wrong NHWC dimensions strides!");
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCHW{});
res = cmp_vec(h.GetLengths(), {2, 3, 4, 5}, "Error: wrong NCHW dimensions lengths!");
res =
cmp_vec(h.GetStrides(), {3 * 4 * 5, 4 * 5, 5, 1}, "Error: wrong NCHW dimensions strides!");
dims = std::vector<std::size_t>{2, 3, 4};
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NWC{});
res = cmp_vec(h.GetLengths(), {2, 3, 4}, "Error: wrong NWC dimensions lengths!");
res = cmp_vec(h.GetStrides(), {3 * 4, 1, 3}, "Error: wrong NWC dimensions strides!");
h = ck::conv_util::GetHostTensorDescriptor(dims, tl::NCW{});
res = cmp_vec(h.GetLengths(), {2, 3, 4}, "Error: wrong NCW dimensions lengths!");
res = cmp_vec(h.GetStrides(), {3 * 4, 4, 1}, "Error: wrong NCW dimensions strides!");
return res;
}
} // namespace
int main(void)
{
bool res = TestConvParams_GetOutputSpatialLengths();
std::cout << "TestConvParams_GetOutputSpatialLengths ..... " << (res ? "SUCCESS" : "FAILURE")
<< std::endl;
res = TestGetHostTensorDescriptor();
std::cout << "TestGetHostTensorDescriptor ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return 0;
}
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "config.hpp"
#include "conv_utils.hpp"
#include "device.hpp"
#include "device_tensor.hpp"
#include "device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
#include "test_util.hpp"
namespace {
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
static constexpr auto ConvFwdDefault =
ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
template <ck::index_t SpatialDims, typename InDataType, typename WeiDataType, typename OutDataType>
using DeviceConvNDFwdInstance = ck::tensor_operation::device::
DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
// clang-format off
InDataType, //
WeiDataType, //
OutDataType, //
InDataType, //
InElementOp, // Input Elementwise Operation
WeiElementOp, // Weights Elementwise Operation
OutElementOp, // Output Elementwise Operation
ConvFwdDefault, // ConvForwardSpecialization
SpatialDims, // SptialDims
64, // BlockSize
16, // MPerBlock
16, // NPerBlock
4, // K0PerBlock
1, // K1
16, // MPerXDL
16, // NPerXDL
1, // MXdlPerWave
1, // NXdlPerWave
S<1, 16, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
2, // ABlockTransferSrcVectorDim
1, // ABlockTransferSrcScalarPerVector
1, // ABlockTransferDstScalarPerVector_K1
true, // ABlockLdsAddExtraM
S<1, 16, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
2, // BBlockTransferSrcVectorDim
1, // BBlockTransferSrcScalarPerVector
1, // BBlockTransferDstScalarPerVector_K1
true, // BBlockTransferAddExtraN
7, // CThreadTransferSrcDstVectorDim
1>; // CThreadTransferDstScalarPerVector
// clang-format on
template <typename InDataType = float,
typename WeiDataType = float,
typename OutDataType = float,
typename InLayout = ck::tensor_layout::convolution::NHWC,
typename WeiLayout = ck::tensor_layout::convolution::KYXC,
typename OutLayout = ck::tensor_layout::convolution::NHWK>
auto GetHostTensors(const ck::conv_util::ConvParams& params)
{
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.C)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths),
std::end(params.input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
static_cast<std::size_t>(params.C)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths),
std::end(params.filter_spatial_lengths));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.K)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(ck::conv_util::GetHostTensorDescriptor(input_dims, InLayout{}));
Tensor<WeiDataType> weights(ck::conv_util::GetHostTensorDescriptor(filter_dims, WeiLayout{}));
Tensor<OutDataType> host_output(
ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
Tensor<OutDataType> device_output(
ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
std::generate(input.begin(), input.end(), [n = 0]() mutable {
return InDataType(n++) * InDataType(0.1f);
});
std::fill(weights.begin(), weights.end(), WeiDataType(0.5f));
std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
std::fill(device_output.begin(), device_output.end(), OutDataType(0.f));
return std::make_tuple(input, weights, host_output, device_output);
}
template <ck::index_t NDim,
typename InDataType = float,
typename WeiDataType = float,
typename OutDataType = float>
void RunReferenceConv(const ck::conv_util::ConvParams& params,
const Tensor<InDataType>& input,
const Tensor<WeiDataType>& weights,
Tensor<OutDataType>& output)
{
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
NDim>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
output,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
}
template <ck::index_t NDim,
typename InDataType = float,
typename WeiDataType = float,
typename OutDataType = float>
void RunConv(const ck::conv_util::ConvParams& params,
const Tensor<InDataType>& input,
const Tensor<WeiDataType>& weights,
Tensor<OutDataType>& output)
{
DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weights.mData.data());
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
auto conv = DeviceConvNDFwdInstance<NDim, InDataType, WeiDataType, OutDataType>();
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
params.N,
params.K,
params.C,
params.input_spatial_lengths,
params.filter_spatial_lengths,
output_spatial_lengths,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
if(!conv.IsSupportedArgument(argument))
{
throw std::runtime_error(
"Error! device_conv with the specified compilation parameters does "
"not support this Conv problem");
}
invoker.Run(argument);
out_device_buf.FromDevice(output.mData.data());
}
bool TestConv2DNHWC()
{
bool res{true};
ck::conv_util::ConvParams params;
params.N = 2;
params.K = 16;
params.C = 4;
params.input_spatial_lengths = std::vector<ck::index_t>{16, 16};
params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
auto host_tensors = GetHostTensors(params);
const Tensor<float>& input = std::get<0>(host_tensors);
const Tensor<float>& weights = std::get<1>(host_tensors);
Tensor<float>& host_output = std::get<2>(host_tensors);
Tensor<float>& device_output = std::get<3>(host_tensors);
RunReferenceConv<2>(params, input, weights, host_output);
RunConv<2>(params, input, weights, device_output);
res = res &&
test_util::check_err(
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
return res;
}
bool TestConv1DNWC()
{
bool res{true};
ck::conv_util::ConvParams params;
params.num_dim_spatial = 1;
params.N = 2;
params.K = 16;
params.C = 4;
params.filter_spatial_lengths = std::vector<ck::index_t>{3};
params.input_spatial_lengths = std::vector<ck::index_t>{16};
params.conv_filter_strides = std::vector<ck::index_t>{1};
params.conv_filter_dilations = std::vector<ck::index_t>{1};
params.input_left_pads = std::vector<ck::index_t>{1};
params.input_right_pads = std::vector<ck::index_t>{1};
auto host_tensors = GetHostTensors<float,
float,
float,
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK>(params);
const Tensor<float>& input = std::get<0>(host_tensors);
const Tensor<float>& weights = std::get<1>(host_tensors);
Tensor<float>& host_output = std::get<2>(host_tensors);
Tensor<float>& device_output = std::get<3>(host_tensors);
RunReferenceConv<1>(params, input, weights, host_output);
RunConv<1>(params, input, weights, device_output);
res = res &&
test_util::check_err(
device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
return res;
}
} // anonymous namespace
int main()
{
bool res{true};
res = TestConv1DNWC();
std::cout << "TestConv1DNWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
res = TestConv2DNHWC();
std::cout << "TestConv2DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
}
#ifndef TEST_UTIL_HPP
#define TEST_UTIL_HPP
#include <cmath>
#include <cstdlib>
#include <iostream>
#include <iomanip>
#include <limits>
#include <type_traits>
#include <vector>
namespace test_util {
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value, bool>::type
check_err(const std::vector<T>& out,
const std::vector<T>& ref,
const std::string& msg,
T rtol = static_cast<T>(1e-5),
T atol = static_cast<T>(1e-8))
{
if(out.size() != ref.size())
{
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl
<< msg << std::endl;
return false;
}
bool res{true};
int err_count = 0;
T err = 0;
T max_err = std::numeric_limits<T>::min();
for(std::size_t i = 0; i < ref.size(); ++i)
{
err = std::abs(out[i] - ref[i]);
if(err > atol + rtol * std::abs(ref[i]) || !std::isfinite(out[i]) || !std::isfinite(ref[i]))
{
max_err = err > max_err ? err : max_err;
err_count++;
if(err_count < 5)
{
std::cout << std::setw(12) << std::setprecision(7) << "out[" << i << "] != ref["
<< i << "]: " << out[i] << "!=" << ref[i] << std::endl
<< msg << std::endl;
}
res = false;
}
}
if(!res)
{
std::cout << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
}
return res;
}
template <typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type check_err(
const std::vector<T>& out, const std::vector<T>& ref, const std::string& msg, T = 0, T = 0)
{
if(out.size() != ref.size())
{
std::cout << "out.size() != ref.size(), :" << out.size() << " != " << ref.size()
<< std::endl
<< msg << std::endl;
return false;
}
for(std::size_t i = 0; i < ref.size(); ++i)
{
if(out[i] != ref[i])
{
std::cout << "out[" << i << "] != ref[" << i << "]: " << out[i] << "!=" << ref[i]
<< std::endl
<< msg << std::endl;
return false;
}
}
return true;
}
} // namespace test_util
#endif
...@@ -41,6 +41,19 @@ gpu_naive_division(int32_t divisor, const int32_t* p_dividend, int32_t* p_result ...@@ -41,6 +41,19 @@ gpu_naive_division(int32_t divisor, const int32_t* p_dividend, int32_t* p_result
} }
} }
__host__ void cpu_magic_number_division(uint32_t magic_multiplier,
uint32_t magic_shift,
const int32_t* p_dividend,
int32_t* p_result,
uint64_t num)
{
for(uint64_t data_id = 0; data_id < num; ++data_id)
{
p_result[data_id] =
ck::MagicDivision::DoMagicDivision(p_dividend[data_id], magic_multiplier, magic_shift);
}
}
template <typename T> template <typename T>
T check_error(const std::vector<T>& ref, const std::vector<T>& result) T check_error(const std::vector<T>& ref, const std::vector<T>& result)
{ {
...@@ -90,6 +103,7 @@ int main(int, char*[]) ...@@ -90,6 +103,7 @@ int main(int, char*[])
std::vector<int32_t> naive_result_host(num_dividend); std::vector<int32_t> naive_result_host(num_dividend);
std::vector<int32_t> magic_result_host(num_dividend); std::vector<int32_t> magic_result_host(num_dividend);
std::vector<int32_t> magic_result_host2(num_dividend);
dividends_dev_buf.ToDevice(dividends_host.data()); dividends_dev_buf.ToDevice(dividends_host.data());
...@@ -128,6 +142,20 @@ int main(int, char*[]) ...@@ -128,6 +142,20 @@ int main(int, char*[])
pass = false; pass = false;
continue; continue;
} }
cpu_magic_number_division(magic_multiplier,
magic_shift,
dividends_host.data(),
magic_result_host2.data(),
num_dividend);
max_diff = check_error(naive_result_host, magic_result_host2);
if(max_diff != 0)
{
pass = false;
continue;
}
} }
if(pass) if(pass)
......
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <half.hpp>
#include <numeric>
#include <type_traits>
#include <vector>
#include "config.hpp"
#include "conv_utils.hpp"
#include "element_wise_operation.hpp"
#include "host_tensor.hpp"
#include "reference_conv_fwd.hpp"
#include "tensor_layout.hpp"
#include "test_util.hpp"
namespace {
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
template <typename T>
struct FillMonotonicSeq
{
T m_init_value{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::iota(first, last, m_init_value);
}
};
template <typename T>
struct FillConstant
{
T m_value{0};
template <typename ForwardIter>
void operator()(ForwardIter first, ForwardIter last) const
{
std::fill(first, last, m_value);
}
};
template <ck::index_t NDim,
typename InDataType = float,
typename WeiDataType = float,
typename OutDataType = float,
typename InLayout = ck::tensor_layout::convolution::NHWC,
typename WeiLayout = ck::tensor_layout::convolution::KYXC,
typename OutLayout = ck::tensor_layout::convolution::NHWK,
typename FillInputOp = FillMonotonicSeq<InDataType>,
typename FillWeightsOp = FillConstant<WeiDataType>>
Tensor<OutDataType> RunReferenceConv(const ck::conv_util::ConvParams& params,
const FillInputOp& fill_input_op = FillInputOp{0},
const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
{
std::vector<std::size_t> input_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.C)};
input_dims.insert(std::end(input_dims),
std::begin(params.input_spatial_lengths),
std::end(params.input_spatial_lengths));
std::vector<std::size_t> filter_dims{static_cast<std::size_t>(params.K),
static_cast<std::size_t>(params.C)};
filter_dims.insert(std::end(filter_dims),
std::begin(params.filter_spatial_lengths),
std::end(params.filter_spatial_lengths));
const std::vector<ck::index_t>& output_spatial_lengths = params.GetOutputSpatialLengths();
std::vector<std::size_t> output_dims{static_cast<std::size_t>(params.N),
static_cast<std::size_t>(params.K)};
output_dims.insert(std::end(output_dims),
std::begin(output_spatial_lengths),
std::end(output_spatial_lengths));
Tensor<InDataType> input(ck::conv_util::GetHostTensorDescriptor(input_dims, InLayout{}));
Tensor<WeiDataType> weights(ck::conv_util::GetHostTensorDescriptor(filter_dims, WeiLayout{}));
Tensor<OutDataType> host_output(
ck::conv_util::GetHostTensorDescriptor(output_dims, OutLayout{}));
fill_input_op(input.begin(), input.end());
fill_weights_op(weights.begin(), weights.end());
std::fill(host_output.begin(), host_output.end(), OutDataType(0.f));
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
NDim>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(input,
weights,
host_output,
params.conv_filter_strides,
params.conv_filter_dilations,
params.input_left_pads,
params.input_right_pads,
InElementOp{},
WeiElementOp{},
OutElementOp{});
ref_invoker.Run(ref_argument);
return host_output;
}
bool TestConv2DNHWC()
{
bool res{true};
ck::conv_util::ConvParams params;
params.N = 1;
params.K = 1;
params.C = 2;
params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
params.input_spatial_lengths = std::vector<ck::index_t>{6, 6};
params.conv_filter_strides = std::vector<ck::index_t>{1, 1};
params.conv_filter_dilations = std::vector<ck::index_t>{1, 1};
params.input_left_pads = std::vector<ck::index_t>{0, 0};
params.input_right_pads = std::vector<ck::index_t>{0, 0};
auto out_tensor = RunReferenceConv<2>(params);
std::vector<std::size_t> ref_dims{1, 1, 4, 4};
std::vector<float> ref_data{130.5,
148.5,
166.5,
184.5,
238.5,
256.5,
274.5,
292.5,
346.5,
364.5,
382.5,
400.5,
454.5,
472.5,
490.5,
508.5};
res = res && test_util::check_err(out_tensor.mDesc.GetLengths(),
ref_dims,
"Error: wrong output tensor dimensions!");
res = res && test_util::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
params.N = 1;
params.K = 2;
params.C = 2;
params.filter_spatial_lengths = std::vector<ck::index_t>{3, 3};
params.input_spatial_lengths = std::vector<ck::index_t>{12, 12};
params.conv_filter_strides = std::vector<ck::index_t>{2, 2};
params.conv_filter_dilations = std::vector<ck::index_t>{2, 2};
params.input_left_pads = std::vector<ck::index_t>{1, 1};
params.input_right_pads = std::vector<ck::index_t>{1, 1};
out_tensor = RunReferenceConv<2>(params);
ref_dims = std::vector<std::size_t>{1, 2, 5, 5};
ref_data = std::vector<float>{
210., 210., 327., 327., 351., 351., 375., 375., 399., 399.,
459., 459., 706.5, 706.5, 742.5, 742.5, 778.5, 778.5, 814.5, 814.5,
747., 747., 1138.5, 1138.5, 1174.5, 1174.5, 1210.5, 1210.5, 1246.5, 1246.5,
1035., 1035., 1570.5, 1570.5, 1606.5, 1606.5, 1642.5, 1642.5, 1678.5, 1678.5,
1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5};
res = res && test_util::check_err(out_tensor.mDesc.GetLengths(),
ref_dims,
"Error: wrong output tensor dimensions!");
res = res && test_util::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
return res;
}
bool TestConv1DNWC()
{
bool res{true};
ck::conv_util::ConvParams params;
params.num_dim_spatial = 1;
params.N = 1;
params.K = 1;
params.C = 2;
params.filter_spatial_lengths = std::vector<ck::index_t>{3};
params.input_spatial_lengths = std::vector<ck::index_t>{6};
params.conv_filter_strides = std::vector<ck::index_t>{1};
params.conv_filter_dilations = std::vector<ck::index_t>{1};
params.input_left_pads = std::vector<ck::index_t>{0};
params.input_right_pads = std::vector<ck::index_t>{0};
auto out_tensor = RunReferenceConv<1,
float,
float,
float,
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK>(params);
std::vector<std::size_t> ref_dims{1, 1, 4};
std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
res = res && test_util::check_err(out_tensor.mDesc.GetLengths(),
ref_dims,
"Error: wrong output tensor dimensions!");
res = res && test_util::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
params.num_dim_spatial = 1;
params.N = 1;
params.K = 2;
params.C = 2;
params.filter_spatial_lengths = std::vector<ck::index_t>{3};
params.input_spatial_lengths = std::vector<ck::index_t>{12};
params.conv_filter_strides = std::vector<ck::index_t>{2};
params.conv_filter_dilations = std::vector<ck::index_t>{2};
params.input_left_pads = std::vector<ck::index_t>{1};
params.input_right_pads = std::vector<ck::index_t>{1};
out_tensor = RunReferenceConv<1,
float,
float,
float,
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK>(params);
ref_dims = std::vector<std::size_t>{1, 2, 5};
ref_data = std::vector<float>{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
res = res && test_util::check_err(out_tensor.mDesc.GetLengths(),
ref_dims,
"Error: wrong output tensor dimensions!");
res = res && test_util::check_err(out_tensor.mData, ref_data, "Error: incorrect results!");
params.num_dim_spatial = 1;
params.N = 2;
params.K = 16;
params.C = 4;
params.filter_spatial_lengths = std::vector<ck::index_t>{3};
params.input_spatial_lengths = std::vector<ck::index_t>{16};
params.conv_filter_strides = std::vector<ck::index_t>{1};
params.conv_filter_dilations = std::vector<ck::index_t>{1};
params.input_left_pads = std::vector<ck::index_t>{1};
params.input_right_pads = std::vector<ck::index_t>{1};
auto out_tensor2 =
RunReferenceConv<1,
float,
float,
float,
ck::tensor_layout::convolution::NWC,
ck::tensor_layout::convolution::KXC,
ck::tensor_layout::convolution::NWK>(params, [](auto first, auto last) {
std::generate(first, last, [n = 0]() mutable { return float(n++) * float(0.1f); });
});
ref_dims = std::vector<std::size_t>{2, 16, 16};
ref_data = std::vector<float>{
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4,
1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4, 1.4,
3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3,
3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3, 3.3,
5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 5.7,
5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 5.7, 5.7,
8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1,
8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1, 8.1,
10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5,
10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5, 10.5,
12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001,
12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001,
15.3, 15.3, 15.3, 15.3, 15.3, 15.3, 15.3, 15.3,
15.3, 15.3, 15.3, 15.3, 15.3, 15.3, 15.3, 15.3,
17.7, 17.7, 17.7, 17.7, 17.7, 17.7, 17.7, 17.7,
17.7, 17.7, 17.7, 17.7, 17.7, 17.7, 17.7, 17.7,
20.1, 20.1, 20.1, 20.1, 20.1, 20.1, 20.1, 20.1,
20.1, 20.1, 20.1, 20.1, 20.1, 20.1, 20.1, 20.1,
22.5, 22.5, 22.5, 22.5, 22.5, 22.5, 22.5, 22.5,
22.5, 22.5, 22.5, 22.5, 22.5, 22.5, 22.5, 22.5,
24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002,
24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002,
27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001,
27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001,
29.7, 29.7, 29.7, 29.7, 29.7, 29.7, 29.7, 29.7,
29.7, 29.7, 29.7, 29.7, 29.7, 29.7, 29.7, 29.7,
32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002,
32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002,
34.5, 34.5, 34.5, 34.5, 34.5, 34.5, 34.5, 34.5,
34.5, 34.5, 34.5, 34.5, 34.5, 34.5, 34.5, 34.5,
23.8, 23.8, 23.8, 23.8, 23.8, 23.8, 23.8, 23.8,
23.8, 23.8, 23.8, 23.8, 23.8, 23.8, 23.8, 23.8,
27., 27., 27., 27., 27., 27., 27., 27.,
27., 27., 27., 27., 27., 27., 27., 27.,
41.7, 41.7, 41.7, 41.7, 41.7, 41.7, 41.7, 41.7,
41.7, 41.7, 41.7, 41.7, 41.7, 41.7, 41.7, 41.7,
44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002,
44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002,
46.5, 46.5, 46.5, 46.5, 46.5, 46.5, 46.5, 46.5,
46.5, 46.5, 46.5, 46.5, 46.5, 46.5, 46.5, 46.5,
48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998,
48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998,
51.3, 51.3, 51.3, 51.3, 51.3, 51.3, 51.3, 51.3,
51.3, 51.3, 51.3, 51.3, 51.3, 51.3, 51.3, 51.3,
53.7, 53.7, 53.7, 53.7, 53.7, 53.7, 53.7, 53.7,
53.7, 53.7, 53.7, 53.7, 53.7, 53.7, 53.7, 53.7,
56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002,
56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002,
58.5, 58.5, 58.5, 58.5, 58.5, 58.5, 58.5, 58.5,
58.5, 58.5, 58.5, 58.5, 58.5, 58.5, 58.5, 58.5,
60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998,
60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998,
63.3, 63.3, 63.3, 63.3, 63.3, 63.3, 63.3, 63.3,
63.3, 63.3, 63.3, 63.3, 63.3, 63.3, 63.3, 63.3,
65.7, 65.7, 65.7, 65.7, 65.7, 65.7, 65.7, 65.7,
65.7, 65.7, 65.7, 65.7, 65.7, 65.7, 65.7, 65.7,
68.1, 68.1, 68.1, 68.1, 68.1, 68.1, 68.1, 68.1,
68.1, 68.1, 68.1, 68.1, 68.1, 68.1, 68.1, 68.1,
70.5, 70.5, 70.5, 70.5, 70.5, 70.5, 70.5, 70.5,
70.5, 70.5, 70.5, 70.5, 70.5, 70.5, 70.5, 70.5,
72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 72.9,
72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 72.9, 72.9,
49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4,
49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4, 49.4};
res = res && test_util::check_err(out_tensor2.mDesc.GetLengths(),
ref_dims,
"Error: wrong output tensor dimensions!");
res = res && test_util::check_err(out_tensor2.mData, ref_data, "Error: incorrect results!");
return res;
}
} // anonymous namespace
int main(void)
{
bool res{true};
res = TestConv2DNHWC();
std::cout << "TestConv2DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
res = TestConv1DNWC();
std::cout << "TestConv1DNHWC ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
return 0;
}
#include <vector>
#include <iostream>
#include <numeric>
#include <cassert>
#include "tensor_space_filling_curve.hpp"
using namespace ck;
void traverse_using_space_filling_curve();
int main(int argc, char** argv)
{
(void)argc;
(void)argv;
{
traverse_using_space_filling_curve();
auto err = hipDeviceSynchronize();
(void)err;
assert(err == hipSuccess);
}
return 0;
}
void traverse_using_space_filling_curve()
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
using TensorLengths = Sequence<4, 10, 9>;
using DimAccessOrder = Sequence<2, 0, 1>;
using ScalarsPerAccess = Sequence<1, 2, 3>;
using SpaceFillingCurve = SpaceFillingCurve<TensorLengths, DimAccessOrder, ScalarsPerAccess>;
constexpr auto expected = make_tuple(make_tuple(0, 0, 0),
make_tuple(0, 2, 0),
make_tuple(0, 4, 0),
make_tuple(0, 6, 0),
make_tuple(0, 8, 0),
make_tuple(1, 8, 0),
make_tuple(1, 6, 0),
make_tuple(1, 4, 0),
make_tuple(1, 2, 0),
make_tuple(1, 0, 0),
make_tuple(2, 0, 0),
make_tuple(2, 2, 0),
make_tuple(2, 4, 0),
make_tuple(2, 6, 0),
make_tuple(2, 8, 0),
make_tuple(3, 8, 0),
make_tuple(3, 6, 0),
make_tuple(3, 4, 0),
make_tuple(3, 2, 0),
make_tuple(3, 0, 0),
make_tuple(3, 0, 3),
make_tuple(3, 2, 3),
make_tuple(3, 4, 3),
make_tuple(3, 6, 3),
make_tuple(3, 8, 3),
make_tuple(2, 8, 3),
make_tuple(2, 6, 3),
make_tuple(2, 4, 3),
make_tuple(2, 2, 3),
make_tuple(2, 0, 3),
make_tuple(1, 0, 3),
make_tuple(1, 2, 3),
make_tuple(1, 4, 3),
make_tuple(1, 6, 3),
make_tuple(1, 8, 3),
make_tuple(0, 8, 3),
make_tuple(0, 6, 3),
make_tuple(0, 4, 3),
make_tuple(0, 2, 3),
make_tuple(0, 0, 3),
make_tuple(0, 0, 6),
make_tuple(0, 2, 6),
make_tuple(0, 4, 6),
make_tuple(0, 6, 6),
make_tuple(0, 8, 6),
make_tuple(1, 8, 6),
make_tuple(1, 6, 6),
make_tuple(1, 4, 6),
make_tuple(1, 2, 6),
make_tuple(1, 0, 6),
make_tuple(2, 0, 6),
make_tuple(2, 2, 6),
make_tuple(2, 4, 6),
make_tuple(2, 6, 6),
make_tuple(2, 8, 6),
make_tuple(3, 8, 6),
make_tuple(3, 6, 6),
make_tuple(3, 4, 6),
make_tuple(3, 2, 6),
make_tuple(3, 0, 6));
constexpr index_t num_accesses = SpaceFillingCurve::GetNumOfAccess();
static_assert(num_accesses == reduce_on_sequence(TensorLengths{} / ScalarsPerAccess{},
math::multiplies{},
Number<1>{}));
static_for<1, num_accesses, 1>{}([&](auto i) {
constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
static_assert(idx_curr[I0] == expected[i][I0]);
static_assert(idx_curr[I1] == expected[i][I1]);
static_assert(idx_curr[I2] == expected[i][I2]);
constexpr auto backward_step = SpaceFillingCurve::GetBackwardStep(i);
constexpr auto expected_step = expected[i - I1] - expected[i];
static_assert(backward_step[I0] == expected_step[I0]);
static_assert(backward_step[I1] == expected_step[I1]);
static_assert(backward_step[I2] == expected_step[I2]);
});
static_for<0, num_accesses - 1, 1>{}([&](auto i) {
constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
static_assert(idx_curr[I0] == expected[i][I0]);
static_assert(idx_curr[I1] == expected[i][I1]);
static_assert(idx_curr[I2] == expected[i][I2]);
constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(i);
constexpr auto expected_step = expected[i + I1] - expected[i];
static_assert(forward_step[I0] == expected_step[I0]);
static_assert(forward_step[I1] == expected_step[I1]);
static_assert(forward_step[I2] == expected_step[I2]);
});
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment