Commit 41852668 authored by Jing Zhang's avatar Jing Zhang
Browse files

add activ_type as arguments

parent e5c9f039
...@@ -20,7 +20,8 @@ template <typename GridwiseGemm, ...@@ -20,7 +20,8 @@ template <typename GridwiseGemm,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2, typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -49,7 +50,8 @@ __global__ void ...@@ -49,7 +50,8 @@ __global__ void
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
template <typename GridwiseGemm, template <typename GridwiseGemm,
...@@ -60,7 +62,8 @@ template <typename GridwiseGemm, ...@@ -60,7 +62,8 @@ template <typename GridwiseGemm,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -81,7 +84,7 @@ __global__ void ...@@ -81,7 +84,7 @@ __global__ void
__shared__ FloatAB p_shared_block[shared_block_size]; __shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::ConvBiasActivResizeAddRun(p_a_grid, GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_d_grid, p_d_grid,
...@@ -91,7 +94,8 @@ __global__ void ...@@ -91,7 +94,8 @@ __global__ void
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
template <typename GridwiseGemm, template <typename GridwiseGemm,
...@@ -102,7 +106,8 @@ template <typename GridwiseGemm, ...@@ -102,7 +106,8 @@ template <typename GridwiseGemm,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -124,7 +129,7 @@ __global__ void ...@@ -124,7 +129,7 @@ __global__ void
__shared__ FloatAB p_shared_block[shared_block_size]; __shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::ConvBiasActivMaxpoolRun(p_a_grid, GridwiseGemm::ConvBiasActivMaxpool(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_c_grid, p_c_grid,
...@@ -135,7 +140,8 @@ __global__ void ...@@ -135,7 +140,8 @@ __global__ void
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
#elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER #elif CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER
// pass tensor descriptor by CONSTANT void pointer // pass tensor descriptor by CONSTANT void pointer
...@@ -148,7 +154,8 @@ template <typename GridwiseGemm, ...@@ -148,7 +154,8 @@ template <typename GridwiseGemm,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2, typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -191,7 +198,8 @@ __global__ void ...@@ -191,7 +198,8 @@ __global__ void
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
// pass tensor descriptor by CONSTANT void pointer // pass tensor descriptor by CONSTANT void pointer
...@@ -205,7 +213,8 @@ template <typename GridwiseGemm, ...@@ -205,7 +213,8 @@ template <typename GridwiseGemm,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -244,7 +253,7 @@ __global__ void ...@@ -244,7 +253,7 @@ __global__ void
__shared__ FloatAB p_shared_block[shared_block_size]; __shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::ConvBiasActivResizeAddRun(p_a_grid, GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_d_grid, p_d_grid,
...@@ -254,7 +263,8 @@ __global__ void ...@@ -254,7 +263,8 @@ __global__ void
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
template <typename GridwiseGemm, template <typename GridwiseGemm,
...@@ -265,7 +275,8 @@ template <typename GridwiseGemm, ...@@ -265,7 +275,8 @@ template <typename GridwiseGemm,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -305,7 +316,7 @@ __global__ void ...@@ -305,7 +316,7 @@ __global__ void
__shared__ FloatAB p_shared_block[shared_block_size]; __shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::ConvBiasActivMaxpoolRun(p_a_grid, GridwiseGemm::ConvBiasActivMaxpool(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_c_grid, p_c_grid,
...@@ -316,7 +327,8 @@ __global__ void ...@@ -316,7 +327,8 @@ __global__ void
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
#elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR #elif CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
template <typename GridwiseGemm, template <typename GridwiseGemm,
...@@ -327,7 +339,8 @@ template <typename GridwiseGemm, ...@@ -327,7 +339,8 @@ template <typename GridwiseGemm,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -350,7 +363,7 @@ __global__ void ...@@ -350,7 +363,7 @@ __global__ void
constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor = constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
CBlockIdToBlockClusterAdaptor_K_N_H_W{}; CBlockIdToBlockClusterAdaptor_K_N_H_W{};
GridwiseGemm::ConvBiasActivResizeAddRun(p_a_grid, GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_d_grid, p_d_grid,
...@@ -360,7 +373,8 @@ __global__ void ...@@ -360,7 +373,8 @@ __global__ void
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
template <typename GridwiseGemm, template <typename GridwiseGemm,
...@@ -371,7 +385,8 @@ template <typename GridwiseGemm, ...@@ -371,7 +385,8 @@ template <typename GridwiseGemm,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -395,7 +410,7 @@ __global__ void ...@@ -395,7 +410,7 @@ __global__ void
constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor = constexpr auto c_blockid_to_k_n_h_w_block_cluster_adaptor =
CBlockIdToBlockClusterAdaptor_K_N_H_W{}; CBlockIdToBlockClusterAdaptor_K_N_H_W{};
GridwiseGemm::ConvBiasActivMaxpoolRun(p_a_grid, GridwiseGemm::ConvBiasActivMaxpool(p_a_grid,
p_b_grid, p_b_grid,
p_bias_grid, p_bias_grid,
p_c_grid, p_c_grid,
...@@ -406,7 +421,8 @@ __global__ void ...@@ -406,7 +421,8 @@ __global__ void
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
template <typename GridwiseGemm, template <typename GridwiseGemm,
...@@ -416,7 +432,8 @@ template <typename GridwiseGemm, ...@@ -416,7 +432,8 @@ template <typename GridwiseGemm,
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2, typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__global__ void __global__ void
#if CK_USE_LAUNCH_BOUNDS #if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
...@@ -447,7 +464,8 @@ __global__ void ...@@ -447,7 +464,8 @@ __global__ void
b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
c_blockid_to_k_n_h_w_block_cluster_adaptor, c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>{}); integral_constant<bool, HasMainE0BlockLoop>{},
integral_constant<ActivTypeEnum_t, ActivType>{});
} }
#endif #endif
...@@ -510,8 +528,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -510,8 +528,6 @@ struct GridwiseGemmDlops_km_kn_mn_v3
static constexpr FloatAcc alpha = 0.3; static constexpr FloatAcc alpha = 0.3;
static constexpr auto activ_type = I1;
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{ {
constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{}; constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
...@@ -912,9 +928,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -912,9 +928,10 @@ struct GridwiseGemmDlops_km_kn_mn_v3
}); });
} }
template <typename CThreadBuff, typename CThreadDesc_K1_N_H2_W2, index_t activ_type_> template <typename CThreadBuff, typename CThreadDesc_K1_N_H2_W2, ActivTypeEnum_t activ_type_>
__device__ static void __device__ static void Activation(CThreadBuff& c_thread_buf,
Activation(CThreadBuff& c_thread_buf, const CThreadDesc_K1_N_H2_W2&, Number<activ_type_>) const CThreadDesc_K1_N_H2_W2&,
integral_constant<ActivTypeEnum_t, activ_type_>)
{ {
constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{}; constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
...@@ -1652,7 +1669,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -1652,7 +1669,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3
typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2, typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
ActivTypeEnum_t ActivType>
__device__ static void ConvBiasActiv( __device__ static void ConvBiasActiv(
const FloatAB* __restrict__ p_a_global, const FloatAB* __restrict__ p_a_global,
const FloatAB* __restrict__ p_b_global, const FloatAB* __restrict__ p_b_global,
...@@ -1663,8 +1681,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -1663,8 +1681,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc, const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>) integral_constant<bool, HasMainE0BlockLoop>,
integral_constant<ActivTypeEnum_t, ActivType>)
{ {
static constexpr auto activ_type = integral_constant<ActivTypeEnum_t, ActivType>{};
const auto bias_k0_k1_grid_desc = const auto bias_k0_k1_grid_desc =
MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
...@@ -1727,8 +1748,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -1727,8 +1748,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
__device__ static void ConvBiasActivMaxpoolRun( ActivTypeEnum_t ActivType>
__device__ static void ConvBiasActivMaxpool(
const FloatAB* __restrict__ p_a_global, const FloatAB* __restrict__ p_a_global,
const FloatAB* __restrict__ p_b_global, const FloatAB* __restrict__ p_b_global,
const FloatC* __restrict__ p_bias_global, const FloatC* __restrict__ p_bias_global,
...@@ -1740,8 +1762,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -1740,8 +1762,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>) integral_constant<bool, HasMainE0BlockLoop>,
integral_constant<ActivTypeEnum_t, ActivType>)
{ {
static constexpr auto activ_type = integral_constant<ActivTypeEnum_t, ActivType>{};
const auto bias_k0_k1_grid_desc = const auto bias_k0_k1_grid_desc =
MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
...@@ -1814,8 +1839,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -1814,8 +1839,9 @@ struct GridwiseGemmDlops_km_kn_mn_v3
typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2, typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx, typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
typename CBlockIdToBlockClusterAdaptor_K_N_H_W, typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
bool HasMainE0BlockLoop> bool HasMainE0BlockLoop,
__device__ static void ConvBiasActivResizeAddRun( ActivTypeEnum_t ActivType>
__device__ static void ConvBiasActivResizeAdd(
const FloatAB* __restrict__ p_a_global, const FloatAB* __restrict__ p_a_global,
const FloatAB* __restrict__ p_b_global, const FloatAB* __restrict__ p_b_global,
const FloatC* __restrict__ p_bias_global, const FloatC* __restrict__ p_bias_global,
...@@ -1826,8 +1852,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3 ...@@ -1826,8 +1852,11 @@ struct GridwiseGemmDlops_km_kn_mn_v3
const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc, const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc, const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor, const CBlockIdToBlockClusterAdaptor_K_N_H_W& c_blockid_to_k_n_h_w_block_cluster_adaptor,
integral_constant<bool, HasMainE0BlockLoop>) integral_constant<bool, HasMainE0BlockLoop>,
integral_constant<ActivTypeEnum_t, ActivType>)
{ {
static constexpr auto activ_type = integral_constant<ActivTypeEnum_t, ActivType>{};
const auto bias_k0_k1_grid_desc = const auto bias_k0_k1_grid_desc =
MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc); MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
......
...@@ -90,9 +90,9 @@ ...@@ -90,9 +90,9 @@
#endif #endif
// pass tensor descriptor by value or void* // pass tensor descriptor by value or void*
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 1 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VALUE 0
#define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0 #define CK_EXPERIMENTAL_PASS_TENSOR_DESCRIPTOR_BY_VOID_POINTER 0
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0 #define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 1
// merge transformation use magic number division // merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0 #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 0
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
template <typename TInWei, template <typename TInWei,
typename TAcc, typename TAcc,
typename TOut, typename TOut,
ck::index_t activ_type, ck::ActivTypeEnum_t activ_type,
typename InLengths, typename InLengths,
typename WeiLengths, typename WeiLengths,
typename MaxLengths, typename MaxLengths,
......
...@@ -87,16 +87,22 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -87,16 +87,22 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR #if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
const auto OutRightPadH = Hop - Ho;
const auto OutRightPadW = Wop - Wo;
const auto OutRightPadHx = Number<OutRightPadH * 2>{};
const auto OutRightPadWx = Number<OutRightPadW * 2>{};
#else #else
const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock; const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock; const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
#endif
const auto OutRightPadH = Hop - Ho; const auto OutRightPadH = Hop - Ho;
const auto OutRightPadW = Wop - Wo; const auto OutRightPadW = Wop - Wo;
const auto OutRightPadHx = OutRightPadH * 2; const auto OutRightPadHx = OutRightPadH * 2;
const auto OutRightPadWx = OutRightPadW * 2; const auto OutRightPadWx = OutRightPadW * 2;
#endif
const auto InLeftPadH = in_left_pads[I0]; const auto InLeftPadH = in_left_pads[I0];
const auto InLeftPadW = in_left_pads[I1]; const auto InLeftPadW = in_left_pads[I1];
...@@ -382,7 +388,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -382,7 +388,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
true>; true,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
...@@ -410,7 +417,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -410,7 +417,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
false>; false,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
...@@ -461,7 +469,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -461,7 +469,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
true>; true,
activ_type>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -495,7 +504,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -495,7 +504,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
false>; false,
activ_type>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -526,7 +536,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -526,7 +536,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), ""); static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), ""); static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
const auto kernel = kernel_gemm_dlops_v2_resize_add< const auto kernel = kernel_gemm_dlops_v3_resize_add<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -535,7 +545,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -535,7 +545,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_H2x2_W0_W1_W2x2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
has_main_e0_block_loop>; has_main_e0_block_loop,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
......
...@@ -337,7 +337,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -337,7 +337,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>, remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
true>; true,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
...@@ -363,7 +364,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -363,7 +364,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>, remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
false>; false,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
...@@ -408,7 +410,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -408,7 +410,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>, remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
true>; true,
activ_type>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -440,7 +443,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -440,7 +443,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>, remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
false>; false,
activ_type>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -469,14 +473,15 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -469,14 +473,15 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), ""); static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
const auto kernel = const auto kernel =
kernel_gemm_dlops_v2<GridwiseGemm, kernel_gemm_dlops_v3<GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>, remove_reference_t<AGridDesc_E0_E1_K0_K1_E2>,
remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>, remove_reference_t<BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2>,
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
has_main_e0_block_loop>; has_main_e0_block_loop,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
......
...@@ -27,7 +27,7 @@ template <ck::index_t BlockSize, ...@@ -27,7 +27,7 @@ template <ck::index_t BlockSize,
ck::index_t ABlockTransferDstScalarPerVector_E2, ck::index_t ABlockTransferDstScalarPerVector_E2,
ck::index_t BThreadTransferSrcScalarPerVector_E2, ck::index_t BThreadTransferSrcScalarPerVector_E2,
ck::index_t CThreadTransferDstScalarPerVector_K, ck::index_t CThreadTransferDstScalarPerVector_K,
ck::index_t activ_type> ck::ActivTypeEnum_t activ_type>
struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0hwk1_maxpool
{ {
template <typename... Wei, template <typename... Wei,
...@@ -88,16 +88,22 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -88,16 +88,22 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR #if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{}; const auto Hop = Number<(Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock>{};
const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{}; const auto Wop = Number<(Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock>{};
const auto OutRightPadH = Hop - Ho;
const auto OutRightPadW = Wop - Wo;
const auto OutRightPadHx = Number<OutRightPadH / 2>{};
const auto OutRightPadWx = Number<OutRightPadW / 2>{};
#else #else
const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock; const auto Hop = (Ho + HoPerBlock - 1) / HoPerBlock * HoPerBlock;
const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock; const auto Wop = (Wo + WoPerBlock - 1) / WoPerBlock * WoPerBlock;
#endif
const auto OutRightPadH = Hop - Ho; const auto OutRightPadH = Hop - Ho;
const auto OutRightPadW = Wop - Wo; const auto OutRightPadW = Wop - Wo;
const auto OutRightPadHx = OutRightPadH / 2; const auto OutRightPadHx = OutRightPadH / 2;
const auto OutRightPadWx = OutRightPadW / 2; const auto OutRightPadWx = OutRightPadW / 2;
#endif
const auto InLeftPadH = in_left_pads[I0]; const auto InLeftPadH = in_left_pads[I0];
const auto InLeftPadW = in_left_pads[I1]; const auto InLeftPadW = in_left_pads[I1];
...@@ -380,7 +386,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -380,7 +386,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
true>; true,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
...@@ -409,7 +416,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -409,7 +416,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
false>; false,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
...@@ -461,7 +469,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -461,7 +469,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
true>; true,
activ_type>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -497,7 +506,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -497,7 +506,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
false>; false,
activ_type>;
ave_time = launch_and_time_kernel( ave_time = launch_and_time_kernel(
kernel, kernel,
...@@ -529,7 +539,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -529,7 +539,7 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), ""); static_assert(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.IsKnownAtCompileTime(), "");
static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), ""); static_assert(c_blockid_to_k_n_h_w_block_cluster_adaptor.IsKnownAtCompileTime(), "");
const auto kernel = kernel_gemm_dlops_v2_maxpool< const auto kernel = kernel_gemm_dlops_v3_maxpool<
GridwiseGemm, GridwiseGemm,
FloatAB, FloatAB,
FloatC, FloatC,
...@@ -538,7 +548,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0 ...@@ -538,7 +548,8 @@ struct DriverDynamicConvolutionForwardImplicitGemmDlops_v5r1_nc0hwc1_kc0yxc1_nk0
remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>, remove_reference_t<CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>,
remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>, remove_reference_t<DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>,
remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>, remove_reference_t<CBlockIdToBlockClusterAdaptor_K_N_H_W>,
has_main_e0_block_loop>; has_main_e0_block_loop,
activ_type>;
ave_time = launch_and_time_kernel(kernel, ave_time = launch_and_time_kernel(kernel,
nrepeat, nrepeat,
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 1 #define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1 #define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo enum ConvForwardAlgo
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 1 #define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1 #define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo enum ConvForwardAlgo
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp" #include "device_convolution_maxpool_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp"
#define USE_DYNAMIC_MODE 1 #define USE_DYNAMIC_MODE 0
#define USE_CONV_FWD_V5R1_NCHWC 1 #define USE_CONV_FWD_V5R1_NCHWC 1
enum ConvForwardAlgo enum ConvForwardAlgo
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment