Commit 34321734 authored by Jing Zhang's avatar Jing Zhang
Browse files

fixed comments

parent 5f11dccc
......@@ -180,7 +180,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}));
#if 1
// GEMM
using gridwise_gemm = GridwiseDynamicGemm_km_kn_mn_v3<
BlockSize,
......@@ -363,7 +362,6 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
std::cout << "Average time : " << ave_time << " ms, " << perf << " TFlop/s"
<< std::endl;
}
#endif
}
};
} // namespace ck
......
......@@ -133,12 +133,13 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto EPerBlock = a_block_mtx.GetLength(I0);
constexpr auto KPerThreadSubC = 4;
constexpr auto HPerThreadSubC = 2;
constexpr auto WPerThreadSubC = 2;
constexpr auto HoPerThreadSubC = 2;
constexpr auto WoPerThreadSubC = 2;
static_assert(KPerThread % KPerThreadSubC == 0, "");
static_assert(HPerThread % HPerThreadSubC == 0, "");
static_assert(WPerThread % WPerThreadSubC == 0, "");
static_assert(HPerThread % HoPerThreadSubC == 0, "");
static_assert(WPerThread % WoPerThreadSubC == 0, "");
// thread A, B for GEMM
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
......@@ -161,8 +162,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v3<decltype(a_thread_mtx),
decltype(b_thread_mtx),
decltype(c_thread_mtx),
HPerThreadSubC,
WPerThreadSubC>{};
HoPerThreadSubC,
WoPerThreadSubC>{};
// loop over k
#pragma unroll
for(index_t e_begin = 0; e_begin < EPerBlock; e_begin += EPerThreadLoop)
......@@ -176,10 +177,10 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
p_a_thread);
#pragma unroll
for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HPerThreadSubC)
for(index_t h_begin = 0; h_begin < HPerThread; h_begin += HoPerThreadSubC)
{
#pragma unroll
for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WPerThreadSubC)
for(index_t w_begin = 0; w_begin < WPerThread; w_begin += WoPerThreadSubC)
{
threadwise_gemm.Run(p_a_thread,
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(
......
......@@ -36,9 +36,9 @@
#endif
// buffer resourse
#if CK_AMD_GPU_GFX906 || CK_AMD_GPU_GFX908
#if defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif CK_AMD_GPU_GFX1030
#elif defined(CK_AMD_GPU_GFX1030)
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#endif
......
......@@ -164,26 +164,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
#endif
constexpr auto conv_driver =
// DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad<
#if 0
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_pad<
#else
DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad<
#endif
BlockSize,
typename vector_type<TInWei, InWeiVectorSize>::type,
TAcc,
TOut,
KPerBlock,
HoPerBlock,
WoPerBlock,
EPerBlock,
KPerThread,
HoPerThread,
WoPerThread,
EPerThread,
ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K,
ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K,
BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W>{};
typename vector_type<TInWei, InWeiVectorSize>::type, TAcc, TOut, KPerBlock,
HoPerBlock, WoPerBlock, EPerBlock, KPerThread, HoPerThread, WoPerThread,
EPerThread, ABlockTransferThreadSliceLengths_E_K,
ABlockTransferThreadClusterLengths_E_K, ABlockTransferSrcScalarPerVector_E,
ABlockTransferDstScalarPerVector_K, BThreadTransferSrcScalarPerVector_W,
CThreadTransferDstScalarPerVector_W > {};
conv_driver.Run(wei_k_c0_y_x_desc,
in_n_c0_hi_wi_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment