"vscode:/vscode.git/clone" did not exist on "d353c80ee7280a355f2ec72b45121fa7a3a7fb9c"
Commit abf75ac0 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 120ab94a
...@@ -391,7 +391,7 @@ int main() ...@@ -391,7 +391,7 @@ int main()
constexpr unsigned HPad = 0; constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0; constexpr unsigned WPad = 0;
#elif 0 #elif 1
// 3x3, 34x34 // 3x3, 34x34
constexpr unsigned N = 64; constexpr unsigned N = 64;
constexpr unsigned C = 256; constexpr unsigned C = 256;
...@@ -430,6 +430,9 @@ int main() ...@@ -430,6 +430,9 @@ int main()
constexpr unsigned K = 64; constexpr unsigned K = 64;
constexpr unsigned S = 5; constexpr unsigned S = 5;
constexpr unsigned R = 5; constexpr unsigned R = 5;
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#elif 0 #elif 0
// 7x7, 38x38 // 7x7, 38x38
constexpr unsigned N = 64; constexpr unsigned N = 64;
...@@ -439,6 +442,9 @@ int main() ...@@ -439,6 +442,9 @@ int main()
constexpr unsigned K = 64; constexpr unsigned K = 64;
constexpr unsigned S = 7; constexpr unsigned S = 7;
constexpr unsigned R = 7; constexpr unsigned R = 7;
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#elif 0 #elif 0
// 3x3, 58x58 // 3x3, 58x58
constexpr unsigned N = 16; constexpr unsigned N = 16;
...@@ -484,7 +490,7 @@ int main() ...@@ -484,7 +490,7 @@ int main()
constexpr unsigned HPad = 1; constexpr unsigned HPad = 1;
constexpr unsigned WPad = 1; constexpr unsigned WPad = 1;
#elif 1 #elif 0
// 1x1 filter, 28x28 image // 1x1 filter, 28x28 image
constexpr unsigned N = 16; constexpr unsigned N = 16;
constexpr unsigned C = 256; constexpr unsigned C = 256;
...@@ -608,7 +614,7 @@ int main() ...@@ -608,7 +614,7 @@ int main()
nrepeat); nrepeat);
#endif #endif
#if 0 #if 1
if(S == 3 && R == 3) if(S == 3 && R == 3)
{ {
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
......
...@@ -137,12 +137,18 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, ...@@ -137,12 +137,18 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr unsigned HoPerThread = 1; constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
constexpr unsigned InBlockCopyDataPerRead = 4; // not used, yet
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0 #elif 1
// for 7x7, 38x38 // for 7x7, 38x38
constexpr unsigned NPerBlock = 8; constexpr unsigned NPerBlock = 8;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2; constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 4; constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 4; constexpr unsigned WoPerBlock = 4;
...@@ -152,6 +158,12 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, ...@@ -152,6 +158,12 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr unsigned HoPerThread = 1; constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
constexpr unsigned InBlockCopyDataPerRead = 4; // not used, yet
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0 #elif 0
// for 3x3, 56x56 // for 3x3, 56x56
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
template <unsigned NRow_, unsigned NCol_, unsigned RowStride_> template <unsigned NRow_, unsigned NCol_, unsigned RowStride_>
struct ConstantMatrixDescriptor struct ConstantMatrixDescriptor
{ {
__host__ __device__ ConstantMatrixDescriptor() __host__ __device__ constexpr ConstantMatrixDescriptor()
{ {
static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!"); static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!");
} }
......
...@@ -124,12 +124,12 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC ...@@ -124,12 +124,12 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
{ {
if(TransA && (!TransB) && (!TransC)) if(TransA && (!TransB) && (!TransC))
{ {
constexpr auto True = Constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = Constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile constexpr auto a_block_mtx = BlockMatrixA{};
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile constexpr auto b_block_mtx = BlockMatrixB{};
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed
...@@ -137,11 +137,11 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC ...@@ -137,11 +137,11 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
constexpr unsigned NPerThread = c_thread_mtx.NCol(); constexpr unsigned NPerThread = c_thread_mtx.NCol();
// a is transposed, b is not // a is transposed, b is not
const auto a_thread_mtx = make_ConstantMatrixDescriptor( constexpr auto a_thread_mtx =
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
const auto b_thread_mtx = make_ConstantMatrixDescriptor( constexpr auto b_thread_mtx =
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
...@@ -278,8 +278,8 @@ struct BlockwiseGemmBlockABlockBThreadC ...@@ -278,8 +278,8 @@ struct BlockwiseGemmBlockABlockBThreadC
if(TransA && (!TransB) && (!TransC)) if(TransA && (!TransB) && (!TransC))
{ {
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile constexpr auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile constexpr auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(), static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
"wrong! k dimension not consistent!"); "wrong! k dimension not consistent!");
...@@ -287,7 +287,7 @@ struct BlockwiseGemmBlockABlockBThreadC ...@@ -287,7 +287,7 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned MPerBlock = a_block_mtx.NCol(); constexpr unsigned MPerBlock = a_block_mtx.NCol();
constexpr unsigned NPerBlock = b_block_mtx.NCol(); constexpr unsigned NPerBlock = b_block_mtx.NCol();
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile constexpr auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
// divide thread work // divide thread work
constexpr unsigned MPerThread = c_thread_mtx.NRow(); constexpr unsigned MPerThread = c_thread_mtx.NRow();
...@@ -374,12 +374,12 @@ struct BlockwiseGemmBlockABlockBThreadC ...@@ -374,12 +374,12 @@ struct BlockwiseGemmBlockABlockBThreadC
{ {
if(TransA && (!TransB) && (!TransC)) if(TransA && (!TransB) && (!TransC))
{ {
constexpr auto True = Constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = Constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile constexpr auto a_block_mtx = BlockMatrixA{};
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile constexpr auto b_block_mtx = BlockMatrixB{};
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed
...@@ -387,11 +387,11 @@ struct BlockwiseGemmBlockABlockBThreadC ...@@ -387,11 +387,11 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned NPerThread = c_thread_mtx.NCol(); constexpr unsigned NPerThread = c_thread_mtx.NCol();
// a is transposed, b is not // a is transposed, b is not
const auto a_thread_mtx = make_ConstantMatrixDescriptor( constexpr auto a_thread_mtx =
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
const auto b_thread_mtx = make_ConstantMatrixDescriptor( constexpr auto b_thread_mtx =
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
FloatA p_a_thread[a_thread_mtx.GetElementSpace()]; FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()]; FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
...@@ -556,8 +556,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -556,8 +556,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatC* p_c_thread, FloatC* p_c_thread,
Accumulator f_accum) const Accumulator f_accum) const
{ {
constexpr auto True = Constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = Constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
...@@ -648,12 +648,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -648,12 +648,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatC* p_c_thread, FloatC* p_c_thread,
Accumulator f_accum) const Accumulator f_accum) const
{ {
constexpr auto True = Constant<bool, true>{}; constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = Constant<bool, false>{}; constexpr auto False = integral_constant<bool, false>{};
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile constexpr auto a_block_mtx = BlockMatrixA{};
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile constexpr auto b_block_mtx = BlockMatrixB{};
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr unsigned M = a_block_mtx.NCol(); constexpr unsigned M = a_block_mtx.NCol();
constexpr unsigned N = b_block_mtx.NCol(); constexpr unsigned N = b_block_mtx.NCol();
...@@ -663,22 +663,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -663,22 +663,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr unsigned NPerThread = c_thread_mtx.NCol(); constexpr unsigned NPerThread = c_thread_mtx.NCol();
// thread A, B for GEMM // thread A, B for GEMM
const auto a_thread_mtx = make_ConstantMatrixDescriptor( constexpr auto a_thread_mtx =
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
const auto b_thread_mtx = make_ConstantMatrixDescriptor( constexpr auto b_thread_mtx =
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
// thread A-sub, B-sub for copy // thread A-sub, B-sub for copy
const auto a_thread_sub_mtx = constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
Number<MPerThreadSubC>{},
Number<MPerThread>{}); // constexpr doesn't compile
const auto b_thread_sub_mtx = constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
Number<NPerThreadSubC>{},
Number<NPerThread>{}); // constexpr doesn't compile
// register // register
FloatA p_a_thread_0[a_thread_mtx.GetElementSpace()]; FloatA p_a_thread_0[a_thread_mtx.GetElementSpace()];
......
#pragma once #pragma once
#define WARPSIZE 32; __device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
__device__ unsigned get_block_1d_id() { return blockIdx.x; }
template <class T1, class T2> template <class T1, class T2>
struct is_same struct is_same
...@@ -14,20 +16,16 @@ struct is_same<T, T> ...@@ -14,20 +16,16 @@ struct is_same<T, T>
static const bool value = true; static const bool value = true;
}; };
__device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
__device__ unsigned get_block_1d_id() { return blockIdx.x; }
template <class T, T N> template <class T, T N>
struct Constant struct integral_constant
{ {
static const T mValue = N; static const T value = N;
__host__ __device__ constexpr T Get() const { return mValue; } __host__ __device__ constexpr T Get() const { return value; }
}; };
template <unsigned N> template <unsigned N>
using Number = Constant<unsigned, N>; using Number = integral_constant<unsigned, N>;
template <unsigned... Is> template <unsigned... Is>
struct Sequence struct Sequence
......
...@@ -156,18 +156,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc, ...@@ -156,18 +156,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K] // A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto b_cxwn_block_mtx_desc =
Number<CPerBlock>{}, make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{}, Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto c_kxwn_thread_mtx_desc =
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm = const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize, Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
...@@ -176,18 +176,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri ...@@ -176,18 +176,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
// A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K] // A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto b_cxwn_block_mtx_desc =
Number<CPerBlock>{}, make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{}, Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto c_kxwn_thread_mtx_desc =
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm = const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize, Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
...@@ -176,18 +176,16 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p ...@@ -176,18 +176,16 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K] // A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto b_cxwn_block_mtx_desc =
Number<CPerBlock>{}, make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{}, Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto c_kxwn_thread_mtx_desc =
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm = const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize, Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
...@@ -115,16 +115,16 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc, ...@@ -115,16 +115,16 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K] // A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc =
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto b_cxwn_block_mtx_desc =
Number<CPerBlock>{}, make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{}, Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto c_kxwn_thread_mtx_desc =
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm = const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize, Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
...@@ -121,16 +121,16 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc, ...@@ -121,16 +121,16 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K] // A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N] // B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N] // C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc =
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto b_cxwn_block_mtx_desc =
Number<CPerBlock>{}, make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{}, Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto c_kxwn_thread_mtx_desc =
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm = const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize, Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
...@@ -172,18 +172,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, ...@@ -172,18 +172,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K] // a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B] // c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto c_kxb_thread_mtx_desc =
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
#if 0 #if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize, const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
...@@ -258,7 +254,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc, ...@@ -258,7 +254,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
{ {
auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 0 #if 1
blockwise_gemm.Run blockwise_gemm.Run
#else #else
blockwise_gemm.Run_RegisterDoubleBuffer blockwise_gemm.Run_RegisterDoubleBuffer
......
...@@ -172,18 +172,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b ...@@ -172,18 +172,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K] // a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B] // c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, constexpr auto c_kxb_thread_mtx_desc =
Number<BPerBlock>{}, make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
#if 0 #if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize, const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
......
...@@ -140,16 +140,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, ...@@ -140,16 +140,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K] // a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B] // c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc =
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto c_kxb_thread_mtx_desc =
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize, const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc), decltype(a_cxk_block_mtx_desc),
......
...@@ -156,16 +156,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline ...@@ -156,16 +156,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K] // a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead] // b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B] // c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto a_cxk_block_mtx_desc =
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor( constexpr auto c_kxb_thread_mtx_desc =
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize, const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc), decltype(a_cxk_block_mtx_desc),
......
...@@ -30,13 +30,13 @@ template <class MatrixA, ...@@ -30,13 +30,13 @@ template <class MatrixA,
class FloatC, class FloatC,
class Accumulator> class Accumulator>
__device__ void threadwise_gemm(MatrixA, __device__ void threadwise_gemm(MatrixA,
Constant<bool, TransA>, integral_constant<bool, TransA>,
FloatA* const p_a_thread, FloatA* const p_a_thread,
MatrixB, MatrixB,
Constant<bool, TransB>, integral_constant<bool, TransB>,
FloatB* const p_b_thread, FloatB* const p_b_thread,
MatrixC, MatrixC,
Constant<bool, TransC>, integral_constant<bool, TransC>,
FloatC* p_c_thread, FloatC* p_c_thread,
Accumulator f_accum) Accumulator f_accum)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment