"vscode:/vscode.git/clone" did not exist on "83ad7bda09bd969967c799bec101f0b8d8f23812"
Commit abf75ac0 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 120ab94a
......@@ -391,7 +391,7 @@ int main()
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#elif 0
#elif 1
// 3x3, 34x34
constexpr unsigned N = 64;
constexpr unsigned C = 256;
......@@ -430,6 +430,9 @@ int main()
constexpr unsigned K = 64;
constexpr unsigned S = 5;
constexpr unsigned R = 5;
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#elif 0
// 7x7, 38x38
constexpr unsigned N = 64;
......@@ -439,6 +442,9 @@ int main()
constexpr unsigned K = 64;
constexpr unsigned S = 7;
constexpr unsigned R = 7;
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#elif 0
// 3x3, 58x58
constexpr unsigned N = 16;
......@@ -484,7 +490,7 @@ int main()
constexpr unsigned HPad = 1;
constexpr unsigned WPad = 1;
#elif 1
#elif 0
// 1x1 filter, 28x28 image
constexpr unsigned N = 16;
constexpr unsigned C = 256;
......@@ -608,7 +614,7 @@ int main()
nrepeat);
#endif
#if 0
#if 1
if(S == 3 && R == 3)
{
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
......
......@@ -137,12 +137,18 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
constexpr unsigned InBlockCopyDataPerRead = 4; // not used, yet
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128;
#elif 0
#elif 1
// for 7x7, 38x38
constexpr unsigned NPerBlock = 8;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 2;
constexpr unsigned CPerBlock = 1;
constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 4;
......@@ -152,6 +158,12 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
constexpr unsigned InBlockCopyDataPerRead = 4; // not used, yet
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128;
#elif 0
// for 3x3, 56x56
......
......@@ -4,7 +4,7 @@
template <unsigned NRow_, unsigned NCol_, unsigned RowStride_>
struct ConstantMatrixDescriptor
{
__host__ __device__ ConstantMatrixDescriptor()
__host__ __device__ constexpr ConstantMatrixDescriptor()
{
static_assert(NCol_ <= RowStride_, "wrong! NCol > RowStride!");
}
......
......@@ -124,12 +124,12 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
{
if(TransA && (!TransB) && (!TransC))
{
constexpr auto True = Constant<bool, true>{};
constexpr auto False = Constant<bool, false>{};
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed
......@@ -137,11 +137,11 @@ struct Blockwise1dStridedBatchedGemmBlockABlockBThreadC
constexpr unsigned NPerThread = c_thread_mtx.NCol();
// a is transposed, b is not
const auto a_thread_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile
constexpr auto a_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
const auto b_thread_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile
constexpr auto b_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
......@@ -278,8 +278,8 @@ struct BlockwiseGemmBlockABlockBThreadC
if(TransA && (!TransB) && (!TransC))
{
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
constexpr auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
constexpr auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
static_assert(a_block_mtx.NRow() == b_block_mtx.NRow(),
"wrong! k dimension not consistent!");
......@@ -287,7 +287,7 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned MPerBlock = a_block_mtx.NCol();
constexpr unsigned NPerBlock = b_block_mtx.NCol();
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
constexpr auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
// divide thread work
constexpr unsigned MPerThread = c_thread_mtx.NRow();
......@@ -374,12 +374,12 @@ struct BlockwiseGemmBlockABlockBThreadC
{
if(TransA && (!TransB) && (!TransC))
{
constexpr auto True = Constant<bool, true>{};
constexpr auto False = Constant<bool, false>{};
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr unsigned KPerBlock = a_block_mtx.NRow(); // A is transposed
......@@ -387,11 +387,11 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned NPerThread = c_thread_mtx.NCol();
// a is transposed, b is not
const auto a_thread_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile
constexpr auto a_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
const auto b_thread_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile
constexpr auto b_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
......@@ -556,8 +556,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatC* p_c_thread,
Accumulator f_accum) const
{
constexpr auto True = Constant<bool, true>{};
constexpr auto False = Constant<bool, false>{};
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
......@@ -648,12 +648,12 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
FloatC* p_c_thread,
Accumulator f_accum) const
{
constexpr auto True = Constant<bool, true>{};
constexpr auto False = Constant<bool, false>{};
constexpr auto True = integral_constant<bool, true>{};
constexpr auto False = integral_constant<bool, false>{};
const auto a_block_mtx = BlockMatrixA{}; // constexpr doesn't compile
const auto b_block_mtx = BlockMatrixB{}; // constexpr doesn't compile
const auto c_thread_mtx = ThreadMatrixC{}; // constexpr doesn't compile
constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr unsigned M = a_block_mtx.NCol();
constexpr unsigned N = b_block_mtx.NCol();
......@@ -663,22 +663,18 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
constexpr unsigned NPerThread = c_thread_mtx.NCol();
// thread A, B for GEMM
const auto a_thread_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<MPerThread>{}); // constexpr doesn't compile
constexpr auto a_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<MPerThread>{});
const auto b_thread_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThread>{}); // constexpr doesn't compile
constexpr auto b_thread_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{}, Number<NPerThread>{});
// thread A-sub, B-sub for copy
const auto a_thread_sub_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{},
Number<MPerThreadSubC>{},
Number<MPerThread>{}); // constexpr doesn't compile
constexpr auto a_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}, Number<MPerThread>{});
const auto b_thread_sub_mtx =
make_ConstantMatrixDescriptor(Number<KPerThreadLoop>{},
Number<NPerThreadSubC>{},
Number<NPerThread>{}); // constexpr doesn't compile
constexpr auto b_thread_sub_mtx = make_ConstantMatrixDescriptor(
Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}, Number<NPerThread>{});
// register
FloatA p_a_thread_0[a_thread_mtx.GetElementSpace()];
......
#pragma once
#define WARPSIZE 32;
__device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
__device__ unsigned get_block_1d_id() { return blockIdx.x; }
template <class T1, class T2>
struct is_same
......@@ -14,20 +16,16 @@ struct is_same<T, T>
static const bool value = true;
};
__device__ unsigned get_thread_local_1d_id() { return threadIdx.x; }
__device__ unsigned get_block_1d_id() { return blockIdx.x; }
template <class T, T N>
struct Constant
struct integral_constant
{
static const T mValue = N;
static const T value = N;
__host__ __device__ constexpr T Get() const { return mValue; }
__host__ __device__ constexpr T Get() const { return value; }
};
template <unsigned N>
using Number = Constant<unsigned, N>;
using Number = integral_constant<unsigned, N>;
template <unsigned... Is>
struct Sequence
......
......@@ -156,18 +156,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
constexpr auto b_cxwn_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
constexpr auto c_kxwn_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
......@@ -176,18 +176,16 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restri
// A_matrix[C,K] is a sub-matrix of wei_block[C,S,R,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
constexpr auto b_cxwn_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
constexpr auto c_kxwn_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
......@@ -176,18 +176,16 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
constexpr auto b_cxwn_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
constexpr auto c_kxwn_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
......@@ -115,16 +115,16 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
constexpr auto b_cxwn_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
constexpr auto c_kxwn_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
......@@ -121,16 +121,16 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(InGlobalDesc,
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
constexpr auto b_cxwn_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
Number<in_chwn_block_desc.GetStride(I0)>{});
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
constexpr auto c_kxwn_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<WoPerThread * NPerThread>{});
const auto blockwise_batch_gemm =
Blockwise1dStridedBatchedGemmBlockABlockBThreadC<BlockSize,
......
......@@ -172,18 +172,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
constexpr auto c_kxb_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
#if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
......@@ -258,7 +254,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
#if 0
#if 1
blockwise_gemm.Run
#else
blockwise_gemm.Run_RegisterDoubleBuffer
......
......@@ -172,18 +172,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
// a_mtx[C,K] is a sub-matrix of wei_block[C,S,R,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}, Number<wei_csrk_block_desc.GetStride(I0)>{});
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
constexpr auto c_kxb_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
#if 0
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
......
......@@ -140,16 +140,14 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
constexpr auto c_kxb_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc),
......
......@@ -156,16 +156,14 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
constexpr auto a_cxk_block_mtx_desc =
make_ConstantMatrixDescriptor(Number<CPerBlock>{}, Number<KPerBlock>{});
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
constexpr auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<BPerBlock>{}, Number<in_cb_block_desc.GetStride(I0)>{});
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
constexpr auto c_kxb_thread_mtx_desc =
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
decltype(a_cxk_block_mtx_desc),
......
......@@ -30,13 +30,13 @@ template <class MatrixA,
class FloatC,
class Accumulator>
__device__ void threadwise_gemm(MatrixA,
Constant<bool, TransA>,
integral_constant<bool, TransA>,
FloatA* const p_a_thread,
MatrixB,
Constant<bool, TransB>,
integral_constant<bool, TransB>,
FloatB* const p_b_thread,
MatrixC,
Constant<bool, TransC>,
integral_constant<bool, TransC>,
FloatC* p_c_thread,
Accumulator f_accum)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment