"git@developer.sourcefind.cn:norm/vllm.git" did not exist on "c1376e0f825e88e32b5aca85c676fe547bcb03c9"
Commit adf4b173 authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 99d05ba7
...@@ -140,10 +140,10 @@ void device_convolution( ...@@ -140,10 +140,10 @@ void device_convolution(
constexpr unsigned YPerBlock = 8; constexpr unsigned YPerBlock = 8;
constexpr unsigned XPerBlock = 16; constexpr unsigned XPerBlock = 16;
constexpr unsigned NBlockCopyLen0 = 1; constexpr unsigned NBlockOpLen0 = 1;
constexpr unsigned NBlockCopyLen1 = 1; constexpr unsigned NBlockOpLen1 = 1;
constexpr unsigned NBlockCopyLen2 = 4; constexpr unsigned NBlockOpLen2 = 4;
constexpr unsigned NBlockCopyLen3 = 32; constexpr unsigned NBlockOpLen3 = 32;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
...@@ -174,10 +174,10 @@ void device_convolution( ...@@ -174,10 +174,10 @@ void device_convolution(
CPerBlock, CPerBlock,
YPerBlock, YPerBlock,
XPerBlock, XPerBlock,
NBlockCopyLen0, NBlockOpLen0,
NBlockCopyLen1, NBlockOpLen1,
NBlockCopyLen2, NBlockOpLen2,
NBlockCopyLen3, NBlockOpLen3,
BlockSize, BlockSize,
GridSize> GridSize>
<<<grid_dim, block_dim>>>(InDesc{}, <<<grid_dim, block_dim>>>(InDesc{},
...@@ -248,7 +248,7 @@ int main() ...@@ -248,7 +248,7 @@ int main()
int num_thread = std::thread::hardware_concurrency(); int num_thread = std::thread::hardware_concurrency();
#if 0 #if 1
in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); in.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread); wei.GenerateTensorValue(GeneratorTensor<float>{}, num_thread);
out_host.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread); out_host.GenerateTensorValue(GeneratorConstant<float>{0}, num_thread);
...@@ -258,7 +258,7 @@ int main() ...@@ -258,7 +258,7 @@ int main()
device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device); device_convolution(in_desc, in, wei_desc, wei, out_desc, out_device);
#if 0 #if 1
host_convolution(in, wei, out_host); host_convolution(in, wei, out_host);
float error = 0; float error = 0;
......
...@@ -7,10 +7,10 @@ ...@@ -7,10 +7,10 @@
template <class TFloat, template <class TFloat,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
unsigned NWorkLen0, unsigned NBlockOpLen0,
unsigned NWorkLen1, unsigned NBlockOpLen1,
unsigned NWorkLen2, unsigned NBlockOpLen2,
unsigned NWorkLen3, unsigned NBlockOpLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op_binary( __device__ void blockwise_4d_tensor_op_binary(
...@@ -67,10 +67,10 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -67,10 +67,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template <class TFloat, template <class TFloat,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
unsigned NWorkLen0, unsigned NBlockOpLen0,
unsigned NWorkLen1, unsigned NBlockOpLen1,
unsigned NWorkLen2, unsigned NBlockOpLen2,
unsigned NWorkLen3, unsigned NBlockOpLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op_binary( __device__ void blockwise_4d_tensor_op_binary(
...@@ -156,10 +156,10 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -156,10 +156,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template <class TFloat, template <class TFloat,
class DstDesc, class DstDesc,
unsigned NWorkLen0, unsigned NBlockOpLen0,
unsigned NWorkLen1, unsigned NBlockOpLen1,
unsigned NWorkLen2, unsigned NBlockOpLen2,
unsigned NWorkLen3, unsigned NBlockOpLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f) __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst, F f)
...@@ -240,10 +240,10 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst ...@@ -240,10 +240,10 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
template <class TFloat, template <class TFloat,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
unsigned NWorkLen0, unsigned NBlockOpLen0,
unsigned NWorkLen1, unsigned NBlockOpLen1,
unsigned NWorkLen2, unsigned NBlockOpLen2,
unsigned NWorkLen3, unsigned NBlockOpLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op_binary( __device__ void blockwise_4d_tensor_op_binary(
...@@ -259,34 +259,34 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -259,34 +259,34 @@ __device__ void blockwise_4d_tensor_op_binary(
static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value); static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
constexpr unsigned NWorkStride3 = 1; constexpr unsigned NBlockOpStride3 = 1;
constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
unsigned itmp = threadIdx.x; unsigned itmp = threadIdx.x;
const unsigned did0_begin = itmp / NWorkStride0; const unsigned did0_begin = itmp / NBlockOpStride0;
itmp -= did0_begin * NWorkStride0; itmp -= did0_begin * NBlockOpStride0;
const unsigned did1_begin = itmp / NWorkStride1; const unsigned did1_begin = itmp / NBlockOpStride1;
itmp -= did1_begin * NWorkStride1; itmp -= did1_begin * NBlockOpStride1;
const unsigned did2_begin = itmp / NWorkStride2; const unsigned did2_begin = itmp / NBlockOpStride2;
itmp -= did2_begin * NWorkStride2; itmp -= did2_begin * NBlockOpStride2;
const unsigned did3_begin = itmp / NWorkStride3; const unsigned did3_begin = itmp / NBlockOpStride3;
for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
{ {
for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
{ {
for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
{ {
for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
{ {
const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
...@@ -304,10 +304,10 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -304,10 +304,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template <class TFloat, template <class TFloat,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
unsigned NWorkLen0, unsigned NBlockOpLen0,
unsigned NWorkLen1, unsigned NBlockOpLen1,
unsigned NWorkLen2, unsigned NBlockOpLen2,
unsigned NWorkLen3, unsigned NBlockOpLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op_binary( __device__ void blockwise_4d_tensor_op_binary(
...@@ -323,63 +323,63 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -323,63 +323,63 @@ __device__ void blockwise_4d_tensor_op_binary(
static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value); static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
constexpr unsigned NWorkStride3 = 1; constexpr unsigned NBlockOpStride3 = 1;
constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
unsigned itmp = threadIdx.x; unsigned itmp = threadIdx.x;
const unsigned did0_begin = itmp / NWorkStride0; const unsigned did0_begin = itmp / NBlockOpStride0;
itmp -= did0_begin * NWorkStride0; itmp -= did0_begin * NBlockOpStride0;
const unsigned did1_begin = itmp / NWorkStride1; const unsigned did1_begin = itmp / NBlockOpStride1;
itmp -= did1_begin * NWorkStride1; itmp -= did1_begin * NBlockOpStride1;
const unsigned did2_begin = itmp / NWorkStride2; const unsigned did2_begin = itmp / NBlockOpStride2;
itmp -= did2_begin * NWorkStride2; itmp -= did2_begin * NBlockOpStride2;
const unsigned did3_begin = itmp / NWorkStride3; const unsigned did3_begin = itmp / NBlockOpStride3;
unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
{ {
const unsigned sindex_save0 = sindex; const unsigned sindex_save0 = sindex;
const unsigned dindex_save0 = dindex; const unsigned dindex_save0 = dindex;
for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
{ {
const unsigned sindex_save1 = sindex; const unsigned sindex_save1 = sindex;
const unsigned dindex_save1 = dindex; const unsigned dindex_save1 = dindex;
for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
{ {
const unsigned sindex_save2 = sindex; const unsigned sindex_save2 = sindex;
const unsigned dindex_save2 = dindex; const unsigned dindex_save2 = dindex;
for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
{ {
f(p_src[sindex], p_dst[dindex]); f(p_src[sindex], p_dst[dindex]);
sindex += NWorkLen3 * src_desc.GetStride(I3); sindex += NBlockOpLen3 * src_desc.GetStride(I3);
dindex += NWorkLen3 * dst_desc.GetStride(I3); dindex += NBlockOpLen3 * dst_desc.GetStride(I3);
} }
sindex = sindex_save2 + NWorkLen2 * src_desc.GetStride(I2); sindex = sindex_save2 + NBlockOpLen2 * src_desc.GetStride(I2);
dindex = dindex_save2 + NWorkLen2 * dst_desc.GetStride(I2); dindex = dindex_save2 + NBlockOpLen2 * dst_desc.GetStride(I2);
} }
sindex = sindex_save1 + NWorkLen1 * src_desc.GetStride(I1); sindex = sindex_save1 + NBlockOpLen1 * src_desc.GetStride(I1);
dindex = dindex_save1 + NWorkLen1 * dst_desc.GetStride(I1); dindex = dindex_save1 + NBlockOpLen1 * dst_desc.GetStride(I1);
} }
sindex = sindex_save0 + NWorkLen0 * src_desc.GetStride(I0); sindex = sindex_save0 + NBlockOpLen0 * src_desc.GetStride(I0);
dindex = dindex_save0 + NWorkLen0 * dst_desc.GetStride(I0); dindex = dindex_save0 + NBlockOpLen0 * dst_desc.GetStride(I0);
} }
} }
#endif #endif
...@@ -388,10 +388,10 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -388,10 +388,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template <class TFloat, template <class TFloat,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
unsigned NWorkLen0, unsigned NBlockOpLen0,
unsigned NWorkLen1, unsigned NBlockOpLen1,
unsigned NWorkLen2, unsigned NBlockOpLen2,
unsigned NWorkLen3, unsigned NBlockOpLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op_binary( __device__ void blockwise_4d_tensor_op_binary(
...@@ -407,65 +407,69 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -407,65 +407,69 @@ __device__ void blockwise_4d_tensor_op_binary(
static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value); static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
constexpr unsigned NWorkStride3 = 1; constexpr unsigned NBlockOpStride3 = 1;
constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
unsigned itmp = threadIdx.x; unsigned itmp = threadIdx.x;
const unsigned did0_begin = itmp / NWorkStride0; const unsigned did0_begin = itmp / NBlockOpStride0;
itmp -= did0_begin * NWorkStride0; itmp -= did0_begin * NBlockOpStride0;
const unsigned did1_begin = itmp / NWorkStride1; const unsigned did1_begin = itmp / NBlockOpStride1;
itmp -= did1_begin * NWorkStride1; itmp -= did1_begin * NBlockOpStride1;
const unsigned did2_begin = itmp / NWorkStride2; const unsigned did2_begin = itmp / NBlockOpStride2;
itmp -= did2_begin * NWorkStride2; itmp -= did2_begin * NBlockOpStride2;
const unsigned did3_begin = itmp / NWorkStride3; const unsigned did3_begin = itmp / NBlockOpStride3;
unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); unsigned sindex = src_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin); unsigned dindex = dst_desc.Get1dIndex(did0_begin, did1_begin, did2_begin, did3_begin);
for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
{ {
unsigned i1 = 0; unsigned i1 = 0;
for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
{ {
unsigned i2 = 0; unsigned i2 = 0;
for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
{ {
unsigned i3 = 0; unsigned i3 = 0;
for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
{ {
f(p_src[sindex], p_dst[dindex]); f(p_src[sindex], p_dst[dindex]);
sindex += NWorkLen3 * src_desc.GetStride(I3); sindex += NBlockOpLen3 * src_desc.GetStride(I3);
dindex += NWorkLen3 * dst_desc.GetStride(I3); dindex += NBlockOpLen3 * dst_desc.GetStride(I3);
++i3; ++i3;
} }
sindex += sindex += NBlockOpLen2 * src_desc.GetStride(I2) -
NWorkLen2 * src_desc.GetStride(I2) - i3 * NWorkLen3 * src_desc.GetStride(I3); i3 * NBlockOpLen3 * src_desc.GetStride(I3);
dindex += dindex += NBlockOpLen2 * dst_desc.GetStride(I2) -
NWorkLen2 * dst_desc.GetStride(I2) - i3 * NWorkLen3 * dst_desc.GetStride(I3); i3 * NBlockOpLen3 * dst_desc.GetStride(I3);
++i2; ++i2;
} }
sindex += NWorkLen1 * src_desc.GetStride(I1) - i2 * NWorkLen2 * src_desc.GetStride(I2); sindex +=
dindex += NWorkLen1 * dst_desc.GetStride(I1) - i2 * NWorkLen2 * dst_desc.GetStride(I2); NBlockOpLen1 * src_desc.GetStride(I1) - i2 * NBlockOpLen2 * src_desc.GetStride(I2);
dindex +=
NBlockOpLen1 * dst_desc.GetStride(I1) - i2 * NBlockOpLen2 * dst_desc.GetStride(I2);
++i1; ++i1;
} }
sindex += NWorkLen0 * src_desc.GetStride(I0) - i1 * NWorkLen1 * src_desc.GetStride(I1); sindex +=
dindex += NWorkLen0 * dst_desc.GetStride(I0) - i1 * NWorkLen1 * dst_desc.GetStride(I1); NBlockOpLen0 * src_desc.GetStride(I0) - i1 * NBlockOpLen1 * src_desc.GetStride(I1);
dindex +=
NBlockOpLen0 * dst_desc.GetStride(I0) - i1 * NBlockOpLen1 * dst_desc.GetStride(I1);
} }
} }
#endif #endif
...@@ -474,10 +478,10 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -474,10 +478,10 @@ __device__ void blockwise_4d_tensor_op_binary(
template <class TFloat, template <class TFloat,
class SrcDesc, class SrcDesc,
class DstDesc, class DstDesc,
unsigned NWorkLen0, unsigned NBlockOpLen0,
unsigned NWorkLen1, unsigned NBlockOpLen1,
unsigned NWorkLen2, unsigned NBlockOpLen2,
unsigned NWorkLen3, unsigned NBlockOpLen3,
class F, class F,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_4d_tensor_op_binary( __device__ void blockwise_4d_tensor_op_binary(
...@@ -493,34 +497,34 @@ __device__ void blockwise_4d_tensor_op_binary( ...@@ -493,34 +497,34 @@ __device__ void blockwise_4d_tensor_op_binary(
static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value); static_assert(is_same<decltype(src_desc.GetLengths()), decltype(dst_desc.GetLengths())>::value);
constexpr unsigned NWorkStride3 = 1; constexpr unsigned NBlockOpStride3 = 1;
constexpr unsigned NWorkStride2 = NWorkLen3 * NWorkStride3; constexpr unsigned NBlockOpStride2 = NBlockOpLen3 * NBlockOpStride3;
constexpr unsigned NWorkStride1 = NWorkLen2 * NWorkStride2; constexpr unsigned NBlockOpStride1 = NBlockOpLen2 * NBlockOpStride2;
constexpr unsigned NWorkStride0 = NWorkLen1 * NWorkStride1; constexpr unsigned NBlockOpStride0 = NBlockOpLen1 * NBlockOpStride1;
unsigned itmp = threadIdx.x; unsigned itmp = threadIdx.x;
const unsigned did0_begin = itmp / NWorkStride0; const unsigned did0_begin = itmp / NBlockOpStride0;
itmp -= did0_begin * NWorkStride0; itmp -= did0_begin * NBlockOpStride0;
const unsigned did1_begin = itmp / NWorkStride1; const unsigned did1_begin = itmp / NBlockOpStride1;
itmp -= did1_begin * NWorkStride1; itmp -= did1_begin * NBlockOpStride1;
const unsigned did2_begin = itmp / NWorkStride2; const unsigned did2_begin = itmp / NBlockOpStride2;
itmp -= did2_begin * NWorkStride2; itmp -= did2_begin * NBlockOpStride2;
const unsigned did3_begin = itmp / NWorkStride3; const unsigned did3_begin = itmp / NBlockOpStride3;
for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NWorkLen0) for(unsigned did0 = did0_begin; did0 < src_desc.GetLength(I0); did0 += NBlockOpLen0)
{ {
for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NWorkLen1) for(unsigned did1 = did1_begin; did1 < src_desc.GetLength(I1); did1 += NBlockOpLen1)
{ {
for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NWorkLen2) for(unsigned did2 = did2_begin; did2 < src_desc.GetLength(I2); did2 += NBlockOpLen2)
{ {
for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NWorkLen3) for(unsigned did3 = did3_begin; did3 < src_desc.GetLength(I3); did3 += NBlockOpLen3)
{ {
const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3); const unsigned sindex = src_desc.Get1dIndex(did0, did1, did2, did3);
......
...@@ -12,11 +12,11 @@ template <class TFloat, ...@@ -12,11 +12,11 @@ template <class TFloat,
unsigned OutTileSizeW, unsigned OutTileSizeW,
unsigned BlockSize> unsigned BlockSize>
__device__ void blockwise_convolution(InDesc, __device__ void blockwise_convolution(InDesc,
TFloat* const __restrict__ p_in, TFloat* const __restrict__ p_in_lds,
WeiDesc, WeiDesc,
TFloat* const __restrict__ p_wei, TFloat* const __restrict__ p_wei_lds,
OutDesc, OutDesc,
TFloat* __restrict__ p_out) TFloat* __restrict__ p_out_lds)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{}; constexpr auto I1 = Index<1>{};
...@@ -97,8 +97,8 @@ __device__ void blockwise_convolution(InDesc, ...@@ -97,8 +97,8 @@ __device__ void blockwise_convolution(InDesc,
decltype(in_thread_dst_desc), decltype(in_thread_dst_desc),
decltype(f_copy)>( decltype(f_copy)>(
in_thread_src_desc, in_thread_src_desc,
p_in + in_desc.Get1dIndex( p_in_lds + in_desc.Get1dIndex(
n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin), n_thread_work_begin, 0, hi_thread_work_begin, wi_thread_work_begin),
in_thread_dst_desc, in_thread_dst_desc,
p_in_thread, p_in_thread,
f_copy); f_copy);
...@@ -112,7 +112,7 @@ __device__ void blockwise_convolution(InDesc, ...@@ -112,7 +112,7 @@ __device__ void blockwise_convolution(InDesc,
decltype(wei_thread_dst_desc), decltype(wei_thread_dst_desc),
decltype(f_copy)>( decltype(f_copy)>(
wei_thread_src_desc, wei_thread_src_desc,
p_wei + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0), p_wei_lds + wei_desc.Get1dIndex(k_thread_work_begin, 0, 0, 0),
wei_thread_dst_desc, wei_thread_dst_desc,
p_wei_thread, p_wei_thread,
f_copy); f_copy);
...@@ -123,10 +123,10 @@ __device__ void blockwise_convolution(InDesc, ...@@ -123,10 +123,10 @@ __device__ void blockwise_convolution(InDesc,
decltype(out_thread_dst_desc), decltype(out_thread_dst_desc),
decltype(f_copy)>( decltype(f_copy)>(
out_thread_src_desc, out_thread_src_desc,
p_out + out_desc.Get1dIndex(n_thread_work_begin, p_out_lds + out_desc.Get1dIndex(n_thread_work_begin,
k_thread_work_begin, k_thread_work_begin,
ho_thread_work_begin, ho_thread_work_begin,
wo_thread_work_begin), wo_thread_work_begin),
out_thread_dst_desc, out_thread_dst_desc,
p_out_thread, p_out_thread,
f_copy); f_copy);
...@@ -150,10 +150,10 @@ __device__ void blockwise_convolution(InDesc, ...@@ -150,10 +150,10 @@ __device__ void blockwise_convolution(InDesc,
out_thread_dst_desc, out_thread_dst_desc,
p_out_thread, p_out_thread,
out_thread_src_desc, out_thread_src_desc,
p_out + out_desc.Get1dIndex(n_thread_work_begin, p_out_lds + out_desc.Get1dIndex(n_thread_work_begin,
k_thread_work_begin, k_thread_work_begin,
ho_thread_work_begin, ho_thread_work_begin,
wo_thread_work_begin), wo_thread_work_begin),
f_copy); f_copy);
} }
} }
...@@ -170,18 +170,18 @@ template <class TFloat, ...@@ -170,18 +170,18 @@ template <class TFloat,
unsigned CPerBlock, unsigned CPerBlock,
unsigned YPerBlock, unsigned YPerBlock,
unsigned XPerBlock, unsigned XPerBlock,
unsigned NBlockCopyLen0, unsigned NBlockOpLen0,
unsigned NBlockCopyLen1, unsigned NBlockOpLen1,
unsigned NBlockCopyLen2, unsigned NBlockOpLen2,
unsigned NBlockCopyLen3, unsigned NBlockOpLen3,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_convolution(InDesc, __global__ void gridwise_convolution(InDesc,
TFloat* const __restrict__ p_in, TFloat* const __restrict__ p_in_glb,
WeiDesc, WeiDesc,
TFloat* const __restrict__ p_wei, TFloat* const __restrict__ p_wei_glb,
OutDesc, OutDesc,
TFloat* __restrict__ p_out) TFloat* __restrict__ p_out_glb)
{ {
constexpr auto I0 = Index<0>{}; constexpr auto I0 = Index<0>{};
constexpr auto I1 = Index<1>{}; constexpr auto I1 = Index<1>{};
...@@ -222,13 +222,13 @@ __global__ void gridwise_convolution(InDesc, ...@@ -222,13 +222,13 @@ __global__ void gridwise_convolution(InDesc,
constexpr auto out_block_lds_desc = constexpr auto out_block_lds_desc =
make_ConstantTensorDescriptor(out_block_glb_desc.GetLengths()); make_ConstantTensorDescriptor(out_block_glb_desc.GetLengths());
constexpr unsigned in_block_size = in_block_lds_desc.GetElementSize(); constexpr unsigned in_block_size = in_block_lds_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_block_lds_desc.GetElementSize(); constexpr unsigned wei_block_size = wei_block_lds_desc.GetElementSpace();
constexpr unsigned out_block_size = out_block_lds_desc.GetElementSize(); constexpr unsigned out_block_size = out_block_lds_desc.GetElementSpace();
__shared__ TFloat p_in_block[in_block_size]; __shared__ TFloat p_in_block_lds[in_block_size];
__shared__ TFloat p_wei_block[wei_block_size]; __shared__ TFloat p_wei_block_lds[wei_block_size];
__shared__ TFloat p_out_block[out_block_size]; __shared__ TFloat p_out_block_lds[out_block_size];
const unsigned block_id = blockIdx.x; const unsigned block_id = blockIdx.x;
...@@ -286,12 +286,12 @@ __global__ void gridwise_convolution(InDesc, ...@@ -286,12 +286,12 @@ __global__ void gridwise_convolution(InDesc,
// set output tensor in LDS to 0 // set output tensor in LDS to 0
blockwise_4d_tensor_op_unary<TFloat, blockwise_4d_tensor_op_unary<TFloat,
decltype(out_block_lds_desc), decltype(out_block_lds_desc),
NBlockCopyLen0, NBlockOpLen0,
NBlockCopyLen1, NBlockOpLen1,
NBlockCopyLen2, NBlockOpLen2,
NBlockCopyLen3, NBlockOpLen3,
decltype(f_set0), decltype(f_set0),
BlockSize>(out_block_lds_desc, p_out_block, f_set0); BlockSize>(out_block_lds_desc, p_out_block_lds, f_set0);
for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1); for(unsigned c_block_work_begin = 0; c_block_work_begin < in_desc.GetLength(I1);
c_block_work_begin += CPerBlock) c_block_work_begin += CPerBlock)
...@@ -301,35 +301,35 @@ __global__ void gridwise_convolution(InDesc, ...@@ -301,35 +301,35 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op_binary<TFloat, blockwise_4d_tensor_op_binary<TFloat,
decltype(in_block_glb_desc), decltype(in_block_glb_desc),
decltype(in_block_lds_desc), decltype(in_block_lds_desc),
NBlockCopyLen0, NBlockOpLen0,
NBlockCopyLen1, NBlockOpLen1,
NBlockCopyLen2, NBlockOpLen2,
NBlockCopyLen3, NBlockOpLen3,
decltype(f_copy), decltype(f_copy),
BlockSize>( BlockSize>(
in_block_glb_desc, in_block_glb_desc,
p_in + in_block_glb_desc.Get1dIndex(n_block_work_begin, p_in_glb + in_block_glb_desc.Get1dIndex(n_block_work_begin,
c_block_work_begin, c_block_work_begin,
hi_block_work_begin, hi_block_work_begin,
wi_block_work_begin), wi_block_work_begin),
in_block_lds_desc, in_block_lds_desc,
p_in_block, p_in_block_lds,
f_copy); f_copy);
// copy weight tensor to LDS // copy weight tensor to LDS
blockwise_4d_tensor_op_binary<TFloat, blockwise_4d_tensor_op_binary<TFloat,
decltype(wei_block_glb_desc), decltype(wei_block_glb_desc),
decltype(wei_block_lds_desc), decltype(wei_block_lds_desc),
NBlockCopyLen0, NBlockOpLen0,
NBlockCopyLen1, NBlockOpLen1,
NBlockCopyLen2, NBlockOpLen2,
NBlockCopyLen3, NBlockOpLen3,
decltype(f_copy), decltype(f_copy),
BlockSize>( BlockSize>(
wei_block_glb_desc, wei_block_glb_desc,
p_wei + wei_block_glb_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0), p_wei_glb + wei_block_glb_desc.Get1dIndex(k_block_work_begin, c_block_work_begin, 0, 0),
wei_block_lds_desc, wei_block_lds_desc,
p_wei_block, p_wei_block_lds,
f_copy); f_copy);
#if 1 #if 1
...@@ -344,11 +344,11 @@ __global__ void gridwise_convolution(InDesc, ...@@ -344,11 +344,11 @@ __global__ void gridwise_convolution(InDesc,
OutTileSizeH, OutTileSizeH,
OutTileSizeW, OutTileSizeW,
BlockSize>(in_block_lds_desc, BlockSize>(in_block_lds_desc,
p_in_block, p_in_block_lds,
wei_block_lds_desc, wei_block_lds_desc,
p_wei_block, p_wei_block_lds,
out_block_lds_desc, out_block_lds_desc,
p_out_block); p_out_block_lds);
#if 1 #if 1
__syncthreads(); __syncthreads();
...@@ -359,16 +359,16 @@ __global__ void gridwise_convolution(InDesc, ...@@ -359,16 +359,16 @@ __global__ void gridwise_convolution(InDesc,
blockwise_4d_tensor_op_binary<TFloat, blockwise_4d_tensor_op_binary<TFloat,
decltype(out_block_lds_desc), decltype(out_block_lds_desc),
decltype(out_block_glb_desc), decltype(out_block_glb_desc),
NBlockCopyLen0, NBlockOpLen0,
NBlockCopyLen1, NBlockOpLen1,
NBlockCopyLen2, NBlockOpLen2,
NBlockCopyLen3, NBlockOpLen3,
decltype(f_copy), decltype(f_copy),
BlockSize>( BlockSize>(
out_block_lds_desc, out_block_lds_desc,
p_out_block, p_out_block_lds,
out_block_glb_desc, out_block_glb_desc,
p_out + p_out_glb +
out_block_glb_desc.Get1dIndex( out_block_glb_desc.Get1dIndex(
n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin), n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin),
f_copy); f_copy);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment