"aten/git@developer.sourcefind.cn:OpenDAS/torch-cluster.git" did not exist on "b992389e64a30859ed01196501523bf18d142db2"
Commit 4b616aad authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 0741a8ab
...@@ -11,8 +11,9 @@ ...@@ -11,8 +11,9 @@
#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh" #include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh" #include "device_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh"
#include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh" #include "device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" #include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include "device_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh"
//#include "device_winograd_convolution.cuh" //#include "device_winograd_convolution.cuh"
struct GeneratorTensor_1 struct GeneratorTensor_1
...@@ -382,11 +383,14 @@ int main() ...@@ -382,11 +383,14 @@ int main()
#if 0 #if 0
constexpr unsigned N = 1; constexpr unsigned N = 1;
constexpr unsigned C = 1; constexpr unsigned C = 1;
constexpr unsigned HI = 10; constexpr unsigned HI = 28;
constexpr unsigned WI = 10; constexpr unsigned WI = 28;
constexpr unsigned K = 1; constexpr unsigned K = 1;
constexpr unsigned S = 3; constexpr unsigned S = 3;
constexpr unsigned R = 3; constexpr unsigned R = 3;
constexpr unsigned HPad = 1;
constexpr unsigned WPad = 1;
#elif 0 #elif 0
// 3x3, 34x34 // 3x3, 34x34
constexpr unsigned N = 64; constexpr unsigned N = 64;
...@@ -567,6 +571,9 @@ int main() ...@@ -567,6 +571,9 @@ int main()
#elif 1 #elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcsr.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-2, 2}, num_thread);
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#endif #endif
unsigned nrepeat = 100; unsigned nrepeat = 100;
...@@ -582,15 +589,17 @@ int main() ...@@ -582,15 +589,17 @@ int main()
device_implicit_gemm_convolution_1_nchw_srck_nkhw device_implicit_gemm_convolution_1_nchw_srck_nkhw
#elif 0 #elif 0
device_implicit_gemm_convolution_1_chwn_csrk_khwn device_implicit_gemm_convolution_1_chwn_csrk_khwn
#elif 1 #elif 0
device_implicit_gemm_convolution_2_cnhw_srck_knhw device_implicit_gemm_convolution_2_cnhw_srck_knhw
#elif 1
device_implicit_gemm_convolution_2_cnhw_csrk_knhw
#elif 0 #elif 0
device_winograd_convolution device_winograd_convolution
#endif #endif
(in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0 #elif 1
device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(in_nchw_desc, device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(in_nchw_desc,
in_nchw, in_nchw,
wei_kcsr_desc, wei_kcsr_desc,
wei_kcsr, wei_kcsr,
...@@ -601,7 +610,7 @@ int main() ...@@ -601,7 +610,7 @@ int main()
nrepeat); nrepeat);
#endif #endif
#if 1 #if 0
if(S == 3 && R == 3) if(S == 3 && R == 3)
{ {
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
......
...@@ -226,7 +226,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc, ...@@ -226,7 +226,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn(InDesc,
cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime); printf("Elapsed time : %f ms\n", elapsedTime);
usleep(10000); usleep(std::min(elapsedTime * 1000, float(10000)));
} }
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
......
#pragma once #pragma once
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding.cuh" #include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh"
#include "gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh"
#include <unistd.h> #include <unistd.h>
#include <algorithm>
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads> template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, void device_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(InDesc,
const Tensor<T>& in_nchw, const Tensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcsr, const Tensor<T>& wei_kcsr,
...@@ -88,6 +90,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, ...@@ -88,6 +90,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
constexpr unsigned HoPerThread = 1; constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 1;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 1;
constexpr unsigned BlockSize = 8; constexpr unsigned BlockSize = 8;
#elif 0 #elif 0
// for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256 // for 3x3, 34x34 | 3x3 58x58, NKC = 64, 64, 256
...@@ -180,6 +185,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, ...@@ -180,6 +185,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
constexpr unsigned HoPerThread = 1; constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 2;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 64;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0 #elif 0
// for 5x5 filter, 20x84 image, 1x1 padding // for 5x5 filter, 20x84 image, 1x1 padding
...@@ -225,6 +233,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, ...@@ -225,6 +233,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
constexpr unsigned HoPerThread = 1; constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 32;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#endif #endif
...@@ -245,7 +256,12 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, ...@@ -245,7 +256,12 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
cudaEventCreate(&start); cudaEventCreate(&start);
cudaEventRecord(start, 0); cudaEventRecord(start, 0);
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding<GridSize, #if 1
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded
#elif 1
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline
#endif
<GridSize,
BlockSize, BlockSize,
T, T,
decltype(in_chwn_desc), decltype(in_chwn_desc),
...@@ -262,7 +278,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, ...@@ -262,7 +278,9 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
KPerThread, KPerThread,
CPerThread, CPerThread,
HoPerThread, HoPerThread,
WoPerThread> WoPerThread,
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>
<<<grid_dim, block_dim>>>(static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()), <<<grid_dim, block_dim>>>(static_cast<T*>(in_chwn_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()), static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer())); static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
...@@ -274,7 +292,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc, ...@@ -274,7 +292,7 @@ void device_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding(InDesc,
cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime); printf("Elapsed time : %f ms\n", elapsedTime);
usleep(elapsedTime * 1000); usleep(std::min(elapsedTime * 1000, float(10000)));
} }
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
......
#pragma once
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline.cuh"
#include <unistd.h>
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_implicit_gemm_convolution_2_cnhw_csrk_knhw(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcsr,
OutDesc,
Tensor<T>& out_nkhw,
unsigned nrepeat)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcsr_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr unsigned N = in_nchw_desc.GetLength(I0);
constexpr unsigned Hi = in_nchw_desc.GetLength(I2);
constexpr unsigned Wi = in_nchw_desc.GetLength(I3);
constexpr unsigned Ho = out_nkhw_desc.GetLength(I2);
constexpr unsigned Wo = out_nkhw_desc.GetLength(I3);
constexpr unsigned K = wei_kcsr_desc.GetLength(I0);
constexpr unsigned C = wei_kcsr_desc.GetLength(I1);
constexpr unsigned S = wei_kcsr_desc.GetLength(I2);
constexpr unsigned R = wei_kcsr_desc.GetLength(I3);
constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1);
// convert in_nchw to in_cnhw
auto in_cnhw_desc = make_ConstantTensorDescriptor(Sequence<C, N, Hi, Wi>{});
ostream_ConstantTensorDescriptor(in_cnhw_desc, std::cout << "in_cnhw_desc: ");
Tensor<T> in_cnhw(make_TensorDescriptor(in_cnhw_desc));
auto f_reorder_nchw2cnhw = [&](auto n, auto c, auto hi, auto wi) {
in_cnhw(c, n, hi, wi) = in_nchw(n, c, hi, wi);
};
make_ParallelTensorFunctor(f_reorder_nchw2cnhw, N, C, Hi, Wi)(
std::thread::hardware_concurrency());
// convert wei_kcsr to wei_csrk
auto wei_csrk_desc = make_ConstantTensorDescriptor(Sequence<C, S, R, K>{});
ostream_ConstantTensorDescriptor(wei_csrk_desc, std::cout << "wei_csrk_desc: ");
Tensor<T> wei_csrk(make_TensorDescriptor(wei_csrk_desc));
auto f_reorder_kcsr2csrk = [&](auto k, auto c, auto s, auto r) {
wei_csrk(c, s, r, k) = wei_kcsr(k, c, s, r);
};
make_ParallelTensorFunctor(f_reorder_kcsr2csrk, K, C, S, R)(
std::thread::hardware_concurrency());
// conver out_nkhw to out_knhw
auto out_knhw_desc = make_ConstantTensorDescriptor(Sequence<K, N, Ho, Wo>{});
ostream_ConstantTensorDescriptor(out_knhw_desc, std::cout << "out_knhw_desc: ");
Tensor<T> out_knhw(make_TensorDescriptor(out_knhw_desc));
#if 1
// 1x1, 28x28
constexpr unsigned BPerBlock = 64;
constexpr unsigned KPerBlock = 64;
constexpr unsigned CPerBlock = 8;
constexpr unsigned BPerThread = 4;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned GemmRowThreadPerCluster = 4;
constexpr unsigned GemmColumnThreadPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 4;
constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned WeiBlockCopyThreadPerDim0 = 4;
constexpr unsigned WeiBlockCopyThreadPerDim1 = 16;
constexpr unsigned BlockSize = 64;
#endif
constexpr unsigned GridSize =
((N * Hi * Wi + BPerBlock - 1) / BPerBlock) * ((K + KPerBlock - 1) / KPerBlock);
dim3 block_dim(BlockSize);
dim3 grid_dim(GridSize);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
// mem
std::size_t data_sz = sizeof(T);
DeviceMem in_cnhw_device_buf(data_sz * (in_cnhw.mDesc.GetElementSpace() + BGhostRead +
BPerBlock)); // reserve extra space for BGhostRead
DeviceMem wei_csrk_device_buf(data_sz * wei_csrk.mDesc.GetElementSpace());
DeviceMem out_knhw_device_buf(data_sz * out_knhw.mDesc.GetElementSpace());
in_cnhw_device_buf.ToDevice(in_cnhw.mData.data());
wei_csrk_device_buf.ToDevice(wei_csrk.mData.data());
out_knhw_device_buf.ToDevice(out_knhw.mData.data());
for(unsigned i = 0; i < nrepeat; ++i)
{
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventRecord(start, 0);
#if 1
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
#else
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline
#endif
<GridSize,
BlockSize,
T,
decltype(in_cnhw_desc),
decltype(wei_csrk_desc),
decltype(out_knhw_desc),
BPerBlock,
KPerBlock,
CPerBlock,
BPerThread,
KPerThread,
CPerThread,
GemmRowThreadPerCluster,
GemmColumnThreadPerCluster,
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1,
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>
<<<grid_dim, block_dim>>>(in_cnhw_desc,
static_cast<T*>(in_cnhw_device_buf.GetDeviceBuffer()),
wei_csrk_desc,
static_cast<T*>(wei_csrk_device_buf.GetDeviceBuffer()),
out_knhw_desc,
static_cast<T*>(out_knhw_device_buf.GetDeviceBuffer()));
cudaEventCreate(&stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime);
usleep(std::min(elapsedTime * 1000, float(10000)));
}
checkCudaErrors(cudaGetLastError());
out_knhw_device_buf.FromDevice(out_knhw.mData.data());
// convert out_knhw to out_nkhw
auto f_reorder_knhw2nkhw = [&](auto n, auto k, auto ho, auto wo) {
out_nkhw(n, k, ho, wo) = out_knhw(k, n, ho, wo);
};
make_ParallelTensorFunctor(f_reorder_knhw2nkhw, N, K, Ho, Wo)(
std::thread::hardware_concurrency());
}
#pragma once #pragma once
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
#include "gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw.cuh" #include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh"
#include <unistd.h> #include <unistd.h>
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
...@@ -122,8 +122,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, ...@@ -122,8 +122,8 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
constexpr unsigned GemmRowThreadPerCluster = 8; constexpr unsigned GemmRowThreadPerCluster = 8;
constexpr unsigned GemmColumnThreadPerCluster = 8; constexpr unsigned GemmColumnThreadPerCluster = 8;
constexpr unsigned InBlockCopyThreadPerDim0 = 2; constexpr unsigned InBlockCopyThreadPerDim0 = 8;
constexpr unsigned InBlockCopyThreadPerDim1 = 64; constexpr unsigned InBlockCopyThreadPerDim1 = 16;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#endif #endif
...@@ -154,7 +154,12 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, ...@@ -154,7 +154,12 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
cudaEventCreate(&start); cudaEventCreate(&start);
cudaEventRecord(start, 0); cudaEventRecord(start, 0);
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw<GridSize, #if 0
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
#else
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#endif
<GridSize,
BlockSize, BlockSize,
T, T,
decltype(in_cnhw_desc), decltype(in_cnhw_desc),
...@@ -184,7 +189,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc, ...@@ -184,7 +189,7 @@ void device_implicit_gemm_convolution_2_cnhw_srck_knhw(InDesc,
cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed time : %f ms\n", elapsedTime); printf("Elapsed time : %f ms\n", elapsedTime);
usleep(10000); usleep(std::min(elapsedTime * 1000, float(10000)));
} }
checkCudaErrors(cudaGetLastError()); checkCudaErrors(cudaGetLastError());
......
...@@ -187,6 +187,8 @@ struct blockwise_2d_tensor_copy_2 ...@@ -187,6 +187,8 @@ struct blockwise_2d_tensor_copy_2
__device__ blockwise_2d_tensor_copy_2() __device__ blockwise_2d_tensor_copy_2()
{ {
static_assert(is_same<Float, float>::value, "wrong! type is not float!\n");
mThreadId0 = get_thread_local_1d_id() / ThreadPerDim1; mThreadId0 = get_thread_local_1d_id() / ThreadPerDim1;
mThreadId1 = get_thread_local_1d_id() - mThreadId0 * ThreadPerDim1; mThreadId1 = get_thread_local_1d_id() - mThreadId0 * ThreadPerDim1;
} }
...@@ -225,7 +227,14 @@ struct blockwise_2d_tensor_copy_2 ...@@ -225,7 +227,14 @@ struct blockwise_2d_tensor_copy_2
for(unsigned d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop) for(unsigned d1v4loop = 0; d1v4loop < Dim1V4Loop; ++d1v4loop)
{ {
unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
#if 1
const unsigned sindex = src_desc.Get1dIndex(did0, did1);
const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
*(reinterpret_cast<float4*>(p_dst + dindex)) =
*(reinterpret_cast<float4*>(p_src + sindex));
#else
for(unsigned i = 0; i < 4; ++i) for(unsigned i = 0; i < 4; ++i)
{ {
const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i);
...@@ -233,6 +242,7 @@ struct blockwise_2d_tensor_copy_2 ...@@ -233,6 +242,7 @@ struct blockwise_2d_tensor_copy_2
p_dst[dindex] = p_src[sindex]; p_dst[dindex] = p_src[sindex];
} }
#endif
} }
// v2 // v2
...@@ -241,6 +251,14 @@ struct blockwise_2d_tensor_copy_2 ...@@ -241,6 +251,14 @@ struct blockwise_2d_tensor_copy_2
unsigned did1 = unsigned did1 =
Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1; Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1;
#if 1
const unsigned sindex = src_desc.Get1dIndex(did0, did1);
const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
*(reinterpret_cast<float2*>(p_dst + dindex)) =
*(reinterpret_cast<float2*>(p_src + sindex));
#else
for(unsigned i = 0; i < 2; ++i) for(unsigned i = 0; i < 2; ++i)
{ {
const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i);
...@@ -248,6 +266,7 @@ struct blockwise_2d_tensor_copy_2 ...@@ -248,6 +266,7 @@ struct blockwise_2d_tensor_copy_2
p_dst[dindex] = p_src[sindex]; p_dst[dindex] = p_src[sindex];
} }
#endif
} }
// v1 // v1
...@@ -291,6 +310,14 @@ struct blockwise_2d_tensor_copy_2 ...@@ -291,6 +310,14 @@ struct blockwise_2d_tensor_copy_2
{ {
unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; unsigned did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1;
#if 1
const unsigned sindex = src_desc.Get1dIndex(did0, did1);
const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
*(reinterpret_cast<float4*>(p_dst + dindex)) =
*(reinterpret_cast<float4*>(p_src + sindex));
#else
for(unsigned i = 0; i < 4; ++i) for(unsigned i = 0; i < 4; ++i)
{ {
const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i);
...@@ -298,6 +325,7 @@ struct blockwise_2d_tensor_copy_2 ...@@ -298,6 +325,7 @@ struct blockwise_2d_tensor_copy_2
p_dst[dindex] = p_src[sindex]; p_dst[dindex] = p_src[sindex];
} }
#endif
} }
// v2 // v2
...@@ -306,6 +334,14 @@ struct blockwise_2d_tensor_copy_2 ...@@ -306,6 +334,14 @@ struct blockwise_2d_tensor_copy_2
unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + unsigned did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 +
2 * mThreadId1; 2 * mThreadId1;
#if 1
const unsigned sindex = src_desc.Get1dIndex(did0, did1);
const unsigned dindex = dst_desc.Get1dIndex(did0, did1);
*(reinterpret_cast<float2*>(p_dst + dindex)) =
*(reinterpret_cast<float2*>(p_src + sindex));
#else
for(unsigned i = 0; i < 2; ++i) for(unsigned i = 0; i < 2; ++i)
{ {
const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i); const unsigned sindex = src_desc.Get1dIndex(did0, did1 + i);
...@@ -313,6 +349,7 @@ struct blockwise_2d_tensor_copy_2 ...@@ -313,6 +349,7 @@ struct blockwise_2d_tensor_copy_2
p_dst[dindex] = p_src[sindex]; p_dst[dindex] = p_src[sindex];
} }
#endif
} }
// v1 // v1
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include "ConstantTensorDescriptor.cuh" #include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh" #include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh" #include "blockwise_4d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh" #include "threadwise_4d_tensor_op.cuh"
#include "blockwise_gemm.cuh" #include "blockwise_gemm.cuh"
...@@ -23,9 +24,11 @@ template <unsigned GridSize, ...@@ -23,9 +24,11 @@ template <unsigned GridSize,
unsigned KPerThread, unsigned KPerThread,
unsigned CPerThread, unsigned CPerThread,
unsigned HoPerThread, unsigned HoPerThread,
unsigned WoPerThread> unsigned WoPerThread,
__global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding( unsigned WeiBlockCopyThreadPerDim0,
Float* const __restrict__ p_in_global, unsigned WeiBlockCopyThreadPerDim1>
__global__ void
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(Float* const __restrict__ p_in_global,
Float* const __restrict__ p_wei_global, Float* const __restrict__ p_wei_global,
Float* __restrict__ p_out_global) Float* __restrict__ p_out_global)
{ {
...@@ -82,6 +85,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding ...@@ -82,6 +85,9 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock; const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned n_block_data_begin = n_block_work_id * NPerBlock; const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
// flattened (2d) tensor view of wei in global mem
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{});
// tensor view of blockwise input and weight in LDS // tensor view of blockwise input and weight in LDS
constexpr auto in_chwn_block_desc = constexpr auto in_chwn_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{}); make_ConstantTensorDescriptor(Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{});
...@@ -89,6 +95,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding ...@@ -89,6 +95,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
constexpr auto wei_csrk_block_desc = constexpr auto wei_csrk_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{}); make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
// flattened (2d) tensor view of wei in LDS
constexpr auto wei_ek_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
// tensor view of threadwise output in register // tensor view of threadwise output in register
constexpr auto out_hkwn_thread_desc = constexpr auto out_hkwn_thread_desc =
make_ConstantTensorDescriptor(Sequence<HoPerThread, KPerThread, WoPerThread, NPerThread>{}); make_ConstantTensorDescriptor(Sequence<HoPerThread, KPerThread, WoPerThread, NPerThread>{});
...@@ -133,13 +143,33 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding ...@@ -133,13 +143,33 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
decltype(in_chwn_block_desc.GetLengths()), decltype(in_chwn_block_desc.GetLengths()),
LowerPads>{}; LowerPads>{};
// weight: format is [S,R,C,K] #if 1
// weight: format is [C,S,R,K]
constexpr auto blockwise_wei_copy = constexpr auto blockwise_wei_copy =
blockwise_4d_tensor_copy_1<BlockSize, blockwise_4d_tensor_copy_1<BlockSize,
Float, Float,
decltype(wei_csrk_global_desc), decltype(wei_csrk_global_desc),
decltype(wei_csrk_block_desc), decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc.GetLengths())>{}; decltype(wei_csrk_block_desc.GetLengths())>{};
#elif 1
// weight: format is [C*S*R,K]
constexpr auto blockwise_wei_copy =
blockwise_2d_tensor_copy_1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 1
// weight: format is [C*S*R,K]
const auto blockwise_wei_copy =
blockwise_2d_tensor_copy_2<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>{};
#endif
// a series of blockwise batched GEMM // a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix // C_matrix += transpose(A_matrix) * B_matrix
...@@ -190,8 +220,12 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding ...@@ -190,8 +220,12 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
// set threadwise output tensor to 0 // set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread); threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; Float* p_wei_global_block_begin =
c_block_data_begin += CPerBlock, __syncthreads()) p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin);
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0),
__syncthreads())
{ {
#if 1 #if 1
// input: global mem to LDS, // input: global mem to LDS,
...@@ -209,9 +243,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding ...@@ -209,9 +243,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_with_padding
#if 1 #if 1
// weight: global mem to LDS, // weight: global mem to LDS,
blockwise_wei_copy.run(p_wei_global + wei_csrk_global_desc.Get1dIndex( blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block);
c_block_data_begin, 0, 0, k_block_data_begin),
p_wei_block);
#endif #endif
__syncthreads(); __syncthreads();
......
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "threadwise_4d_tensor_op.cuh"
#include "blockwise_gemm.cuh"
template <unsigned GridSize,
unsigned BlockSize,
class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
class LowerPads,
class UpperPads,
unsigned NPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned HoPerBlock,
unsigned WoPerBlock,
unsigned NPerThread,
unsigned KPerThread,
unsigned CPerThread,
unsigned HoPerThread,
unsigned WoPerThread,
unsigned WeiBlockCopyThreadPerDim0,
unsigned WeiBlockCopyThreadPerDim1>
__global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline(
Float* const __restrict__ p_in_global,
Float* const __restrict__ p_wei_global,
Float* __restrict__ p_out_global)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
// if we use [C,Hi,N,Wi,N] in LDS, then NPerThread can be different from NPerBlock
static_assert(NPerBlock % NPerThread == 0, "wrong! NPerBlock % NPerThread !=0");
static_assert((NPerThread < NPerBlock && WoPerThread == 1) || NPerThread == NPerBlock,
"wrong!");
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_chwn_global_desc = InGlobalDesc{};
constexpr auto wei_csrk_global_desc = WeiGlobalDesc{};
constexpr auto out_khwn_global_desc = OutGlobalDesc{};
constexpr unsigned C = in_chwn_global_desc.GetLength(I0);
constexpr unsigned K = out_khwn_global_desc.GetLength(I0);
constexpr unsigned Ho = out_khwn_global_desc.GetLength(I1);
constexpr unsigned Wo = out_khwn_global_desc.GetLength(I2);
constexpr unsigned N = out_khwn_global_desc.GetLength(I3);
constexpr unsigned S = wei_csrk_global_desc.GetLength(I1);
constexpr unsigned R = wei_csrk_global_desc.GetLength(I2);
constexpr unsigned HPadLow = LowerPads{}.Get(I0);
constexpr unsigned WPadLow = LowerPads{}.Get(I1);
constexpr unsigned HPadUp = UpperPads{}.Get(I0);
constexpr unsigned WPadUp = UpperPads{}.Get(I1);
constexpr unsigned HiPerBlock = HoPerBlock + S - 1;
constexpr unsigned WiPerBlock = WoPerBlock + R - 1;
// divide block work: [K, Ho, Wo, N]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
constexpr unsigned HBlockWork = (Ho + HoPerBlock - 1) / HoPerBlock;
constexpr unsigned WBlockWork = (Wo + WoPerBlock - 1) / WoPerBlock;
constexpr unsigned NBlockWork = (N + NPerBlock - 1) / NPerBlock;
const unsigned k_block_work_id = get_block_1d_id() / (HBlockWork * WBlockWork * NBlockWork);
unsigned itmp = get_block_1d_id() - k_block_work_id * (HBlockWork * WBlockWork * NBlockWork);
const unsigned h_block_work_id = itmp / (WBlockWork * NBlockWork);
itmp -= h_block_work_id * (WBlockWork * NBlockWork);
const unsigned w_block_work_id = itmp / NBlockWork;
const unsigned n_block_work_id = itmp - w_block_work_id * NBlockWork;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned ho_block_data_begin = h_block_work_id * HoPerBlock;
const unsigned wo_block_data_begin = w_block_work_id * WoPerBlock;
const unsigned n_block_data_begin = n_block_work_id * NPerBlock;
// flattened (2d) tensor view of wei in global mem
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{});
// tensor view of blockwise input and weight in LDS
constexpr auto in_chwn_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, HiPerBlock, WiPerBlock, NPerBlock>{});
constexpr auto wei_csrk_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
// flattened (2d) tensor view of wei in LDS
constexpr auto wei_ek_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
// tensor view of threadwise output in register
constexpr auto out_hkwn_thread_desc =
make_ConstantTensorDescriptor(Sequence<HoPerThread, KPerThread, WoPerThread, NPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_chwn_block_desc, "in_chwn_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(out_hkwn_thread_desc, "out_hkwn_thread_desc");
}
#endif
// blockwise copy
// input: format is [C, Hi, Wi, N]
const unsigned h_block_pad_low = h_block_work_id == 0 ? HPadLow : 0;
const unsigned w_block_pad_low = w_block_work_id == 0 ? WPadLow : 0;
const unsigned h_block_pad_up = h_block_work_id == HBlockWork - 1 ? HPadUp : 0;
const unsigned w_block_pad_up = w_block_work_id == WBlockWork - 1 ? WPadUp : 0;
#if 0
if(get_thread_local_1d_id() == 0)
;
{
printf(
"%u %u, h_block_pad_low %u w_block_pad_low %u h_block_pad_up %u w_block_pad_up %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
h_block_pad_low,
w_block_pad_low,
h_block_pad_up,
w_block_pad_up);
}
#endif
constexpr auto blockwise_in_copy =
blockwise_chwn_tensor_copy_with_padding<BlockSize,
Float,
decltype(in_chwn_global_desc),
decltype(in_chwn_block_desc),
decltype(in_chwn_block_desc.GetLengths()),
LowerPads>{};
#if 0
// weight: format is [C,S,R,K]
constexpr auto blockwise_wei_copy =
blockwise_4d_tensor_copy_1<BlockSize,
Float,
decltype(wei_csrk_global_desc),
decltype(wei_csrk_block_desc),
decltype(wei_csrk_block_desc.GetLengths())>{};
#elif 0
// weight: format is [C*S*R,K]
constexpr auto blockwise_wei_copy =
blockwise_2d_tensor_copy_1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 1
// weight: format is [C*S*R,K]
const auto blockwise_wei_copy =
blockwise_2d_tensor_copy_2<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>{};
#endif
// a series of blockwise batched GEMM
// C_matrix += transpose(A_matrix) * B_matrix
// A_matrix and B_matrix saved in LDS, C_matrix saved in register
// A_matrix[C,K] is a sub-matrix of wei_block[S,R,C,K]
// B_matrix[C,Wo*N] is a sub-matrix of in_block[C,Hi,Wi,N]
// C_matrix[K,Wo*N] is a sub-matrix of out_block[Ho,K,Wo,N]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<KPerBlock>{},
Number<wei_csrk_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto b_cxwn_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<WoPerBlock * NPerBlock>{},
Number<in_chwn_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto c_kxwn_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<WoPerThread * NPerThread>{}); // constexpr doesn't compile
const auto blockwise_batch_gemm =
blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxwn_block_mtx_desc),
decltype(c_kxwn_thread_mtx_desc),
true,
false,
false,
0,
in_chwn_block_desc.GetStride(I1),
out_hkwn_thread_desc.GetStride(
I0),
HoPerBlock,
HoPerThread,
CPerThread,
true>{};
// LDS
constexpr unsigned in_block_size = in_chwn_block_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace();
// LDS double buffer
__shared__ Float p_in_block_0[in_block_size];
__shared__ Float p_wei_block_0[wei_block_size];
__shared__ Float p_in_block_1[in_block_size];
__shared__ Float p_wei_block_1[wei_block_size];
// register
Float p_out_thread[out_hkwn_thread_desc.GetElementSpace()];
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread);
Float* p_wei_global_block_begin =
p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin);
// prelog: load data
// input: global mem to LDS,
blockwise_in_copy.run(p_in_global,
0,
ho_block_data_begin,
wo_block_data_begin,
n_block_data_begin,
p_in_block_0,
h_block_pad_low,
w_block_pad_low,
h_block_pad_up,
w_block_pad_up);
// weight: global mem to LDS,
blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block_0);
p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0);
bool even_loop = true;
for(unsigned c_block_data_begin = CPerBlock; c_block_data_begin < C;
c_block_data_begin += CPerBlock,
p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0),
even_loop = !even_loop)
{
__syncthreads();
Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1;
Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0;
Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0;
// preload next data
#if 1
// input: global mem to LDS,
blockwise_in_copy.run(p_in_global,
c_block_data_begin,
ho_block_data_begin,
wo_block_data_begin,
n_block_data_begin,
p_in_block_next,
h_block_pad_low,
w_block_pad_low,
h_block_pad_up,
w_block_pad_up);
#endif
#if 1
// weight: global mem to LDS,
blockwise_wei_copy.run(p_wei_global_block_begin, p_wei_block_next);
#endif
// a series of batched GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
blockwise_batch_gemm.run(p_wei_block_now +
wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
p_in_block_now + in_chwn_block_desc.Get1dIndex(0, s, r, 0),
p_out_thread,
f_accum);
}
}
}
// last computation
{
__syncthreads();
Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1;
Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
// a series of batched GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& acc, const auto&& v) { acc += v; };
blockwise_batch_gemm.run(p_wei_block_now +
wei_csrk_block_desc.Get1dIndex(0, s, r, 0),
p_in_block_now + in_chwn_block_desc.Get1dIndex(0, s, r, 0),
p_out_thread,
f_accum);
}
}
}
const auto matrix_c_index =
blockwise_batch_gemm.CalculateThreadMatrixCIndex(get_thread_local_1d_id());
const unsigned ho_thread_data_begin = matrix_c_index.batch_begin;
const unsigned k_thread_data_begin = matrix_c_index.row_begin;
const unsigned wo_thread_data_begin = matrix_c_index.col_begin / NPerBlock;
const unsigned n_thread_data_begin =
matrix_c_index.col_begin - wo_thread_data_begin * NPerBlock;
#if 0
printf("block %u %u, %u %u %u %u, %u %u %u %u, %f \n",
get_block_1d_id(), get_thread_local_1d_id(),
ho_block_data_begin, k_block_data_begin, wo_block_data_begin, n_block_data_begin,
ho_thread_data_begin, k_thread_data_begin, wo_thread_data_begin, n_thread_data_begin,
p_out_thread[0]);
#endif
// output: register to global mem,
// convert out_thread[Ho,K,Wo,N] to out_global[K,Ho,Wo,N]
constexpr auto reorder_khwn_from_hkwn = Sequence<1, 0, 2, 3>{};
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(
out_hkwn_thread_desc,
p_out_thread,
out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
out_hkwn_thread_desc.GetLengths(),
reorder_khwn_from_hkwn);
}
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "blockwise_gemm.cuh"
// define B = flatten(N, Hi, Wi)
template <unsigned GridSize,
unsigned BlockSize,
class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
unsigned BPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned BPerThread,
unsigned KPerThread,
unsigned CPerThread,
unsigned GemmThreadPerClusterRow,
unsigned GemmThreadPerClusterColumn,
unsigned InBlockCopyThreadPerDim0,
unsigned InBlockCopyThreadPerDim1,
unsigned WeiBlockCopyThreadPerDim0,
unsigned WeiBlockCopyThreadPerDim1>
__global__ void
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(InGlobalDesc,
Float* const __restrict__ p_in_global,
WeiGlobalDesc,
Float* const __restrict__ p_wei_global,
OutGlobalDesc,
Float* __restrict__ p_out_global)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_cnhw_global_desc = InGlobalDesc{};
constexpr auto wei_csrk_global_desc = WeiGlobalDesc{};
constexpr auto out_knhw_global_desc = OutGlobalDesc{};
constexpr unsigned C = in_cnhw_global_desc.GetLength(I0);
constexpr unsigned N = in_cnhw_global_desc.GetLength(I1);
constexpr unsigned Hi = in_cnhw_global_desc.GetLength(I2);
constexpr unsigned Wi = in_cnhw_global_desc.GetLength(I3);
constexpr unsigned K = out_knhw_global_desc.GetLength(I0);
constexpr unsigned Ho = out_knhw_global_desc.GetLength(I2);
constexpr unsigned Wo = out_knhw_global_desc.GetLength(I3);
constexpr unsigned S = wei_csrk_global_desc.GetLength(I1);
constexpr unsigned R = wei_csrk_global_desc.GetLength(I2);
constexpr unsigned B = N * Hi * Wi;
constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1);
// divide block work by 2d: [K, B]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
#if 0
if(get_thread_local_1d_id() == 0)
{
printf("K %u B %u, BGhostRead %u\n", K, B, BGhostRead);
printf("%u %u, KBlockWork %u BBlockWork %u, k_block_data_begin %u b_block_data_begin %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
KBlockWork,
BBlockWork,
k_block_data_begin,
b_block_data_begin);
}
#endif
// flattend (2d) tensor view of gridwise input
constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence<C, B>{});
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{});
// tensor view of blockwise input and weight
constexpr auto in_cb_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, BPerBlock + BGhostRead>{});
constexpr auto wei_ek_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
constexpr auto wei_csrk_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
// tensor view of threadwise output in register
constexpr auto out_kb_thread_desc =
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
blockwise_2d_tensor_copy_1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif 1
const auto blockwise_in_copy =
blockwise_2d_tensor_copy_2<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths()),
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
blockwise_2d_tensor_copy_1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 1
const auto blockwise_wei_copy =
blockwise_2d_tensor_copy_2<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
const auto blockwise_gemm =
blockwise_gemm_block_a_block_b_thread_c<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxb_block_mtx_desc),
decltype(c_kxb_thread_mtx_desc),
true,
false,
false,
CPerThread,
GemmThreadPerClusterRow,
GemmThreadPerClusterColumn,
true>{};
// LDS
constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace();
__shared__ Float p_in_block[in_block_size];
__shared__ Float p_wei_block[wei_block_size];
// register
Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
Float* p_wei_global_block_offset =
p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
for(unsigned c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I2),
__syncthreads())
{
// input: global mem to LDS,
blockwise_in_copy.run(p_in_global_block_offset, p_in_block);
// weight: global mem to LDS,
blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block);
__syncthreads();
// a series of GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& c, const auto&& ab) { c += ab; };
blockwise_gemm.run(p_wei_block + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0),
p_in_block + s * Wi + r,
p_out_thread,
f_accum);
}
}
}
// output: register to global mem,
const auto matrix_c_index =
blockwise_gemm.CalculateThreadMatrixCIndex(get_thread_local_1d_id());
const unsigned k_thread_data_begin = matrix_c_index.row_begin;
const unsigned b_thread_data_begin = matrix_c_index.col_begin;
const unsigned k_data_begin = k_block_data_begin + k_thread_data_begin;
const unsigned b_data_begin = b_block_data_begin + b_thread_data_begin;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, row_begin %u col_begin %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
matrix_c_index.row_begin,
matrix_c_index.col_begin,
k_data_begin,
b_data_begin,
p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
}
#endif
for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
{
for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
{
unsigned k_data = k_data_begin + k;
unsigned b_data = b_data_begin + b;
unsigned n_data = b_data / (Hi * Wi);
unsigned itmp = b_data - n_data * (Hi * Wi);
unsigned h_data = itmp / Wi;
unsigned w_data = itmp - h_data * Wi;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
k,
b,
k_data,
n_data,
h_data,
w_data,
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]);
}
#endif
if(n_data < N && h_data < Ho && w_data < Wo)
{
#if 1
p_out_global[out_knhw_global_desc.Get1dIndex(k_data, n_data, h_data, w_data)] =
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
#endif
}
}
}
}
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "blockwise_gemm.cuh"
// define B = flatten(N, Hi, Wi)
template <unsigned GridSize,
unsigned BlockSize,
class Float,
class InGlobalDesc,
class WeiGlobalDesc,
class OutGlobalDesc,
unsigned BPerBlock,
unsigned KPerBlock,
unsigned CPerBlock,
unsigned BPerThread,
unsigned KPerThread,
unsigned CPerThread,
unsigned GemmThreadPerClusterRow,
unsigned GemmThreadPerClusterColumn,
unsigned InBlockCopyThreadPerDim0,
unsigned InBlockCopyThreadPerDim1,
unsigned WeiBlockCopyThreadPerDim0,
unsigned WeiBlockCopyThreadPerDim1>
__global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_pipeline(
InGlobalDesc,
Float* const __restrict__ p_in_global,
WeiGlobalDesc,
Float* const __restrict__ p_wei_global,
OutGlobalDesc,
Float* __restrict__ p_out_global)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_cnhw_global_desc = InGlobalDesc{};
constexpr auto wei_csrk_global_desc = WeiGlobalDesc{};
constexpr auto out_knhw_global_desc = OutGlobalDesc{};
constexpr unsigned C = in_cnhw_global_desc.GetLength(I0);
constexpr unsigned N = in_cnhw_global_desc.GetLength(I1);
constexpr unsigned Hi = in_cnhw_global_desc.GetLength(I2);
constexpr unsigned Wi = in_cnhw_global_desc.GetLength(I3);
constexpr unsigned K = out_knhw_global_desc.GetLength(I0);
constexpr unsigned Ho = out_knhw_global_desc.GetLength(I2);
constexpr unsigned Wo = out_knhw_global_desc.GetLength(I3);
constexpr unsigned S = wei_csrk_global_desc.GetLength(I1);
constexpr unsigned R = wei_csrk_global_desc.GetLength(I2);
constexpr unsigned B = N * Hi * Wi;
constexpr unsigned BGhostRead = (S - 1) * Wi + (R - 1);
// divide block work by 2d: [K, B]
constexpr unsigned KBlockWork = (K + KPerBlock - 1) / KPerBlock;
constexpr unsigned BBlockWork = (B + BPerBlock - 1) / BPerBlock;
const unsigned k_block_work_id = get_block_1d_id() / BBlockWork;
const unsigned b_block_work_id = get_block_1d_id() - k_block_work_id * BBlockWork;
const unsigned k_block_data_begin = k_block_work_id * KPerBlock;
const unsigned b_block_data_begin = b_block_work_id * BPerBlock;
#if 0
if(get_thread_local_1d_id() == 0)
{
printf("K %u B %u, BGhostRead %u\n", K, B, BGhostRead);
printf("%u %u, KBlockWork %u BBlockWork %u, k_block_data_begin %u b_block_data_begin %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
KBlockWork,
BBlockWork,
k_block_data_begin,
b_block_data_begin);
}
#endif
// flattend (2d) tensor view of gridwise input
constexpr auto in_cb_global_desc = make_ConstantTensorDescriptor(Sequence<C, B>{});
constexpr auto wei_ek_global_desc = make_ConstantTensorDescriptor(Sequence<C * S * R, K>{});
// tensor view of blockwise input and weight
constexpr auto in_cb_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, BPerBlock + BGhostRead>{});
constexpr auto wei_ek_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock * S * R, KPerBlock>{});
constexpr auto wei_csrk_block_desc =
make_ConstantTensorDescriptor(Sequence<CPerBlock, S, R, KPerBlock>{});
// tensor view of threadwise output in register
constexpr auto out_kb_thread_desc =
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_csrk_block_desc, "wei_csrk_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
blockwise_2d_tensor_copy_1<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
#elif 1
const auto blockwise_in_copy =
blockwise_2d_tensor_copy_2<BlockSize,
Float,
decltype(in_cb_global_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths()),
InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
blockwise_2d_tensor_copy_1<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths())>{};
#elif 1
const auto blockwise_wei_copy =
blockwise_2d_tensor_copy_2<BlockSize,
Float,
decltype(wei_ek_global_desc),
decltype(wei_ek_block_desc),
decltype(wei_ek_block_desc.GetLengths()),
WeiBlockCopyThreadPerDim0,
WeiBlockCopyThreadPerDim1>{};
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const auto a_cxk_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{}, Number<KPerBlock>{}); // constexpr doesn't compile
const auto b_cxb_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<CPerBlock>{},
Number<BPerBlock>{},
Number<in_cb_block_desc.GetStride(I0)>{}); // constexpr doesn't compile
const auto c_kxb_thread_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerThread>{}, Number<BPerThread>{}); // constexpr doesn't compile
const auto blockwise_gemm =
blockwise_gemm_block_a_block_b_thread_c<BlockSize,
decltype(a_cxk_block_mtx_desc),
decltype(b_cxb_block_mtx_desc),
decltype(c_kxb_thread_mtx_desc),
true,
false,
false,
CPerThread,
GemmThreadPerClusterRow,
GemmThreadPerClusterColumn,
true>{};
// LDS
constexpr unsigned in_block_size = in_cb_block_desc.GetElementSpace();
constexpr unsigned wei_block_size = wei_csrk_block_desc.GetElementSpace();
// LDS double buffer
__shared__ Float p_in_block_0[in_block_size];
__shared__ Float p_wei_block_0[wei_block_size];
__shared__ Float p_in_block_1[in_block_size];
__shared__ Float p_wei_block_1[wei_block_size];
// register
Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
Float* p_in_global_block_offset =
p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin);
Float* p_wei_global_block_offset =
p_wei_global + wei_csrk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
// prelog : preload data
// input: global mem to LDS,
blockwise_in_copy.run(p_in_global_block_offset, p_in_block_0);
// weight: global mem to LDS,
blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block_0);
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I2);
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero(out_kb_thread_desc, p_out_thread);
bool even_loop = true;
for(unsigned c_block_data_begin = CPerBlock; c_block_data_begin < C;
c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0),
p_wei_global_block_offset += CPerBlock * wei_csrk_global_desc.GetStride(I2),
even_loop = !even_loop)
{
__syncthreads();
Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1;
Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
Float* p_in_block_next = even_loop ? p_in_block_1 : p_in_block_0;
Float* p_wei_block_next = even_loop ? p_wei_block_1 : p_wei_block_0;
// input: global mem to LDS,
blockwise_in_copy.run(p_in_global_block_offset, p_in_block_next);
// weight: global mem to LDS,
blockwise_wei_copy.run(p_wei_global_block_offset, p_wei_block_next);
// a series of GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& c, const auto&& ab) { c += ab; };
blockwise_gemm.run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0),
p_in_block_now + s * Wi + r,
p_out_thread,
f_accum);
}
}
}
// last computation
{
__syncthreads();
Float* p_in_block_now = even_loop ? p_in_block_0 : p_in_block_1;
Float* p_wei_block_now = even_loop ? p_wei_block_0 : p_wei_block_1;
// a series of GEMM
for(unsigned s = 0; s < S; ++s)
{
for(unsigned r = 0; r < R; ++r)
{
auto f_accum = [](auto& c, const auto&& ab) { c += ab; };
blockwise_gemm.run(p_wei_block_now + wei_csrk_block_desc.Get1dIndex(s, r, 0, 0),
p_in_block_now + s * Wi + r,
p_out_thread,
f_accum);
}
}
}
// output: register to global mem,
const auto matrix_c_index =
blockwise_gemm.CalculateThreadMatrixCIndex(get_thread_local_1d_id());
const unsigned k_thread_data_begin = matrix_c_index.row_begin;
const unsigned b_thread_data_begin = matrix_c_index.col_begin;
const unsigned k_data_begin = k_block_data_begin + k_thread_data_begin;
const unsigned b_data_begin = b_block_data_begin + b_thread_data_begin;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, row_begin %u col_begin %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
matrix_c_index.row_begin,
matrix_c_index.col_begin,
k_data_begin,
b_data_begin,
p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
}
#endif
for(unsigned k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
{
for(unsigned b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
{
unsigned k_data = k_data_begin + k;
unsigned b_data = b_data_begin + b;
unsigned n_data = b_data / (Hi * Wi);
unsigned itmp = b_data - n_data * (Hi * Wi);
unsigned h_data = itmp / Wi;
unsigned w_data = itmp - h_data * Wi;
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
k,
b,
k_data,
n_data,
h_data,
w_data,
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]);
}
#endif
if(n_data < N && h_data < Ho && w_data < Wo)
{
#if 1
p_out_global[out_knhw_global_desc.Get1dIndex(k_data, n_data, h_data, w_data)] =
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)];
#endif
}
}
}
}
...@@ -115,7 +115,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc, ...@@ -115,7 +115,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
decltype(in_cb_global_desc), decltype(in_cb_global_desc),
decltype(in_cb_block_desc), decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{}; decltype(in_cb_block_desc.GetLengths())>{};
#elif 0 #elif 1
const auto blockwise_in_copy = const auto blockwise_in_copy =
blockwise_2d_tensor_copy_2<BlockSize, blockwise_2d_tensor_copy_2<BlockSize,
Float, Float,
......
...@@ -24,8 +24,8 @@ template <unsigned GridSize, ...@@ -24,8 +24,8 @@ template <unsigned GridSize,
unsigned GemmColumnThreadPerCluster, unsigned GemmColumnThreadPerCluster,
unsigned InBlockCopyThreadPerDim0, unsigned InBlockCopyThreadPerDim0,
unsigned InBlockCopyThreadPerDim1> unsigned InBlockCopyThreadPerDim1>
__global__ void __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline(
gridwise_implicit_gemm_convolution_3_cnhw_srck_knhw(InGlobalDesc, InGlobalDesc,
Float* const __restrict__ p_in_global, Float* const __restrict__ p_in_global,
WeiGlobalDesc, WeiGlobalDesc,
Float* const __restrict__ p_wei_global, Float* const __restrict__ p_wei_global,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment