"tests/git@developer.sourcefind.cn:OpenDAS/dgl.git" did not exist on "dc86bd421ed98777112c64f61940321631c11806"
Commit 88b77181 authored by Chao Liu's avatar Chao Liu
Browse files

rename files, added header guard, added namespace

parent 05e04665
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_convolution_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp" #include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc, void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in, const Tensor<T>& in,
...@@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc, ...@@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
WoPerThread, WoPerThread,
InBlockCopyDataPerRead, InBlockCopyDataPerRead,
WeiBlockCopyDataPerRead>; WeiBlockCopyDataPerRead>;
float time = launch_kernel(run_gridwise_convolution<gridwise_conv, T>, float time = launch_kernel(run_gridwise_convolution_kernel<gridwise_conv, T>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_convolution_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp" #include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp" #include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp" #include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp" #include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...@@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
#elif 0 #elif 0
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 1 #elif 1
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
#endif #endif
<GridSize, <GridSize,
BlockSize, BlockSize,
...@@ -509,7 +511,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -509,7 +511,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_N>{}; OutThreadCopyDataPerWrite_N>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
index_t nrepeat)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcyx_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr index_t Hi = in_nchw_desc.GetLength(I2);
constexpr index_t Wi = in_nchw_desc.GetLength(I3);
constexpr index_t N = out_nkhw_desc.GetLength(I0);
constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
constexpr index_t K = wei_kcyx_desc.GetLength(I0);
constexpr index_t C = wei_kcyx_desc.GetLength(I1);
constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
};
make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
std::thread::hardware_concurrency());
// output
auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_khwn_device_buf.ToDevice(out_khwn.mData.data());
#if 1
// for 3x3, 34x34, v1r3, Pascal
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 2;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 16;
constexpr index_t NPerThread = 2;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 4;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 2;
using WeiBlockCopyClusterLengths = void;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
#elif 1
// for 3x3, 34x34, v1r3, Vega 20
constexpr index_t BlockSize = 256;
constexpr index_t NPerBlock = 2;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 4;
constexpr index_t WoPerBlock = 16;
constexpr index_t NPerThread = 2;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 4;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 2, 16>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 2;
using WeiBlockCopyClusterLengths = void;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
#elif 0
// for 3x3, 28x28, v1r2, Pascal
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 2;
constexpr index_t NPerThread = 4;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 2>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 2;
constexpr index_t InBlockReorderDataPerWrite_N = 4;
using WeiBlockCopyClusterLengths = Sequence<4, 1, 32>;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
#elif 0
// for 3x3, 28x28, v1r3, Pascal, bad
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 2;
constexpr index_t NPerThread = 4;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 1;
using WeiBlockCopyClusterLengths = void;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
#endif
constexpr index_t GridSize =
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
for(index_t i = 0; i < nrepeat; ++i)
{
constexpr auto gridwise_conv =
#if 0
GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
#elif 0
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
#endif
<GridSize,
BlockSize,
T,
decltype(in_nchw_desc),
decltype(wei_cyxk_desc),
decltype(out_khwn_desc),
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
NPerThread,
KPerThread,
HoPerThread,
WoPerThread,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockReorderSrcSubLengths_NCHW,
InBlockReorderSrcClusterLengths_NCHW,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
InBlockReorderDataPerRead_W,
InBlockReorderDataPerWrite_N,
WeiBlockCopyClusterLengths,
WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_N>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms, %f TFlop/s\n",
time,
(float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
(std::size_t(1000) * 1000 * 1000) / time);
usleep(std::min(time * 1000, float(10000)));
}
out_khwn_device_buf.FromDevice(out_khwn.mData.data());
// reorder output
auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
};
make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
std::thread::hardware_concurrency());
}
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_convolution_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...@@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
constexpr auto gridwise_conv = constexpr auto gridwise_conv =
#if 1 #if 0
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
#else #else
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
#endif #endif
<GridSize, <GridSize,
BlockSize, BlockSize,
...@@ -351,7 +353,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -351,7 +353,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
WeiBlockCopyDataPerRead_K, WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_W>{}; OutThreadCopyDataPerWrite_W>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_convolution_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp" #include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
const Tensor<T>& in_nchw, const Tensor<T>& in_nchw,
...@@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, ...@@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
WeiBlockCopyDataPerRead, WeiBlockCopyDataPerRead,
OutThreadCopyDataPerWrite>{}; OutThreadCopyDataPerWrite>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_convolution_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
...@@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, ...@@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
#if 0 #if 0
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
#else #else
GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
#endif #endif
<GridSize, <GridSize,
BlockSize, BlockSize,
...@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, ...@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
WeiBlockCopyDataPerAccess_K>{}; WeiBlockCopyDataPerAccess_K>{};
#if 1 #if 1
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
......
#pragma once #pragma once
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "gridwise_convolution_wrapper.hpp" #include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp" #include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
...@@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, ...@@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
#if 0 #if 0
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
#else #else
GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
#endif #endif
<GridSize, <GridSize,
BlockSize, BlockSize,
...@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, ...@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
WeiBlockCopySrcDataPerRead_E, WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>{}; WeiBlockCopyDstDataPerWrite_K>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#include "device.hpp" #include "device.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" #include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
using namespace ck;
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc> template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
const Tensor<TInWei>& in_nchw, const Tensor<TInWei>& in_nchw,
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
#include "device.hpp" #include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" #include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads> template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc, void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
const Tensor<T>& in_nchw, const Tensor<T>& in_nchw,
......
...@@ -3,19 +3,19 @@ ...@@ -3,19 +3,19 @@
#include <initializer_list> #include <initializer_list>
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include "config.h" #include "config.hpp"
#include "tensor.hpp" #include "tensor.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp" #include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
using namespace ck;
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
template <class... Is> template <class... Is>
...@@ -419,7 +419,7 @@ int main(int argc, char* argv[]) ...@@ -419,7 +419,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0; constexpr index_t HPad = 0;
constexpr index_t WPad = 0; constexpr index_t WPad = 0;
#elif 0 #elif 1
// 3x3, 34x34 // 3x3, 34x34
constexpr index_t N = 64; constexpr index_t N = 64;
constexpr index_t C = 256; constexpr index_t C = 256;
...@@ -633,15 +633,9 @@ int main(int argc, char* argv[]) ...@@ -633,15 +633,9 @@ int main(int argc, char* argv[])
#if 1 #if 1
#if 0 #if 0
device_direct_convolution_1
#elif 0
device_convolution_direct_v2_nchw_kcyx_nkhw device_convolution_direct_v2_nchw_kcyx_nkhw
#elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 0 #elif 0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif 0 #elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0 #elif 0
......
configure_file("${PROJECT_SOURCE_DIR}/src/include/config.h.in" "${PROJECT_BINARY_DIR}/src/include/config.h") configure_file("${PROJECT_SOURCE_DIR}/src/include/config.hpp.in" "${PROJECT_BINARY_DIR}/src/include/config.hpp")
set(TENSOR_SOURCE set(TENSOR_SOURCE
tensor.cpp; tensor.cpp;
......
#include "config.h" #include "config.hpp"
#include "device.hpp" #include "device.hpp"
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
......
#pragma once #ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP
#include "Sequence.hpp" #include "Sequence.hpp"
#include "functional2.hpp" #include "functional2.hpp"
namespace ck {
template <class TData, index_t NSize> template <class TData, index_t NSize>
struct Array struct Array
{ {
...@@ -96,7 +100,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData ...@@ -96,7 +100,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map"); static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
return Array<TData, NSize>{old_array.mSize[IRs]...}; return Array<TData, NSize>{old_array[IRs]...};
} }
template <class TData, index_t NSize, class MapOld2New> template <class TData, index_t NSize, class MapOld2New>
...@@ -180,7 +184,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData, ...@@ -180,7 +184,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
{ {
Array<TData, NSize> result; Array<TData, NSize> result;
auto f = mod_conv::plus<index_t>{}; auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}( static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>( lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
...@@ -195,7 +199,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData, ...@@ -195,7 +199,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
{ {
Array<TData, NSize> result; Array<TData, NSize> result;
auto f = mod_conv::minus<index_t>{}; auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}( static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>( lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
...@@ -212,7 +216,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is. ...@@ -212,7 +216,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result; Array<TData, NSize> result;
auto f = mod_conv::plus<index_t>{}; auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}( static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>( lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
...@@ -229,7 +233,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is. ...@@ -229,7 +233,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result; Array<TData, NSize> result;
auto f = mod_conv::minus<index_t>{}; auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}( static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>( lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
...@@ -246,7 +250,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is. ...@@ -246,7 +250,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result; Array<TData, NSize> result;
auto f = mod_conv::multiplies<index_t>{}; auto f = math::multiplies<index_t>{};
static_for<0, NSize, 1>{}( static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>( lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
...@@ -263,7 +267,7 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi ...@@ -263,7 +267,7 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
Array<TData, NSize> result; Array<TData, NSize> result;
auto f = mod_conv::minus<index_t>{}; auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}( static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>( lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
...@@ -368,3 +372,6 @@ __host__ __device__ void print_Array(const char* s, Array<T, NSize> a) ...@@ -368,3 +372,6 @@ __host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
a[9]); a[9]);
}); });
} }
} // namespace ck
#endif
#pragma once #ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#include "common.hpp" #include "common.hpp"
namespace ck {
template <index_t NRow_, index_t NCol_, index_t RowStride_> template <index_t NRow_, index_t NCol_, index_t RowStride_>
struct ConstantMatrixDescriptor struct ConstantMatrixDescriptor
{ {
...@@ -57,3 +61,7 @@ __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s) ...@@ -57,3 +61,7 @@ __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride()); printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride());
} }
} // namespace ck
#endif
#pragma once #ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
#include "common.hpp" #include "common.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor.hpp"
namespace ck {
// OriginalTensorDesc : ConstantTensorDescriptor<...> // OriginalTensorDesc : ConstantTensorDescriptor<...>
// it's the tensor whose dimensions are to be merged // it's the tensor whose dimensions are to be merged
// OriginalDimMergeSeqs : Sequence<...>... // OriginalDimMergeSeqs : Sequence<...>...
...@@ -184,3 +188,6 @@ __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDe ...@@ -184,3 +188,6 @@ __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDe
{ {
print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor()); print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor());
} }
} // namespace ck
#endif
#pragma once #ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
#include "common.hpp" #include "common.hpp"
namespace ck {
template <class Lengths> template <class Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
{ {
return reverse_inclusive_scan_sequence( return reverse_inclusive_scan_sequence(
Lengths{}.PopFront(), mod_conv::multiplies<index_t>{}, Number<1>{}) Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
.PushBack(Number<1>{}); .PushBack(Number<1>{});
} }
...@@ -13,7 +17,7 @@ template <class Lengths, index_t Align> ...@@ -13,7 +17,7 @@ template <class Lengths, index_t Align>
__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>) __host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
{ {
constexpr index_t L_back_align = constexpr index_t L_back_align =
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align); Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_tensor_strides_packed( return calculate_tensor_strides_packed(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{})); Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
...@@ -100,7 +104,7 @@ struct ConstantTensorDescriptor ...@@ -100,7 +104,7 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr index_t GetElementSize() __host__ __device__ static constexpr index_t GetElementSize()
{ {
return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{}); return accumulate_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{});
} }
template <class Align = Number<1>> template <class Align = Number<1>>
...@@ -109,7 +113,7 @@ struct ConstantTensorDescriptor ...@@ -109,7 +113,7 @@ struct ConstantTensorDescriptor
// This is WRONG! align shouldbe applied to the last memory rank, not the last tensor // This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
// dimension // dimension
constexpr index_t element_space_unaligned = accumulate_on_sequence( constexpr index_t element_space_unaligned = accumulate_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{}); (GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
} }
...@@ -161,8 +165,7 @@ struct ConstantTensorDescriptor ...@@ -161,8 +165,7 @@ struct ConstantTensorDescriptor
constexpr auto multi_id = Sequence<Is...>{}; constexpr auto multi_id = Sequence<Is...>{};
return accumulate_on_sequence( return accumulate_on_sequence(multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{});
multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
} }
// emulate constexpr lambda // emulate constexpr lambda
...@@ -323,7 +326,7 @@ struct ConstantTensorDescriptor ...@@ -323,7 +326,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_intervals = Sequence<FoldIntervals...>{}; constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
constexpr index_t fold_intervals_product = constexpr index_t fold_intervals_product =
accumulate_on_sequence(fold_intervals, mod_conv::multiplies<index_t>{}, Number<1>{}); accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
constexpr auto unfold_length = GetLength(Number<IDim>{}); constexpr auto unfold_length = GetLength(Number<IDim>{});
constexpr auto unfold_stride = GetStride(Number<IDim>{}); constexpr auto unfold_stride = GetStride(Number<IDim>{});
...@@ -341,7 +344,7 @@ struct ConstantTensorDescriptor ...@@ -341,7 +344,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_strides = constexpr auto fold_strides =
Number<unfold_stride>{} * Number<unfold_stride>{} *
reverse_inclusive_scan_sequence( reverse_inclusive_scan_sequence(
fold_intervals.PushBack(Number<1>{}), mod_conv::multiplies<index_t>{}, Number<1>{}); fold_intervals.PushBack(Number<1>{}), math::multiplies<index_t>{}, Number<1>{});
// left and right // left and right
constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{}; constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{};
...@@ -376,7 +379,7 @@ struct ConstantTensorDescriptor ...@@ -376,7 +379,7 @@ struct ConstantTensorDescriptor
// unfolded length, stride // unfolded length, stride
constexpr index_t unfold_length = accumulate_on_sequence( constexpr index_t unfold_length = accumulate_on_sequence(
GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{}); GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{}); constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
...@@ -511,3 +514,6 @@ print_ConstantTensorDescriptor(const char* s, ...@@ -511,3 +514,6 @@ print_ConstantTensorDescriptor(const char* s,
Strides...); Strides...);
}); });
} }
} // namespace ck
#endif
#pragma once #ifndef CK_SEQUENCE_HPP
#define CK_SEQUENCE_HPP
#include "integral_constant.hpp" #include "integral_constant.hpp"
#include "functional.hpp" #include "functional.hpp"
namespace ck {
template <class Seq> template <class Seq>
struct is_valid_sequence_map; struct is_valid_sequence_map;
...@@ -547,3 +551,6 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>) ...@@ -547,3 +551,6 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
static_if<nsize == 10>{}( static_if<nsize == 10>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
} }
} // namespace ck
#endif
#pragma once #ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP
#include "common.hpp" #include "common.hpp"
#define NO_VM_WAIT 0 #define NO_VM_WAIT 0
...@@ -7,6 +9,8 @@ ...@@ -7,6 +9,8 @@
#define NO_DS_WRITE 0 #define NO_DS_WRITE 0
#define NO_GLB_READ 0 #define NO_GLB_READ 0
namespace ck {
// cast a pointer of LDS to its address // cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
...@@ -759,3 +763,6 @@ ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t off ...@@ -759,3 +763,6 @@ ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t off
} }
#endif #endif
} }
} // namespace ck
#endif
#pragma once #ifndef CK_BLOCKWISE_2D_TENSOR_OP_HPP
#define CK_BLOCKWISE_2D_TENSOR_OP_HPP
#include "common.hpp" #include "common.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor.hpp"
namespace ck {
template <index_t BlockSize, class Float, class DstDesc, class F> template <index_t BlockSize, class Float, class DstDesc, class F>
__device__ void __device__ void
blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f) blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
...@@ -192,7 +196,7 @@ struct Blockwise2dTensorCopy1 ...@@ -192,7 +196,7 @@ struct Blockwise2dTensorCopy1
// but we need to make sure dst stride0 is big enough, // but we need to make sure dst stride0 is big enough,
// so that the out-of-bound write won't contaminate next line in dst // so that the out-of-bound write won't contaminate next line in dst
constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L1 = CopyLengths{}.Get(I1);
constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead); constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead);
static_assert(read_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0), static_assert(read_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0),
"wrong! out-of-bound write will contaminate next line!\n"); "wrong! out-of-bound write will contaminate next line!\n");
...@@ -209,7 +213,7 @@ struct Blockwise2dTensorCopy1 ...@@ -209,7 +213,7 @@ struct Blockwise2dTensorCopy1
constexpr index_t L0 = CopyLengths{}.Get(I0); constexpr index_t L0 = CopyLengths{}.Get(I0);
constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L1 = CopyLengths{}.Get(I1);
constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead); constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead);
constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, read_per_d1>{}); constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, read_per_d1>{});
...@@ -676,7 +680,7 @@ struct Blockwise2dTensorCopy3 ...@@ -676,7 +680,7 @@ struct Blockwise2dTensorCopy3
} }
} }
#if USE_AMD_INLINE_ASM #if CK_USE_AMD_INLINE_ASM
__device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src, __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
Float* p_clipboard) const Float* p_clipboard) const
{ {
...@@ -796,3 +800,7 @@ struct Blockwise2dTensorCopy3 ...@@ -796,3 +800,7 @@ struct Blockwise2dTensorCopy3
} }
#endif #endif
}; };
} // namespace ck
#endif
#pragma once #ifndef CK_BLOCKWISE_3D_TENSOR_OP_HPP
#define CK_BLOCKWISE_3D_TENSOR_OP_HPP
#include "common.hpp" #include "common.hpp"
#include "ConstantTensorDescriptor.hpp" #include "ConstantTensorDescriptor.hpp"
namespace ck {
template <index_t BlockSize, template <index_t BlockSize,
class Float, class Float,
class SrcDesc, class SrcDesc,
...@@ -33,7 +37,7 @@ struct Blockwise3dTensorCopy1 ...@@ -33,7 +37,7 @@ struct Blockwise3dTensorCopy1
// but we need to make sure dst stride2 is big enough, // but we need to make sure dst stride2 is big enough,
// so that the out-of-bound write won't contaminate next line in dst // so that the out-of-bound write won't contaminate next line in dst
constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L2 = CopyLengths{}.Get(I2);
constexpr index_t read_per_d2 = mod_conv::integer_divide_ceil(L2, DataPerRead); constexpr index_t read_per_d2 = math::integer_divide_ceil(L2, DataPerRead);
static_assert(read_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1), static_assert(read_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
"wrong! out-of-bound write will contaminate next line!\n"); "wrong! out-of-bound write will contaminate next line!\n");
...@@ -52,7 +56,7 @@ struct Blockwise3dTensorCopy1 ...@@ -52,7 +56,7 @@ struct Blockwise3dTensorCopy1
constexpr index_t L1 = CopyLengths{}.Get(I1); constexpr index_t L1 = CopyLengths{}.Get(I1);
constexpr index_t L2 = CopyLengths{}.Get(I2); constexpr index_t L2 = CopyLengths{}.Get(I2);
constexpr index_t read_per_d2 = mod_conv::integer_divide_ceil(L2, DataPerRead); constexpr index_t read_per_d2 = math::integer_divide_ceil(L2, DataPerRead);
constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, L1, read_per_d2>{}); constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, L1, read_per_d2>{});
...@@ -146,7 +150,7 @@ struct Blockwise3dTensorCopy3 ...@@ -146,7 +150,7 @@ struct Blockwise3dTensorCopy3
// we allow out-of-bound read from src in D2 dimension, // we allow out-of-bound read from src in D2 dimension,
// but we need to make sure dst stride is big enough, // but we need to make sure dst stride is big enough,
// so that the out-of-bound write won't contaminate next line in dst // so that the out-of-bound write won't contaminate next line in dst
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
static_assert(nloop_d2 * thread_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1), static_assert(nloop_d2 * thread_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
"wrong! out-of-bound write will contaminate next line!\n"); "wrong! out-of-bound write will contaminate next line!\n");
...@@ -158,7 +162,7 @@ struct Blockwise3dTensorCopy3 ...@@ -158,7 +162,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!"); "wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread = constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{}); accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread) if(BlockSize > num_active_thread)
{ {
...@@ -205,7 +209,7 @@ struct Blockwise3dTensorCopy3 ...@@ -205,7 +209,7 @@ struct Blockwise3dTensorCopy3
constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d0 = L0 / thread_per_d0;
constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d1 = L1 / thread_per_d1;
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
#pragma unroll #pragma unroll
for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0) for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0)
...@@ -251,7 +255,7 @@ struct Blockwise3dTensorCopy3 ...@@ -251,7 +255,7 @@ struct Blockwise3dTensorCopy3
constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d0 = L0 / thread_per_d0;
constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d1 = L1 / thread_per_d1;
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2; return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
} }
...@@ -283,7 +287,7 @@ struct Blockwise3dTensorCopy3 ...@@ -283,7 +287,7 @@ struct Blockwise3dTensorCopy3
constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d0 = L0 / thread_per_d0;
constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d1 = L1 / thread_per_d1;
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr auto clipboard_desc = constexpr auto clipboard_desc =
make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{}); make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{});
...@@ -339,7 +343,7 @@ struct Blockwise3dTensorCopy3 ...@@ -339,7 +343,7 @@ struct Blockwise3dTensorCopy3
constexpr index_t nloop_d0 = L0 / thread_per_d0; constexpr index_t nloop_d0 = L0 / thread_per_d0;
constexpr index_t nloop_d1 = L1 / thread_per_d1; constexpr index_t nloop_d1 = L1 / thread_per_d1;
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead); constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr auto clipboard_desc = constexpr auto clipboard_desc =
make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{}); make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{});
...@@ -368,3 +372,7 @@ struct Blockwise3dTensorCopy3 ...@@ -368,3 +372,7 @@ struct Blockwise3dTensorCopy3
} }
} }
}; };
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment