Commit 88b77181 authored by Chao Liu's avatar Chao Liu
Browse files

rename files, added header guard, added namespace

parent 05e04665
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in,
......@@ -79,7 +81,7 @@ void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
WoPerThread,
InBlockCopyDataPerRead,
WeiBlockCopyDataPerRead>;
float time = launch_kernel(run_gridwise_convolution<gridwise_conv, T>,
float time = launch_kernel(run_gridwise_convolution_kernel<gridwise_conv, T>,
dim3(GridSize),
dim3(BlockSize),
0,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
......@@ -478,7 +480,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
#elif 0
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
#endif
<GridSize,
BlockSize,
......@@ -509,7 +511,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_N>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hpp"
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
index_t nrepeat)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc = InDesc{};
constexpr auto wei_kcyx_desc = WeiDesc{};
constexpr auto out_nkhw_desc = OutDesc{};
constexpr index_t Hi = in_nchw_desc.GetLength(I2);
constexpr index_t Wi = in_nchw_desc.GetLength(I3);
constexpr index_t N = out_nkhw_desc.GetLength(I0);
constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
constexpr index_t K = wei_kcyx_desc.GetLength(I0);
constexpr index_t C = wei_kcyx_desc.GetLength(I1);
constexpr index_t Y = wei_kcyx_desc.GetLength(I2);
constexpr index_t X = wei_kcyx_desc.GetLength(I3);
// reorder weight
auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
};
make_ParallelTensorFunctor(f_reorder_kcyx2cyxk, K, C, Y, X)(
std::thread::hardware_concurrency());
// output
auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_cyxk_device_buf(data_sz * wei_cyxk.mDesc.GetElementSpace());
DeviceMem out_khwn_device_buf(data_sz * out_khwn.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_khwn_device_buf.ToDevice(out_khwn.mData.data());
#if 1
// for 3x3, 34x34, v1r3, Pascal
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 2;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 16;
constexpr index_t NPerThread = 2;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 4;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 1, 16>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 2;
using WeiBlockCopyClusterLengths = void;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
#elif 1
// for 3x3, 34x34, v1r3, Vega 20
constexpr index_t BlockSize = 256;
constexpr index_t NPerBlock = 2;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 4;
constexpr index_t WoPerBlock = 16;
constexpr index_t NPerThread = 2;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 4;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<2, 1, 2, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<1, 8, 2, 16>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 2;
using WeiBlockCopyClusterLengths = void;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
#elif 0
// for 3x3, 28x28, v1r2, Pascal
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 2;
constexpr index_t NPerThread = 4;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 2>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 2;
constexpr index_t InBlockReorderDataPerWrite_N = 4;
using WeiBlockCopyClusterLengths = Sequence<4, 1, 32>;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
#elif 0
// for 3x3, 28x28, v1r3, Pascal, bad
constexpr index_t BlockSize = 128;
constexpr index_t NPerBlock = 16;
constexpr index_t KPerBlock = 128;
constexpr index_t CPerBlock = 8;
constexpr index_t HoPerBlock = 2;
constexpr index_t WoPerBlock = 2;
constexpr index_t NPerThread = 4;
constexpr index_t KPerThread = 8;
constexpr index_t HoPerThread = 1;
constexpr index_t WoPerThread = 2;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 2;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t GemmDataPerReadA = 4;
constexpr index_t GemmDataPerReadB = 4;
using InBlockReorderSrcSubLengths_NCHW = Sequence<4, 1, 1, 1>;
using InBlockReorderSrcClusterLengths_NCHW = Sequence<4, 8, 2, 2>;
using InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW = Sequence<1, 2, 0, 3>;
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 1;
using WeiBlockCopyClusterLengths = void;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_N = 2;
#endif
constexpr index_t GridSize =
((N + NPerBlock - 1) / NPerBlock) * ((K + KPerBlock - 1) / KPerBlock) *
((Ho + HoPerBlock - 1) / HoPerBlock) * ((Wo + WoPerBlock - 1) / WoPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
for(index_t i = 0; i < nrepeat; ++i)
{
constexpr auto gridwise_conv =
#if 0
GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
#elif 0
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
#endif
<GridSize,
BlockSize,
T,
decltype(in_nchw_desc),
decltype(wei_cyxk_desc),
decltype(out_khwn_desc),
NPerBlock,
KPerBlock,
CPerBlock,
HoPerBlock,
WoPerBlock,
NPerThread,
KPerThread,
HoPerThread,
WoPerThread,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
GemmDataPerReadA,
GemmDataPerReadB,
InBlockReorderSrcSubLengths_NCHW,
InBlockReorderSrcClusterLengths_NCHW,
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW,
InBlockReorderDataPerRead_W,
InBlockReorderDataPerWrite_N,
WeiBlockCopyClusterLengths,
WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_N>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<T*>(wei_cyxk_device_buf.GetDeviceBuffer()),
static_cast<T*>(out_khwn_device_buf.GetDeviceBuffer()));
printf("Elapsed time : %f ms, %f TFlop/s\n",
time,
(float)calculate_convolution_flops(InDesc{}, WeiDesc{}, OutDesc{}) /
(std::size_t(1000) * 1000 * 1000) / time);
usleep(std::min(time * 1000, float(10000)));
}
out_khwn_device_buf.FromDevice(out_khwn.mData.data());
// reorder output
auto f_reorder_khwn2nkhw = [&](auto k, auto ho, auto wo, auto n) {
out_nkhw(n, k, ho, wo) = out_khwn(k, ho, wo, n);
};
make_ParallelTensorFunctor(f_reorder_khwn2nkhw, K, Ho, Wo, N)(
std::thread::hardware_concurrency());
}
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
......@@ -313,10 +315,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
for(index_t i = 0; i < nrepeat; ++i)
{
constexpr auto gridwise_conv =
#if 1
#if 0
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
#else
GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw_lds_double_buffer
#endif
<GridSize,
BlockSize,
......@@ -351,7 +353,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
WeiBlockCopyDataPerRead_K,
OutThreadCopyDataPerWrite_W>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
const Tensor<T>& in_nchw,
......@@ -303,7 +305,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
WeiBlockCopyDataPerRead,
OutThreadCopyDataPerWrite>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
......@@ -102,7 +104,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
#if 0
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
#else
GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw
GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw_lds_double_buffer
#endif
<GridSize,
BlockSize,
......@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
WeiBlockCopyDataPerAccess_K>{};
#if 1
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "gridwise_convolution_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
......@@ -96,7 +98,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
#if 0
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw
#else
GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer
#endif
<GridSize,
BlockSize,
......@@ -133,7 +135,7 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc,
WeiBlockCopySrcDataPerRead_E,
WeiBlockCopyDstDataPerWrite_K>{};
float time = launch_kernel(run_gridwise_convolution<decltype(gridwise_conv), T>,
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>,
dim3(GridSize),
dim3(BlockSize),
0,
......
......@@ -3,6 +3,8 @@
#include "device.hpp"
#include "gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
using namespace ck;
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
const Tensor<TInWei>& in_nchw,
......
......@@ -3,6 +3,8 @@
#include "device.hpp"
#include "gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
const Tensor<T>& in_nchw,
......
......@@ -3,19 +3,19 @@
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "config.h"
#include "config.hpp"
#include "tensor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "conv_common.hpp"
#include "device_convolution_direct_v2_nchw_kcyx_nkhw.hpp"
//#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp"
using namespace ck;
struct GeneratorTensor_1
{
template <class... Is>
......@@ -419,7 +419,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
#elif 0
#elif 1
// 3x3, 34x34
constexpr index_t N = 64;
constexpr index_t C = 256;
......@@ -633,15 +633,9 @@ int main(int argc, char* argv[])
#if 1
#if 0
device_direct_convolution_1
#elif 0
device_convolution_direct_v2_nchw_kcyx_nkhw
#elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_khwn
#elif 0
device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw
#elif 0
......
configure_file("${PROJECT_SOURCE_DIR}/src/include/config.h.in" "${PROJECT_BINARY_DIR}/src/include/config.h")
configure_file("${PROJECT_SOURCE_DIR}/src/include/config.hpp.in" "${PROJECT_BINARY_DIR}/src/include/config.hpp")
set(TENSOR_SOURCE
tensor.cpp;
......
#include "config.h"
#include "config.hpp"
#include "device.hpp"
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
......
#pragma once
#ifndef CK_ARRAY_HPP
#define CK_ARRAY_HPP
#include "Sequence.hpp"
#include "functional2.hpp"
namespace ck {
template <class TData, index_t NSize>
struct Array
{
......@@ -96,7 +100,7 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array<TData
static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
return Array<TData, NSize>{old_array.mSize[IRs]...};
return Array<TData, NSize>{old_array[IRs]...};
}
template <class TData, index_t NSize, class MapOld2New>
......@@ -180,7 +184,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Array<TData,
{
Array<TData, NSize> result;
auto f = mod_conv::plus<index_t>{};
auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
......@@ -195,7 +199,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
{
Array<TData, NSize> result;
auto f = mod_conv::minus<index_t>{};
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
......@@ -212,7 +216,7 @@ __host__ __device__ constexpr auto operator+(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result;
auto f = mod_conv::plus<index_t>{};
auto f = math::plus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
......@@ -229,7 +233,7 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result;
auto f = mod_conv::minus<index_t>{};
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
......@@ -246,7 +250,7 @@ __host__ __device__ constexpr auto operator*(Array<TData, NSize> a, Sequence<Is.
Array<TData, NSize> result;
auto f = mod_conv::multiplies<index_t>{};
auto f = math::multiplies<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
......@@ -263,7 +267,7 @@ __host__ __device__ constexpr auto operator-(Sequence<Is...> a, Array<TData, NSi
Array<TData, NSize> result;
auto f = mod_conv::minus<index_t>{};
auto f = math::minus<index_t>{};
static_for<0, NSize, 1>{}(
lambda_array_math<decltype(f), decltype(a), decltype(b), decltype(result)>(
......@@ -368,3 +372,6 @@ __host__ __device__ void print_Array(const char* s, Array<T, NSize> a)
a[9]);
});
}
} // namespace ck
#endif
#pragma once
#ifndef CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#define CK_CONSTANT_MATRIX_DESCRIPTOR_HPP
#include "common.hpp"
namespace ck {
template <index_t NRow_, index_t NCol_, index_t RowStride_>
struct ConstantMatrixDescriptor
{
......@@ -57,3 +61,7 @@ __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s)
printf("%s NRow %u NCol %u RowStride %u\n", s, desc.NRow(), desc.NCol(), desc.RowStride());
}
} // namespace ck
#endif
#pragma once
#ifndef CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
#define CK_CONSTANT_MERGED_TENSOR_DESCRIPTOR_HPP
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
namespace ck {
// OriginalTensorDesc : ConstantTensorDescriptor<...>
// it's the tensor whose dimensions are to be merged
// OriginalDimMergeSeqs : Sequence<...>...
......@@ -184,3 +188,6 @@ __host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDe
{
print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor());
}
} // namespace ck
#endif
#pragma once
#ifndef CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
#define CK_CONSTANT_TENSOR_DESCRIPTOR_HPP
#include "common.hpp"
namespace ck {
template <class Lengths>
__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths)
{
return reverse_inclusive_scan_sequence(
Lengths{}.PopFront(), mod_conv::multiplies<index_t>{}, Number<1>{})
Lengths{}.PopFront(), math::multiplies<index_t>{}, Number<1>{})
.PushBack(Number<1>{});
}
......@@ -13,7 +17,7 @@ template <class Lengths, index_t Align>
__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number<Align>)
{
constexpr index_t L_back_align =
Align * mod_conv::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
Align * math::integer_divide_ceiler<index_t>{}(Lengths{}.Back(), Align);
return calculate_tensor_strides_packed(
Lengths{}.Modify(Number<Lengths{}.GetSize() - 1>{}, Number<L_back_align>{}));
......@@ -100,7 +104,7 @@ struct ConstantTensorDescriptor
__host__ __device__ static constexpr index_t GetElementSize()
{
return accumulate_on_sequence(Lengths{}, mod_conv::multiplies<index_t>{}, Number<1>{});
return accumulate_on_sequence(Lengths{}, math::multiplies<index_t>{}, Number<1>{});
}
template <class Align = Number<1>>
......@@ -109,7 +113,7 @@ struct ConstantTensorDescriptor
// This is WRONG! align shouldbe applied to the last memory rank, not the last tensor
// dimension
constexpr index_t element_space_unaligned = accumulate_on_sequence(
(GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus<index_t>{}, Number<1>{});
(GetLengths() - Number<1>{}) * GetStrides(), math::plus<index_t>{}, Number<1>{});
return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get());
}
......@@ -161,8 +165,7 @@ struct ConstantTensorDescriptor
constexpr auto multi_id = Sequence<Is...>{};
return accumulate_on_sequence(
multi_id * GetStrides(), mod_conv::plus<index_t>{}, Number<0>{});
return accumulate_on_sequence(multi_id * GetStrides(), math::plus<index_t>{}, Number<0>{});
}
// emulate constexpr lambda
......@@ -323,7 +326,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_intervals = Sequence<FoldIntervals...>{};
constexpr index_t fold_intervals_product =
accumulate_on_sequence(fold_intervals, mod_conv::multiplies<index_t>{}, Number<1>{});
accumulate_on_sequence(fold_intervals, math::multiplies<index_t>{}, Number<1>{});
constexpr auto unfold_length = GetLength(Number<IDim>{});
constexpr auto unfold_stride = GetStride(Number<IDim>{});
......@@ -341,7 +344,7 @@ struct ConstantTensorDescriptor
constexpr auto fold_strides =
Number<unfold_stride>{} *
reverse_inclusive_scan_sequence(
fold_intervals.PushBack(Number<1>{}), mod_conv::multiplies<index_t>{}, Number<1>{});
fold_intervals.PushBack(Number<1>{}), math::multiplies<index_t>{}, Number<1>{});
// left and right
constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{};
......@@ -376,7 +379,7 @@ struct ConstantTensorDescriptor
// unfolded length, stride
constexpr index_t unfold_length = accumulate_on_sequence(
GetLengths().Extract(middle), mod_conv::multiplies<index_t>{}, Number<1>{});
GetLengths().Extract(middle), math::multiplies<index_t>{}, Number<1>{});
constexpr index_t unfold_stride = GetStride(Number<LastUnfoldDim>{});
......@@ -511,3 +514,6 @@ print_ConstantTensorDescriptor(const char* s,
Strides...);
});
}
} // namespace ck
#endif
#pragma once
#ifndef CK_SEQUENCE_HPP
#define CK_SEQUENCE_HPP
#include "integral_constant.hpp"
#include "functional.hpp"
namespace ck {
template <class Seq>
struct is_valid_sequence_map;
......@@ -547,3 +551,6 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
static_if<nsize == 10>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
}
} // namespace ck
#endif
#pragma once
#ifndef CK_AMD_INLINE_ASM_HPP
#define CK_AMD_INLINE_ASM_HPP
#include "common.hpp"
#define NO_VM_WAIT 0
......@@ -7,6 +9,8 @@
#define NO_DS_WRITE 0
#define NO_GLB_READ 0
namespace ck {
// cast a pointer of LDS to its address
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
......@@ -759,3 +763,6 @@ ds_write_b128(const vector_type<float, 4>::MemoryType& r, void* lds, index_t off
}
#endif
}
} // namespace ck
#endif
#pragma once
#ifndef CK_BLOCKWISE_2D_TENSOR_OP_HPP
#define CK_BLOCKWISE_2D_TENSOR_OP_HPP
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
namespace ck {
template <index_t BlockSize, class Float, class DstDesc, class F>
__device__ void
blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst, F f)
......@@ -192,7 +196,7 @@ struct Blockwise2dTensorCopy1
// but we need to make sure dst stride0 is big enough,
// so that the out-of-bound write won't contaminate next line in dst
constexpr index_t L1 = CopyLengths{}.Get(I1);
constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead);
constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead);
static_assert(read_per_d1 * DataPerRead <= DstDesc{}.GetStride(I0),
"wrong! out-of-bound write will contaminate next line!\n");
......@@ -209,7 +213,7 @@ struct Blockwise2dTensorCopy1
constexpr index_t L0 = CopyLengths{}.Get(I0);
constexpr index_t L1 = CopyLengths{}.Get(I1);
constexpr index_t read_per_d1 = mod_conv::integer_divide_ceil(L1, DataPerRead);
constexpr index_t read_per_d1 = math::integer_divide_ceil(L1, DataPerRead);
constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, read_per_d1>{});
......@@ -676,7 +680,7 @@ struct Blockwise2dTensorCopy3
}
}
#if USE_AMD_INLINE_ASM
#if CK_USE_AMD_INLINE_ASM
__device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
Float* p_clipboard) const
{
......@@ -796,3 +800,7 @@ struct Blockwise2dTensorCopy3
}
#endif
};
} // namespace ck
#endif
#pragma once
#ifndef CK_BLOCKWISE_3D_TENSOR_OP_HPP
#define CK_BLOCKWISE_3D_TENSOR_OP_HPP
#include "common.hpp"
#include "ConstantTensorDescriptor.hpp"
namespace ck {
template <index_t BlockSize,
class Float,
class SrcDesc,
......@@ -33,7 +37,7 @@ struct Blockwise3dTensorCopy1
// but we need to make sure dst stride2 is big enough,
// so that the out-of-bound write won't contaminate next line in dst
constexpr index_t L2 = CopyLengths{}.Get(I2);
constexpr index_t read_per_d2 = mod_conv::integer_divide_ceil(L2, DataPerRead);
constexpr index_t read_per_d2 = math::integer_divide_ceil(L2, DataPerRead);
static_assert(read_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
"wrong! out-of-bound write will contaminate next line!\n");
......@@ -52,7 +56,7 @@ struct Blockwise3dTensorCopy1
constexpr index_t L1 = CopyLengths{}.Get(I1);
constexpr index_t L2 = CopyLengths{}.Get(I2);
constexpr index_t read_per_d2 = mod_conv::integer_divide_ceil(L2, DataPerRead);
constexpr index_t read_per_d2 = math::integer_divide_ceil(L2, DataPerRead);
constexpr auto ref_desc = make_ConstantTensorDescriptor(Sequence<L0, L1, read_per_d2>{});
......@@ -146,7 +150,7 @@ struct Blockwise3dTensorCopy3
// we allow out-of-bound read from src in D2 dimension,
// but we need to make sure dst stride is big enough,
// so that the out-of-bound write won't contaminate next line in dst
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
static_assert(nloop_d2 * thread_per_d2 * DataPerRead <= DstDesc{}.GetStride(I1),
"wrong! out-of-bound write will contaminate next line!\n");
......@@ -158,7 +162,7 @@ struct Blockwise3dTensorCopy3
"wrrong! BlockSize is not big enough for ThreadPerDims!");
constexpr index_t num_active_thread =
accumulate_on_sequence(ThreadPerDims{}, mod_conv::multiplies<index_t>{}, Number<1>{});
accumulate_on_sequence(ThreadPerDims{}, math::multiplies<index_t>{}, Number<1>{});
if(BlockSize > num_active_thread)
{
......@@ -205,7 +209,7 @@ struct Blockwise3dTensorCopy3
constexpr index_t nloop_d0 = L0 / thread_per_d0;
constexpr index_t nloop_d1 = L1 / thread_per_d1;
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
#pragma unroll
for(index_t iloop_d0 = 0; iloop_d0 < nloop_d0; ++iloop_d0)
......@@ -251,7 +255,7 @@ struct Blockwise3dTensorCopy3
constexpr index_t nloop_d0 = L0 / thread_per_d0;
constexpr index_t nloop_d1 = L1 / thread_per_d1;
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
return DataPerRead * nloop_d0 * nloop_d1 * nloop_d2;
}
......@@ -283,7 +287,7 @@ struct Blockwise3dTensorCopy3
constexpr index_t nloop_d0 = L0 / thread_per_d0;
constexpr index_t nloop_d1 = L1 / thread_per_d1;
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr auto clipboard_desc =
make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{});
......@@ -339,7 +343,7 @@ struct Blockwise3dTensorCopy3
constexpr index_t nloop_d0 = L0 / thread_per_d0;
constexpr index_t nloop_d1 = L1 / thread_per_d1;
constexpr index_t nloop_d2 = mod_conv::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr index_t nloop_d2 = math::integer_divide_ceil(L2, thread_per_d2 * DataPerRead);
constexpr auto clipboard_desc =
make_ConstantTensorDescriptor(Sequence<nloop_d0, nloop_d1, nloop_d2 * DataPerRead>{});
......@@ -368,3 +372,7 @@ struct Blockwise3dTensorCopy3
}
}
};
} // namespace ck
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment