Commit 8c923db4 authored by Chao Liu's avatar Chao Liu
Browse files

hip build

parent e72eece8
...@@ -51,7 +51,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, ...@@ -51,7 +51,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
in_nchw_vec(n, c, h, w) = in_nchw_vec(n, c, h, w) =
vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
#elif 1 #elif 1
in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w), in_nchw_vec(n, c, h, w) = vector_t::Pack(in_nchw(n, 4 * c, h, w),
in_nchw(n, 4 * c + 1, h, w), in_nchw(n, 4 * c + 1, h, w),
in_nchw(n, 4 * c + 2, h, w), in_nchw(n, 4 * c + 2, h, w),
in_nchw(n, 4 * c + 3, h, w)); in_nchw(n, 4 * c + 3, h, w));
...@@ -113,37 +113,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, ...@@ -113,37 +113,37 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0 #elif 0
// 3x3, 34x34, 128 thread, fp32, vector = 2 // 3x3, 34x34, 128 thread, fp32, vector = 2
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32; constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 2; constexpr unsigned CPerBlock = 2;
constexpr unsigned HoPerBlock = 2; constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32; constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 2; constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4; constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 1; constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2; constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2; constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 0 #elif 0
// 3x3, 34x34, 128 thread, int8, vector = 4 // 3x3, 34x34, 128 thread, int8, vector = 4
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32; constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 8; constexpr unsigned CPerBlock = 8;
constexpr unsigned HoPerBlock = 4; constexpr unsigned HoPerBlock = 4;
constexpr unsigned WoPerBlock = 32; constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 1; constexpr unsigned NPerThread = 1;
constexpr unsigned KPerThread = 8; constexpr unsigned KPerThread = 8;
constexpr unsigned CPerThread = 2; constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 4; constexpr unsigned HoPerThread = 4;
constexpr unsigned WoPerThread = 2; constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 2; constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
......
...@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, ...@@ -74,7 +74,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data());
out_khwn_device_buf.ToDevice(out_khwn.mData.data()); out_khwn_device_buf.ToDevice(out_khwn.mData.data());
#if 1 #if 0
// for 3x3, 34x34 // for 3x3, 34x34
constexpr unsigned NPerBlock = 16; constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 64; constexpr unsigned KPerBlock = 64;
...@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, ...@@ -213,7 +213,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
constexpr unsigned WoPerThread = 1; constexpr unsigned WoPerThread = 1;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1 #elif 0
// for 1x1, 28x28 // for 1x1, 28x28
constexpr unsigned NPerBlock = 16; constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 128; constexpr unsigned KPerBlock = 128;
...@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc, ...@@ -245,6 +245,39 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn(InDesc,
constexpr unsigned OutThreadCopyDataPerWrite = 2; constexpr unsigned OutThreadCopyDataPerWrite = 2;
constexpr unsigned BlockSize = 128;
#elif 1
// for 1x1, 14x14
constexpr unsigned NPerBlock = 16;
constexpr unsigned KPerBlock = 128;
constexpr unsigned CPerBlock = 8;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 2;
constexpr unsigned NPerThread = 4;
constexpr unsigned KPerThread = 16;
constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 1;
constexpr unsigned WoPerThread = 1;
constexpr unsigned InBlockCopy_ThreadPerDimC = 8;
constexpr unsigned InBlockCopy_ThreadPerDimH = 2;
constexpr unsigned InBlockCopy_ThreadPerDimW = 2;
constexpr unsigned InBlockCopy_ThreadPerDimN = 4;
constexpr unsigned InBlockCopyDataPerRead = 4;
constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned GemmMPerThreadSubC = 4;
constexpr unsigned GemmNPerThreadSubC = 4;
constexpr unsigned GemmMLevel0Cluster = 4;
constexpr unsigned GemmNLevel0Cluster = 2;
constexpr unsigned GemmMLevel1Cluster = 2;
constexpr unsigned GemmNLevel1Cluster = 4;
constexpr unsigned GemmKPerThreadLoop = 1;
constexpr unsigned OutThreadCopyDataPerWrite = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#endif #endif
......
...@@ -8,11 +8,11 @@ ...@@ -8,11 +8,11 @@
#include "ConstantTensorDescriptor.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp"
#include "conv_common.hip.hpp" #include "conv_common.hip.hpp"
//#include "device_direct_convolution_1.hpp" //#include "device_direct_convolution_1.hpp"
#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp" //#include "device_direct_convolution_2_nchw_kcyx_nkhw.hpp"
#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp" //#include "device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp"
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp" #include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn.hpp"
//#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp" //#include "device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp"
//#include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp" #include "device_implicit_gemm_convolution_2_chwn_cyxk_khwn.hpp"
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
...@@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, ...@@ -353,7 +353,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std::size_t ho = HoPerTile * htile + j; std::size_t ho = HoPerTile * htile + j;
for(int i = 0; i < WoPerTile; ++i) for(int i = 0; i < WoPerTile; ++i)
{ {
std::size_t wo = WoPerTile * wtile + i; std::size_t wo = WoPerTile * wtile + i;
out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i); out_nkhw(n, k, ho, wo) = out_hold(n, k, htile, wtile, j, i);
} }
} }
...@@ -568,7 +568,7 @@ int main(int argc, char* argv[]) ...@@ -568,7 +568,7 @@ int main(int argc, char* argv[])
constexpr unsigned HPad = 2; constexpr unsigned HPad = 2;
constexpr unsigned WPad = 2; constexpr unsigned WPad = 2;
#elif 1 #elif 0
// 1x1 filter, 32x32 image // 1x1 filter, 32x32 image
constexpr unsigned N = 64; constexpr unsigned N = 64;
constexpr unsigned C = 256; constexpr unsigned C = 256;
...@@ -578,6 +578,18 @@ int main(int argc, char* argv[]) ...@@ -578,6 +578,18 @@ int main(int argc, char* argv[])
constexpr unsigned Y = 1; constexpr unsigned Y = 1;
constexpr unsigned X = 1; constexpr unsigned X = 1;
constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0;
#elif 1
// 1x1 filter, 14x14 image
constexpr unsigned N = 128;
constexpr unsigned C = 2048;
constexpr unsigned HI = 14;
constexpr unsigned WI = 14;
constexpr unsigned K = 512;
constexpr unsigned Y = 1;
constexpr unsigned X = 1;
constexpr unsigned HPad = 0; constexpr unsigned HPad = 0;
constexpr unsigned WPad = 0; constexpr unsigned WPad = 0;
#endif #endif
...@@ -594,8 +606,8 @@ int main(int argc, char* argv[]) ...@@ -594,8 +606,8 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: "); ostream_ConstantTensorDescriptor(wei_kcyx_desc, std::cout << "wei_kcyx_desc: ");
ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: "); ostream_ConstantTensorDescriptor(out_nkhw_desc, std::cout << "out_nkhw_desc: ");
using in_data_t = char; using in_data_t = float;
using out_data_t = int32_t; using out_data_t = float;
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc)); Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
...@@ -635,9 +647,9 @@ int main(int argc, char* argv[]) ...@@ -635,9 +647,9 @@ int main(int argc, char* argv[])
device_direct_convolution_1 device_direct_convolution_1
#elif 0 #elif 0
device_direct_convolution_2_nchw_kcyx_nkhw device_direct_convolution_2_nchw_kcyx_nkhw
#elif 1
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 0 #elif 0
device_direct_convolution_2_vectorized_nchw_kcyx_nkhw
#elif 1
device_implicit_gemm_convolution_1_chwn_cyxk_khwn device_implicit_gemm_convolution_1_chwn_cyxk_khwn
#elif 0 #elif 0
device_implicit_gemm_convolution_2_chwn_cyxk_khwn device_implicit_gemm_convolution_2_chwn_cyxk_khwn
......
...@@ -10,7 +10,7 @@ struct Array ...@@ -10,7 +10,7 @@ struct Array
unsigned mData[nSize]; unsigned mData[nSize];
template <class... Xs> template <class... Xs>
__host__ __device__ Array(Xs... xs) : mData({static_cast<TData>(xs)...}) __host__ __device__ Array(Xs... xs) : mData{static_cast<TData>(xs)...}
{ {
} }
......
...@@ -340,10 +340,11 @@ struct BlockwiseChwnTensorCopyPadded ...@@ -340,10 +340,11 @@ struct BlockwiseChwnTensorCopyPadded
constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize; constexpr unsigned NLoop = ref_desc.GetElementSize() / BlockSize;
const Float* p_src_tmp = const Float* p_src_tmp =
p_src + src_desc.Get1dIndex(c_block_data_begin, p_src +
(ho_block_data_begin + h_block_pad_low) - h_global_pad_low, src_desc.Get1dIndex(c_block_data_begin,
(wo_block_data_begin + w_block_pad_low) - w_global_pad_low, (ho_block_data_begin + h_block_pad_low) - h_global_pad_low,
n_block_data_begin); (wo_block_data_begin + w_block_pad_low) - w_global_pad_low,
n_block_data_begin);
#if 0 #if 0
if(get_thread_local_1d_id() == 0) if(get_thread_local_1d_id() == 0)
......
...@@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -93,10 +93,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
Float p_out_thread[out_thread_desc.GetElementSpace()]; Float p_out_thread[out_thread_desc.GetElementSpace()];
threadwise_4d_tensor_copy(out_block_desc, threadwise_4d_tensor_copy(out_block_desc,
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, p_out_block +
k_thread_data_begin, out_block_desc.Get1dIndex(n_thread_data_begin,
ho_thread_data_begin, k_thread_data_begin,
wo_thread_data_begin), ho_thread_data_begin,
wo_thread_data_begin),
out_thread_desc, out_thread_desc,
p_out_thread, p_out_thread,
out_thread_desc.GetLengths()); out_thread_desc.GetLengths());
...@@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -107,10 +108,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
// threadwise convolution // threadwise convolution
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_thread_block_desc, in_thread_block_desc,
p_in_block + in_block_desc.Get1dIndex(n_thread_data_begin, p_in_block +
c_thread_data_begin, in_block_desc.Get1dIndex(n_thread_data_begin,
hi_thread_data_begin, c_thread_data_begin,
wi_thread_data_begin), hi_thread_data_begin,
wi_thread_data_begin),
wei_thread_block_desc, wei_thread_block_desc,
p_wei_block + p_wei_block +
wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0), wei_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data_begin, 0, 0),
...@@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc, ...@@ -122,10 +124,11 @@ __device__ void blockwise_direct_convolution(InBlockDesc,
threadwise_4d_tensor_copy(out_thread_desc, threadwise_4d_tensor_copy(out_thread_desc,
p_out_thread, p_out_thread,
out_block_desc, out_block_desc,
p_out_block + out_block_desc.Get1dIndex(n_thread_data_begin, p_out_block +
k_thread_data_begin, out_block_desc.Get1dIndex(n_thread_data_begin,
ho_thread_data_begin, k_thread_data_begin,
wo_thread_data_begin), ho_thread_data_begin,
wo_thread_data_begin),
out_thread_desc.GetLengths()); out_thread_desc.GetLengths());
} }
} }
...@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -431,12 +431,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
constexpr unsigned MRepeat = MPerThread / MPerThreadSubC; constexpr unsigned MRepeat = MPerThread / MPerThreadSubC;
constexpr unsigned NRepeat = NPerThread / NPerThreadSubC; constexpr unsigned NRepeat = NPerThread / NPerThreadSubC;
// loop over k // loop over k
#pragma unroll #pragma unroll
for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop) for(unsigned k_begin = 0; k_begin < KPerBlock; k_begin += KPerThreadLoop)
{ {
// read first batch of A, B // read first batch of A, B
// copy A-sub to form A // copy A-sub to form A
#pragma unroll #pragma unroll
for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat) for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
...@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -449,7 +449,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
a_thread_sub_mtx.GetLengths()); a_thread_sub_mtx.GetLengths());
} }
// copy B-sub to form B // copy B-sub to form B
#pragma unroll #pragma unroll
for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat) for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
...@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -462,7 +462,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
b_thread_sub_mtx.GetLengths()); b_thread_sub_mtx.GetLengths());
} }
// loop over batch // loop over batch
#pragma unroll #pragma unroll
for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib) for(unsigned ib = 0; ib + 1 < BatchPerThread; ++ib)
{ {
...@@ -551,14 +551,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 ...@@ -551,14 +551,15 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
c_thread_mtx_begin.batch * BlockMatrixStrideC + c_thread_mtx_begin.batch * BlockMatrixStrideC +
c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col); c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col);
for(unsigned m_repeat = 0; m_repeat, MRepeat; ++m_repeat) for(unsigned m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{ {
for(unsigned n_repeat = 0; n_repeat, NRepeat; ++n_repeat) for(unsigned n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{ {
threadwise_matrix_copy( threadwise_matrix_copy(
c_thread_sub_mtx, c_thread_sub_mtx,
p_c_thread + c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, p_c_thread +
n_repeat * NPerLevel1Cluster), c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
n_repeat * NPerLevel1Cluster),
c_block_mtx, c_block_mtx,
p_c_block + p_c_block +
c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster,
...@@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC ...@@ -656,8 +657,9 @@ struct BlockwiseGemmBlockABlockBThreadC
constexpr unsigned NClusterWork = constexpr unsigned NClusterWork =
(NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster); (NPerBlock + NPerThread * NThreadPerCluster - 1) / (NPerThread * NThreadPerCluster);
static_assert(BlockSize == (MClusterWork * MThreadPerCluster) * static_assert(BlockSize ==
(NClusterWork * NThreadPerCluster), (MClusterWork * MThreadPerCluster) *
(NClusterWork * NThreadPerCluster),
"wrong! wrong BlockSize"); "wrong! wrong BlockSize");
if(DistributeThreadAlongColumnFirst) if(DistributeThreadAlongColumnFirst)
...@@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 ...@@ -1256,8 +1258,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC),
c_thread_sub_mtx, c_thread_sub_mtx,
False, False,
p_c_thread + c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC, p_c_thread +
n_repeat * NPerThreadSubC), c_thread_mtx.Get1dIndex(m_repeat * MPerThreadSubC,
n_repeat * NPerThreadSubC),
f_accum); f_accum);
} }
} }
......
...@@ -21,6 +21,7 @@ struct is_same<T, T> ...@@ -21,6 +21,7 @@ struct is_same<T, T>
static const bool value = true; static const bool value = true;
}; };
#if 0
template <typename T> template <typename T>
__host__ __device__ constexpr T max(T a, T b) __host__ __device__ constexpr T max(T a, T b)
{ {
...@@ -32,6 +33,7 @@ __host__ __device__ constexpr T min(T a, T b) ...@@ -32,6 +33,7 @@ __host__ __device__ constexpr T min(T a, T b)
{ {
return a < b ? a : b; return a < b ? a : b;
} }
#endif
__host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b) __host__ __device__ constexpr unsigned integer_divide_ceil(unsigned a, unsigned b)
{ {
......
...@@ -4,8 +4,10 @@ ...@@ -4,8 +4,10 @@
#if DEVICE_BACKEND_HIP #if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "helper_cuda.h" #include "helper_cuda.h"
#endif #endif
#pragma once #pragma once
#include "config.h" #include "config.h"
#if DEVICE_BACKEND_CUDA
namespace CUDA {
#include "cuda_fp16.h"
}
#endif
using half = CUDA::half;
using half2 = CUDA::half2;
template <class T, unsigned N> template <class T, unsigned N>
struct vector_type struct vector_type
{ {
...@@ -52,6 +43,7 @@ struct vector_type<float2, 2> ...@@ -52,6 +43,7 @@ struct vector_type<float2, 2>
using MemoryType = float4; using MemoryType = float4;
}; };
#if 0
template <> template <>
struct vector_type<half, 1> struct vector_type<half, 1>
{ {
...@@ -91,24 +83,6 @@ struct vector_type<half, 8> ...@@ -91,24 +83,6 @@ struct vector_type<half, 8>
using MemoryType = float4; using MemoryType = float4;
}; };
template <>
struct vector_type<half2, 1>
{
using MemoryType = half2;
};
template <>
struct vector_type<half2, 2>
{
using MemoryType = float2;
};
template <>
struct vector_type<half2, 4>
{
using MemoryType = float4;
};
template <> template <>
struct vector_type<char, 1> struct vector_type<char, 1>
{ {
...@@ -169,7 +143,6 @@ struct vector_type<int32_t, 2> ...@@ -169,7 +143,6 @@ struct vector_type<int32_t, 2>
using MemoryType = int64_t; using MemoryType = int64_t;
}; };
#if 0
template <> template <>
struct vector_type<char2, 2> struct vector_type<char2, 2>
{ {
...@@ -214,6 +187,7 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa ...@@ -214,6 +187,7 @@ __device__ void fused_multiply_accumulate(float& d, const float4& s0, const floa
d += s0.w * s1.w; d += s0.w * s1.w;
} }
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } __device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
...@@ -222,12 +196,10 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& ...@@ -222,12 +196,10 @@ __device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2&
d += s0.y * s1.y; d += s0.y * s1.y;
} }
#if 0
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) __device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{ {
d += s0.x * s1.x + s0.y * s1.y; d += s0.x * s1.x + s0.y * s1.y;
} }
#endif
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; } __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
...@@ -239,3 +211,4 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i ...@@ -239,3 +211,4 @@ __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const i
d = __dp4a(s0, s1, d); d = __dp4a(s0, s1, d);
#endif #endif
} }
#endif
...@@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -113,10 +113,11 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin += CPerBlock) c_block_work_begin += CPerBlock)
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + in_global_desc.Get1dIndex(n_block_work_begin, blockwise_in_copy.Run(p_in_global +
c_block_work_begin, in_global_desc.Get1dIndex(n_block_work_begin,
hi_block_work_begin, c_block_work_begin,
wi_block_work_begin), hi_block_work_begin,
wi_block_work_begin),
p_in_block); p_in_block);
// copy weight tensor to LDS // copy weight tensor to LDS
...@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_ ...@@ -143,9 +144,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
} }
// copy output tensor from LDS to device mem // copy output tensor from LDS to device mem
blockwise_out_copy.Run(p_out_block, blockwise_out_copy.Run(
p_out_global + out_global_desc.Get1dIndex(n_block_work_begin, p_out_block,
k_block_work_begin, p_out_global +
ho_block_work_begin, out_global_desc.Get1dIndex(
wo_block_work_begin)); n_block_work_begin, k_block_work_begin, ho_block_work_begin, wo_block_work_begin));
} }
...@@ -176,16 +176,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i ...@@ -176,16 +176,18 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_global + in_nchw_global_desc.Get1dIndex(n_block_data_begin, blockwise_in_copy.Run(p_in_global +
c_block_data_begin, in_nchw_global_desc.Get1dIndex(n_block_data_begin,
hi_block_data_begin, c_block_data_begin,
wi_block_data_begin), hi_block_data_begin,
wi_block_data_begin),
p_in_block); p_in_block);
// copy weight tensor to LDS // copy weight tensor to LDS
blockwise_wei_copy.Run(p_wei_global + wei_kcyx_global_desc.Get1dIndex( blockwise_wei_copy.Run(
k_block_data_begin, c_block_data_begin, 0, 0), p_wei_global +
p_wei_block); wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
p_wei_block);
__syncthreads(); __syncthreads();
...@@ -195,10 +197,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i ...@@ -195,10 +197,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
#if 1 #if 1
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_nchw_thread_block_desc, in_nchw_thread_block_desc,
p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, p_in_block +
c_thread_data, in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
hi_thread_data_begin, c_thread_data,
wi_thread_data_begin), hi_thread_data_begin,
wi_thread_data_begin),
wei_kcyx_thread_block_desc, wei_kcyx_thread_block_desc,
p_wei_block + p_wei_block +
wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
...@@ -207,10 +210,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i ...@@ -207,10 +210,11 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
#elif 0 #elif 0
threadwise_direct_convolution_3( threadwise_direct_convolution_3(
in_nchw_thread_block_desc, in_nchw_thread_block_desc,
p_in_block + in_nchw_block_desc.Get1dIndex(n_thread_data_begin, p_in_block +
c_thread_data, in_nchw_block_desc.Get1dIndex(n_thread_data_begin,
hi_thread_data_begin, c_thread_data,
wi_thread_data_begin), hi_thread_data_begin,
wi_thread_data_begin),
wei_kcyx_thread_block_desc, wei_kcyx_thread_block_desc,
p_wei_block + p_wei_block +
wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
...@@ -225,9 +229,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i ...@@ -225,9 +229,10 @@ gridwise_direct_convolution_2_nchw_kcyx_nkhw(const Float* const __restrict__ p_i
out_nkhw_thread_desc, out_nkhw_thread_desc,
p_out_thread, p_out_thread,
out_nkhw_global_desc, out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, p_out_global +
k_block_data_begin + k_thread_data_begin, out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, k_block_data_begin + k_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_nkhw_thread_desc.GetLengths()); out_nkhw_thread_desc.GetLengths());
} }
...@@ -200,9 +200,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -200,9 +200,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
p_in_vec_block); p_in_vec_block);
// copy weight tensor to LDS // copy weight tensor to LDS
blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex( blockwise_wei_copy.Run(
k_block_data_begin, c_block_data_begin, 0, 0), p_wei_vec_global +
p_wei_vec_block); wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0),
p_wei_vec_block);
__syncthreads(); __syncthreads();
...@@ -212,10 +213,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -212,10 +213,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
#if 1 #if 1
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_nchw_vec_thread_block_desc, in_nchw_vec_thread_block_desc,
p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, p_in_vec_block +
c_thread_data, in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
hi_thread_data_begin, c_thread_data,
wi_thread_data_begin), hi_thread_data_begin,
wi_thread_data_begin),
wei_kcyx_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc,
p_wei_vec_block + p_wei_vec_block +
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
...@@ -224,10 +226,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -224,10 +226,11 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
#elif 0 #elif 0
threadwise_direct_convolution_3( threadwise_direct_convolution_3(
in_nchw_vec_thread_block_desc, in_nchw_vec_thread_block_desc,
p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, p_in_vec_block +
c_thread_data, in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
hi_thread_data_begin, c_thread_data,
wi_thread_data_begin), hi_thread_data_begin,
wi_thread_data_begin),
wei_kcyx_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc,
p_wei_vec_block + p_wei_vec_block +
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
...@@ -242,9 +245,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -242,9 +245,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
out_nkhw_thread_desc, out_nkhw_thread_desc,
p_out_thread, p_out_thread,
out_nkhw_global_desc, out_nkhw_global_desc,
p_out_global + out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, p_out_global +
k_block_data_begin + k_thread_data_begin, out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin, k_block_data_begin + k_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin), ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin),
out_nkhw_thread_desc.GetLengths()); out_nkhw_thread_desc.GetLengths());
} }
...@@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -184,8 +184,9 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread); threadwise_4d_tensor_set_zero(out_khwn_thread_desc, p_out_thread);
const Float* p_in_global_block_begin = const Float* p_in_global_block_begin =
p_in_global + in_chwn_global_desc.Get1dIndex( p_in_global +
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); in_chwn_global_desc.Get1dIndex(
0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin);
const Float* p_wei_global_block_begin = const Float* p_wei_global_block_begin =
p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
...@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -216,7 +217,7 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
} }
} }
// output: register to global mem, // output: register to global mem,
#if 0 #if 0
const auto c_thread_mtx_begin = const auto c_thread_mtx_begin =
blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id()); blockwise_batch_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
...@@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric ...@@ -286,16 +287,17 @@ gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn(const Float* const __restric
} }
#endif #endif
threadwise_8d_tensor_copy(out_8d_thread_desc, threadwise_8d_tensor_copy(
p_out_thread, out_8d_thread_desc,
out_8d_global_desc, p_out_thread,
p_out_global + out_khwn_global_desc.Get1dIndex( out_8d_global_desc,
k_block_data_begin + k_thread_data_begin, p_out_global +
ho_block_data_begin + ho_thread_data_begin, out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
n_block_data_begin + n_thread_data_begin), wo_block_data_begin + wo_thread_data_begin,
out_8d_thread_desc.GetLengths(), n_block_data_begin + n_thread_data_begin),
Number<OutThreadCopyDataPerWrite>{}); out_8d_thread_desc.GetLengths(),
Number<OutThreadCopyDataPerWrite>{});
} }
else if(NPerThread == NPerBlock) else if(NPerThread == NPerBlock)
{ {
......
...@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( ...@@ -283,10 +283,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
out_hkwn_thread_desc, out_hkwn_thread_desc,
p_out_thread, p_out_thread,
out_khwn_global_desc, out_khwn_global_desc,
p_out_global + out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, p_out_global +
ho_block_data_begin + ho_thread_data_begin, out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin, ho_block_data_begin + ho_thread_data_begin,
n_block_data_begin + n_thread_data_begin), wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
out_hkwn_thread_desc.GetLengths(), out_hkwn_thread_desc.GetLengths(),
reorder_khwn_from_hkwn); reorder_khwn_from_hkwn);
} }
...@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric ...@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
decltype(in_cb_block_desc), decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{}; decltype(in_cb_block_desc.GetLengths())>{};
#elif 0 #elif 0
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize, const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
Float, Float,
decltype(in_cb_global_desc), decltype(in_cb_global_desc),
decltype(in_cb_block_desc), decltype(in_cb_block_desc),
...@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric ...@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn(const Float* const __restric
InBlockCopyThreadPerDim0, InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{}; InBlockCopyThreadPerDim1>{};
#elif 1 #elif 1
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize, const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
Float, Float,
decltype(in_cb_global_desc), decltype(in_cb_global_desc),
decltype(in_cb_block_desc), decltype(in_cb_block_desc),
......
...@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b ...@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
decltype(in_cb_block_desc), decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{}; decltype(in_cb_block_desc.GetLengths())>{};
#elif 0 #elif 0
const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize, const auto blockwise_in_copy = Blockwise2dTensorCopy2<BlockSize,
Float, Float,
decltype(in_cb_global_desc), decltype(in_cb_global_desc),
decltype(in_cb_block_desc), decltype(in_cb_block_desc),
...@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b ...@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_chwn_cyxk_khwn_lds_double_b
InBlockCopyThreadPerDim0, InBlockCopyThreadPerDim0,
InBlockCopyThreadPerDim1>{}; InBlockCopyThreadPerDim1>{};
#elif 1 #elif 1
const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize, const auto blockwise_in_copy = Blockwise2dTensorCopy3<BlockSize,
Float, Float,
decltype(in_cb_global_desc), decltype(in_cb_global_desc),
decltype(in_cb_block_desc), decltype(in_cb_block_desc),
......
...@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim) ...@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
return os; return os;
} }
typedef enum typedef enum {
{
Half = 0, Half = 0,
Float = 1, Float = 1,
} DataType_t; } DataType_t;
......
...@@ -10,7 +10,7 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc, ...@@ -10,7 +10,7 @@ __device__ void threadwise_6d_tensor_copy(SrcDesc,
SrcOpLengths, SrcOpLengths,
Number<DataPerRead>) Number<DataPerRead>)
{ {
using vector_t = typename vector_type<Float, DataPerRead>::type; using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 && static_assert(SrcDesc{}.GetDimension() == 6 && DstDesc{}.GetDimension() == 6 &&
SrcOpLengths::nDim == 6, SrcOpLengths::nDim == 6,
...@@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc, ...@@ -80,7 +80,7 @@ __device__ void threadwise_8d_tensor_copy(SrcDesc,
SrcOpLengths, SrcOpLengths,
Number<DataPerRead>) Number<DataPerRead>)
{ {
using vector_t = typename vector_type<Float, DataPerRead>::type; using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 && static_assert(SrcDesc{}.GetDimension() == 8 && DstDesc{}.GetDimension() == 8 &&
SrcOpLengths::nDim == 8, SrcOpLengths::nDim == 8,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment