Commit 79d9b108 authored by Chao Liu's avatar Chao Liu
Browse files

adding fp16 direct that reads pre-vectorized data

parent 28325204
...@@ -13,8 +13,8 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, ...@@ -13,8 +13,8 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
unsigned nrepeat) unsigned nrepeat)
{ {
constexpr unsigned NVector = 1; constexpr unsigned NVector = 1;
using vector_type_t = vector_type<T, NVector>; using vector_t = vector_type<T, NVector>;
using vector_t = typename vector_type_t::VectorType; using vector_mem_t = typename vector_t::MemoryType;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -41,40 +41,41 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, ...@@ -41,40 +41,41 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{}); auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: "); ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
Tensor<vector_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) { auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
#if 1 #if 1
in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w); in_nchw_vec(n, c, h, w) = in_nchw(n, c, h, w);
#else #else
in_nchw_vec(n, c, h, w) = in_nchw_vec(n, c, h, w) =
vector_type_t::pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w)); vector_t::Pack(in_nchw(n, 2 * c, h, w), in_nchw(n, 2 * c + 1, h, w));
#endif #endif
}; };
make_ParallelTensorFunctor(f_vectorized_nchw, N, C, Hi, Wi)( make_ParallelTensorFunctor(f_vectorized_nchw, N, C / NVector, Hi, Wi)(
std::thread::hardware_concurrency()); std::thread::hardware_concurrency());
// vectorize weight // vectorize weight
auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{}); auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: "); ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
Tensor<vector_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) { auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
#if 1 #if 1
wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x); wei_kcyx_vec(k, c, y, x) = wei_kcyx(k, c, y, x);
#else #else
wei_kcyx_vec(k, c, y, x) = wei_kcyx_vec(k, c, y, x) =
vector_type_t::pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x)); vector_t::Pack(wei_kcyx(k, 2 * c, y, x), wei_kcyx(k, 2 * c + 1, y, x));
#endif #endif
}; };
make_ParallelTensorFunctor(f_vectorized_kcyx, K, C, Y, X)(std::thread::hardware_concurrency()); make_ParallelTensorFunctor(f_vectorized_kcyx, K, C / NVector, Y, X)(
std::thread::hardware_concurrency());
// //
DeviceMem in_nchw_vec_device_buf(sizeof(vector_t) * in_nchw_vec.mDesc.GetElementSpace()); DeviceMem in_nchw_vec_device_buf(sizeof(vector_mem_t) * in_nchw_vec.mDesc.GetElementSpace());
DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_t) * wei_kcyx_vec.mDesc.GetElementSpace()); DeviceMem wei_kcyx_vec_device_buf(sizeof(vector_mem_t) * wei_kcyx_vec.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace()); DeviceMem out_nkhw_device_buf(sizeof(T) * out_nkhw.mDesc.GetElementSpace());
in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data()); in_nchw_vec_device_buf.ToDevice(in_nchw_vec.mData.data());
...@@ -82,7 +83,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, ...@@ -82,7 +83,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
#if 1 #if 1
// 3x3, 34x34, 128 thread // 3x3, 34x34, 128 thread, fp32, vector = 1
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32; constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4; constexpr unsigned CPerBlock = 4;
...@@ -96,24 +97,42 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, ...@@ -96,24 +97,42 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
constexpr unsigned WoPerThread = 2; constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 4; constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
#elif 1 #elif 1
// 3x3, 34x34, 128 thread, fp16 // 3x3, 34x34, 128 thread, fp32, vector = 2
constexpr unsigned NPerBlock = 2; constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32; constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4; constexpr unsigned CPerBlock = 2;
constexpr unsigned HoPerBlock = 2; constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32; constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 2; constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4; constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2; constexpr unsigned CPerThread = 1;
constexpr unsigned HoPerThread = 2; constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2; constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2; constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 2;
constexpr unsigned BlockSize = 128;
#elif 1
// 3x3, 34x34, 128 thread, fp16
constexpr unsigned NPerBlock = 2;
constexpr unsigned KPerBlock = 32;
constexpr unsigned CPerBlock = 4;
constexpr unsigned HoPerBlock = 2;
constexpr unsigned WoPerBlock = 32;
constexpr unsigned NPerThread = 2;
constexpr unsigned KPerThread = 4;
constexpr unsigned CPerThread = 2;
constexpr unsigned HoPerThread = 2;
constexpr unsigned WoPerThread = 2;
constexpr unsigned InBlockCopyDataPerRead = 2;
constexpr unsigned WeiBlockCopyDataPerRead = 4; constexpr unsigned WeiBlockCopyDataPerRead = 4;
constexpr unsigned BlockSize = 128; constexpr unsigned BlockSize = 128;
......
...@@ -373,7 +373,7 @@ template <unsigned BlockSize, ...@@ -373,7 +373,7 @@ template <unsigned BlockSize,
unsigned DataPerRead> unsigned DataPerRead>
struct Blockwise2dTensorCopy3 struct Blockwise2dTensorCopy3
{ {
using vector_t = typename vector_type<Float, DataPerRead>::VectorType; using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
unsigned mSrcMyThreadOffset; unsigned mSrcMyThreadOffset;
unsigned mDstMyThreadOffset; unsigned mDstMyThreadOffset;
......
...@@ -207,7 +207,7 @@ template <unsigned BlockSize, ...@@ -207,7 +207,7 @@ template <unsigned BlockSize,
unsigned DataPerRead> unsigned DataPerRead>
struct Blockwise4dTensorCopy1 struct Blockwise4dTensorCopy1
{ {
using vector_t = typename vector_type<Float, DataPerRead>::VectorType; using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
__device__ constexpr Blockwise4dTensorCopy1() __device__ constexpr Blockwise4dTensorCopy1()
{ {
...@@ -444,7 +444,7 @@ template <unsigned BlockSize, ...@@ -444,7 +444,7 @@ template <unsigned BlockSize,
unsigned DataPerRead> unsigned DataPerRead>
struct Blockwise4dTensorCopy3 struct Blockwise4dTensorCopy3
{ {
using vector_t = typename vector_type<Float, DataPerRead>::VectorType; using vector_t = typename vector_type<Float, DataPerRead>::MemoryType;
unsigned mSrcMyThreadOffset; unsigned mSrcMyThreadOffset;
unsigned mDstMyThreadOffset; unsigned mDstMyThreadOffset;
......
#pragma once #pragma once
#include "data_type.hip.hpp"
#include "constant_integral.hip.hpp" #include "constant_integral.hip.hpp"
#include "Sequence.hip.hpp" #include "Sequence.hip.hpp"
#include "Array.hip.hpp" #include "Array.hip.hpp"
...@@ -20,97 +21,6 @@ struct is_same<T, T> ...@@ -20,97 +21,6 @@ struct is_same<T, T>
static const bool value = true; static const bool value = true;
}; };
template <class T, unsigned N>
struct vector_type
{
};
template <>
struct vector_type<float, 1>
{
using VectorType = float;
};
template <>
struct vector_type<float, 2>
{
using VectorType = float2;
};
template <>
struct vector_type<float, 4>
{
using VectorType = float4;
};
#if 0
template <>
struct vector_type<half_float::half, 1>
{
using VectorType = half_float::half;
};
template <>
struct vector_type<half_float::half, 2>
{
using VectorType = float;
};
template <>
struct vector_type<half_float::half, 4>
{
using VectorType = float2;
};
template <>
struct vector_type<half_float::half, 8>
{
using VectorType = float4;
};
#endif
#if 1
template <>
struct vector_type<half, 1>
{
using VectorType = half;
__host__ __device__ static VectorType pack(half s) { return s; }
};
template <>
struct vector_type<half, 2>
{
using VectorType = half2;
union Data
{
VectorType vector;
half scalar[2];
};
__host__ __device__ static VectorType pack(half s0, half s1)
{
Data data;
data.scalar[0] = s0;
data.scalar[1] = s1;
return data.vector;
}
};
template <>
struct vector_type<half, 4>
{
using VectorType = float2;
};
template <>
struct vector_type<half, 8>
{
using VectorType = float4;
};
#endif
template <typename T> template <typename T>
__host__ __device__ constexpr T max(T a, T b) __host__ __device__ constexpr T max(T a, T b)
{ {
......
...@@ -4,10 +4,8 @@ ...@@ -4,10 +4,8 @@
#if DEVICE_BACKEND_HIP #if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "half.hpp"
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "helper_cuda.h" #include "helper_cuda.h"
#include "cuda_fp16.h"
#endif #endif
...@@ -47,3 +47,11 @@ struct static_const_reduce_n<1> ...@@ -47,3 +47,11 @@ struct static_const_reduce_n<1>
return f(Number<0>{}); return f(Number<0>{});
} }
}; };
#if 0
template<class F>
__host__ __device__ constexpr auto unpacker(F f)
{
return [=](auto xs_array){ f(xs...); };
}
#endif
\ No newline at end of file
...@@ -27,12 +27,14 @@ template <class Float, ...@@ -27,12 +27,14 @@ template <class Float,
unsigned BlockSize, unsigned BlockSize,
unsigned GridSize> unsigned GridSize>
__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_in_vec_global, const typename vector_type<Float,
const typename vector_type<Float, ScalarPerVector>::VectorType* const __restrict__ p_wei_vec_global, ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
const typename vector_type<Float,
ScalarPerVector>::MemoryType* const __restrict__ p_wei_vec_global,
Float* const __restrict__ p_out_global) Float* const __restrict__ p_out_global)
{ {
using scalar_t = Float; using scalar_t = Float;
using vector_t = typename vector_type<scalar_t, ScalarPerVector>::VectorType; using vector_mem_t = typename vector_type<scalar_t, ScalarPerVector>::MemoryType;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -69,6 +71,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -69,6 +71,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
// shared mem // shared mem
constexpr unsigned in_block_size = constexpr unsigned in_block_size =
in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{}); in_nchw_vec_block_desc.GetElementSpace(Number<InBlockCopyDataPerRead>{});
constexpr unsigned wei_block_size = constexpr unsigned wei_block_size =
wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{}); wei_kcyx_vec_block_desc.GetElementSpace(Number<WeiBlockCopyDataPerRead>{});
...@@ -76,8 +79,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -76,8 +79,10 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
? InBlockCopyDataPerRead ? InBlockCopyDataPerRead
: WeiBlockCopyDataPerRead; : WeiBlockCopyDataPerRead;
__shared__ vector_t p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)]; __shared__ vector_mem_t
__shared__ vector_t p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)]; p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
__shared__ vector_mem_t
p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
// threadwise tensors // threadwise tensors
constexpr unsigned HiPerThread = HoPerThread + Y - 1; constexpr unsigned HiPerThread = HoPerThread + Y - 1;
...@@ -150,7 +155,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -150,7 +155,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
constexpr auto blockwise_in_copy = constexpr auto blockwise_in_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
vector_t, vector_mem_t,
decltype(in_nchw_vec_global_desc), decltype(in_nchw_vec_global_desc),
decltype(in_nchw_vec_block_desc), decltype(in_nchw_vec_block_desc),
decltype(in_nchw_vec_block_desc.GetLengths()), decltype(in_nchw_vec_block_desc.GetLengths()),
...@@ -159,7 +164,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -159,7 +164,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
#if 0 #if 0
constexpr auto blockwise_wei_copy = constexpr auto blockwise_wei_copy =
Blockwise4dTensorCopy1<BlockSize, Blockwise4dTensorCopy1<BlockSize,
vector_t, vector_mem_t,
decltype(wei_kcyx_vec_global_desc), decltype(wei_kcyx_vec_global_desc),
decltype(wei_kcyx_vec_block_desc), decltype(wei_kcyx_vec_block_desc),
decltype(wei_kcyx_vec_block_desc.GetLengths()), decltype(wei_kcyx_vec_block_desc.GetLengths()),
...@@ -167,7 +172,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -167,7 +172,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
#elif 1 #elif 1
const auto blockwise_wei_copy = const auto blockwise_wei_copy =
Blockwise2dTensorCopy3<BlockSize, Blockwise2dTensorCopy3<BlockSize,
vector_t, vector_mem_t,
decltype(wei_ke_vec_global_desc), decltype(wei_ke_vec_global_desc),
decltype(wei_ke_vec_block_desc), decltype(wei_ke_vec_block_desc),
decltype(wei_ke_vec_block_desc.GetLengths()), decltype(wei_ke_vec_block_desc.GetLengths()),
...@@ -181,15 +186,16 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -181,15 +186,16 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
c_block_data_begin += CPerBlock, __syncthreads()) c_block_data_begin += CPerBlock, __syncthreads())
{ {
// copy input tensor to LDS // copy input tensor to LDS
blockwise_in_copy.Run(p_in_vec_global + in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin, blockwise_in_copy.Run(p_in_vec_global +
c_block_data_begin, in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin,
hi_block_data_begin, c_block_data_begin,
wi_block_data_begin), hi_block_data_begin,
wi_block_data_begin),
p_in_vec_block); p_in_vec_block);
// copy weight tensor to LDS // copy weight tensor to LDS
blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex( blockwise_wei_copy.Run(p_wei_vec_global + wei_kcyx_vec_global_desc.Get1dIndex(
k_block_data_begin, c_block_data_begin, 0, 0), k_block_data_begin, c_block_data_begin, 0, 0),
p_wei_vec_block); p_wei_vec_block);
__syncthreads(); __syncthreads();
...@@ -201,9 +207,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -201,9 +207,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
threadwise_direct_convolution_2( threadwise_direct_convolution_2(
in_nchw_vec_thread_block_desc, in_nchw_vec_thread_block_desc,
p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
wei_kcyx_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc,
p_wei_vec_block + p_wei_vec_block +
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
...@@ -213,9 +219,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( ...@@ -213,9 +219,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
threadwise_direct_convolution_3( threadwise_direct_convolution_3(
in_nchw_vec_thread_block_desc, in_nchw_vec_thread_block_desc,
p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, p_in_vec_block + in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin,
c_thread_data, c_thread_data,
hi_thread_data_begin, hi_thread_data_begin,
wi_thread_data_begin), wi_thread_data_begin),
wei_kcyx_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc,
p_wei_vec_block + p_wei_vec_block +
wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0),
......
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
#include "ConstantTensorDescriptor.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp"
// optimized for scenario if p_in, p_wei, p_out are in register // optimized for scenario if p_in, p_wei, p_out are in register
template <class Float, class InDesc, class WeiDesc, class OutDesc> template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_1(InDesc, __device__ void threadwise_direct_convolution_1(InDesc,
Float* const __restrict__ p_in, TInWei* const __restrict__ p_in,
WeiDesc, WeiDesc,
Float* const __restrict__ p_wei, TInWei* const __restrict__ p_wei,
OutDesc, OutDesc,
Float* __restrict__ p_out) TOut* __restrict__ p_out)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -51,25 +51,10 @@ __device__ void threadwise_direct_convolution_1(InDesc, ...@@ -51,25 +51,10 @@ __device__ void threadwise_direct_convolution_1(InDesc,
const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo); const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);
p_out[out_index] += p_wei[wei_index] * p_in[in_index]; fused_multiply_add(p_out[out_index],
p_wei[wei_index],
#if 0 p_in[in_index],
// if(threadIdx.x == 0) p_out[out_index]);
{
printf("threadwise_direct_convolution: \t"
"threadIdx.x %u\t"
"out_index %u, p_out[out_index] %f, \t"
"wei_index %u, p_wei[wei_index] %f, \t"
"in_index %u, p_in[in_index] %f\n",
threadIdx.x,
out_index,
p_out[out_index],
wei_index,
p_wei[wei_index],
in_index,
p_in[in_index]);
}
#endif
} }
} }
} }
...@@ -81,13 +66,13 @@ __device__ void threadwise_direct_convolution_1(InDesc, ...@@ -81,13 +66,13 @@ __device__ void threadwise_direct_convolution_1(InDesc,
// Optimized for scenario if p_in and p_wei are in LDS, p_out are in register // Optimized for scenario if p_in and p_wei are in LDS, p_out are in register
// Copy in and wei into register before doing convolution // Copy in and wei into register before doing convolution
template <class Float, class InDesc, class WeiDesc, class OutDesc> template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_2(InDesc, __device__ void threadwise_direct_convolution_2(InDesc,
Float* const __restrict__ p_in, TInWei* const __restrict__ p_in,
WeiDesc, WeiDesc,
Float* const __restrict__ p_wei, TInWei* const __restrict__ p_wei,
OutDesc, OutDesc,
Float* __restrict__ p_out) TOut* __restrict__ p_out)
{ {
constexpr auto in_desc = InDesc{}; constexpr auto in_desc = InDesc{};
constexpr auto wei_desc = WeiDesc{}; constexpr auto wei_desc = WeiDesc{};
...@@ -97,8 +82,8 @@ __device__ void threadwise_direct_convolution_2(InDesc, ...@@ -97,8 +82,8 @@ __device__ void threadwise_direct_convolution_2(InDesc,
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths()); constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(wei_desc.GetLengths());
// register // register
Float p_in_reg[in_reg_desc.GetElementSpace()]; TInWei p_in_reg[in_reg_desc.GetElementSpace()];
Float p_wei_reg[wei_reg_desc.GetElementSpace()]; TInWei p_wei_reg[wei_reg_desc.GetElementSpace()];
// copy input tensor into register // copy input tensor into register
threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths()); threadwise_4d_tensor_copy(in_desc, p_in, in_reg_desc, p_in_reg, in_reg_desc.GetLengths());
...@@ -114,13 +99,13 @@ __device__ void threadwise_direct_convolution_2(InDesc, ...@@ -114,13 +99,13 @@ __device__ void threadwise_direct_convolution_2(InDesc,
// optimized for scenario where p_in and p_wei are in LDS, p_out is in register // optimized for scenario where p_in and p_wei are in LDS, p_out is in register
// break down a non-1x1 convolution into a sequence of 1x1 convolutions, // break down a non-1x1 convolution into a sequence of 1x1 convolutions,
// load 1x1 weight into register, and do 1x1 convolution in register. // load 1x1 weight into register, and do 1x1 convolution in register.
template <class Float, class InDesc, class WeiDesc, class OutDesc> template <class Data, class InDesc, class WeiDesc, class OutDesc>
__device__ void threadwise_direct_convolution_3(InDesc, __device__ void threadwise_direct_convolution_3(InDesc,
Float* const __restrict__ p_in, Data* const __restrict__ p_in,
WeiDesc, WeiDesc,
Float* const __restrict__ p_wei, Data* const __restrict__ p_wei,
OutDesc, OutDesc,
Float* __restrict__ p_out) Data* __restrict__ p_out)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -139,8 +124,8 @@ __device__ void threadwise_direct_convolution_3(InDesc, ...@@ -139,8 +124,8 @@ __device__ void threadwise_direct_convolution_3(InDesc,
constexpr auto wei_reg_desc = make_ConstantTensorDescriptor( constexpr auto wei_reg_desc = make_ConstantTensorDescriptor(
Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{}); Sequence<wei_desc.GetLength(I0), wei_desc.GetLength(I1), 1, 1>{});
Float p_in_reg[in_reg_desc.GetElementSpace()]; Data p_in_reg[in_reg_desc.GetElementSpace()];
Float p_wei_reg[wei_reg_desc.GetElementSpace()]; Data p_wei_reg[wei_reg_desc.GetElementSpace()];
constexpr unsigned in_w_new_read = 1; constexpr unsigned in_w_new_read = 1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment