Commit 04c5527d authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 5fd40ad7
...@@ -633,6 +633,7 @@ int main(int argc, char* argv[]) ...@@ -633,6 +633,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1
if(Y == 3 && X == 3) if(Y == 3 && X == 3)
{ {
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
...@@ -642,6 +643,7 @@ int main(int argc, char* argv[]) ...@@ -642,6 +643,7 @@ int main(int argc, char* argv[])
host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads); host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
} }
check_error(out_nkhw_host, out_nkhw_device); check_error(out_nkhw_host, out_nkhw_device);
#endif
#if 0 #if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
......
...@@ -373,6 +373,8 @@ template <unsigned BlockSize, ...@@ -373,6 +373,8 @@ template <unsigned BlockSize,
unsigned DataPerRead> unsigned DataPerRead>
struct Blockwise2dTensorCopy3 struct Blockwise2dTensorCopy3
{ {
using vector_t = typename vector_type<Float, DataPerRead>::type;
unsigned mSrcMyThreadOffset; unsigned mSrcMyThreadOffset;
unsigned mDstMyThreadOffset; unsigned mDstMyThreadOffset;
...@@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3 ...@@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
using Float2 = float2;
using Float4 = float4;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3 ...@@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
auto f_copy = [&](unsigned iloop) { auto f_copy = [&](unsigned iloop) {
if(DataPerRead == 1) *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
{ *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = iloop * src_loop_stride));
p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
}
else if(DataPerRead == 2)
{
*(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
}
else if(DataPerRead == 4)
{
*(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
}
else
{
assert(false);
}
}; };
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop) for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
...@@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3 ...@@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* p_clipboard) const Float* p_clipboard) const
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
using Float2 = float2;
using Float4 = float4;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3 ...@@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
auto f_copy = [&](unsigned iloop) { auto f_copy = [&](unsigned iloop) {
if(DataPerRead == 1) *(reinterpret_cast<vector_t*>(p_clipboard + iloop * 4)) =
{ *(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
p_clipboard[iloop] = p_src[mSrcMyThreadOffset + iloop * src_loop_stride]; iloop * src_loop_stride));
}
else if(DataPerRead == 2)
{
*(reinterpret_cast<Float2*>(p_clipboard + iloop * 2)) =
*(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
}
else if(DataPerRead == 4)
{
*(reinterpret_cast<Float4*>(p_clipboard + iloop * 4)) =
*(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
}
else
{
assert(false);
}
}; };
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop) for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
...@@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3 ...@@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const Float* __restrict__ p_dst) const
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
using Float2 = float2;
using Float4 = float4;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
...@@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3 ...@@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0; constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
auto f_copy = [&](unsigned iloop) { auto f_copy = [&](unsigned iloop) {
if(DataPerRead == 1) *(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
{ *(reinterpret_cast<const vector_t*>(p_clipboard + iloop * 4));
p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = p_clipboard[iloop];
}
else if(DataPerRead == 2)
{
*(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const Float2*>(p_clipboard + iloop * 2));
}
else if(DataPerRead == 4)
{
*(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const Float4*>(p_clipboard + iloop * 4));
}
else
{
assert(false);
}
}; };
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop) for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
......
...@@ -349,6 +349,8 @@ template <unsigned BlockSize, ...@@ -349,6 +349,8 @@ template <unsigned BlockSize,
unsigned DataPerRead> unsigned DataPerRead>
struct Blockwise4dTensorCopy3 struct Blockwise4dTensorCopy3
{ {
using vector_t = typename vector_type<Float, DataPerRead>::type;
unsigned mSrcMyThreadOffset; unsigned mSrcMyThreadOffset;
unsigned mDstMyThreadOffset; unsigned mDstMyThreadOffset;
...@@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3 ...@@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{ {
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
using Float2 = float2;
using Float4 = float4;
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{}; constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{}; constexpr auto I2 = Number<2>{};
...@@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3 ...@@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3
iloop_d2 * thread_per_d2, iloop_d2 * thread_per_d2,
iloop_d3 * thread_per_d3 * DataPerRead); iloop_d3 * thread_per_d3 * DataPerRead);
if(DataPerRead == 1) *(reinterpret_cast<vector_t*>(p_dst + dst_offset + mDstMyThreadOffset)) =
{ *(reinterpret_cast<const vector_t*>(p_src + src_offset +
p_dst[dst_offset + mDstMyThreadOffset] = mSrcMyThreadOffset));
p_src[src_offset + mSrcMyThreadOffset];
}
else if(DataPerRead == 2)
{
*(reinterpret_cast<Float2*>(p_dst + dst_offset + mDstMyThreadOffset)) =
*(reinterpret_cast<const Float2*>(p_src + src_offset +
mSrcMyThreadOffset));
}
else if(DataPerRead == 4)
{
*(reinterpret_cast<Float4*>(p_dst + dst_offset + mDstMyThreadOffset)) =
*(reinterpret_cast<const Float4*>(p_src + src_offset +
mSrcMyThreadOffset));
}
else
{
assert(false);
}
} }
} }
} }
......
...@@ -16,6 +16,81 @@ struct is_same<T, T> ...@@ -16,6 +16,81 @@ struct is_same<T, T>
static const bool value = true; static const bool value = true;
}; };
template <class T, unsigned N>
struct vector_type
{
};
template <>
struct vector_type<float, 1>
{
using type = float;
};
template <>
struct vector_type<float, 2>
{
using type = float2;
};
template <>
struct vector_type<float, 4>
{
using type = float4;
};
#if 0
template <>
struct vector_type<half_float::half, 1>
{
using type = half_float::half;
};
template <>
struct vector_type<half_float::half, 2>
{
using type = float;
};
template <>
struct vector_type<half_float::half, 4>
{
using type = float2;
};
template <>
struct vector_type<half_float::half, 8>
{
using type = float4;
};
#endif
#if 1
template <>
struct vector_type<half, 1>
{
using type = half;
};
template <>
struct vector_type<half, 2>
{
using type = half2;
};
template <>
struct vector_type<half, 4>
{
using type = float2;
};
template <>
struct vector_type<half, 8>
{
using type = float4;
};
#endif
template <class T, T N> template <class T, T N>
struct integral_constant struct integral_constant
{ {
......
...@@ -4,8 +4,10 @@ ...@@ -4,8 +4,10 @@
#if DEVICE_BACKEND_HIP #if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h" #include "hip/hip_runtime.h"
#include "half.hpp"
#elif DEVICE_BACKEND_CUDA #elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "helper_cuda.h" #include "helper_cuda.h"
#include "cuda_fp16.h"
#endif #endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment