"...ops/git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "e9029c0e6dd6f30216c3ac65502357a06d99e8a8"
Commit 04c5527d authored by Chao Liu's avatar Chao Liu
Browse files

refactor

parent 5fd40ad7
......@@ -633,6 +633,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
#if 1
if(Y == 3 && X == 3)
{
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
......@@ -642,6 +643,7 @@ int main(int argc, char* argv[])
host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
}
check_error(out_nkhw_host, out_nkhw_device);
#endif
#if 0
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
......
......@@ -373,6 +373,8 @@ template <unsigned BlockSize,
unsigned DataPerRead>
struct Blockwise2dTensorCopy3
{
using vector_t = typename vector_type<Float, DataPerRead>::type;
unsigned mSrcMyThreadOffset;
unsigned mDstMyThreadOffset;
......@@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
using Float2 = float2;
using Float4 = float4;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
auto f_copy = [&](unsigned iloop) {
if(DataPerRead == 1)
{
p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] =
p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
}
else if(DataPerRead == 2)
{
*(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
}
else if(DataPerRead == 4)
{
*(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
}
else
{
assert(false);
}
*(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
};
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
......@@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
Float* p_clipboard) const
{
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
using Float2 = float2;
using Float4 = float4;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
auto f_copy = [&](unsigned iloop) {
if(DataPerRead == 1)
{
p_clipboard[iloop] = p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
}
else if(DataPerRead == 2)
{
*(reinterpret_cast<Float2*>(p_clipboard + iloop * 2)) =
*(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
}
else if(DataPerRead == 4)
{
*(reinterpret_cast<Float4*>(p_clipboard + iloop * 4)) =
*(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
}
else
{
assert(false);
}
*(reinterpret_cast<vector_t*>(p_clipboard + iloop * 4)) =
*(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
iloop * src_loop_stride));
};
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
......@@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
Float* __restrict__ p_dst) const
{
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
using Float2 = float2;
using Float4 = float4;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
......@@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
auto f_copy = [&](unsigned iloop) {
if(DataPerRead == 1)
{
p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = p_clipboard[iloop];
}
else if(DataPerRead == 2)
{
*(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const Float2*>(p_clipboard + iloop * 2));
}
else if(DataPerRead == 4)
{
*(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const Float4*>(p_clipboard + iloop * 4));
}
else
{
assert(false);
}
*(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
*(reinterpret_cast<const vector_t*>(p_clipboard + iloop * 4));
};
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
......
......@@ -349,6 +349,8 @@ template <unsigned BlockSize,
unsigned DataPerRead>
struct Blockwise4dTensorCopy3
{
using vector_t = typename vector_type<Float, DataPerRead>::type;
unsigned mSrcMyThreadOffset;
unsigned mDstMyThreadOffset;
......@@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
{
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
using Float2 = float2;
using Float4 = float4;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
......@@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3
iloop_d2 * thread_per_d2,
iloop_d3 * thread_per_d3 * DataPerRead);
if(DataPerRead == 1)
{
p_dst[dst_offset + mDstMyThreadOffset] =
p_src[src_offset + mSrcMyThreadOffset];
}
else if(DataPerRead == 2)
{
*(reinterpret_cast<Float2*>(p_dst + dst_offset + mDstMyThreadOffset)) =
*(reinterpret_cast<const Float2*>(p_src + src_offset +
mSrcMyThreadOffset));
}
else if(DataPerRead == 4)
{
*(reinterpret_cast<Float4*>(p_dst + dst_offset + mDstMyThreadOffset)) =
*(reinterpret_cast<const Float4*>(p_src + src_offset +
mSrcMyThreadOffset));
}
else
{
assert(false);
}
*(reinterpret_cast<vector_t*>(p_dst + dst_offset + mDstMyThreadOffset)) =
*(reinterpret_cast<const vector_t*>(p_src + src_offset +
mSrcMyThreadOffset));
}
}
}
......
......@@ -16,6 +16,81 @@ struct is_same<T, T>
static const bool value = true;
};
template <class T, unsigned N>
struct vector_type
{
};
template <>
struct vector_type<float, 1>
{
using type = float;
};
template <>
struct vector_type<float, 2>
{
using type = float2;
};
template <>
struct vector_type<float, 4>
{
using type = float4;
};
#if 0
template <>
struct vector_type<half_float::half, 1>
{
using type = half_float::half;
};
template <>
struct vector_type<half_float::half, 2>
{
using type = float;
};
template <>
struct vector_type<half_float::half, 4>
{
using type = float2;
};
template <>
struct vector_type<half_float::half, 8>
{
using type = float4;
};
#endif
#if 1
template <>
struct vector_type<half, 1>
{
using type = half;
};
template <>
struct vector_type<half, 2>
{
using type = half2;
};
template <>
struct vector_type<half, 4>
{
using type = float2;
};
template <>
struct vector_type<half, 8>
{
using type = float4;
};
#endif
template <class T, T N>
struct integral_constant
{
......
......@@ -4,8 +4,10 @@
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "half.hpp"
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "cuda_fp16.h"
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment