Commit 6ed96265 authored by Chao Liu's avatar Chao Liu
Browse files

added col2im

parent ad3ac5cc
#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
template <typename GridwiseOp, typename... Xs>
__global__ void run_gridwise_operation(GridwiseOp& gridwise_op, Xs... xs)
{
gridwise_op.Run(xs...);
}
#endif
#ifndef CK_GRIDWISE_COL2IM_EB_NCHW_HPP
#define CK_GRIDWISE_COL2IM_EB_NCHW_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
namespace ck {
// B = merge(N, Ho, Wo)
template <index_t GridSize,
index_t BlockSize,
typename Float,
typename ColGlobalDesc,
typename ImgGlobalDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads,
index_t EPerBlock,
index_t BPerBlock,
typename BlockCopySubLengths_E_B,
typename BlockCopyClusterLengths_E_B,
typename BlockCopyThreadClusterArrangeOrder,
typename BlockCopySrcAccessOrder,
typename BlockCopyDstAccessOrder,
index_t BlockCopyDataPerAccess_B>
struct GridwiseCol2Im_eb_nchw
{
__device__ void Run(const Float* const __restrict__ p_col_global,
Float* const __restrict__ p_img_global) const
{
constexpr auto col_e_b_global_desc = ColGlobalDesc{};
constexpr auto img_n_c_hi_wi_global_desc = ImgGlobalDesc{};
constexpr index_t N = img_n_c_hi_wi_global_desc.GetLengths()[0];
constexpr index_t C = img_n_c_hi_wi_global_desc.GetLengths()[1];
constexpr index_t Hi = img_n_c_hi_wi_global_desc.GetLengths()[2];
constexpr index_t Wi = img_n_c_hi_wi_global_desc.GetLengths()[3];
constexpr index_t Ho = OutputSizes{}[0];
constexpr index_t Wo = OutputSizes{}[1];
constexpr index_t Y = FilterSizes{}[0];
constexpr index_t X = FilterSizes{}[1];
constexpr index_t ConvStrideH = ConvStrides{}[0];
constexpr index_t ConvStrideW = ConvStrides{}[1];
constexpr index_t ConvDilationH = ConvDilations{}[0];
constexpr index_t ConvDilationW = ConvDilations{}[1];
constexpr index_t E = C * Y * X;
constexpr index_t B = N * Ho * Wo;
// sanity-check for vectorized memory load
static_assert((Wo == 1 || (ConvStrideW == 1 || BlockCopyDataPerAccess_B == 1)) &&
(X == 1 || ConvDilationW % BlockCopyDataPerAccess_B == 0),
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated");
// divide block work by [E, B]
static_assert(E % EPerBlock == 0 && B % BPerBlock == 0,
"wrong! cannot divide work evenly among block");
constexpr index_t EBlockWork = E / EPerBlock;
constexpr index_t BBlockWork = B / BPerBlock;
constexpr auto block_work_desc =
make_cluster_descriptor(Sequence<EBlockWork, BBlockWork>{});
const auto block_work_id = block_work_desc.CalculateClusterIndex(get_block_1d_id());
const index_t e_block_data_on_global = block_work_id[0] * EPerBlock;
const index_t b_block_data_on_global = block_work_id[1] * BPerBlock;
// construct img_eb_global_desc
constexpr auto img_n_c_hip_wip_global_desc = transform_tensor_descriptor(
img_n_c_hi_wi_global_desc,
make_tuple(
PassThrough<N>{}, PassThrough<C>{}, Pad<Sequence<Hi, Wi>, LeftPads, RightPads>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
constexpr auto img_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
img_n_c_hip_wip_global_desc,
make_tuple(PassThrough<N>{},
PassThrough<C>{},
Embed<Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
Embed<Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
constexpr auto img_e_b_global_desc = transform_tensor_descriptor(
img_n_c_y_ho_x_wo_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// blockwise atomic accumulation
auto blockwise_copy = BlockwiseGenericTensorSliceCopy_v4<BlockSize,
decltype(col_e_b_global_desc),
decltype(img_e_b_global_desc),
Sequence<EPerBlock, BPerBlock>,
BlockCopySubLengths_E_B,
BlockCopyClusterLengths_E_B,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
1,
1,
BlockCopyDataPerAccess_B,
BlockCopyDataPerAccess_B>(
{e_block_data_on_global, b_block_data_on_global},
{e_block_data_on_global, b_block_data_on_global});
// blockwise copy
blockwise_copy.Run(p_col_global, p_img_global);
}
};
} // namespace ck
#endif
...@@ -134,8 +134,18 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -134,8 +134,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
} }
else else
{ {
#if 0 // debug
mThreadwiseStore.Run( mThreadwiseStore.Run(
p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space); p_thread_buffer, p_block_dst, thread_buffer_address_space, block_dst_address_space);
#else
constexpr auto True = integral_constant<bool, true>{};
mThreadwiseStore.Run(p_thread_buffer,
p_block_dst,
thread_buffer_address_space,
block_dst_address_space,
True);
#endif
} }
} }
......
...@@ -69,11 +69,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -69,11 +69,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
template <typename SrcData, template <typename SrcData,
typename DstData, typename DstData,
AddressSpace SrcAddressSpace, AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace> AddressSpace DstAddressSpace,
bool DoAtomicAdd = false>
__device__ void Run(const SrcData* p_src, __device__ void Run(const SrcData* p_src,
DstData* p_dst, DstData* p_dst,
integral_constant<AddressSpace, SrcAddressSpace>, integral_constant<AddressSpace, SrcAddressSpace>,
integral_constant<AddressSpace, DstAddressSpace>) const integral_constant<AddressSpace, DstAddressSpace>,
integral_constant<bool, DoAtomicAdd> do_atomic_add =
integral_constant<bool, DoAtomicAdd>{}) const
{ {
using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType; using src_vector_t = typename vector_type<SrcData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType; using dst_vector_t = typename vector_type<DstData, DstDataPerAccess>::MemoryType;
...@@ -160,21 +163,27 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -160,21 +163,27 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
// has the same padding situation // has the same padding situation
if(dst_coord.IsUpperIndexMappedToValidOffset()) if(dst_coord.IsUpperIndexMappedToValidOffset())
{ {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) { static_if<!DoAtomicAdd>{}([&](auto) {
static_if<DstAddressSpace == AddressSpace::global>{}([&](auto fwd) {
#if CK_USE_AMD_BUFFER_ADDRESSING #if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store<DstData, DstDataPerAccess>( amd_intrinsic_buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
fwd(p_dst), fwd(p_dst),
dst_coord.GetOffset(), dst_coord.GetOffset(),
0); 0);
#else #else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
#endif #endif
}).Else([&](auto) {
// dst can be all kinds of memory-space
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
});
}).Else([&](auto) { }).Else([&](auto) {
// dst can be all kinds of memory-space atomicAdd(
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) = reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]),
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]));
}); });
} }
} }
......
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_col2im_eb_nchw.hpp"
template <typename T,
typename ColDesc,
typename ImgDesc,
typename FilterSizes,
typename OutputSizes,
typename ConvStrides,
typename ConvDilations,
typename LeftPads,
typename RightPads>
void device_col2im_eb_nchw(ColDesc,
const Tensor<T>& col_eb,
ImgDesc,
Tensor<T>& img_nchw,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
std::size_t nrepeat)
{
using namespace ck;
constexpr auto col_eb_desc = ColDesc{};
constexpr auto img_nchw_desc = ImgDesc{};
constexpr index_t N = img_nchw_desc.GetLengths()[0];
constexpr index_t C = img_nchw_desc.GetLengths()[1];
constexpr index_t Hi = img_nchw_desc.GetLengths()[2];
constexpr index_t Wi = img_nchw_desc.GetLengths()[3];
constexpr index_t E = col_eb_desc.GetLengths()[0];
constexpr index_t B = col_eb_desc.GetLengths()[1];
std::size_t data_sz = sizeof(T);
DeviceMem col_eb_device_buf(data_sz * col_eb.mDesc.GetElementSpace());
DeviceMem img_nchw_device_buf(data_sz * img_nchw.mDesc.GetElementSpace());
col_eb_device_buf.ToDevice(col_eb.mData.data());
img_nchw_device_buf.ToDevice(img_nchw.mData.data());
#if 1
constexpr index_t BlockSize = 256;
constexpr index_t EPerBlock = 128;
constexpr index_t BPerBlock = 128;
using BlockCopySubLengths_E_B = Sequence<8, 8>;
using BlockCopyClusterLengths_E_B = Sequence<16, 16>;
using BlockCopyThreadClusterArrangeOrder = Sequence<0, 1>; // [E, B]
using BlockCopySrcAccessOrder = Sequence<0, 1>; // [E, B]
using BlockCopyDstAccessOrder = Sequence<0, 1>; // [E, B]
constexpr index_t BlockCopyDataPerAccess_B = 1;
#endif
constexpr index_t GridSize =
((E + EPerBlock - 1) / EPerBlock) * ((B + BPerBlock - 1) / BPerBlock);
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto gridwise_col2im = GridwiseCol2Im_eb_nchw<GridSize,
BlockSize,
T,
ColDesc,
ImgDesc,
FilterSizes,
OutputSizes,
ConvStrides,
ConvDilations,
LeftPads,
RightPads,
EPerBlock,
BPerBlock,
BlockCopySubLengths_E_B,
BlockCopyClusterLengths_E_B,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder,
BlockCopyDataPerAccess_B>{};
for(index_t i = 0; i < nrepeat; ++i)
{
float time = launch_kernel(run_gridwise_operation<decltype(gridwise_col2im),
const T* const __restrict__,
T* const __restrict__>,
dim3(GridSize),
dim3(BlockSize),
0,
gridwise_col2im,
const_cast<const T* const __restrict__>(
static_cast<T*>(col_eb_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(img_nchw_device_buf.GetDeviceBuffer())));
printf("Elapsed time : %f ms\n", time);
usleep(std::min(time * 1000, float(10000)));
}
img_nchw_device_buf.FromDevice(img_nchw.mData.data());
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#include <unistd.h> #include <unistd.h>
#include "device.hpp" #include "device.hpp"
#include "tensor.hpp" #include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp" #include "gridwise_operation_wrapper.hpp"
#include "convolution_common.hpp" #include "convolution_common.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
...@@ -221,13 +221,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -221,13 +221,20 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for(index_t i = 0; i < nrepeat; ++i) for(index_t i = 0; i < nrepeat; ++i)
{ {
float time = launch_kernel(run_gridwise_convolution_kernel<decltype(gridwise_conv), T>, float time = launch_kernel(run_gridwise_operation<decltype(gridwise_conv),
const T* const __restrict__,
const T* const __restrict__,
T* const __restrict__>,
dim3(GridSize), dim3(GridSize),
dim3(BlockSize), dim3(BlockSize),
0, 0,
static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer()), gridwise_conv,
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer()), const_cast<const T* const __restrict__>(
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())); static_cast<T*>(in_nchw_device_buf.GetDeviceBuffer())),
const_cast<const T* const __restrict__>(
static_cast<T*>(wei_kcyx_device_buf.GetDeviceBuffer())),
const_cast<T* const __restrict__>(
static_cast<T*>(out_nkhw_device_buf.GetDeviceBuffer())));
printf("Elapsed time : %f ms, %f TFlop/s\n", printf("Elapsed time : %f ms, %f TFlop/s\n",
time, time,
......
...@@ -37,7 +37,7 @@ void host_col2im(const Tensor<T>& in_eb, ...@@ -37,7 +37,7 @@ void host_col2im(const Tensor<T>& in_eb,
{ {
int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0]; int h_tmp = hi + LeftPads{}[0] - y * ConvDilations{}[0];
if(h_tmp % ConvStrides{}[0] == 0) if(h_tmp >= 0 && h_tmp < HI && h_tmp % ConvStrides{}[0] == 0)
{ {
int ho = h_tmp / ConvStrides{}[0]; int ho = h_tmp / ConvStrides{}[0];
...@@ -45,7 +45,7 @@ void host_col2im(const Tensor<T>& in_eb, ...@@ -45,7 +45,7 @@ void host_col2im(const Tensor<T>& in_eb,
{ {
int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1]; int w_tmp = wi + LeftPads{}[1] - x * ConvDilations{}[1];
if(w_tmp % ConvStrides{}[1] == 0) if(w_tmp >= 0 && w_tmp < WI && w_tmp % ConvStrides{}[1] == 0)
{ {
int wo = w_tmp / ConvStrides{}[1]; int wo = w_tmp / ConvStrides{}[1];
......
...@@ -5,10 +5,12 @@ ...@@ -5,10 +5,12 @@
struct GeneratorTensor_1 struct GeneratorTensor_1
{ {
int value = 1;
template <class... Is> template <class... Is>
double operator()(Is... is) double operator()(Is... is)
{ {
return 1; return value;
} }
}; };
......
...@@ -13,26 +13,26 @@ ...@@ -13,26 +13,26 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_col2im.hpp" #include "host_col2im.hpp"
//#include "device_col2im.hpp" #include "device_col2im_eb_nchw.hpp"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 1
constexpr index_t N = 1; constexpr index_t N = 2;
constexpr index_t C = 1; constexpr index_t C = 8;
constexpr index_t HI = 17; constexpr index_t HI = 8;
constexpr index_t WI = 17; constexpr index_t WI = 8;
constexpr index_t K = 1; constexpr index_t K = 128;
constexpr index_t Y = 3; constexpr index_t Y = 4;
constexpr index_t X = 3; constexpr index_t X = 4;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>; using RightPads = Sequence<2, 2>;
#elif 0 #elif 0
// 3x3, 34x34 // 3x3, 34x34
constexpr index_t N = 64; constexpr index_t N = 64;
...@@ -303,21 +303,22 @@ int main(int argc, char* argv[]) ...@@ -303,21 +303,22 @@ int main(int argc, char* argv[])
using RightPads = Sequence<0, 3>; using RightPads = Sequence<0, 3>;
#endif #endif
constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{}); constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{}); constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
constexpr index_t HO = out_nkhw_desc.GetLengths()[2]; constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
constexpr index_t WO = out_nkhw_desc.GetLengths()[3]; constexpr index_t WO = out_nkhw_desc.GetLengths()[3];
auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{}); constexpr auto col_eb_desc =
make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
using FilterSizes = Sequence<Y, X>; using FilterSizes = Sequence<Y, X>;
using OutputSizes = Sequence<HO, WO>; using OutputSizes = Sequence<HO, WO>;
ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: ");
ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: "); ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: ");
print_sequence("FilterSizes", FilterSizes{}); print_sequence("FilterSizes", FilterSizes{});
print_sequence("OutputSizes", OutputSizes{}); print_sequence("OutputSizes", OutputSizes{});
print_sequence("LeftPads", LeftPads{}); print_sequence("LeftPads", LeftPads{});
...@@ -326,9 +327,9 @@ int main(int argc, char* argv[]) ...@@ -326,9 +327,9 @@ int main(int argc, char* argv[])
print_sequence("ConvStrides", ConvStrides{}); print_sequence("ConvStrides", ConvStrides{});
print_sequence("ConvDilations", ConvDilations{}); print_sequence("ConvDilations", ConvDilations{});
Tensor<float> in_eb(make_TensorDescriptor(in_eb_desc)); Tensor<float> col_eb(make_TensorDescriptor(col_eb_desc));
Tensor<float> in_nchw_host(make_TensorDescriptor(in_nchw_desc)); Tensor<float> img_nchw_host(make_TensorDescriptor(img_nchw_desc));
Tensor<float> in_nchw_device(make_TensorDescriptor(in_nchw_desc)); Tensor<float> img_nchw_device(make_TensorDescriptor(img_nchw_desc));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
...@@ -339,35 +340,35 @@ int main(int argc, char* argv[]) ...@@ -339,35 +340,35 @@ int main(int argc, char* argv[])
} }
bool do_verification = atoi(argv[1]); bool do_verification = atoi(argv[1]);
index_t nrepeat = atoi(argv[2]); std::size_t nrepeat = atoi(argv[2]);
if(do_verification) if(do_verification)
{ {
#if 1 #if 1
in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#else #else
in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#endif #endif
} }
#if 0 #if 1
device_col2im(in_eb_desc, device_col2im_eb_nchw(col_eb_desc,
in_eb, col_eb,
in_nchw_desc, img_nchw_desc,
in_nchw_device, img_nchw_device,
FilterSizes{}, FilterSizes{},
OutputSizes{}, OutputSizes{},
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#endif #endif
if(do_verification) if(do_verification)
{ {
host_col2im(in_eb, host_col2im(col_eb,
in_nchw_host, img_nchw_host,
FilterSizes{}, FilterSizes{},
OutputSizes{}, OutputSizes{},
ConvStrides{}, ConvStrides{},
...@@ -375,12 +376,12 @@ int main(int argc, char* argv[]) ...@@ -375,12 +376,12 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}); RightPads{});
check_error(in_nchw_host, in_nchw_device); check_error(img_nchw_host, img_nchw_device);
#if 1 #if 1
LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl; LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl; LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl; LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
#endif #endif
} }
} }
...@@ -13,26 +13,26 @@ ...@@ -13,26 +13,26 @@
#include "device_tensor.hpp" #include "device_tensor.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_col2im.hpp" #include "host_col2im.hpp"
//#include "device_col2im.hpp" #include "device_col2im_eb_nchw.hpp"
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 1
constexpr index_t N = 1; constexpr index_t N = 2;
constexpr index_t C = 1; constexpr index_t C = 8;
constexpr index_t HI = 17; constexpr index_t HI = 8;
constexpr index_t WI = 17; constexpr index_t WI = 8;
constexpr index_t K = 1; constexpr index_t K = 128;
constexpr index_t Y = 3; constexpr index_t Y = 4;
constexpr index_t X = 3; constexpr index_t X = 4;
using ConvStrides = Sequence<1, 1>; using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<1, 1>; using LeftPads = Sequence<1, 1>;
using RightPads = Sequence<1, 1>; using RightPads = Sequence<2, 2>;
#elif 0 #elif 0
// 3x3, 34x34 // 3x3, 34x34
constexpr index_t N = 64; constexpr index_t N = 64;
...@@ -303,21 +303,22 @@ int main(int argc, char* argv[]) ...@@ -303,21 +303,22 @@ int main(int argc, char* argv[])
using RightPads = Sequence<0, 3>; using RightPads = Sequence<0, 3>;
#endif #endif
constexpr auto in_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{}); constexpr auto img_nchw_desc = make_native_tensor_descriptor_packed(Sequence<N, C, HI, WI>{});
constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{}); constexpr auto wei_kcyx_desc = make_native_tensor_descriptor_packed(Sequence<K, C, Y, X>{});
constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor( constexpr auto out_nkhw_desc = get_convolution_output_default_4d_tensor_descriptor(
in_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{}); img_nchw_desc, wei_kcyx_desc, ConvStrides{}, ConvDilations{}, LeftPads{}, RightPads{});
constexpr index_t HO = out_nkhw_desc.GetLengths()[2]; constexpr index_t HO = out_nkhw_desc.GetLengths()[2];
constexpr index_t WO = out_nkhw_desc.GetLengths()[3]; constexpr index_t WO = out_nkhw_desc.GetLengths()[3];
auto in_eb_desc = make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{}); constexpr auto col_eb_desc =
make_native_tensor_descriptor_packed(Sequence<C * Y * X, N * HO * WO>{});
using FilterSizes = Sequence<Y, X>; using FilterSizes = Sequence<Y, X>;
using OutputSizes = Sequence<HO, WO>; using OutputSizes = Sequence<HO, WO>;
ostream_ConstantTensorDescriptor(in_nchw_desc, std::cout << "in_nchw_desc: "); ostream_ConstantTensorDescriptor(col_eb_desc, std::cout << "col_eb_desc: ");
ostream_ConstantTensorDescriptor(in_eb_desc, std::cout << "in_eb_desc: "); ostream_ConstantTensorDescriptor(img_nchw_desc, std::cout << "img_nchw_desc: ");
print_sequence("FilterSizes", FilterSizes{}); print_sequence("FilterSizes", FilterSizes{});
print_sequence("OutputSizes", OutputSizes{}); print_sequence("OutputSizes", OutputSizes{});
print_sequence("LeftPads", LeftPads{}); print_sequence("LeftPads", LeftPads{});
...@@ -326,9 +327,9 @@ int main(int argc, char* argv[]) ...@@ -326,9 +327,9 @@ int main(int argc, char* argv[])
print_sequence("ConvStrides", ConvStrides{}); print_sequence("ConvStrides", ConvStrides{});
print_sequence("ConvDilations", ConvDilations{}); print_sequence("ConvDilations", ConvDilations{});
Tensor<float> in_eb(make_TensorDescriptor(in_eb_desc)); Tensor<float> col_eb(make_TensorDescriptor(col_eb_desc));
Tensor<float> in_nchw_host(make_TensorDescriptor(in_nchw_desc)); Tensor<float> img_nchw_host(make_TensorDescriptor(img_nchw_desc));
Tensor<float> in_nchw_device(make_TensorDescriptor(in_nchw_desc)); Tensor<float> img_nchw_device(make_TensorDescriptor(img_nchw_desc));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
...@@ -339,35 +340,37 @@ int main(int argc, char* argv[]) ...@@ -339,35 +340,37 @@ int main(int argc, char* argv[])
} }
bool do_verification = atoi(argv[1]); bool do_verification = atoi(argv[1]);
index_t nrepeat = atoi(argv[2]); std::size_t nrepeat = atoi(argv[2]);
if(do_verification) if(do_verification)
{ {
#if 1 #if 0
in_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread); col_eb.GenerateTensorValue(GeneratorTensor_1{1}, num_thread);
img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread);
#else #else
in_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); col_eb.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
img_nchw_device.GenerateTensorValue(GeneratorTensor_1{0}, num_thread);
#endif #endif
} }
#if 0 #if 1
device_col2im(in_eb_desc, device_col2im_eb_nchw(col_eb_desc,
in_eb, col_eb,
in_nchw_desc, img_nchw_desc,
in_nchw_device, img_nchw_device,
FilterSizes{}, FilterSizes{},
OutputSizes{}, OutputSizes{},
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
LeftPads{}, LeftPads{},
RightPads{}, RightPads{},
nrepeat); nrepeat);
#endif #endif
if(do_verification) if(do_verification)
{ {
host_col2im(in_eb, host_col2im(col_eb,
in_nchw_host, img_nchw_host,
FilterSizes{}, FilterSizes{},
OutputSizes{}, OutputSizes{},
ConvStrides{}, ConvStrides{},
...@@ -375,12 +378,12 @@ int main(int argc, char* argv[]) ...@@ -375,12 +378,12 @@ int main(int argc, char* argv[])
LeftPads{}, LeftPads{},
RightPads{}); RightPads{});
check_error(in_nchw_host, in_nchw_device); check_error(img_nchw_host, img_nchw_device);
#if 1 #if 0
LogRange(std::cout << "in_eb : ", in_eb.mData, ",") << std::endl; LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw_host : ", in_nchw_host.mData, ",") << std::endl; LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "in_nchw_device : ", in_nchw_device.mData, ",") << std::endl; LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
#endif #endif
} }
} }
...@@ -29,7 +29,7 @@ int main(int argc, char* argv[]) ...@@ -29,7 +29,7 @@ int main(int argc, char* argv[])
{ {
using namespace ck; using namespace ck;
#if 1 #if 0
constexpr index_t N = 8; constexpr index_t N = 8;
constexpr index_t C = 32; constexpr index_t C = 32;
constexpr index_t HI = 28; constexpr index_t HI = 28;
...@@ -393,7 +393,7 @@ int main(int argc, char* argv[]) ...@@ -393,7 +393,7 @@ int main(int argc, char* argv[])
#elif 0 #elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw( device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_deprecated(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
...@@ -403,7 +403,7 @@ int main(int argc, char* argv[]) ...@@ -403,7 +403,7 @@ int main(int argc, char* argv[])
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
nrepeat); nrepeat);
#elif 1 #elif 0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
...@@ -445,7 +445,7 @@ int main(int argc, char* argv[]) ...@@ -445,7 +445,7 @@ int main(int argc, char* argv[])
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
nrepeat); nrepeat);
#elif 0 #elif 1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc, device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment