Commit c9a8e558 authored by Chao Liu's avatar Chao Liu
Browse files

adding tensor_view

parent 8669e242
...@@ -433,7 +433,7 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>) ...@@ -433,7 +433,7 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
{ {
constexpr index_t nsize = Sequence<Xs...>::GetSize(); constexpr index_t nsize = Sequence<Xs...>::GetSize();
static_assert(nsize <= 10, "wrong!"); static_assert(nsize <= 12, "wrong!");
static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); }); static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });
...@@ -462,6 +462,13 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>) ...@@ -462,6 +462,13 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
static_if<nsize == 10>{}( static_if<nsize == 10>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); }); [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 11>{}(
[&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
static_if<nsize == 12>{}([&](auto) {
printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...);
});
} }
} // namespace ck } // namespace ck
......
#ifndef CONV_COMMON_HPP #ifndef CONV_COMMON_HPP
#define CONV_COMMON_HPP #define CONV_COMMON_HPP
#include "ConstantTensorDescriptor.hpp" #include "constant_tensor_descriptor.hpp"
// this is ugly, only for 4d // this is ugly, only for 4d
template <class InDesc, class WeiDesc> template <class InDesc, class WeiDesc>
......
...@@ -9,11 +9,11 @@ using namespace ck; ...@@ -9,11 +9,11 @@ using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc, void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in, const HostTensor<T>& in,
WeiDesc, WeiDesc,
const Tensor<T>& wei, const HostTensor<T>& wei,
OutDesc, OutDesc,
Tensor<T>& out, HostTensor<T>& out,
index_t nrepeat) index_t nrepeat)
{ {
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
......
...@@ -12,11 +12,11 @@ using namespace ck; ...@@ -12,11 +12,11 @@ using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
const Tensor<T>& in_nchw, const HostTensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const HostTensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, HostTensor<T>& out_nkhw,
index_t nrepeat) index_t nrepeat)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
...@@ -44,7 +44,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -44,7 +44,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) { auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
...@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Hi, Wi, N>{}); auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Hi, Wi, N>{});
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc)); HostTensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) { auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi); in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
...@@ -70,7 +70,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, ...@@ -70,7 +70,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{}); auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); HostTensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace()); DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());
......
...@@ -10,11 +10,11 @@ using namespace ck; ...@@ -10,11 +10,11 @@ using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
const Tensor<T>& in_nchw, const HostTensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const HostTensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, HostTensor<T>& out_nkhw,
index_t nrepeat) index_t nrepeat)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
...@@ -42,7 +42,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, ...@@ -42,7 +42,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) { auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
......
...@@ -10,11 +10,11 @@ using namespace ck; ...@@ -10,11 +10,11 @@ using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
const Tensor<T>& in_nchw, const HostTensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const HostTensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, HostTensor<T>& out_nkhw,
index_t nrepeat) index_t nrepeat)
{ {
constexpr auto I0 = Number<0>{}; constexpr auto I0 = Number<0>{};
...@@ -44,7 +44,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, ...@@ -44,7 +44,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{}); auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc)); HostTensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
make_ParallelTensorFunctor( make_ParallelTensorFunctor(
[&](auto n, auto c, auto hi, auto wi) { in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi); }, [&](auto n, auto c, auto hi, auto wi) { in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi); },
...@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, ...@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
make_ParallelTensorFunctor( make_ParallelTensorFunctor(
[&](auto k, auto c, auto y, auto x) { wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); }, [&](auto k, auto c, auto y, auto x) { wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); },
...@@ -70,7 +70,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc, ...@@ -70,7 +70,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{}); auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); HostTensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
#if 0 #if 0
// 3x3, 34x34 // 3x3, 34x34
......
...@@ -8,11 +8,11 @@ ...@@ -8,11 +8,11 @@
template <class T, class InDesc, class WeiDesc, class OutDesc> template <class T, class InDesc, class WeiDesc, class OutDesc>
void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
const Tensor<T>& in_nchw, const HostTensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const HostTensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, HostTensor<T>& out_nkhw,
index_t nrepeat) index_t nrepeat)
{ {
using namespace ck; using namespace ck;
...@@ -42,7 +42,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, ...@@ -42,7 +42,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) { auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
......
...@@ -13,11 +13,11 @@ template <class T, ...@@ -13,11 +13,11 @@ template <class T,
class ConvStrides, class ConvStrides,
class ConvDilations> class ConvDilations>
void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in_nchw, const HostTensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const HostTensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, HostTensor<T>& out_nkhw,
ConvStrides, ConvStrides,
ConvDilations, ConvDilations,
index_t nrepeat) index_t nrepeat)
......
...@@ -14,11 +14,11 @@ template <class T, ...@@ -14,11 +14,11 @@ template <class T,
class ConvStrides, class ConvStrides,
class ConvDilations> class ConvDilations>
void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc, void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in_nchw, const HostTensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const HostTensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, HostTensor<T>& out_nkhw,
ConvStrides, ConvStrides,
ConvDilations, ConvDilations,
index_t nrepeat) index_t nrepeat)
......
...@@ -14,11 +14,11 @@ template <class T, ...@@ -14,11 +14,11 @@ template <class T,
class ConvStrides, class ConvStrides,
class ConvDilations> class ConvDilations>
void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc, void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc,
const Tensor<T>& in_nchw, const HostTensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const HostTensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, HostTensor<T>& out_nkhw,
ConvStrides, ConvStrides,
ConvDilations, ConvDilations,
index_t nrepeat) index_t nrepeat)
...@@ -90,14 +90,14 @@ void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc, ...@@ -90,14 +90,14 @@ void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc,
constexpr index_t InBlockCopyDataPerAccess_W2 = 4; constexpr index_t InBlockCopyDataPerAccess_W2 = 4;
using WeiBlockCopySubLengths_E_K = Sequence<2, 2>; using WeiBlockCopySubLengths_E_K = Sequence<4, 1>;
using WeiBlockCopyClusterLengths_E_K = Sequence<4, 64>; using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>;
using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E]
using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K]
constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; constexpr index_t WeiBlockCopySrcDataPerRead_E = 4;
constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2; constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
#endif #endif
constexpr index_t N0 = N / (N1 * N2); constexpr index_t N0 = N / (N1 * N2);
......
...@@ -8,11 +8,11 @@ using namespace ck; ...@@ -8,11 +8,11 @@ using namespace ck;
template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc> template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
const Tensor<TInWei>& in_nchw, const HostTensor<TInWei>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<TInWei>& wei_kcyx, const HostTensor<TInWei>& wei_kcyx,
OutDesc, OutDesc,
Tensor<TOut>& out_nkhw, HostTensor<TOut>& out_nkhw,
index_t nrepeat) index_t nrepeat)
{ {
// this suppose in / wei data type is int8x4 // this suppose in / wei data type is int8x4
...@@ -46,7 +46,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, ...@@ -46,7 +46,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{}); auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: "); ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");
Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc)); HostTensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) { auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
#if 0 #if 0
...@@ -69,7 +69,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc, ...@@ -69,7 +69,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{}); auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: "); ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");
Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc)); HostTensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) { auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
#if 0 #if 0
......
...@@ -8,11 +8,11 @@ using namespace ck; ...@@ -8,11 +8,11 @@ using namespace ck;
template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads> template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc, void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
const Tensor<T>& in_nchw, const HostTensor<T>& in_nchw,
WeiDesc, WeiDesc,
const Tensor<T>& wei_kcyx, const HostTensor<T>& wei_kcyx,
OutDesc, OutDesc,
Tensor<T>& out_nkhw, HostTensor<T>& out_nkhw,
LowerPads, LowerPads,
UpperPads, UpperPads,
index_t nrepeat) index_t nrepeat)
...@@ -42,7 +42,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc, ...@@ -42,7 +42,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{}); auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");
Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) { auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
...@@ -55,7 +55,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc, ...@@ -55,7 +55,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{}); auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");
Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc)); HostTensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) { auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi); in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
...@@ -68,7 +68,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc, ...@@ -68,7 +68,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{}); auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");
Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc)); HostTensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
std::size_t data_sz = sizeof(T); std::size_t data_sz = sizeof(T);
DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace()); DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());
......
#pragma once #pragma once
#include "tensor.hpp" #include "tensor.hpp"
#include "common_header.hpp" #include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp" #include "constant_tensor_descriptor.hpp"
// this is ugly, only for 4d // this is ugly, only for 4d
template <class TConstTensorDesc> template <class TConstTensorDesc>
...@@ -42,7 +42,7 @@ auto make_TensorDescriptor(TConstTensorDesc) ...@@ -42,7 +42,7 @@ auto make_TensorDescriptor(TConstTensorDesc)
std::initializer_list<index_t> strides = { std::initializer_list<index_t> strides = {
desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)}; desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};
return TensorDescriptor(lengths, strides); return HostTensorDescriptor(lengths, strides);
} }
template <class TIn, template <class TIn,
...@@ -52,9 +52,9 @@ template <class TIn, ...@@ -52,9 +52,9 @@ template <class TIn,
class ConvDilations, class ConvDilations,
class LowerPads, class LowerPads,
class UpperPads> class UpperPads>
void host_direct_convolution(const Tensor<TIn>& in_nchw, void host_direct_convolution(const HostTensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx, const HostTensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw, HostTensor<TOut>& out_nkhw,
ConvStrides, ConvStrides,
ConvDilations, ConvDilations,
LowerPads, LowerPads,
...@@ -99,9 +99,9 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw, ...@@ -99,9 +99,9 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
} }
template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads> template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, void host_winograd_3x3_convolution(const HostTensor<TIn>& in_nchw,
const Tensor<TWei>& wei_kcyx, const HostTensor<TWei>& wei_kcyx,
Tensor<TOut>& out_nkhw, HostTensor<TOut>& out_nkhw,
LowerPads, LowerPads,
UpperPads) UpperPads)
{ {
...@@ -134,11 +134,11 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, ...@@ -134,11 +134,11 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile; std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile; std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;
Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile}); HostTensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile}); HostTensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile}); HostTensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile}); HostTensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile}); HostTensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) { auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
for(int j = 0; j < HiPerTile; ++j) for(int j = 0; j < HiPerTile; ++j)
...@@ -339,7 +339,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw, ...@@ -339,7 +339,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
} }
template <class T> template <class T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result) void check_error(const HostTensor<T>& ref, const HostTensor<T>& result)
{ {
float error = 0; float error = 0;
float max_diff = -1; float max_diff = -1;
......
#ifndef TENSOR_HPP #ifndef HOST_TENSOR_HPP
#define TENSOR_HPP #define HOST_TENSOR_HPP
#include <thread> #include <thread>
#include <vector> #include <vector>
...@@ -65,24 +65,24 @@ auto construct_f_unpack_args(F, T args) ...@@ -65,24 +65,24 @@ auto construct_f_unpack_args(F, T args)
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{}); return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
} }
struct TensorDescriptor struct HostTensorDescriptor
{ {
TensorDescriptor() = delete; HostTensorDescriptor() = delete;
TensorDescriptor(std::initializer_list<std::size_t> lens); HostTensorDescriptor(std::initializer_list<std::size_t> lens);
TensorDescriptor(std::initializer_list<std::size_t> lens, HostTensorDescriptor(std::initializer_list<std::size_t> lens,
std::initializer_list<std::size_t> strides); std::initializer_list<std::size_t> strides);
TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides); HostTensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
void CalculateStrides(); void CalculateStrides();
template <class Range> template <class Range>
TensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end()) HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
{ {
this->CalculateStrides(); this->CalculateStrides();
} }
template <class Range1, class Range2> template <class Range1, class Range2>
TensorDescriptor(const Range1& lens, const Range2& strides) HostTensorDescriptor(const Range1& lens, const Range2& strides)
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end()) : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
{ {
} }
...@@ -185,25 +185,25 @@ auto make_ParallelTensorFunctor(F f, Xs... xs) ...@@ -185,25 +185,25 @@ auto make_ParallelTensorFunctor(F f, Xs... xs)
} }
template <class T> template <class T>
struct Tensor struct HostTensor
{ {
template <class X> template <class X>
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace()) HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
{ {
} }
template <class X> template <class X>
Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace()) HostTensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
{ {
} }
template <class X, class Y> template <class X, class Y>
Tensor(std::vector<X> lens, std::vector<Y> strides) HostTensor(std::vector<X> lens, std::vector<Y> strides)
: mDesc(lens, strides), mData(mDesc.GetElementSpace()) : mDesc(lens, strides), mData(mDesc.GetElementSpace())
{ {
} }
Tensor(const TensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {} HostTensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
template <class G> template <class G>
void GenerateTensorValue(G g, std::size_t num_thread = 1) void GenerateTensorValue(G g, std::size_t num_thread = 1)
...@@ -265,7 +265,7 @@ struct Tensor ...@@ -265,7 +265,7 @@ struct Tensor
typename std::vector<T>::const_iterator end() const { return mData.end(); } typename std::vector<T>::const_iterator end() const { return mData.end(); }
TensorDescriptor mDesc; HostTensorDescriptor mDesc;
std::vector<T> mData; std::vector<T> mData;
}; };
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include <cstdlib> #include <cstdlib>
#include <stdlib.h> #include <stdlib.h>
#include "config.hpp" #include "config.hpp"
#include "ConstantTensorDescriptor.hpp" #include "constant_tensor_descriptor.hpp"
#include "device.hpp" #include "device.hpp"
#include "conv_common.hpp" #include "conv_common.hpp"
#include "host_conv.hpp" #include "host_conv.hpp"
...@@ -473,10 +473,10 @@ int main(int argc, char* argv[]) ...@@ -473,10 +473,10 @@ int main(int argc, char* argv[])
using in_data_t = float; using in_data_t = float;
using out_data_t = float; using out_data_t = float;
Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc)); HostTensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc)); HostTensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc)); HostTensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc)); HostTensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = std::thread::hardware_concurrency();
...@@ -491,7 +491,7 @@ int main(int argc, char* argv[]) ...@@ -491,7 +491,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
#if 1 #if 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif 0 #elif 0
...@@ -503,6 +503,9 @@ int main(int argc, char* argv[]) ...@@ -503,6 +503,9 @@ int main(int argc, char* argv[])
#elif 1 #elif 1
in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
#elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
#elif 0 #elif 0
in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread); in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);
......
...@@ -3,17 +3,18 @@ ...@@ -3,17 +3,18 @@
#include "tensor.hpp" #include "tensor.hpp"
TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens) HostTensorDescriptor::HostTensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens)
{ {
this->CalculateStrides(); this->CalculateStrides();
} }
TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides) HostTensorDescriptor::HostTensorDescriptor(std::vector<std::size_t> lens,
std::vector<std::size_t> strides)
: mLens(lens), mStrides(strides) : mLens(lens), mStrides(strides)
{ {
} }
void TensorDescriptor::CalculateStrides() void HostTensorDescriptor::CalculateStrides()
{ {
mStrides.clear(); mStrides.clear();
mStrides.resize(mLens.size(), 0); mStrides.resize(mLens.size(), 0);
...@@ -25,21 +26,21 @@ void TensorDescriptor::CalculateStrides() ...@@ -25,21 +26,21 @@ void TensorDescriptor::CalculateStrides()
mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>()); mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
} }
std::size_t TensorDescriptor::GetNumOfDimension() const { return mLens.size(); } std::size_t HostTensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
std::size_t TensorDescriptor::GetElementSize() const std::size_t HostTensorDescriptor::GetElementSize() const
{ {
assert(mLens.size() == mStrides.size()); assert(mLens.size() == mStrides.size());
return std::accumulate( return std::accumulate(
mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>()); mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
} }
std::size_t TensorDescriptor::GetElementSpace() const std::size_t HostTensorDescriptor::GetElementSpace() const
{ {
auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; }); auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; });
return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1; return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
} }
const std::vector<std::size_t>& TensorDescriptor::GetLengths() const { return mLens; } const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { return mLens; }
const std::vector<std::size_t>& TensorDescriptor::GetStrides() const { return mStrides; } const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment