adding tensor_view

c9a8e558 · Chao Liu · 8669e242 · c9a8e558 · c9a8e558 · c9a8e558
Commit c9a8e558 authored Jul 20, 2019 by Chao Liu
16 changed files
--- a/composable_kernel/include/utility/Sequence.hpp
+++ b/composable_kernel/include/utility/Sequence.hpp
@@ -433,7 +433,7 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)
 {
    constexpr index_t nsize = Sequence<Xs...>::GetSize();

-    static_assert(nsize <= 10, "wrong!");
+    static_assert(nsize <= 12, "wrong!");

    static_if<nsize == 0>{}([&](auto) { printf("%s size %u, {}\n", s, nsize, Xs...); });

@@ -462,6 +462,13 @@ __host__ __device__ void print_Sequence(const char* s, Sequence<Xs...>)

    static_if<nsize == 10>{}(
        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 11>{}(
+        [&](auto) { printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...); });
+
+    static_if<nsize == 12>{}([&](auto) {
+        printf("%s size %u, {%u %u %u %u %u %u %u %u %u %u %u %u}\n", s, nsize, Xs...);
+    });
 }

 } // namespace ck

--- a/driver/include/conv_common.hpp
+++ b/driver/include/conv_common.hpp
 #ifndef CONV_COMMON_HPP
 #define CONV_COMMON_HPP

-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"

 // this is ugly, only for 4d
 template <class InDesc, class WeiDesc>

--- a/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_direct_v2_nchw_kcyx_nkhw.hpp
@@ -9,11 +9,11 @@ using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_direct_v2_nchw_kcyx_nkhw(InDesc,
-                                                 const Tensor<T>& in,
+                                                 const HostTensor<T>& in,
                                                 WeiDesc,
-                                                 const Tensor<T>& wei,
+                                                 const HostTensor<T>& wei,
                                                 OutDesc,
-                                                 Tensor<T>& out,
+                                                 HostTensor<T>& out,
                                                 index_t nrepeat)
 {
    std::size_t data_sz = sizeof(T);

--- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
@@ -12,11 +12,11 @@ using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
-                                                        const Tensor<T>& in_nchw,
+                                                        const HostTensor<T>& in_nchw,
                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
+                                                        const HostTensor<T>& wei_kcyx,
                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
+                                                        HostTensor<T>& out_nkhw,
                                                        index_t nrepeat)
 {
    constexpr auto I0 = Number<0>{};
@@ -44,7 +44,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+    HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));

    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
    auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Hi, Wi, N>{});
    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");

-    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
+    HostTensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));

    auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
        in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
@@ -70,7 +70,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
    auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");

-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
+    HostTensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

    std::size_t data_sz = sizeof(T);
    DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());

--- a/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
@@ -10,11 +10,11 @@ using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
-                                                        const Tensor<T>& in_nchw,
+                                                        const HostTensor<T>& in_nchw,
                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
+                                                        const HostTensor<T>& wei_kcyx,
                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
+                                                        HostTensor<T>& out_nkhw,
                                                        index_t nrepeat)
 {
    constexpr auto I0 = Number<0>{};
@@ -42,7 +42,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+    HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));

    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);

--- a/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp
@@ -10,11 +10,11 @@ using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
-                                                        const Tensor<T>& in_nchw,
+                                                        const HostTensor<T>& in_nchw,
                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
+                                                        const HostTensor<T>& wei_kcyx,
                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
+                                                        HostTensor<T>& out_nkhw,
                                                        index_t nrepeat)
 {
    constexpr auto I0 = Number<0>{};
@@ -44,7 +44,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
    auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");

-    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
+    HostTensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));

    make_ParallelTensorFunctor(
        [&](auto n, auto c, auto hi, auto wi) { in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi); },
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+    HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));

    make_ParallelTensorFunctor(
        [&](auto k, auto c, auto y, auto x) { wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x); },
@@ -70,7 +70,7 @@ void device_convolution_implicit_gemm_v2_chwn_cyxk_khwn(InDesc,
    auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");

-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
+    HostTensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

 #if 0
    // 3x3, 34x34

--- a/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp
@@ -8,11 +8,11 @@

 template <class T, class InDesc, class WeiDesc, class OutDesc>
 void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
-                                                        const Tensor<T>& in_nchw,
+                                                        const HostTensor<T>& in_nchw,
                                                        WeiDesc,
-                                                        const Tensor<T>& wei_kcyx,
+                                                        const HostTensor<T>& wei_kcyx,
                                                        OutDesc,
-                                                        Tensor<T>& out_nkhw,
+                                                        HostTensor<T>& out_nkhw,
                                                        index_t nrepeat)
 {
    using namespace ck;
@@ -42,7 +42,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc,
    auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+    HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));

    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -13,11 +13,11 @@ template <class T,
          class ConvStrides,
          class ConvDilations>
 void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
-                                                          const Tensor<T>& in_nchw,
+                                                          const HostTensor<T>& in_nchw,
                                                          WeiDesc,
-                                                          const Tensor<T>& wei_kcyx,
+                                                          const HostTensor<T>& wei_kcyx,
                                                          OutDesc,
-                                                          Tensor<T>& out_nkhw,
+                                                          HostTensor<T>& out_nkhw,
                                                          ConvStrides,
                                                          ConvDilations,
                                                          index_t nrepeat)

--- a/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp
@@ -14,11 +14,11 @@ template <class T,
          class ConvStrides,
          class ConvDilations>
 void device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw(InDesc,
-                                                          const Tensor<T>& in_nchw,
+                                                          const HostTensor<T>& in_nchw,
                                                          WeiDesc,
-                                                          const Tensor<T>& wei_kcyx,
+                                                          const HostTensor<T>& wei_kcyx,
                                                          OutDesc,
-                                                          Tensor<T>& out_nkhw,
+                                                          HostTensor<T>& out_nkhw,
                                                          ConvStrides,
                                                          ConvDilations,
                                                          index_t nrepeat)

--- a/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
@@ -14,11 +14,11 @@ template <class T,
          class ConvStrides,
          class ConvDilations>
 void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc,
-                                                          const Tensor<T>& in_nchw,
+                                                          const HostTensor<T>& in_nchw,
                                                          WeiDesc,
-                                                          const Tensor<T>& wei_kcyx,
+                                                          const HostTensor<T>& wei_kcyx,
                                                          OutDesc,
-                                                          Tensor<T>& out_nkhw,
+                                                          HostTensor<T>& out_nkhw,
                                                          ConvStrides,
                                                          ConvDilations,
                                                          index_t nrepeat)
@@ -90,14 +90,14 @@ void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc,

    constexpr index_t InBlockCopyDataPerAccess_W2 = 4;

-    using WeiBlockCopySubLengths_E_K            = Sequence<2, 2>;
-    using WeiBlockCopyClusterLengths_E_K        = Sequence<4, 64>;
+    using WeiBlockCopySubLengths_E_K            = Sequence<4, 1>;
+    using WeiBlockCopyClusterLengths_E_K        = Sequence<2, 128>;
    using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E]
    using WeiBlockCopySrcAccessOrder            = Sequence<1, 0>; // [K, E]
    using WeiBlockCopyDstAccessOrder            = Sequence<0, 1>; // [E, K]

-    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 1;
-    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 2;
+    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
+    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
 #endif

    constexpr index_t N0  = N / (N1 * N2);

--- a/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hpp
@@ -8,11 +8,11 @@ using namespace ck;

 template <class TInWei, class TOut, class InDesc, class WeiDesc, class OutDesc>
 void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
-                                                           const Tensor<TInWei>& in_nchw,
+                                                           const HostTensor<TInWei>& in_nchw,
                                                           WeiDesc,
-                                                           const Tensor<TInWei>& wei_kcyx,
+                                                           const HostTensor<TInWei>& wei_kcyx,
                                                           OutDesc,
-                                                           Tensor<TOut>& out_nkhw,
+                                                           HostTensor<TOut>& out_nkhw,
                                                           index_t nrepeat)
 {
    // this suppose in / wei data type is int8x4
@@ -46,7 +46,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
    auto in_nchw_vec_desc = make_ConstantTensorDescriptor(Sequence<N, C / NVector, Hi, Wi>{});
    ostream_ConstantTensorDescriptor(in_nchw_vec_desc, std::cout << "in_nchw_vec_desc: ");

-    Tensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));
+    HostTensor<vector_mem_t> in_nchw_vec(make_TensorDescriptor(in_nchw_vec_desc));

    auto f_vectorized_nchw = [&](auto n, auto c, auto h, auto w) {
 #if 0
@@ -69,7 +69,7 @@ void device_direct_convolution_2_vectorized_nchw_kcyx_nkhw(InDesc,
    auto wei_kcyx_vec_desc = make_ConstantTensorDescriptor(Sequence<K, C / NVector, Y, X>{});
    ostream_ConstantTensorDescriptor(wei_kcyx_vec_desc, std::cout << "wei_kcyx_vec_desc: ");

-    Tensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));
+    HostTensor<vector_mem_t> wei_kcyx_vec(make_TensorDescriptor(wei_kcyx_vec_desc));

    auto f_vectorized_kcyx = [&](auto k, auto c, auto y, auto x) {
 #if 0

--- a/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
+++ b/driver/include/device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hpp
@@ -8,11 +8,11 @@ using namespace ck;

 template <class T, class InDesc, class WeiDesc, class OutDesc, class LowerPads, class UpperPads>
 void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
-                                                              const Tensor<T>& in_nchw,
+                                                              const HostTensor<T>& in_nchw,
                                                              WeiDesc,
-                                                              const Tensor<T>& wei_kcyx,
+                                                              const HostTensor<T>& wei_kcyx,
                                                              OutDesc,
-                                                              Tensor<T>& out_nkhw,
+                                                              HostTensor<T>& out_nkhw,
                                                              LowerPads,
                                                              UpperPads,
                                                              index_t nrepeat)
@@ -42,7 +42,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
    auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence<C, Y, X, K>{});
    ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: ");

-    Tensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));
+    HostTensor<T> wei_cyxk(make_TensorDescriptor(wei_cyxk_desc));

    auto f_reorder_kcyx2cyxk = [&](auto k, auto c, auto y, auto x) {
        wei_cyxk(c, y, x, k) = wei_kcyx(k, c, y, x);
@@ -55,7 +55,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
    auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence<C, Hi, Wi, N>{});
    ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: ");

-    Tensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));
+    HostTensor<T> in_chwn(make_TensorDescriptor(in_chwn_desc));

    auto f_reorder_nchw2chwn = [&](auto n, auto c, auto hi, auto wi) {
        in_chwn(c, hi, wi, n) = in_nchw(n, c, hi, wi);
@@ -68,7 +68,7 @@ void device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(InDesc,
    auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence<K, Ho, Wo, N>{});
    ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: ");

-    Tensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));
+    HostTensor<T> out_khwn(make_TensorDescriptor(out_khwn_desc));

    std::size_t data_sz = sizeof(T);
    DeviceMem in_chwn_device_buf(data_sz * in_chwn.mDesc.GetElementSpace());

--- a/driver/include/host_conv.hpp
+++ b/driver/include/host_conv.hpp
 #pragma once
 #include "tensor.hpp"
 #include "common_header.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"

 // this is ugly, only for 4d
 template <class TConstTensorDesc>
@@ -42,7 +42,7 @@ auto make_TensorDescriptor(TConstTensorDesc)
    std::initializer_list<index_t> strides = {
        desc.GetStride(I0), desc.GetStride(I1), desc.GetStride(I2), desc.GetStride(I3)};

-    return TensorDescriptor(lengths, strides);
+    return HostTensorDescriptor(lengths, strides);
 }

 template <class TIn,
@@ -52,9 +52,9 @@ template <class TIn,
          class ConvDilations,
          class LowerPads,
          class UpperPads>
-void host_direct_convolution(const Tensor<TIn>& in_nchw,
-                             const Tensor<TWei>& wei_kcyx,
-                             Tensor<TOut>& out_nkhw,
+void host_direct_convolution(const HostTensor<TIn>& in_nchw,
+                             const HostTensor<TWei>& wei_kcyx,
+                             HostTensor<TOut>& out_nkhw,
                             ConvStrides,
                             ConvDilations,
                             LowerPads,
@@ -99,9 +99,9 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
 }

 template <class TIn, class TWei, class TOut, class LowerPads, class UpperPads>
-void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
-                                   const Tensor<TWei>& wei_kcyx,
-                                   Tensor<TOut>& out_nkhw,
+void host_winograd_3x3_convolution(const HostTensor<TIn>& in_nchw,
+                                   const HostTensor<TWei>& wei_kcyx,
+                                   HostTensor<TOut>& out_nkhw,
                                   LowerPads,
                                   UpperPads)
 {
@@ -134,11 +134,11 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
    std::size_t HTile = (HO + HoPerTile - 1) / HoPerTile;
    std::size_t WTile = (WO + WoPerTile - 1) / WoPerTile;

-    Tensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
-    Tensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
-    Tensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
-    Tensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
-    Tensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});
+    HostTensor<double> in_hold({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    HostTensor<double> in_transform({N, C, HTile, WTile, HiPerTile, WiPerTile});
+    HostTensor<double> wei_transform({K, C, HiPerTile, WiPerTile});
+    HostTensor<double> out_transform({N, K, HTile, WTile, HiPerTile, HiPerTile});
+    HostTensor<double> out_hold({N, K, HTile, WTile, HoPerTile, WoPerTile});

    auto f_in_hold = [&](auto n, auto c, auto htile, auto wtile) {
        for(int j = 0; j < HiPerTile; ++j)
@@ -339,7 +339,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
 }

 template <class T>
-void check_error(const Tensor<T>& ref, const Tensor<T>& result)
+void check_error(const HostTensor<T>& ref, const HostTensor<T>& result)
 {
    float error     = 0;
    float max_diff  = -1;

--- a/driver/include/tensor.hpp
+++ b/driver/include/tensor.hpp
-#ifndef TENSOR_HPP
-#define TENSOR_HPP
+#ifndef HOST_TENSOR_HPP
+#define HOST_TENSOR_HPP

 #include <thread>
 #include <vector>
@@ -65,24 +65,24 @@ auto construct_f_unpack_args(F, T args)
    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
 }

-struct TensorDescriptor
+struct HostTensorDescriptor
 {
-    TensorDescriptor() = delete;
-    TensorDescriptor(std::initializer_list<std::size_t> lens);
-    TensorDescriptor(std::initializer_list<std::size_t> lens,
-                     std::initializer_list<std::size_t> strides);
-    TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);
+    HostTensorDescriptor() = delete;
+    HostTensorDescriptor(std::initializer_list<std::size_t> lens);
+    HostTensorDescriptor(std::initializer_list<std::size_t> lens,
+                         std::initializer_list<std::size_t> strides);
+    HostTensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides);

    void CalculateStrides();

    template <class Range>
-    TensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
+    HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
    {
        this->CalculateStrides();
    }

    template <class Range1, class Range2>
-    TensorDescriptor(const Range1& lens, const Range2& strides)
+    HostTensorDescriptor(const Range1& lens, const Range2& strides)
        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
    {
    }
@@ -185,25 +185,25 @@ auto make_ParallelTensorFunctor(F f, Xs... xs)
 }

 template <class T>
-struct Tensor
+struct HostTensor
 {
    template <class X>
-    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
    {
    }

    template <class X>
-    Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
+    HostTensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
    {
    }

    template <class X, class Y>
-    Tensor(std::vector<X> lens, std::vector<Y> strides)
+    HostTensor(std::vector<X> lens, std::vector<Y> strides)
        : mDesc(lens, strides), mData(mDesc.GetElementSpace())
    {
    }

-    Tensor(const TensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
+    HostTensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}

    template <class G>
    void GenerateTensorValue(G g, std::size_t num_thread = 1)
@@ -265,7 +265,7 @@ struct Tensor

    typename std::vector<T>::const_iterator end() const { return mData.end(); }

-    TensorDescriptor mDesc;
+    HostTensorDescriptor mDesc;
    std::vector<T> mData;
 };


--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -4,7 +4,7 @@
 #include <cstdlib>
 #include <stdlib.h>
 #include "config.hpp"
-#include "ConstantTensorDescriptor.hpp"
+#include "constant_tensor_descriptor.hpp"
 #include "device.hpp"
 #include "conv_common.hpp"
 #include "host_conv.hpp"
@@ -473,10 +473,10 @@ int main(int argc, char* argv[])

    using in_data_t  = float;
    using out_data_t = float;
-    Tensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
-    Tensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
-    Tensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
-    Tensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));
+    HostTensor<in_data_t> in_nchw(make_TensorDescriptor(in_nchw_desc));
+    HostTensor<in_data_t> wei_kcyx(make_TensorDescriptor(wei_kcyx_desc));
+    HostTensor<out_data_t> out_nkhw_host(make_TensorDescriptor(out_nkhw_desc));
+    HostTensor<out_data_t> out_nkhw_device(make_TensorDescriptor(out_nkhw_desc));

    std::size_t num_thread = std::thread::hardware_concurrency();

@@ -491,7 +491,7 @@ int main(int argc, char* argv[])

    if(do_verification)
    {
-#if 1
+#if 0
        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 0
@@ -503,6 +503,9 @@ int main(int argc, char* argv[])
 #elif 1
        in_nchw.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+#elif 0
+        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
 #elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_2{1, 5}, num_thread);


--- a/driver/src/tensor.cpp
+++ b/driver/src/tensor.cpp
@@ -3,17 +3,18 @@

 #include "tensor.hpp"

-TensorDescriptor::TensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens)
+HostTensorDescriptor::HostTensorDescriptor(std::initializer_list<std::size_t> lens) : mLens(lens)
 {
    this->CalculateStrides();
 }

-TensorDescriptor::TensorDescriptor(std::vector<std::size_t> lens, std::vector<std::size_t> strides)
+HostTensorDescriptor::HostTensorDescriptor(std::vector<std::size_t> lens,
+                                           std::vector<std::size_t> strides)
    : mLens(lens), mStrides(strides)
 {
 }

-void TensorDescriptor::CalculateStrides()
+void HostTensorDescriptor::CalculateStrides()
 {
    mStrides.clear();
    mStrides.resize(mLens.size(), 0);
@@ -25,21 +26,21 @@ void TensorDescriptor::CalculateStrides()
        mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
 }

-std::size_t TensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
+std::size_t HostTensorDescriptor::GetNumOfDimension() const { return mLens.size(); }

-std::size_t TensorDescriptor::GetElementSize() const
+std::size_t HostTensorDescriptor::GetElementSize() const
 {
    assert(mLens.size() == mStrides.size());
    return std::accumulate(
        mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
 }

-std::size_t TensorDescriptor::GetElementSpace() const
+std::size_t HostTensorDescriptor::GetElementSpace() const
 {
    auto ls = mLens | boost::adaptors::transformed([](std::size_t v) { return v - 1; });
    return std::inner_product(ls.begin(), ls.end(), mStrides.begin(), std::size_t{0}) + 1;
 }

-const std::vector<std::size_t>& TensorDescriptor::GetLengths() const { return mLens; }
+const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { return mLens; }

-const std::vector<std::size_t>& TensorDescriptor::GetStrides() const { return mStrides; }
+const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; }