refactor

28bfc452 · Chao Liu · 40b2fad3 · 28bfc452 · 28bfc452 · 28bfc452
Commit 28bfc452 authored Dec 02, 2021 by Chao Liu
5 changed files
--- a/example/2_gemm_xdl_bias_add/README.md
+++ b/example/2_gemm_xdl_bias_add/README.md
--- a/example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp
+++ b/example/2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp
@@ -12,28 +12,70 @@
 #include "host_gemm.hpp"
 #include "device_tensor.hpp"
 #include "device_base.hpp"
-#include "example/2_gemm_xdl_bias_add/include/device_gemm_xdl_two_extra_source_reduce.hpp"
+#include "example/2_gemm_xdl_bias_relu_add/include/device_gemm_xdl_two_extra_source_reduce.hpp"
+
+// C[m, n] = Relu(A[m, k] * B[k, n] + C0[m]) + C1[m, n]
+// assume C0 is contiguous in memory
+//     C0 resides in memory as 1d vector [m], but is represented as 2D matrix [m, n], with stride =
+//     0 in the "n" dimension
+// assume C1 and C have same layout C
+
+// v0 is from A * B
+// v1 is from C0
+// v2 is from C1
+struct BiasReluAdd
+{
+    template <typename T1, typename T2>
+    __host__ constexpr float operator()(float v0, T1 v1, T2 v2) const
+    {
+        float a = v0 + v1;
+        float b = 0.1 * a;
+        float c = b > 0 ? b : 0;
+        float d = c + v2;

-struct PassThrough
+        return d;
+    }
+
+    template <typename T1, typename T2>
+    __device__ constexpr float operator()(float v0, T1 v1, T2 v2) const
+    {
+        constexpr float alpha     = 0.1;
+        constexpr float alpha_inv = 1.0 / alpha;
+
+        float a = v2 * alpha_inv;
+        float b = v1 + v0;
+        float c = max(b, float(0));
+        float d = alpha * (a + c);
+
+        return d;
+    }
+};
+
+struct BiasRelu
 {
-    template <typename T>
-    __host__ __device__ constexpr T operator()(T v) const
+    template <typename T1, typename T2>
+    __host__ constexpr float operator()(float v0, T1 v1, T2) const
    {
-        return v;
+        float a = v0 + v1;
+        float b = 0.1 * a;
+        float c = b > 0 ? b : 0;
+
+        return c;
+    }
+
+    template <typename T1, typename T2>
+    __device__ constexpr float operator()(float v0, T1 v1, T2) const
+    {
+        constexpr float alpha = 0.1;
+
+        float b = v1 + v0;
+        float c = max(b, float(0));
+        float d = alpha * c;
+
+        return d;
    }
 };

-// GEMM Bias Add:
-// C[m, n] = alpha(A[m, k] * B[k, n]) + beta * C0[m, n] + gamma * C1[m]
-// assume C0 has same layout as C
-// assume C1 is contiguous in memory
-// C1 presents in memory as 1d vector, but is represented as 2D matrix C1[m, n], with stride = 0 in
-//    the "n" dimension
-//
-// alpha * v0 + beta * v1 + gamma * v2
-// v0 is from C matrix
-// v1 is from residual matrix
-// v2 is from bias vector
 struct BiasAdd
 {
 #if 1
@@ -88,90 +130,40 @@ struct BiasAdd
 #endif
 };

-template <typename ADataType,
-          typename BDataType,
-          typename CDataType,
-          typename ALayout,
-          typename BLayout,
-          typename CLayout,
-          typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct DeviceGemmInstance;
-
-template <typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct DeviceGemmInstance<ck::half_t,
-                          ck::half_t,
-                          ck::half_t,
-                          ck::tensor_layout::gemm::RowMajor,
-                          ck::tensor_layout::gemm::ColumnMajor,
-                          ck::tensor_layout::gemm::RowMajor,
-                          AElementwiseOperation,
-                          BElementwiseOperation,
-                          CElementwiseOperation>
+struct PassThrough
 {
-    using F16 = ck::half_t;
-    using F32 = float;
-
-    using Row = ck::tensor_layout::gemm::RowMajor;
-    using Col = ck::tensor_layout::gemm::ColumnMajor;
-
-    template <ck::index_t... Is>
-    using S = ck::Sequence<Is...>;
-
-    using AOp = AElementwiseOperation;
-    using BOp = BElementwiseOperation;
-    using COp = CElementwiseOperation;
-
-    // Compilation parameters for NT problem
-    // clang-format off
-    using type =
-        //#################################################################| AData| BData| CData| AccData| ALayout| BLayout| CLayout| AElementwise| BElementwise| CElementwise| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
-        //#################################################################|  Type|  Type|  Type|    Type|        |        |        |    Operation|    Operation|    Operation|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
-        //#################################################################|      |      |      |        |        |        |        |             |             |             |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
-        //#################################################################|      |      |      |        |        |        |        |             |             |             |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
-        ck::tensor_operation::device::DeviceGemmXdl_two_extra_source_reduce<  F16,   F16,   F16,     F32,     Row,     Col,     Row,          AOp,          BOp,          COp,   256,   256,   128,     4,  8,   32,   32,    4,    2,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
-    // clang-format on
+    template <typename T>
+    __host__ __device__ constexpr T operator()(T v) const
+    {
+        return v;
+    }
 };

-template <typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation>
-struct DeviceGemmInstance<float,
-                          float,
-                          float,
-                          ck::tensor_layout::gemm::RowMajor,
-                          ck::tensor_layout::gemm::ColumnMajor,
-                          ck::tensor_layout::gemm::RowMajor,
-                          AElementwiseOperation,
-                          BElementwiseOperation,
-                          CElementwiseOperation>
-{
-    using F16 = ck::half_t;
-    using F32 = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;

-    using Row = ck::tensor_layout::gemm::RowMajor;
-    using Col = ck::tensor_layout::gemm::ColumnMajor;
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;

-    template <ck::index_t... Is>
-    using S = ck::Sequence<Is...>;
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;

-    using AOp = AElementwiseOperation;
-    using BOp = BElementwiseOperation;
-    using COp = CElementwiseOperation;
+using AOp = PassThrough;
+using BOp = PassThrough;
+using COp = BiasReluAdd;

-    // Compilation parameters for NT problem
-    // clang-format off
-    using type =
+// Compilation parameters for NT problem
+// clang-format off
+using DeviceGemmInstance =
    //#################################################################|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout| AElementwise| BElementwise| CElementwise| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
    //#################################################################|      Type|      Type|      Type|        Type|        |        |        |    Operation|    Operation|    Operation|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
    //#################################################################|          |          |          |            |        |        |        |             |             |             |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
    //#################################################################|          |          |          |            |        |        |        |             |             |             |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
-    ck::tensor_operation::device::DeviceGemmXdl_two_extra_source_reduce<  F32,   F32,   F32,     F32,     Row,     Col,     Row,          AOp,          BOp,          COp,   256,   256,   128,     4,  4,   32,   32,    4,    2,      S<1, 4, 4>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      S<1, 2, 4>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,               7,               1,      true,      true>;
-    // clang-format on
-};
+    ck::tensor_operation::device::DeviceGemmXdl_two_extra_source_reduce< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,          AOp,          BOp,          COp,   256,   256,   128,     4,  8,   32,   32,    4,    2,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
+// clang-format on

 template <typename AType,
          typename BType,
@@ -210,7 +202,40 @@ static void host_verify(const Tensor<AType>& a_m_k,

 int main(int argc, char* argv[])
 {
-    if(argc != 10)
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
@@ -219,35 +244,6 @@ int main(int argc, char* argv[])
        exit(0);
    }

-    const bool do_verification = std::stoi(argv[1]);
-    const int init_method      = std::stoi(argv[2]);
-    const int nrepeat          = std::stoi(argv[3]);
-
-    // GEMM shape
-    ck::index_t M = std::stoi(argv[4]);
-    ck::index_t N = std::stoi(argv[5]);
-    ck::index_t K = std::stoi(argv[6]);
-
-    ck::index_t StrideA = std::stoi(argv[7]);
-    ck::index_t StrideB = std::stoi(argv[8]);
-    ck::index_t StrideC = std::stoi(argv[9]);
-
-    // matrix data type
-#if 1
-    using ADataType = ck::half_t;
-    using BDataType = ck::half_t;
-    using CDataType = ck::half_t;
-#else
-    using ADataType = float;
-    using BDataType = float;
-    using CDataType = float;
-#endif
-
-    // matrix layout
-    using ALayout = ck::tensor_layout::gemm::RowMajor;
-    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
-    using CLayout = ck::tensor_layout::gemm::RowMajor;
-
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
@@ -267,14 +263,14 @@ int main(int argc, char* argv[])
    Tensor<BDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<BDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

-    // C0[m ,n]
-    Tensor<BDataType> c0_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
-
-    // C1[m]
+    // C0[m]
    Tensor<CDataType> c1_m_n(HostTensorDescriptor(
        std::vector<std::size_t>({static_cast<std::size_t>(M), static_cast<std::size_t>(N)}),
        std::vector<std::size_t>({1, 0})));

+    // C1[m ,n]
+    Tensor<BDataType> c0_m_n(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
@@ -309,18 +305,10 @@ int main(int argc, char* argv[])
    c0_m_n_device_buf.ToDevice(c0_m_n.mData.data());
    c1_m_n_device_buf.ToDevice(c1_m_n.mData.data());

-    auto c_element_op = BiasAdd{};
+    auto c_element_op = BiasReluAdd{};

    // do GEMM
-    auto gemm = typename DeviceGemmInstance<ADataType,
-                                            BDataType,
-                                            CDataType,
-                                            ALayout,
-                                            BLayout,
-                                            CLayout,
-                                            PassThrough,
-                                            PassThrough,
-                                            decltype(c_element_op)>::type{};
+    auto gemm = DeviceGemmInstance{};

    auto invoker  = gemm.MakeInvoker();
    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),

--- a/example/2_gemm_xdl_bias_add/include/device_gemm_xdl_two_extra_source_reduce.hpp
+++ b/example/2_gemm_xdl_bias_add/include/device_gemm_xdl_two_extra_source_reduce.hpp
--- a/example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
+++ b/example/4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp
@@ -97,6 +97,31 @@ struct BiasReluAdd
    }
 };

+struct BiasRelu
+{
+    template <typename T1, typename T2>
+    __host__ constexpr float operator()(float v0, T1 v1, T2) const
+    {
+        float a = v0 + v1;
+        float b = 0.1 * a;
+        float c = b > 0 ? b : 0;
+
+        return c;
+    }
+
+    template <typename T1, typename T2>
+    __device__ constexpr float operator()(float v0, T1 v1, T2) const
+    {
+        constexpr float alpha = 0.1;
+
+        float b = v1 + v0;
+        float c = max(b, float(0));
+        float d = alpha * c;
+
+        return d;
+    }
+};
+
 using InDataType  = ck::half_t;
 using WeiDataType = ck::half_t;
 using OutDataType = ck::half_t;
@@ -113,13 +138,13 @@ using InElementOp  = PassThrough;
 using WeiElementOp = PassThrough;
 using OutElementOp = BiasReluAdd;

+// clang-format off
 using DeviceConvFwdInstance =
-    // clang-format off
-//################################################################|    NDim|     InData|     WeiData|     OutData|     AccData|       In|       Wei|       Out|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
-//################################################################| Spatial|       Type|        Type|        Type|        Type|   Layout|    Layout|    Layout| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
-//################################################################|        |           |            |            |            |         |          |          |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
-//################################################################|        |           |            |            |            |         |          |          |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
-ck::tensor_operation::device::DeviceConvFwdXdl_bias_activation_add<       2, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
+    //################################################################|    NDim|     InData|     WeiData|     OutData|     AccData|       In|       Wei|       Out|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
+    //################################################################| Spatial|       Type|        Type|        Type|        Type|   Layout|    Layout|    Layout| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
+    //################################################################|        |           |            |            |            |         |          |          |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
+    //################################################################|        |           |            |            |            |         |          |          |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
+    ck::tensor_operation::device::DeviceConvFwdXdl_bias_activation_add<       2, InDataType, WeiDataType, OutDataType, AccDataType, InLayout, WeiLayout, OutLayout, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
 // clang-format on

 template <typename TIn,

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -12,16 +12,16 @@ include_directories(BEFORE
 )

 set(GEMM_XDL_SOURCE 1_gemm_xdl/gemm_xdl.cpp)
-set(GEMM_XDL_BIAS_ADD_SOURCE 2_gemm_xdl_bias_add/gemm_xdl_bias_add.cpp)
+set(GEMM_XDL_BIAS_RELU_ADD_SOURCE 2_gemm_xdl_bias_relu_add/gemm_xdl_bias_relu_add.cpp)
 set(CONV_XDL_SOURCE 3_conv_xdl/conv_xdl.cpp)
 set(CONV_XDL_BIAS_RELU_ADD_SOURCE 4_conv_xdl_bias_relu_add/conv_xdl_bias_relu_add.cpp)

 add_executable(gemm_xdl ${GEMM_XDL_SOURCE})
-add_executable(gemm_xdl_bias_add ${GEMM_XDL_BIAS_ADD_SOURCE})
+add_executable(gemm_xdl_bias_relu_add ${GEMM_XDL_BIAS_RELU_ADD_SOURCE})
 add_executable(conv_xdl ${CONV_XDL_SOURCE})
 add_executable(conv_xdl_bias_relu_add ${CONV_XDL_BIAS_RELU_ADD_SOURCE})

 target_link_libraries(gemm_xdl PRIVATE host_tensor)
-target_link_libraries(gemm_xdl_bias_add PRIVATE host_tensor)
+target_link_libraries(gemm_xdl_bias_relu_add PRIVATE host_tensor)
 target_link_libraries(conv_xdl PRIVATE host_tensor)
 target_link_libraries(conv_xdl_bias_relu_add PRIVATE host_tensor)