[Enhancement] Replace the implementation of deform_roi_pool with mlu-ops (#2598)

* [Feature] Replace the implementation of deform_roi_pool with mlu-ops * [Feature] Modify code --------- Co-authored-by: budefei <budefei@cambricon.com>

[Enhancement] Replace the implementation of deform_roi_pool with mlu-ops (#2598)
* [Feature] Replace the implementation of deform_roi_pool with mlu-ops * [Feature] Modify code --------- Co-authored-by: budefei <budefei@cambricon.com>
9fcf48c4 · bdf · GitHub · 0d1b224f · 0d1b224f · 9fcf48c4
Unverified Commit 9fcf48c4 authored Mar 24, 2023 by bdf Committed by GitHub Mar 24, 2023
4 changed files
--- a/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
--- a/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
@@ -9,254 +9,59 @@
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
-#include "pytorch_device_registry.hpp"
-#include "pytorch_mlu_helper.hpp"
-
-void KernelDeformRoIPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                cnrtQueue_t queue, cnrtDataType_t data_type,
-                                const void *input, const void *rois,
-                                const void *offset, void *output,
-                                const int channels, const int height,
-                                const int width, const int num_rois,
-                                const int pooled_height, const int pooled_width,
-                                const float spatial_scale,
-                                const int sampling_ratio, const float gamma);
-
-void KernelDeformRoIPoolBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    cnrtDataType_t data_type, const void *grad_output, const void *input,
-    const void *rois, const void *offset, void *grad_input, void *grad_offset,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const float spatial_scale,
-    const int sampling_ratio, const float gamma);
-
-// policy function for forward and backward
-static void policyFunc(const int bin_num, cnrtDim3_t *k_dim,
-                       cnrtFunctionType_t *k_type) {
-  const size_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  ;
-  const size_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  const size_t bin_num_align = CEIL_ALIGN(bin_num, core_limit);
-  k_dim->x = core_limit;
-  k_dim->y = (bin_num_align / core_limit) > cluster_limit
-                 ? cluster_limit
-                 : (bin_num_align / core_limit);
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-}
+#include "mlu_common_helper.h"

 void DeformRoIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois,
                                           Tensor offset, Tensor output,
                                           int pooled_height, int pooled_width,
                                           float spatial_scale,
                                           int sampling_ratio, float gamma) {
-  // Check dtype.
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "input type should be Float or Half, got ", input.scalar_type());
-  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
-              "rois should have the same type as input");
-
-  // Check shape.
-  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
-              "D.");
-  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
-              "D.");
-  if (offset.defined() && offset.numel() > 0) {
-    TORCH_CHECK(input.scalar_type() == offset.scalar_type(),
-                "offset should have the same type as input");
-    TORCH_CHECK(offset.dim() == 4, "offset should be 4d tensor, got ",
-                offset.dim(), "D.");
-    TORCH_CHECK(
-        (offset.size(0) == rois.size(0)), "offset.size(0) = ", offset.size(0),
-        "while rois.size(0)) = ", rois.size(0), ". They should be the same.");
-    TORCH_CHECK((offset.size(1) == 2), "offset.size(1) should be 2, ",
-                "but now offset.size(1) = ", offset.size(1), ".");
-    TORCH_CHECK((offset.size(2) == output.size(2)),
-                "offset.size(2) = ", offset.size(2),
-                "while output.size(2)) = ", output.size(2),
-                ". They should be the same.");
-    TORCH_CHECK((offset.size(3) == output.size(3)),
-                "offset.size(3) = ", offset.size(3),
-                "while output.size(3)) = ", output.size(3),
-                ". They should be the same.");
-  }
-
-  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
-              "spatial_scale should be within (0, 1], got ", spatial_scale,
-              ".");
-
-  // compute kernel params
-  auto height = input.size(2);
-  auto width = input.size(3);
-  auto channels = input.size(1);
-  auto num_rois = output.size(0);
-
-  if (output.numel() == 0) {
-    output = at::zeros({num_rois, channels, pooled_height, pooled_width},
-                       input.options());
-    return;
-  }
-
-  // zero element check
-  TORCH_CHECK(input.size(0) != 0, "input.size(0) should not be zero, got ",
-              input.size(0));
-  TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
-              rois.numel());
-  if (input.numel() == 0 || output.numel() == 0) {
-    return;
-  }
-
-  // large tensor check
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(input.numel() < max_input_num,
-              "input.numel() should be less than 2147483648, got ",
-              input.numel());
-  TORCH_CHECK(rois.numel() < max_input_num,
-              "rois.numel() should be less than 2147483648, got ",
-              rois.numel());
-  TORCH_CHECK(output.numel() < max_input_num,
-              "output.numel() should be less than 2147483648, got ",
-              output.numel());
-  TORCH_CHECK(!offset.defined() || offset.numel() < max_input_num,
-              "offset.numel() should be less than 2147483648, got ",
-              offset.numel());
-
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
-
-  at::Tensor output_ =
-      at::empty({num_rois, channels, pooled_height, pooled_width},
-                input.options(), memory_format);
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(num_rois * pooled_height * pooled_width, &k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto output_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
+
+  MluOpTensorDescriptor input_desc, rois_desc, offset_desc, output_desc;
+  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  mluOpTensorDescriptor_t offset_real_desc = NULL;
+  void *offset_ptr = NULL;
+  if (offset.defined() && offset.numel() > 0) {
+    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        offset, offset.suggest_memory_format());
+    offset_desc.set(offset_contiguous);
+    offset_real_desc = offset_desc.desc();
+    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
+    offset_ptr = offset_impl->cnnlMalloc();
+  }

  // get ptr of tensors
  auto input_impl = torch_mlu::getMluTensorImpl(input_);
  auto input_ptr = input_impl->cnnlMalloc();
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto offset_impl = torch_mlu::getMluTensorImpl(offset);
-  auto offset_ptr = offset_impl->cnnlMalloc();
-  auto output_impl = torch_mlu::getMluTensorImpl(output_);
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
  auto output_ptr = output_impl->cnnlMalloc();

-  // get comput dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  mluOpDeformRoiPoolForward(
+      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
+      offset_real_desc, offset_ptr, pooled_height, pooled_width, spatial_scale,
+      sampling_ratio, gamma, output_desc.desc(), output_ptr);

-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel MLUKernelDeformRoIPoolForward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelDeformRoIPoolForward(k_dim, k_type, queue, data_type, input_ptr,
-                             rois_ptr, offset_ptr, output_ptr, channels, height,
-                             width, num_rois, pooled_height, pooled_width,
-                             spatial_scale, sampling_ratio, gamma);
-
-  output.copy_(output_);
+  output.copy_(output_contiguous);
 }

 void DeformRoIPoolBackwardMLUKernelLauncher(
    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
    float spatial_scale, int sampling_ratio, float gamma) {
-  // Check dtype.
-  TORCH_CHECK(
-      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
-      "input type should be Float or Half, got ", input.scalar_type());
-  TORCH_CHECK(input.scalar_type() == grad_output.scalar_type(),
-              "grad_output should have the same type as input");
-  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
-              "rois should have the same type as input");
-  TORCH_CHECK(input.scalar_type() == grad_input.scalar_type(),
-              "grad_input should have the same type as input");
-
-  // Check shape.
-  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
-              grad_output.dim(), "D.");
-  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
-              "D.");
-  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
-              "D.");
-  if (offset.defined() && offset.numel() > 0) {
-    TORCH_CHECK(input.scalar_type() == offset.scalar_type(),
-                "offset should have the same type as input");
-    TORCH_CHECK(offset.dim() == 4, "offset should be 4d tensor, got ",
-                offset.dim(), "D.");
-    TORCH_CHECK(
-        (offset.size(0) == rois.size(0)), "offset.size(0) = ", offset.size(0),
-        "while rois.size(0)) = ", rois.size(0), ". They should be the same.");
-    TORCH_CHECK((offset.size(1) == 2), "offset.size(1) should be 2, ",
-                "but now offset.size(1) = ", offset.size(1), ".");
-    TORCH_CHECK((offset.size(2) == grad_output.size(2)),
-                "offset.size(2) = ", offset.size(2),
-                "while grad_output.size(2)) = ", grad_output.size(2),
-                ". They should be the same.");
-    TORCH_CHECK((offset.size(3) == grad_output.size(3)),
-                "offset.size(3) = ", offset.size(3),
-                "while grad_output.size(3)) = ", grad_output.size(3),
-                ". They should be the same.");
-  }
-
-  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
-              "spatial_scale should be within (0, 1], got ", spatial_scale);
-
-  // Check relationship between tensor.
-  TORCH_CHECK((grad_output.size(0) == rois.size(0)),
-              "grad_output.size(0) = ", grad_output.size(0),
-              "while rois.size(0)) = ", rois.size(0),
-              ". They should be the same.");
-  TORCH_CHECK((grad_output.size(1) == input.size(1)),
-              "grad_output.size(1) = ", grad_output.size(1),
-              "while input.size(1)) = ", input.size(1),
-              ". They should be the same.");
-  TORCH_CHECK((grad_output.size(2) == pooled_height),
-              "grad_output.size(2) = ", grad_output.size(2),
-              "while pooled_height = ", pooled_height,
-              ". They should be the same.");
-  TORCH_CHECK((grad_output.size(3) == pooled_width),
-              "grad_output.size(3) = ", grad_output.size(3),
-              "while pooled_width = ", pooled_width,
-              ". They should be the same.");
-
-  // compute kernel params
-  auto batch = input.size(0);
-  auto channels = input.size(1);
-  auto height = input.size(2);
-  auto width = input.size(3);
-  auto num_rois = grad_output.size(0);
-
-  // zero element check
-  TORCH_CHECK(input.size(0) != 0, "input.size(0) should not be zero, got ",
-              input.size(0));
-  TORCH_CHECK(rois.numel() != 0, "rois.numel() should not be zero, got ",
-              rois.numel());
-  if (input.numel() == 0 || grad_output.numel() == 0) {
-    return;
-  }
-
-  // large tensor check
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(input.numel() < max_input_num,
-              "input.numel() should be less than 2147483648, got ",
-              input.numel());
-  TORCH_CHECK(rois.numel() < max_input_num,
-              "rois.numel() should be less than 2147483648, got ",
-              rois.numel());
-  TORCH_CHECK(grad_output.numel() < max_input_num,
-              "grad_output.numel() should be less than 2147483648, got ",
-              grad_output.numel());
-  TORCH_CHECK(!offset.defined() || offset.numel() < max_input_num,
-              "offset.numel() should be less than 2147483648, got ",
-              offset.numel());
-
  auto memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
  auto grad_output_ =
@@ -264,45 +69,56 @@ void DeformRoIPoolBackwardMLUKernelLauncher(
  memory_format =
      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
-  at::Tensor grad_input_ = at::empty({batch, channels, height, width},
-                                     input.options(), memory_format)
-                               .zero_();
-
-  // calculate task dimension
-  cnrtDim3_t k_dim;
-  cnrtFunctionType_t k_type;
-  policyFunc(num_rois * pooled_height * pooled_width, &k_dim, &k_type);
-
-  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto grad_input_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_input, memory_format);

  // get ptr of tensors
  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
  auto input_impl = torch_mlu::getMluTensorImpl(input_);
  auto input_ptr = input_impl->cnnlMalloc();
-  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
  auto rois_ptr = rois_impl->cnnlMalloc();
-  auto offset_impl = torch_mlu::getMluTensorImpl(offset);
-  auto offset_ptr = offset_impl->cnnlMalloc();
  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
-  auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset);
-  auto grad_offset_ptr = grad_offset_impl->cnnlMalloc();
-
-  // get comput dtype of input
-  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());

-  // launch kernel
-  CNLOG(INFO) << "Launch Kernel KernelDeformRoIPoolBackward<<<" << k_dim.x
-              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
-
-  KernelDeformRoIPoolBackward(k_dim, k_type, queue, data_type, grad_output_ptr,
-                              input_ptr, rois_ptr, offset_ptr, grad_input_ptr,
-                              grad_offset_ptr, channels, height, width,
-                              num_rois, pooled_height, pooled_width,
-                              spatial_scale, sampling_ratio, gamma);
+  MluOpTensorDescriptor grad_output_desc, input_desc, rois_desc, offset_desc,
+      grad_input_desc, grad_offset_desc;
+  grad_output_desc.set_with_layout(grad_output_, MLUOP_LAYOUT_NHWC);
+  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
+  mluOpTensorDescriptor_t offset_real_desc = NULL;
+  void *offset_ptr = NULL;
+  if (offset.defined() && offset.numel() > 0) {
+    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        offset, offset.suggest_memory_format());
+    offset_desc.set(offset_contiguous);
+    offset_real_desc = offset_desc.desc();
+    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
+    offset_ptr = offset_impl->cnnlMalloc();
+  }
+  mluOpTensorDescriptor_t grad_offset_real_desc = NULL;
+  void *grad_offset_ptr = NULL;
+  if (grad_offset.defined() && grad_offset.numel() > 0) {
+    auto grad_offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        grad_offset, grad_offset.suggest_memory_format());
+    grad_offset_desc.set(grad_offset_contiguous);
+    grad_offset_real_desc = grad_offset_desc.desc();
+    auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset_contiguous);
+    grad_offset_ptr = grad_offset_impl->cnnlMalloc();
+  }

+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  mluOpDeformRoiPoolBackward(
+      handle, grad_output_desc.desc(), grad_output_ptr, input_desc.desc(),
+      input_ptr, rois_desc.desc(), rois_ptr, offset_real_desc, offset_ptr,
+      pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma,
+      grad_input_desc.desc(), grad_input_ptr, grad_offset_real_desc,
+      grad_offset_ptr);
  grad_input.copy_(grad_input_);
 }


--- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
@@ -72,6 +72,39 @@ void MluOpTensorDescriptor::set(Tensor t) {
  set_desc(t, layout, data_type, dim_array);
 }

+void MluOpTensorDescriptor::set_with_layout(Tensor t,
+                                            mluOpTensorLayout_t layout) {
+  mluOpDataType_t data_type = getMluOpDataType(t.dtype());
+  int t_dim = t.dim();
+  std::vector<int> shape_info = checkUpperBoundAndCastTo<int>(t.sizes().vec());
+  std::vector<int> stride_info =
+      checkUpperBoundAndCastTo<int>(t.strides().vec());
+  if (layout == MLUOP_LAYOUT_NHWC || layout == MLUOP_LAYOUT_NDHWC ||
+      layout == MLUOP_LAYOUT_NLC) {
+    convertShapeAndStride(shape_info, stride_info);
+  } else if (layout == MLUOP_LAYOUT_HWCN) {
+    auto convertDepthWiseConvShapeStride = [](const std::vector<int64_t>& vec,
+                                              std::vector<int>& target_vec,
+                                              std::vector<int>& stride_vec) {
+      // NCHW --> HWCN
+      target_vec[0] = static_cast<int>(vec[2]);
+      target_vec[1] = static_cast<int>(vec[3]);
+      target_vec[2] = static_cast<int>(vec[1]);
+      target_vec[3] = static_cast<int>(vec[0]);
+      // Calculate Stride just like contiguous of HWCN.
+      stride_vec[3] = 1;
+      stride_vec[2] = target_vec[3] * stride_vec[3];
+      stride_vec[1] = target_vec[2] * stride_vec[2];
+      stride_vec[0] = target_vec[1] * stride_vec[1];
+    };
+    convertDepthWiseConvShapeStride(t.sizes().vec(), shape_info, stride_info);
+  }
+  TORCH_CHECK(mluOpSetTensorDescriptorEx(
+                  desc_, layout, data_type, t_dim, shape_info.data(),
+                  stride_info.data()) == MLUOP_STATUS_SUCCESS,
+              "mluOpSetTensorDescriptorEx execution failed.");
+}
+
 void MluOpTensorDescriptor::set_desc(const at::Tensor& t,
                                     mluOpTensorLayout_t layout,
                                     mluOpDataType_t dtype,

--- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
+++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
@@ -30,6 +30,7 @@ class MluOpTensorDescriptor {
  ~MluOpTensorDescriptor() { mluOpDestroyTensorDescriptor(desc_); }

  void set(at::Tensor);
+  void set_with_layout(at::Tensor, mluOpTensorLayout_t layout);
  mluOpTensorDescriptor_t desc() { return desc_; }

 private:
@@ -52,3 +53,47 @@ class MluOpHandle {
  void setQueue(cnrtQueue_t queue) { mluOpSetQueue(handle, queue); }
  mluOpHandle_t handle;
 };
+
+// modify tensor size and stride order based on
+// channels_first to channels_last or channels_last_3d.
+// which this is not same with pytorch original layout,
+// this real layout is based on data storage real order.
+// example: modify channels_last tensor dim to nhwc tensor desc.
+//            N    C H W  -->   N    H W C
+//          C*H*W  1 W C  --> C*H*W  W C 1
+template <typename T>
+void convertShapeAndStride(std::vector<T>& shape_info,
+                           std::vector<T>& stride_info) {
+  TORCH_MLU_CHECK(shape_info.size() == stride_info.size(),
+                  "shape size need equal to stride size.");
+  const int dim = shape_info.size();
+  std::vector<T> temp_shape_info(dim);
+  std::vector<T> temp_stride_info(dim);
+  temp_shape_info[0] = shape_info[0];
+  temp_stride_info[0] = stride_info[0];
+  for (size_t i = 0; i < dim - 1; ++i) {
+    const int index = (i + 1) % (dim - 1) + 1;
+    temp_shape_info[i + 1] = shape_info[index];
+    temp_stride_info[i + 1] = stride_info[index];
+  }
+  shape_info.assign(temp_shape_info.begin(), temp_shape_info.end());
+  stride_info.assign(temp_stride_info.begin(), temp_stride_info.end());
+}
+
+// torch tensor provides int64_t type of shape and stride,
+// but mluops descriptor requires type int32.
+// use this function to ensure safe CAST, or report an error.
+template <typename DST_T, typename SRC_T>
+std::vector<DST_T> checkUpperBoundAndCastTo(const std::vector<SRC_T>& input) {
+  std::vector<DST_T> output;
+  output.reserve(input.size());
+  for (const auto& val : input) {
+    if (val > std::numeric_limits<DST_T>::max()) {
+      TORCH_MLU_CHECK(false, "Requires dim size not greater than ",
+                      std::numeric_limits<DST_T>::max(), ". But got ", val,
+                      ".");
+    }
+    output.push_back(static_cast<DST_T>(val));
+  }
+  return output;
+}