add csrc and mmdeploy module

546b4279 · limm · 502f4fb9 · 546b4279 · 546b4279 · 546b4279
Commit 546b4279 authored Jun 25, 2025 by limm
20 changed files
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "gather.h"
+#include "../ncnn_ops_definer.h"
+#include "assert.h"
+namespace mmdeploy {
+using namespace ncnn;
+DEFINE_LAYER_CREATOR(Gather)
+DEFINE_NCNN_OPS(Gather, Gather)
+Gather::Gather() {
+  one_blob_only = false;
+  support_inplace = false;
+}
+int Gather::load_param(const ParamDict &pd) {
+  axis = pd.get(0, 0);
+  return 0;
+}
+// Gather only support 1-dim of indices, because the data and indices all has
+// implicit batch in ncnn, this will lead to wrong shape to match onnx result.
+// When indices dim equals to 1, after eliminating implicit batch, the indices
+// dim still be 1. So there is only 1 implicit batch in data, this will make
+// the shape match onnx result.
+int Gather::forward(const std::vector<Mat> &bottom_blobs, std::vector<Mat> &top_blobs,
+                    const Option &opt) const {
+  const Mat &bottom_blob = bottom_blobs[0];
+  const Mat &indices = bottom_blobs[1];
+  int dims = bottom_blob.dims;
+  int indices_dims = indices.dims;
+  size_t elemsize = bottom_blob.elemsize;
+  int positive_axis = axis < 0 ? dims + axis : axis;
+  Mat &top_blob = top_blobs[0];
+  assert(indices.dims == 1);
+  const float *indices_ptr = indices;
+  if (dims == 1 && indices_dims == 1)  // positive_axis == 0
+  {
+    int w = indices.w;
+    top_blob.create(w, elemsize, opt.blob_allocator);
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+    float *outptr = top_blob;
+    for (int i = 0; i < w; i++) {
+      float indice = indices_ptr[i];
+      outptr[i] = ptr[(int)(indice + 0.5)];
+    }
+    return 0;
+  }
+  if (dims == 2 && positive_axis == 0 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    top_blob.create(w, indices.w, elemsize, opt.blob_allocator);
+    // w -> w
+    // h -> indices.w
+    // h * w -> indices.w * w
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+    float *outptr = top_blob;
+    for (int i = 0; i < indices.w; i++) {
+      const int selected = (int)(indices_ptr[i] + 0.5);
+      memcpy(top_blob.row(i), bottom_blob.row(selected), w * elemsize);
+    }
+    return 0;
+  }
+  if (dims == 2 && positive_axis == 1 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    top_blob.create(indices.w, h, elemsize, opt.blob_allocator);
+    // w -> h
+    // h -> indices.w
+    // h * w -> indices.w * h
+    if (top_blob.empty()) {
+      return -100;
+    }
+    const float *ptr = bottom_blob;
+    float *outptr = top_blob;
+    for (int j = 0; j < h; j++) {
+      for (int i = 0; i < indices.w; i++) {
+        int selected = (int)(indices_ptr[i] + 0.5);
+        outptr[j * indices.w + i] = ptr[j * w + selected];
+      }
+    }
+    return 0;
+  }
+  if (dims == 3 && positive_axis == 0 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    top_blob.create(w, h, indices.w, elemsize, opt.blob_allocator);
+    if (top_blob.empty()) {
+      return -100;
+    }
+    for (int i = 0; i < indices.w; i++) {
+      int selected = (int)(indices_ptr[i] + 0.5);
+      const unsigned char *ptr = bottom_blob.channel(selected);
+      unsigned char *outptr = top_blob.channel(i);
+      memcpy(outptr, ptr, w * h * elemsize);
+    }
+    return 0;
+  }
+  if (dims == 3 && positive_axis == 1 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    top_blob.create(w, indices.w, channels, elemsize, opt.blob_allocator);
+#pragma omp parallel for num_threads(opt.num_threads)
+    // use parallel programming
+    for (int i = 0; i < channels; i++) {
+      float *outptr = top_blob.channel(i);
+      const float *ptr = bottom_blob.channel(i);
+      for (int j = 0; j < indices.w; j++) {
+        int selected = (int)(indices_ptr[j] + 0.5);
+        for (int k = 0; k < w; k++) {
+          outptr[j * w + k] = ptr[selected * w + k];
+        }
+      }
+    }
+    return 0;
+  }
+  if (dims == 3 && positive_axis == 2 && indices_dims == 1) {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    top_blob.create(indices.w, h, channels, elemsize, opt.blob_allocator);
+#pragma omp parallel for num_threads(opt.num_threads)
+    // use parallel programming
+    for (int i = 0; i < channels; i++) {
+      float *outptr = top_blob.channel(i);
+      const float *ptr = bottom_blob.channel(i);
+      for (int j = 0; j < h; j++) {
+        for (int k = 0; k < indices.w; k++) {
+          int selected = (int)(indices_ptr[k] + 0.5);
+          outptr[j * indices.w + k] = ptr[j * w + selected];
+        }
+      }
+    }
+    return 0;
+  }
+  return 0;
+}
+}  //  namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/gather/gather.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef LAYER_GATHER_H
+#define LAYER_GATHER_H
+#include "layer.h"
+namespace mmdeploy {
+class Gather : public ncnn::Layer {
+ public:
+  Gather();
+  virtual int load_param(const ncnn::ParamDict& pd);
+  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs,
+                      const ncnn::Option& opt) const;
+ public:
+  int axis;
+};
+}  // namespace mmdeploy
+#endif  // LAYER_GATHER_H
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_definer.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef NCNN_OPS_DEFINER_H
+#define NCNN_OPS_DEFINER_H
+#include <string>
+#include "layer.h"
+#include "ncnn_ops_register.h"
+namespace mmdeploy {
+class NCNNOpsDefiner {
+ public:
+  NCNNOpsDefiner(const std::string& ops_name, const ncnn::layer_creator_func& creator_func = 0,
+                 const ncnn::layer_destroyer_func& destroyer_func = 0)
+      : _ops_name(ops_name) {
+    get_mmdeploy_layer_creator()[_ops_name.c_str()] = creator_func;
+  }
+ private:
+  const std::string _ops_name;
+};
+#define DEFINE_NCNN_OPS(ops_name, OpsLayer) \
+  static mmdeploy::NCNNOpsDefiner NCNNOpsDefiner##ops_name{#ops_name, OpsLayer##_layer_creator};
+}  // namespace mmdeploy
+#endif
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "ncnn_ops_register.h"
+#include <iostream>
+std::map<const char *, ncnn::layer_creator_func> &get_mmdeploy_layer_creator() {
+  static std::map<const char *, ncnn::layer_creator_func> _layer_creator_map;
+  return _layer_creator_map;
+}
+std::map<const char *, ncnn::layer_destroyer_func> &get_mmdeploy_layer_destroyer() {
+  static std::map<const char *, ncnn::layer_destroyer_func> _layer_destroyer_map;
+  return _layer_destroyer_map;
+}
+int register_mmdeploy_custom_layers(ncnn::Net &net) {
+  auto &layer_creator_map = get_mmdeploy_layer_creator();
+  auto &layer_destroyer_map = get_mmdeploy_layer_destroyer();
+  for (auto const &creator_pair : layer_creator_map) {
+    auto creator_name = creator_pair.first;
+    auto creator_func = creator_pair.second;
+    ncnn::layer_destroyer_func destroyer_func = 0;
+    if (layer_destroyer_map.find(creator_name) != layer_destroyer_map.end()) {
+      destroyer_func = layer_destroyer_map[creator_name];
+    }
+    int ret = net.register_custom_layer(creator_name, creator_func, destroyer_func);
+    if (0 != ret) {
+      return ret;
+    }
+  }
+  return 0;
+}
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/ncnn_ops_register.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef NCNN_OPS_REGISTER_H
+#define NCNN_OPS_REGISTER_H
+#include <map>
+#include <string>
+#include "mmdeploy/core/macro.h"
+#include "net.h"
+MMDEPLOY_API std::map<const char*, ncnn::layer_creator_func>& get_mmdeploy_layer_creator();
+MMDEPLOY_API std::map<const char*, ncnn::layer_destroyer_func>& get_mmdeploy_layer_destroyer();
+MMDEPLOY_API int register_mmdeploy_custom_layers(ncnn::Net& net);
+#endif
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "shape.h"
+#include "../ncnn_ops_definer.h"
+namespace mmdeploy {
+using namespace ncnn;
+DEFINE_LAYER_CREATOR(Shape)
+DEFINE_NCNN_OPS(Shape, Shape)
+Shape::Shape() {
+  one_blob_only = true;
+  support_inplace = false;
+}
+int Shape::forward(const Mat &bottom_blob, Mat &top_blob, const Option &opt) const {
+  int dims = bottom_blob.dims;
+  int w = bottom_blob.w;
+  size_t elemsize = sizeof(float);
+  top_blob.create(dims + 1, elemsize, opt.blob_allocator);
+  if (top_blob.empty()) {
+    return -100;
+  }
+  float *outptr = top_blob;
+  if (dims == 1) {
+    outptr[0] = 1.0f;
+    outptr[1] = w;
+  } else if (dims == 2) {
+    int h = bottom_blob.h;
+    outptr[0] = 1.0f;
+    outptr[1] = h;
+    outptr[2] = w;
+  } else if (dims == 3) {
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    outptr[0] = 1.0f;
+    outptr[1] = channels;
+    outptr[2] = h;
+    outptr[3] = w;
+  } else {
+    fprintf(stdout, "Unsupported dims=%d\n", dims);
+  }
+  return 0;
+}
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/shape/shape.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef LAYER_SHAPE_H
+#define LAYER_SHAPE_H
+#include "layer.h"
+namespace mmdeploy {
+class Shape : public ncnn::Layer {
+ public:
+  Shape();
+  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
+                      const ncnn::Option& opt) const;
+};
+}  // namespace mmdeploy
+#endif  // LAYER_SHAPE_H
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "tensorslice.h"
+#include <math.h>
+#include "../ncnn_ops_definer.h"
+namespace mmdeploy {
+using namespace ncnn;
+DEFINE_LAYER_CREATOR(TensorSlice)
+DEFINE_NCNN_OPS(TensorSlice, TensorSlice)
+TensorSlice::TensorSlice() {
+  one_blob_only = true;
+  support_inplace = false;
+}
+int TensorSlice::load_param(const ParamDict& pd) {
+  starts = pd.get(0, Mat());
+  ends = pd.get(1, Mat());
+  axes = pd.get(2, Mat());
+  steps = pd.get(3, Mat());
+  if (axes.w == 0) {
+    axes.create(starts.w);
+    int* axes_ptr = axes;
+    for (int i = 0; i < starts.w; i++) {
+      axes_ptr[i] = i;
+    }
+  }
+  if (steps.w == 0) {
+    steps.create(axes.w);
+    steps.fill(1);
+  }
+  return 0;
+}
+static inline int get_shape_by_axes(const Mat& blob, int axes, int dims) {
+  switch (dims - axes) {
+    case 0:
+      return blob.w;
+    case 1:
+      return blob.h;
+    case 2:
+      return blob.c;
+    default:
+      fprintf(stderr, "wrong axes %d!\n", axes);
+      return -1;
+  }
+  return 0;
+}
+int TensorSlice::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const {
+  int dims = bottom_blob.dims;
+  size_t elemsize = bottom_blob.elemsize;
+  const int* start_ptr = starts;
+  const int* end_ptr = ends;
+  const int* axes_ptr = axes;
+  const int* step_ptr = steps;
+  if (starts.w > dims || ends.w > dims) {
+    fprintf(stderr, "start/end attributes shape error!\n");
+    return -100;
+  }
+  if (axes.w != 1) {
+    fprintf(stderr,
+            "axes.w must be 1 because any of multiaxes slice is regarded as "
+            "multi-staged onnx slice in pytorch2onnx.");
+  }
+  if (dims == 1) {
+    for (int i = 0; i < axes.w; i++) {
+      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+      int step = step_ptr[i];
+      std::vector<float> temp_val;
+      int start = start_ptr[i];
+      int end = end_ptr[i];
+      int cur = start;
+      if (step > 0) {
+        while (cur < end && cur < bottom_blob.w) {
+          temp_val.push_back(bottom_blob[cur]);
+          cur += step;
+        }
+      } else if (step < 0) {
+        while (cur > end && cur > 0) {
+          temp_val.push_back(bottom_blob[cur]);
+          cur += step;
+        }
+      } else {
+        fprintf(stderr, "step should not be 0!\n");
+        return -100;
+      }
+      top_blob.create(temp_val.size(), elemsize, opt.blob_allocator);
+      for (int i = 0; i < temp_val.size(); i++) {
+        top_blob[i] = temp_val[i];
+      }
+    }
+    return 0;
+  }
+  if (dims == 2) {
+    std::vector<std::vector<int> > active_indice;
+    std::vector<int> indices;
+    for (int i = 0; i < bottom_blob.h; i++) {
+      indices.push_back(i);
+    }
+    active_indice.push_back(indices);
+    indices.clear();
+    for (int i = 0; i < bottom_blob.w; i++) {
+      indices.push_back(i);
+    }
+    active_indice.push_back(indices);
+    for (int i = 0; i < axes.w; i++) {
+      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+      int step = step_ptr[i];
+      int start = start_ptr[i];
+      int end = end_ptr[i];
+      int dim_shape = get_shape_by_axes(bottom_blob, positive_axis, dims);
+      int dim_shape_test = get_shape_by_axes(bottom_blob, positive_axis, dims - 1);
+      if (dim_shape < 0) {
+        return -1;
+      }
+      end = end < dim_shape ? end : dim_shape;
+      int cur = start;
+      std::vector<int> temp_indice;
+      if (step > 0) {
+        while (cur < end && cur < dim_shape) {
+          temp_indice.push_back(cur);
+          cur += step;
+        }
+      } else if (step < 0) {
+        while (cur > end && cur > 0) {
+          temp_indice.push_back(cur);
+          cur += step;
+        }
+      } else {
+        fprintf(stderr, "step should not be 0!\n");
+        return -100;
+      }
+      active_indice[positive_axis - 1] = temp_indice;
+      active_indice[positive_axis - 1].resize(temp_indice.size());
+    }
+    top_blob.create((int)active_indice[1].size(), (int)active_indice[0].size(), elemsize,
+                    opt.blob_allocator);
+    for (int i = 0; i < active_indice[0].size(); i++) {
+      for (int j = 0; j < active_indice[1].size(); j++) {
+        top_blob.row(i)[j] = bottom_blob.row(active_indice[0][i])[active_indice[1][j]];
+      }
+    }
+    return 0;
+  }
+  if (dims == 3) {
+    std::vector<std::vector<int> > active_indice;
+    std::vector<int> indices;
+    for (int i = 0; i < bottom_blob.c; i++) {
+      indices.push_back(i);
+    }
+    active_indice.push_back(indices);
+    indices.clear();
+    for (int i = 0; i < bottom_blob.h; i++) {
+      indices.push_back(i);
+    }
+    active_indice.push_back(indices);
+    indices.clear();
+    for (int i = 0; i < bottom_blob.w; i++) {
+      indices.push_back(i);
+    }
+    active_indice.push_back(indices);
+    for (int i = 0; i < axes.w; i++) {
+      int positive_axis = axes_ptr[i] < 0 ? dims + axes_ptr[i] : axes_ptr[i];
+      int step = step_ptr[i];
+      int start = start_ptr[i];
+      int end = end_ptr[i];
+      int cur = start;
+      std::vector<int> temp_indice;
+      if (step > 0) {
+        while (cur < end && cur < bottom_blob.w) {
+          temp_indice.push_back(cur);
+          cur += step;
+        }
+      } else if (step < 0) {
+        while (cur > end && cur > 0) {
+          temp_indice.push_back(cur);
+          cur += step;
+        }
+      } else {
+        fprintf(stderr, "step should not be 0!\n");
+        return -100;
+      }
+      active_indice[positive_axis - 1] = temp_indice;
+      active_indice[positive_axis - 1].resize(temp_indice.size());
+    }
+    top_blob.create((int)active_indice[2].size(), (int)active_indice[1].size(),
+                    (int)active_indice[0].size(), elemsize, opt.blob_allocator);
+    for (int i = 0; i < active_indice[0].size(); i++) {
+      for (int j = 0; j < active_indice[1].size(); j++) {
+        for (int k = 0; k < active_indice[2].size(); k++) {
+          top_blob.channel(i).row(j)[k] = bottom_blob.channel(active_indice[0][i])
+                                              .row(active_indice[1][j])[active_indice[2][k]];
+        }
+      }
+    }
+    return 0;
+  }
+  return 0;
+}
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/tensorslice/tensorslice.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef LAYER_TENSORSLICE_H
+#define LAYER_TENSORSLICE_H
+#include "layer.h"
+namespace mmdeploy {
+class TensorSlice : public ncnn::Layer {
+ public:
+  TensorSlice();
+  virtual int load_param(const ncnn::ParamDict& pd);
+  virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob,
+                      const ncnn::Option& opt) const;
+ public:
+  ncnn::Mat starts;
+  ncnn::Mat ends;
+  ncnn::Mat axes;
+  ncnn::Mat steps;
+};
+}  // namespace mmdeploy
+#endif  // LAYER_TENSORSLICE_H
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "topk.h"
+#include <math.h>
+#include <functional>
+#include "../ncnn_ops_definer.h"
+namespace mmdeploy {
+using namespace ncnn;
+DEFINE_LAYER_CREATOR(TopK)
+DEFINE_NCNN_OPS(TopK, TopK)
+TopK::TopK() {
+  one_blob_only = false;
+  support_inplace = false;
+}
+int TopK::load_param(const ParamDict& pd) {
+  axis = pd.get(0, -1);
+  largest = pd.get(1, 1);
+  sorted = pd.get(2, 1);
+  keep_dims = pd.get(3, 1);
+  return 0;
+}
+int TopK::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs,
+                  const Option& opt) const {
+  int dims = bottom_blobs[0].dims;
+  int positive_axis = axis < 0 ? dims + axis : axis;
+  int topk;
+  if (bottom_blobs.size() == 2) {
+    const Mat& topk_blob = bottom_blobs[1];
+    topk = (int)(topk_blob[0] + 0.5);
+  } else if (bottom_blobs.size() == 1) {
+    topk = 1;
+  } else {
+    fprintf(stderr, "topk input blobs should be 1 or 2, but not %ld\n", bottom_blobs.size());
+    return -103;
+  }
+  // To do: Cut the top_val_blob after unit test. And we should change them in
+  // param files.
+  // Adaptive outputs. For onnx TopK, we output 2 blobs, for ArgMax, we output
+  // 1 blob.
+  Mat& top_val_blob = top_blobs[0];
+  Mat& top_ind_blob = top_blobs.size() == 2 ? top_blobs[1] : top_val_blob;
+  if (topk > 1) {
+    // real topk
+    if (keep_dims == 0) {
+      fprintf(stderr, "real topk should not reduce dims!\n");
+      return -102;
+    }
+    if (dims == 1 && positive_axis == 0) {
+      if (topk > bottom_blobs[0].w) {
+        fprintf(stderr, "topk should not greater than total items!\n");
+        return -100;
+      }
+      top_val_blob.create(topk, 4u, opt.blob_allocator);
+      if (top_val_blob.empty()) return -100;
+      top_ind_blob.create(topk, 4u, opt.blob_allocator);
+      if (top_ind_blob.empty()) return -100;
+      const float* ptr = bottom_blobs[0];
+      std::vector<std::pair<float, int> > vec;
+      vec.resize(bottom_blobs[0].w);
+      if (largest == 1) {
+        for (int i = 0; i < bottom_blobs[0].w; i++) {
+          vec[i] = std::make_pair(ptr[i], -i);
+        }
+        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                          std::greater<std::pair<float, int> >());
+      } else if (largest == 0) {
+        for (int i = 0; i < bottom_blobs[0].w; i++) {
+          vec[i] = std::make_pair(ptr[i], i);
+        }
+        std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                          std::less<std::pair<float, int> >());
+      } else {
+        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+        return -100;
+      }
+      float* valptr = top_val_blob;
+      float* indptr = top_ind_blob;
+      if (sorted == 1) {
+        for (int i = 0; i < topk; i++) {
+          valptr[i] = vec[i].first;
+          indptr[i] = abs(vec[i].second);
+        }
+      } else if (sorted == 0) {
+        int cur = 0;
+        float valtarget = vec[topk - 1].first;
+        int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+        // pair comparison
+        if (largest == 1) {
+          for (int i = 0; i < bottom_blobs[0].w; i++) {
+            if (cur >= topk) break;
+            if (bottom_blobs[0][i] > valtarget) {
+              valptr[cur] = bottom_blobs[0][i];
+              indptr[cur] = i;
+              cur++;
+            } else if (bottom_blobs[0][i] == valtarget && i <= indtarget) {
+              valptr[cur] = bottom_blobs[0][i];
+              indptr[cur] = i;
+              cur++;
+            }
+          }
+        } else {
+          for (int i = 0; i < bottom_blobs[0].w; i++) {
+            if (cur >= topk) break;
+            if (bottom_blobs[0][i] < valtarget) {
+              valptr[cur] = bottom_blobs[0][i];
+              indptr[cur] = i;
+              cur++;
+            } else if (bottom_blobs[0][i] == valtarget && i <= indtarget) {
+              valptr[cur] = bottom_blobs[0][i];
+              indptr[cur] = i;
+              cur++;
+            }
+          }
+        }
+      }
+    }
+    if (dims == 2 && positive_axis == 0) {
+      if (topk > bottom_blobs[0].h) {
+        fprintf(stderr, "topk should not greater than total items!\n");
+        return -100;
+      }
+      top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+      if (top_val_blob.empty()) return -100;
+      top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+      if (top_ind_blob.empty()) return -100;
+      for (int col = 0; col < bottom_blobs[0].w; col++) {
+        std::vector<std::pair<float, int> > vec;
+        vec.resize(bottom_blobs[0].h);
+        if (largest == 1) {
+          for (int i = 0; i < bottom_blobs[0].h; i++) {
+            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], -i);
+          }
+          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                            std::greater<std::pair<float, int> >());
+        } else if (largest == 0) {
+          for (int i = 0; i < bottom_blobs[0].h; i++) {
+            vec[i] = std::make_pair(bottom_blobs[0].row(i)[col], i);
+          }
+          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                            std::less<std::pair<float, int> >());
+        } else {
+          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+          return -100;
+        }
+        if (sorted == 1) {
+          for (int i = 0; i < topk; i++) {
+            top_val_blob.row(i)[col] = vec[i].first;
+            top_ind_blob.row(i)[col] = abs(vec[i].second);
+          }
+        } else if (sorted == 0) {
+          int cur = 0;
+          float valtarget = vec[topk - 1].first;
+          int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+          if (largest == 1) {
+            for (int i = 0; i < bottom_blobs[0].h; i++) {
+              if (cur >= topk) break;
+              if (bottom_blobs[0].row(i)[col] > valtarget) {
+                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                top_ind_blob.row(cur)[col] = i;
+                cur++;
+              } else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget) {
+                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                top_ind_blob.row(cur)[col] = i;
+                cur++;
+              }
+            }
+          } else {
+            for (int i = 0; i < bottom_blobs[0].h; i++) {
+              if (cur >= topk) break;
+              if (bottom_blobs[0].row(i)[col] < valtarget) {
+                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                top_ind_blob.row(cur)[col] = i;
+                cur++;
+              } else if (bottom_blobs[0].row(i)[col] == valtarget && i <= indtarget) {
+                top_val_blob.row(cur)[col] = bottom_blobs[0].row(i)[col];
+                top_ind_blob.row(cur)[col] = i;
+                cur++;
+              }
+            }
+          }
+        } else {
+          fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+          return -100;
+        }
+      }
+    }
+    if (dims == 2 && positive_axis == 1) {
+      if (topk > bottom_blobs[0].w) {
+        fprintf(stderr, "topk should not greater than total items!\n");
+        return -100;
+      }
+      top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+      if (top_val_blob.empty()) return -100;
+      top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+      if (top_ind_blob.empty()) return -100;
+      for (int r = 0; r < bottom_blobs[0].h; r++) {
+        std::vector<std::pair<float, int> > vec;
+        vec.resize(bottom_blobs[0].w);
+        if (largest == 1) {
+          for (int i = 0; i < bottom_blobs[0].w; i++) {
+            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], -i);
+          }
+          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                            std::greater<std::pair<float, int> >());
+        } else if (largest == 0) {
+          for (int i = 0; i < bottom_blobs[0].w; i++) {
+            vec[i] = std::make_pair(bottom_blobs[0].row(r)[i], i);
+          }
+          std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                            std::less<std::pair<float, int> >());
+        } else {
+          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+          return -100;
+        }
+        if (sorted == 1) {
+          for (int i = 0; i < topk; i++) {
+            top_val_blob.row(r)[i] = vec[i].first;
+            top_ind_blob.row(r)[i] = abs(vec[i].second);
+          }
+        } else if (sorted == 0) {
+          int cur = 0;
+          float valtarget = vec[topk - 1].first;
+          int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+          if (largest == 1) {
+            for (int i = 0; i < bottom_blobs[0].w; i++) {
+              if (cur >= topk) break;
+              if (bottom_blobs[0].row(r)[i] > valtarget) {
+                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                top_ind_blob.row(r)[cur] = i;
+                cur++;
+              } else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget) {
+                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                top_ind_blob.row(r)[cur] = i;
+                cur++;
+              }
+            }
+          } else {
+            for (int i = 0; i < bottom_blobs[0].w; i++) {
+              if (cur >= topk) break;
+              if (bottom_blobs[0].row(r)[i] < valtarget) {
+                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                top_ind_blob.row(r)[cur] = i;
+                cur++;
+              } else if (bottom_blobs[0].row(r)[i] == valtarget && i <= indtarget) {
+                top_val_blob.row(r)[cur] = bottom_blobs[0].row(r)[i];
+                top_ind_blob.row(r)[cur] = i;
+                cur++;
+              }
+            }
+          }
+        } else {
+          fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+          return -100;
+        }
+      }
+    }
+    if (dims == 3 && positive_axis == 0) {
+      if (topk > bottom_blobs[0].c) {
+        fprintf(stderr, "topk should not greater than total items!\n");
+        return -100;
+      }
+      top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+      if (top_val_blob.empty()) return -100;
+      top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+      if (top_ind_blob.empty()) return -100;
+      for (int r = 0; r < bottom_blobs[0].h; r++) {
+        for (int col = 0; col < bottom_blobs[0].w; col++) {
+          std::vector<std::pair<float, int> > vec;
+          vec.resize(bottom_blobs[0].c);
+          if (largest == 1) {
+            for (int i = 0; i < bottom_blobs[0].c; i++) {
+              vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], -i);
+            }
+            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                              std::greater<std::pair<float, int> >());
+          } else if (largest == 0) {
+            for (int i = 0; i < bottom_blobs[0].c; i++) {
+              vec[i] = std::make_pair(bottom_blobs[0].channel(i).row(r)[col], i);
+            }
+            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                              std::less<std::pair<float, int> >());
+          } else {
+            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+            return -100;
+          }
+          if (sorted == 1) {
+            for (int i = 0; i < topk; i++) {
+              top_val_blob.channel(i).row(r)[col] = vec[i].first;
+              top_ind_blob.channel(i).row(r)[col] = abs(vec[i].second);
+            }
+          } else if (sorted == 0) {
+            int cur = 0;
+            float valtarget = vec[topk - 1].first;
+            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+            if (largest == 1) {
+              for (int i = 0; i < bottom_blobs[0].c; i++) {
+                if (cur >= topk) break;
+                if (bottom_blobs[0].channel(i).row(r)[col] > valtarget) {
+                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                  top_ind_blob.channel(cur).row(r)[col] = i;
+                  cur++;
+                } else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget) {
+                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                  top_ind_blob.channel(cur).row(r)[col] = i;
+                  cur++;
+                }
+              }
+            } else {
+              for (int i = 0; i < bottom_blobs[0].c; i++) {
+                if (cur >= topk) break;
+                if (bottom_blobs[0].channel(i).row(r)[col] < valtarget) {
+                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                  top_ind_blob.channel(cur).row(r)[col] = i;
+                  cur++;
+                } else if (bottom_blobs[0].channel(i).row(r)[col] == valtarget && i <= indtarget) {
+                  top_val_blob.channel(cur).row(r)[col] = bottom_blobs[0].channel(i).row(r)[col];
+                  top_ind_blob.channel(cur).row(r)[col] = i;
+                  cur++;
+                }
+              }
+            }
+          } else {
+            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+            return -100;
+          }
+        }
+      }
+    }
+    if (dims == 3 && positive_axis == 1) {
+      if (topk > bottom_blobs[0].h) {
+        fprintf(stderr, "topk should not greater than total items!\n");
+        return -100;
+      }
+      top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+      if (top_val_blob.empty()) return -100;
+      top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+      if (top_ind_blob.empty()) return -100;
+      for (int page = 0; page < bottom_blobs[0].c; page++) {
+        for (int col = 0; col < bottom_blobs[0].w; col++) {
+          std::vector<std::pair<float, int> > vec;
+          vec.resize(bottom_blobs[0].h);
+          if (largest == 1) {
+            for (int i = 0; i < bottom_blobs[0].h; i++) {
+              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], -i);
+            }
+            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                              std::greater<std::pair<float, int> >());
+          } else if (largest == 0) {
+            for (int i = 0; i < bottom_blobs[0].h; i++) {
+              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(i)[col], i);
+            }
+            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                              std::less<std::pair<float, int> >());
+          } else {
+            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+            return -100;
+          }
+          if (sorted == 1) {
+            for (int i = 0; i < topk; i++) {
+              top_val_blob.channel(page).row(i)[col] = vec[i].first;
+              top_ind_blob.channel(page).row(i)[col] = abs(vec[i].second);
+            }
+          } else if (sorted == 0) {
+            int cur = 0;
+            float valtarget = vec[topk - 1].first;
+            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+            for (int i = 0; i < bottom_blobs[0].h; i++) {
+              if (cur >= topk) break;
+              if (largest == 1) {
+                if (bottom_blobs[0].channel(page).row(i)[col] > valtarget) {
+                  top_val_blob.channel(page).row(cur)[col] =
+                      bottom_blobs[0].channel(page).row(i)[col];
+                  top_ind_blob.channel(page).row(cur)[col] = i;
+                  cur++;
+                } else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
+                           i <= indtarget) {
+                  top_val_blob.channel(page).row(cur)[col] =
+                      bottom_blobs[0].channel(page).row(i)[col];
+                  top_ind_blob.channel(page).row(cur)[col] = i;
+                  cur++;
+                }
+              } else {
+                if (bottom_blobs[0].channel(page).row(i)[col] < valtarget) {
+                  top_val_blob.channel(page).row(cur)[col] =
+                      bottom_blobs[0].channel(page).row(i)[col];
+                  top_ind_blob.channel(page).row(cur)[col] = i;
+                  cur++;
+                } else if (bottom_blobs[0].channel(page).row(i)[col] == valtarget &&
+                           i <= indtarget) {
+                  top_val_blob.channel(page).row(cur)[col] =
+                      bottom_blobs[0].channel(page).row(i)[col];
+                  top_ind_blob.channel(page).row(cur)[col] = i;
+                  cur++;
+                }
+              }
+            }
+          } else {
+            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+            return -100;
+          }
+        }
+      }
+    }
+    if (dims == 3 && positive_axis == 2) {
+      if (topk > bottom_blobs[0].w) {
+        fprintf(stderr, "topk should not greater than total items!\n");
+        return -100;
+      }
+      top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+      if (top_val_blob.empty()) return -100;
+      top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+      if (top_ind_blob.empty()) return -100;
+      for (int page = 0; page < bottom_blobs[0].c; page++) {
+        for (int r = 0; r < bottom_blobs[0].h; r++) {
+          std::vector<std::pair<float, int> > vec;
+          vec.resize(bottom_blobs[0].w);
+          if (largest == 1) {
+            for (int i = 0; i < bottom_blobs[0].w; i++) {
+              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], -i);
+            }
+            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                              std::greater<std::pair<float, int> >());
+          } else if (largest == 0) {
+            for (int i = 0; i < bottom_blobs[0].w; i++) {
+              vec[i] = std::make_pair(bottom_blobs[0].channel(page).row(r)[i], i);
+            }
+            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                              std::less<std::pair<float, int> >());
+          } else {
+            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+            return -100;
+          }
+          if (sorted == 1) {
+            for (int i = 0; i < topk; i++) {
+              top_val_blob.channel(page).row(r)[i] = vec[i].first;
+              top_ind_blob.channel(page).row(r)[i] = abs(vec[i].second);
+            }
+          } else if (sorted == 0) {
+            int cur = 0;
+            float valtarget = vec[topk - 1].first;
+            int indtarget = (int)(abs(vec[topk - 1].second) + 0.5);
+            if (largest == 1) {
+              for (int i = 0; i < bottom_blobs[0].w; i++) {
+                if (cur >= topk) break;
+                if (bottom_blobs[0].channel(page).row(r)[i] > valtarget) {
+                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                  top_ind_blob.channel(page).row(r)[cur] = i;
+                  cur++;
+                } else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget) {
+                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                  top_ind_blob.channel(page).row(r)[cur] = i;
+                  cur++;
+                }
+              }
+            } else {
+              for (int i = 0; i < bottom_blobs[0].w; i++) {
+                if (cur >= topk) break;
+                if (bottom_blobs[0].channel(page).row(r)[i] < valtarget) {
+                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                  top_ind_blob.channel(page).row(r)[cur] = i;
+                  cur++;
+                } else if (bottom_blobs[0].channel(page).row(r)[i] == valtarget && i <= indtarget) {
+                  top_val_blob.channel(page).row(r)[cur] = bottom_blobs[0].channel(page).row(r)[i];
+                  top_ind_blob.channel(page).row(r)[cur] = i;
+                  cur++;
+                }
+              }
+            }
+          } else {
+            fprintf(stderr, "sorted attribute should be 0 or 1, but not %d\n", sorted);
+            return -100;
+          }
+        }
+      }
+    }
+  } else {
+    if (topk <= 0) {
+      fprintf(stderr, "topk should not <= 0!\n");
+      return -102;
+    }
+    if (dims == 1 && positive_axis == 0) {
+      if (topk > bottom_blobs[0].w) {
+        fprintf(stderr, "topk should not greater than total items!\n");
+        return -100;
+      }
+      top_val_blob.create(topk, 4u, opt.blob_allocator);
+      if (top_val_blob.empty()) return -100;
+      if (top_blobs.size() == 2) {
+        top_ind_blob.create(topk, 4u, opt.blob_allocator);
+        if (top_ind_blob.empty()) return -100;
+      }
+      const float* ptr = bottom_blobs[0];
+      std::vector<float> vec;
+      vec.resize(bottom_blobs[0].w);
+      float* valptr = top_val_blob;
+      float* indptr;
+      if (top_blobs.size() == 2) indptr = top_ind_blob;
+      for (int i = 0; i < bottom_blobs[0].w; i++) {
+        vec[i] = ptr[i];
+      }
+      if (largest == 1) {
+        auto index_iter = std::max_element(vec.begin(), vec.end());
+        valptr[0] = *index_iter;
+        if (top_blobs.size() == 2)
+          indptr[0] = std::distance(vec.begin(), index_iter);
+        else
+          valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
+      } else if (largest == 0) {
+        auto index_iter = std::min_element(vec.begin(), vec.end());
+        valptr[0] = *index_iter;
+        if (top_blobs.size() == 2)
+          indptr[0] = std::distance(vec.begin(), index_iter);
+        else
+          valptr[0] = std::distance(vec.begin(), index_iter);  // replace with index
+      } else {
+        fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+        return -100;
+      }
+    }
+    if (dims == 2 && positive_axis == 0) {
+      if (keep_dims == 1) {
+        top_val_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(bottom_blobs[0].w, topk, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+      } else {
+        top_val_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(bottom_blobs[0].w, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+      }
+      const float* ptr = bottom_blobs[0];
+      std::vector<float> vec;
+      vec.resize(bottom_blobs[0].h);
+      float* valptr = top_val_blob;
+      float* indptr;
+      if (top_blobs.size() == 2) indptr = top_ind_blob;
+      for (int col = 0; col < bottom_blobs[0].w; col++) {
+        for (int i = 0; i < bottom_blobs[0].h; i++) {
+          vec[i] = ptr[i * bottom_blobs[0].w + col];
+        }
+        if (largest == 1) {
+          auto index_iter = std::max_element(vec.begin(), vec.end());
+          valptr[col] = *index_iter;
+          if (top_blobs.size() == 2)
+            indptr[col] = std::distance(vec.begin(), index_iter);
+          else
+            valptr[col] = std::distance(vec.begin(), index_iter);
+        } else if (largest == 0) {
+          auto index_iter = std::min_element(vec.begin(), vec.end());
+          valptr[col] = *index_iter;
+          if (top_blobs.size() == 2)
+            indptr[col] = std::distance(vec.begin(), index_iter);
+          else
+            valptr[col] = std::distance(vec.begin(), index_iter);
+        } else {
+          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+          return -100;
+        }
+      }
+    }
+    if (dims == 2 && positive_axis == 1) {
+      if (keep_dims == 1) {
+        top_val_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(topk, bottom_blobs[0].h, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+      } else {
+        top_val_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(bottom_blobs[0].h, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+      }
+      const float* ptr = bottom_blobs[0];
+      std::vector<float> vec;
+      vec.resize(bottom_blobs[0].w);
+      float* valptr = top_val_blob;
+      float* indptr;
+      if (top_blobs.size() == 2) indptr = top_ind_blob;
+      for (int r = 0; r < bottom_blobs[0].h; r++) {
+        for (int i = 0; i < bottom_blobs[0].w; i++) {
+          vec[i] = ptr[r * bottom_blobs[0].w + i];
+        }
+        if (largest == 1) {
+          auto index_iter = std::max_element(vec.begin(), vec.end());
+          valptr[r] = *index_iter;
+          if (top_blobs.size() == 2)
+            indptr[r] = std::distance(vec.begin(), index_iter);
+          else
+            valptr[r] = std::distance(vec.begin(), index_iter);
+        } else if (largest == 0) {
+          auto index_iter = std::min_element(vec.begin(), vec.end());
+          valptr[r] = *index_iter;
+          if (top_blobs.size() == 2)
+            indptr[r] = std::distance(vec.begin(), index_iter);
+          else
+            valptr[r] = std::distance(vec.begin(), index_iter);
+        } else {
+          fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+          return -100;
+        }
+      }
+    }
+    if (dims == 3 && positive_axis == 0) {
+      if (keep_dims == 1) {
+        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, topk, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+      } else {
+        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].h, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+      }
+      const float* ptr = bottom_blobs[0];
+      std::vector<float> vec;
+      vec.resize(bottom_blobs[0].c);
+      float* valptr = top_val_blob;
+      float* indptr;
+      if (top_blobs.size() == 2) indptr = top_ind_blob;
+      for (int r = 0; r < bottom_blobs[0].h; r++) {
+        for (int col = 0; col < bottom_blobs[0].w; col++) {
+          for (int i = 0; i < bottom_blobs[0].c; i++) {
+            ptr = bottom_blobs[0].channel(i);
+            vec[i] = ptr[r * bottom_blobs[0].w + col];
+          }
+          if (largest == 1) {
+            auto index_iter = std::max_element(vec.begin(), vec.end());
+            valptr[r * top_val_blob.w + col] = *index_iter;
+            if (top_blobs.size() == 2)
+              indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+            else
+              valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+          } else if (largest == 0) {
+            auto index_iter = std::min_element(vec.begin(), vec.end());
+            valptr[r * top_val_blob.w + col] = *index_iter;
+            if (top_blobs.size() == 2)
+              indptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+            else
+              valptr[r * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+          } else {
+            fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+            return -100;
+          }
+        }
+      }
+    }
+    if (dims == 3 && positive_axis == 1) {
+      if (keep_dims == 1) {
+        top_val_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(bottom_blobs[0].w, topk, bottom_blobs[0].c, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+        std::vector<float> vec;
+        vec.resize(bottom_blobs[0].h);
+        for (int page = 0; page < bottom_blobs[0].c; page++) {
+          const float* ptr = bottom_blobs[0].channel(page);
+          float* valptr = top_val_blob.channel(page);
+          float* indptr;
+          if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
+          for (int col = 0; col < bottom_blobs[0].w; col++) {
+            for (int i = 0; i < bottom_blobs[0].h; i++) {
+              vec[i] = ptr[i * bottom_blobs[0].w + col];
+            }
+            if (largest == 1) {
+              auto index_iter = std::max_element(vec.begin(), vec.end());
+              valptr[col] = *index_iter;
+              if (top_blobs.size() == 2)
+                indptr[col] = std::distance(vec.begin(), index_iter);
+              else
+                valptr[col] = std::distance(vec.begin(), index_iter);
+            } else if (largest == 0) {
+              auto index_iter = std::min_element(vec.begin(), vec.end());
+              valptr[col] = *index_iter;
+              if (top_blobs.size() == 2)
+                indptr[col] = std::distance(vec.begin(), index_iter);
+              else
+                valptr[col] = std::distance(vec.begin(), index_iter);
+            } else {
+              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+              return -100;
+            }
+          }
+        }
+      } else {
+        top_val_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(bottom_blobs[0].w, bottom_blobs[0].c, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+        std::vector<float> vec;
+        vec.resize(bottom_blobs[0].h);
+        float* valptr = top_val_blob;
+        float* indptr;
+        if (top_blobs.size() == 2) indptr = top_ind_blob;
+        for (int page = 0; page < bottom_blobs[0].c; page++) {
+          const float* ptr = bottom_blobs[0].channel(page);
+          for (int col = 0; col < bottom_blobs[0].w; col++) {
+            for (int i = 0; i < bottom_blobs[0].h; i++) {
+              vec[i] = ptr[i * bottom_blobs[0].w + col];
+            }
+            if (largest == 1) {
+              auto index_iter = std::max_element(vec.begin(), vec.end());
+              valptr[page * top_val_blob.w + col] = *index_iter;
+              if (top_blobs.size() == 2)
+                indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+              else
+                valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+            } else if (largest == 0) {
+              auto index_iter = std::min_element(vec.begin(), vec.end());
+              valptr[page * top_val_blob.w + col] = *index_iter;
+              if (top_blobs.size() == 2)
+                indptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+              else
+                valptr[page * top_ind_blob.w + col] = std::distance(vec.begin(), index_iter);
+            } else {
+              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+              return -100;
+            }
+          }
+        }
+      }
+    }
+    if (dims == 3 && positive_axis == 2) {
+      if (keep_dims == 1) {
+        top_val_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(topk, bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+        std::vector<float> vec;
+        vec.resize(bottom_blobs[0].w);
+        for (int page = 0; page < bottom_blobs[0].c; page++) {
+          const float* ptr = bottom_blobs[0].channel(page);
+          float* valptr = top_val_blob.channel(page);
+          float* indptr;
+          if (top_blobs.size() == 2) indptr = top_ind_blob.channel(page);
+          for (int r = 0; r < bottom_blobs[0].h; r++) {
+            for (int i = 0; i < bottom_blobs[0].w; i++) {
+              vec[i] = ptr[r * bottom_blobs[0].w + i];
+            }
+            if (largest == 1) {
+              auto index_iter = std::max_element(vec.begin(), vec.end());
+              valptr[r] = *index_iter;
+              if (top_blobs.size() == 2)
+                indptr[r] = std::distance(vec.begin(), index_iter);
+              else
+                valptr[r] = std::distance(vec.begin(), index_iter);
+            } else if (largest == 0) {
+              auto index_iter = std::min_element(vec.begin(), vec.end());
+              valptr[r] = *index_iter;
+              if (top_blobs.size() == 2)
+                indptr[r] = std::distance(vec.begin(), index_iter);
+              else
+                valptr[r] = std::distance(vec.begin(), index_iter);
+            } else {
+              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+              return -100;
+            }
+          }
+        }
+      } else {
+        top_val_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+        if (top_val_blob.empty()) return -100;
+        if (top_blobs.size() == 2) {
+          top_ind_blob.create(bottom_blobs[0].h, bottom_blobs[0].c, 4u, opt.blob_allocator);
+          if (top_ind_blob.empty()) return -100;
+        }
+        std::vector<float> vec;
+        vec.resize(bottom_blobs[0].w);
+        float* valptr = top_val_blob;
+        float* indptr;
+        if (top_blobs.size() == 2) indptr = top_ind_blob;
+        for (int page = 0; page < bottom_blobs[0].c; page++) {
+          const float* ptr = bottom_blobs[0].channel(page);
+          for (int r = 0; r < bottom_blobs[0].h; r++) {
+            for (int i = 0; i < bottom_blobs[0].w; i++) {
+              vec[i] = ptr[r * bottom_blobs[0].w + i];
+            }
+            if (largest == 1) {
+              auto index_iter = std::max_element(vec.begin(), vec.end());
+              valptr[page * top_val_blob.w + r] = *index_iter;
+              if (top_blobs.size() == 2)
+                indptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+              else
+                valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+            } else if (largest == 0) {
+              auto index_iter = std::min_element(vec.begin(), vec.end());
+              valptr[page * top_val_blob.w + r] = *index_iter;
+              if (top_blobs.size() == 2)
+                indptr[page * top_val_blob.w + r] = std::distance(vec.begin(), index_iter);
+              else
+                valptr[page * top_ind_blob.w + r] = std::distance(vec.begin(), index_iter);
+            } else {
+              fprintf(stderr, "largest attribute should be 0 or 1, but not %d\n", largest);
+              return -100;
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
+++ b/csrc/mmdeploy/backend_ops/ncnn/ops/topk/topk.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef LAYER_TOPK_H
+#define LAYER_TOPK_H
+#include "layer.h"
+namespace mmdeploy {
+class TopK : public ncnn::Layer {
+ public:
+  TopK();
+  virtual int load_param(const ncnn::ParamDict& pd);
+  virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs,
+                      const ncnn::Option& opt) const;
+ public:
+  int axis;
+  int largest;
+  int sorted;
+  int keep_dims;
+};
+}  // namespace mmdeploy
+#endif  // LAYER_TOPK_H
--- a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+project(ncnn_ext)
+# pybind11
+if (NOT TARGET pybind11)
+    add_subdirectory(${CMAKE_SOURCE_DIR}/third_party/pybind11 pybind11)
+endif ()
+pybind11_add_module(ncnn_ext ncnn_ext.cpp)
+target_link_libraries(ncnn_ext PUBLIC mmdeploy_ncnn_ops ncnn)
+set(_NCNN_EXT_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/backend/ncnn)
+set_target_properties(ncnn_ext PROPERTIES
+        LIBRARY_OUTPUT_DIRECTORY ${_NCNN_EXT_DIR}
+        LIBRARY_OUTPUT_DIRECTORY_DEBUG ${_NCNN_EXT_DIR}
+        LIBRARY_OUTPUT_DIRECTORY_RELEASE ${_NCNN_EXT_DIR})
--- a/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
+++ b/csrc/mmdeploy/backend_ops/ncnn/pyncnn_ext/ncnn_ext.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <pybind11/pybind11.h>
+#include "ncnn_ops_register.h"
+#include "net.h"
+PYBIND11_MODULE(ncnn_ext, m) {
+  m.def(
+      "register_mmdeploy_custom_layers",
+      [](ncnn::Net &net) { return register_mmdeploy_custom_layers(net); },
+      "register mmdeploy custom ncnn layers.");
+}
--- a/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+project(mmdeploy_onnxruntime_ops)
+include(${CMAKE_SOURCE_DIR}/cmake/modules/FindONNXRUNTIME.cmake)
+include(${CMAKE_SOURCE_DIR}/cmake/MMDeploy.cmake)
+# add plugin source
+file(GLOB_RECURSE ORT_OPS_SRCS *.cpp)
+add_library(${PROJECT_NAME}_obj OBJECT "${ORT_OPS_SRCS}")
+target_compile_definitions(${PROJECT_NAME}_obj PRIVATE -DMMDEPLOY_API_EXPORTS=1)
+target_compile_options(${PROJECT_NAME}_obj PRIVATE
+        $<$<COMPILE_LANGUAGE:CXX>:-fvisibility=hidden>)
+set_target_properties(${PROJECT_NAME}_obj PROPERTIES POSITION_INDEPENDENT_CODE 1)
+mmdeploy_export(${PROJECT_NAME}_obj)
+target_include_directories(${PROJECT_NAME}_obj PUBLIC
+        $<BUILD_INTERFACE:${ONNXRUNTIME_DIR}/include>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/common>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../common>
+        $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
+target_link_libraries(${PROJECT_NAME}_obj PUBLIC onnxruntime)
+mmdeploy_add_library(${PROJECT_NAME} SHARED EXCLUDE "")
+target_link_libraries(${PROJECT_NAME} PUBLIC ${PROJECT_NAME}_obj)
+mmdeploy_add_rpath(${PROJECT_NAME})
+add_library(mmdeploy::onnxruntime::ops ALIAS ${PROJECT_NAME})
+set(_ORT_OPS_DIR ${CMAKE_SOURCE_DIR}/mmdeploy/lib)
+install(TARGETS ${PROJECT_NAME} DESTINATION ${_ORT_OPS_DIR})
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_register.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef ONNXRUNTIME_REGISTER_H
+#define ONNXRUNTIME_REGISTER_H
+#include <onnxruntime_c_api.h>
+#include "mmdeploy/core/macro.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+MMDEPLOY_API OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                                       const OrtApiBase *api);
+#ifdef __cplusplus
+}
+#endif
+#endif  // ONNXRUNTIME_REGISTER_H
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_session_options_config_keys.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/onnxruntime_session_options_config_keys.h
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#ifndef ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+#define ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+/*
+ * This file defines SessionOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a SessionOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a SessionOptions Config Value is defined individually
+ * for each Config. The maximum length of the Config Value is 1024
+ */
+// Key for disable PrePacking,
+// If the config value is set to "1" then the prepacking is disabled, otherwise
+// prepacking is enabled (default value)
+static const char* const kOrtSessionOptionsConfigDisablePrepacking = "session.disable_prepacking";
+// A value of "1" means allocators registered in the env will be used. "0" means
+// the allocators created in the session will be used. Use this to override the
+// usage of env allocators on a per session level.
+static const char* const kOrtSessionOptionsConfigUseEnvAllocators = "session.use_env_allocators";
+// Set to 'ORT' (case sensitive) to load an ORT format model.
+// If unset, model type will default to ONNX unless inferred from filename
+// ('.ort' == ORT format) or bytes to be ORT
+static const char* const kOrtSessionOptionsConfigLoadModelFormat = "session.load_model_format";
+// Set to 'ORT' (case sensitive) to save optimized model in ORT format when
+// SessionOptions.optimized_model_path is set. If unset, format will default to
+// ONNX unless optimized_model_filepath ends in '.ort'.
+static const char* const kOrtSessionOptionsConfigSaveModelFormat = "session.save_model_format";
+#endif  // ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.cpp
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "ort_utils.h"
+namespace mmdeploy {
+CustomOpsTable& get_mmdeploy_custom_ops() {
+  static CustomOpsTable _custom_ops;
+  return _custom_ops;
+}
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/common/ort_utils.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef ORT_MMCV_UTILS_H
+#define ORT_MMCV_UTILS_H
+#include <onnxruntime_cxx_api.h>
+#include <unordered_map>
+#include <vector>
+namespace mmdeploy {
+typedef std::unordered_map<std::string, std::vector<OrtCustomOp*>> CustomOpsTable;
+struct OrtTensorDimensions : std::vector<int64_t> {
+  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
+    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
+    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
+    ort.ReleaseTensorTypeAndShapeInfo(info);
+  }
+};
+CustomOpsTable& get_mmdeploy_custom_ops();
+template <char const* domain, typename T>
+class OrtOpsRegistry {
+ public:
+  OrtOpsRegistry() { get_mmdeploy_custom_ops()[domain].push_back(&instance); }
+ private:
+  T instance{};
+};
+#define REGISTER_ONNXRUNTIME_OPS(domain, name)     \
+  static char __domain_##domain##name[] = #domain; \
+  static OrtOpsRegistry<__domain_##domain##name, name> ort_ops_registry_##domain##name {}
+}  // namespace mmdeploy
+#endif  // ORT_MMCV_UTILS_H
--- a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
+#include "grid_sample.h"
+#include <cmath>
+#include "ort_utils.h"
+namespace mmdeploy {
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit - 1), MAX(in, 0))
+GridSampleKernel::GridSampleKernel(const OrtApi &api, const OrtKernelInfo *info)
+    : ort_(api), info_(info) {
+  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+  interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
+enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
+}
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+template <typename scalar_t>
+static inline scalar_t compute_coordinates(scalar_t coord, int64_t size, int64_t padding_mode,
+                                           bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    coord = clip_coordinates(coord, size);
+  }
+  return coord;
+}
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index(scalar_t coord, int64_t size,
+                                                         int64_t padding_mode, bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+template <typename scalar_t>
+static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x, scalar_t y, int64_t W,
+                                         int64_t H, int64_t sW, int64_t sH, int64_t padding_mode,
+                                         bool align_corners) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+template <typename scalar_t>
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+template <typename scalar_t>
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+template <typename scalar_t>
+static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4], scalar_t t) {
+  scalar_t A = -0.75;
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+template <typename scalar_t>
+static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2, scalar_t x3,
+                                      scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+void GridSampleKernel::Compute(OrtKernelContext *context) {
+  const bool align_corners = align_corners_;
+  const int64_t padding_mode = padding_mode_;
+  const int64_t interpolation_mode = interpolation_mode_;
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
+  const float *grid_data = reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions grid_dims(ort_, grid);
+  int64_t N = input_dims[0];
+  int64_t C = input_dims[1];
+  int64_t inp_H = input_dims[2];
+  int64_t inp_W = input_dims[3];
+  int64_t out_H = grid_dims[1];
+  int64_t out_W = grid_dims[2];
+  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+  OrtValue *output =
+      ort_.KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
+  int64_t inp_sC = input_dims[2] * input_dims[3];
+  int64_t inp_sH = input_dims[3];
+  int64_t inp_sW = 1;
+  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
+  int64_t grid_sH = grid_dims[2] * grid_dims[3];
+  int64_t grid_sW = grid_dims[3];
+  int64_t grid_sCoor = 1;
+  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
+  int64_t out_sC = output_dims[2] * output_dims[3];
+  int64_t out_sH = output_dims[3];
+  int64_t out_sW = 1;
+  // loop over each output pixel
+  for (int64_t n = 0; n < N; ++n) {
+    const float *grid_ptr_N = grid_data + n * grid_sN;
+    const float *inp_ptr_N = input_data + n * inp_sN;
+    for (int64_t h = 0; h < out_H; ++h) {
+      for (int64_t w = 0; w < out_W; ++w) {
+        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+        float x = *grid_ptr_NHW;
+        float y = grid_ptr_NHW[grid_sCoor];
+        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
+        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+          // get corner pixel values from (x, y)
+          // for 4d, we use north-east-south-west
+          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+          int64_t ix_ne = ix_nw + 1;
+          int64_t iy_ne = iy_nw;
+          int64_t ix_sw = ix_nw;
+          int64_t iy_sw = iy_nw + 1;
+          int64_t ix_se = ix_nw + 1;
+          int64_t iy_se = iy_nw + 1;
+          // get surfaces to each neighbor:
+          float nw = (ix_se - ix) * (iy_se - iy);
+          float ne = (ix - ix_sw) * (iy_sw - iy);
+          float sw = (ix_ne - ix) * (iy - iy_ne);
+          float se = (ix - ix_nw) * (iy - iy_nw);
+          // calculate bilinear weighted pixel value and set output pixel
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            auto res = static_cast<float>(0);
+            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+            }
+            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+            }
+            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+            }
+            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+            }
+            *out_ptr_NCHW = res;
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+          // assign nearest neighbor pixel value to output pixel
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          const float *inp_ptr_NC = inp_ptr_N;
+          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+              *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+            } else {
+              *out_ptr_NCHW = static_cast<float>(0);
+            }
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+          // grid_sampler_compute_source_index will "clip the value" of idx
+          // depends on the padding,
+          // which would cause calculation to be wrong,
+          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
+          // = floor(x) = -1
+          // There would be more problem in reflection padding, since the -1 and
+          // +1 direction is not fixed in boundary condition
+          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+          float ix_nw = std::floor(ix);
+          float iy_nw = std::floor(iy);
+          const float tx = ix - ix_nw;
+          const float ty = iy - iy_nw;
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            float coefficients[4];
+            // Interpolate 4 values in the x direction
+            for (int64_t i = 0; i < 4; ++i) {
+              coefficients[i] = cubic_interp1d<float>(
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H,
+                                           inp_sW, inp_sH, padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H,
+                                           inp_sW, inp_sH, padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H,
+                                           inp_sW, inp_sH, padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H,
+                                           inp_sW, inp_sH, padding_mode, align_corners),
+                  tx);
+            }
+            // Interpolate in the y direction
+            *out_ptr_NCHW = cubic_interp1d<float>(coefficients[0], coefficients[1], coefficients[2],
+                                                  coefficients[3], ty);
+          }
+        }
+      }
+    }
+  }
+}
+REGISTER_ONNXRUNTIME_OPS(mmdeploy, GridSampleOp);
+}  // namespace mmdeploy
--- a/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h
+++ b/csrc/mmdeploy/backend_ops/onnxruntime/grid_sample/grid_sample.h
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef ONNXRUNTIME_GRIDSAMPLE_H
+#define ONNXRUNTIME_GRIDSAMPLE_H
+#include <onnxruntime_cxx_api.h>
+namespace mmdeploy {
+struct GridSampleKernel {
+  GridSampleKernel(const OrtApi &api, const OrtKernelInfo *info);
+  void Compute(OrtKernelContext *context);
+ protected:
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+  int64_t align_corners_;
+  int64_t interpolation_mode_;
+  int64_t padding_mode_;
+};
+struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
+  void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const {
+    return new GridSampleKernel(api, info);
+  };
+  const char *GetName() const { return "grid_sampler"; };
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+  const char *GetExecutionProviderType() const { return "CPUExecutionProvider"; };
+};
+}  // namespace mmdeploy
+#endif