Initial commit

e9cee049 · luopl · e9cee049 · e9cee049 · e9cee049 · e9cee049
Commit e9cee049 authored May 31, 2024 by luopl
20 changed files
--- a/deploy/easydeploy/deepstream/configs/config_infer_yolov5.txt
+++ b/deploy/easydeploy/deepstream/configs/config_infer_yolov5.txt
+[property]
+gpu-id=0
+net-scale-factor=0.0039215697906911373
+model-color-format=0
+model-engine-file=../end2end.engine
+labelfile-path=../coco_labels.txt
+batch-size=1
+network-mode=0
+num-detected-classes=80
+interval=0
+gie-unique-id=1
+process-mode=1
+network-type=0
+cluster-mode=2
+maintain-aspect-ratio=1
+parse-bbox-func-name=NvDsInferParseCustomMMYOLO
+custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
+
+[class-attrs-all]
+pre-cluster-threshold=0.45
+topk=100
--- a/deploy/easydeploy/deepstream/configs/config_infer_yolov8.txt
+++ b/deploy/easydeploy/deepstream/configs/config_infer_yolov8.txt
+[property]
+gpu-id=0
+net-scale-factor=0.0039215697906911373
+model-color-format=0
+model-engine-file=../end2end.engine
+labelfile-path=../coco_labels.txt
+batch-size=1
+network-mode=0
+num-detected-classes=80
+interval=0
+gie-unique-id=1
+process-mode=1
+network-type=0
+cluster-mode=2
+maintain-aspect-ratio=1
+parse-bbox-func-name=NvDsInferParseCustomMMYOLO
+custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
+
+[class-attrs-all]
+pre-cluster-threshold=0.45
+topk=100
--- a/deploy/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp
+++ b/deploy/easydeploy/deepstream/custom_mmyolo_bbox_parser/nvdsparsebbox_mmyolo.cpp
+#include "nvdsinfer_custom_impl.h"
+#include <cassert>
+#include <iostream>
+
+/**
+ * Function expected by DeepStream for decoding the MMYOLO output.
+ *
+ * C-linkage [extern "C"] was written to prevent name-mangling. This function must return true after
+ * adding all bounding boxes to the objectList vector.
+ *
+ * @param [outputLayersInfo] std::vector of NvDsInferLayerInfo objects with information about the output layer.
+ * @param [networkInfo] NvDsInferNetworkInfo object with information about the MMYOLO network.
+ * @param [detectionParams] NvDsInferParseDetectionParams with information about some config params.
+ * @param [objectList] std::vector of NvDsInferParseObjectInfo objects to which bounding box information must
+ * be stored.
+ *
+ * @return true
+ */
+
+// This is just the function prototype. The definition is written at the end of the file.
+extern "C" bool NvDsInferParseCustomMMYOLO(
+	std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+	NvDsInferNetworkInfo const& networkInfo,
+	NvDsInferParseDetectionParams const& detectionParams,
+	std::vector<NvDsInferParseObjectInfo>& objectList);
+
+static __inline__ float clamp(float& val, float min, float max)
+{
+	return val > min ? (val < max ? val : max) : min;
+}
+
+static std::vector<NvDsInferParseObjectInfo> decodeMMYoloTensor(
+	const int* num_dets,
+	const float* bboxes,
+	const float* scores,
+	const int* labels,
+	const float& conf_thres,
+	const unsigned int& img_w,
+	const unsigned int& img_h
+)
+{
+	std::vector<NvDsInferParseObjectInfo> bboxInfo;
+	size_t nums = num_dets[0];
+	for (size_t i = 0; i < nums; i++)
+	{
+		float score = scores[i];
+		if (score < conf_thres)continue;
+		float x0 = (bboxes[i * 4]);
+		float y0 = (bboxes[i * 4 + 1]);
+		float x1 = (bboxes[i * 4 + 2]);
+		float y1 = (bboxes[i * 4 + 3]);
+		x0 = clamp(x0, 0.f, img_w);
+		y0 = clamp(y0, 0.f, img_h);
+		x1 = clamp(x1, 0.f, img_w);
+		y1 = clamp(y1, 0.f, img_h);
+		NvDsInferParseObjectInfo obj;
+		obj.left = x0;
+		obj.top = y0;
+		obj.width = x1 - x0;
+		obj.height = y1 - y0;
+		obj.detectionConfidence = score;
+		obj.classId = labels[i];
+		bboxInfo.push_back(obj);
+	}
+
+	return bboxInfo;
+}
+
+/* C-linkage to prevent name-mangling */
+extern "C" bool NvDsInferParseCustomMMYOLO(
+	std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
+	NvDsInferNetworkInfo const& networkInfo,
+	NvDsInferParseDetectionParams const& detectionParams,
+	std::vector<NvDsInferParseObjectInfo>& objectList)
+{
+
+// Some assertions and error checking.
+	if (outputLayersInfo.empty() || outputLayersInfo.size() != 4)
+	{
+		std::cerr << "Could not find output layer in bbox parsing" << std::endl;
+		return false;
+	}
+
+//	Score threshold of bboxes.
+	const float conf_thres = detectionParams.perClassThreshold[0];
+
+// Obtaining the output layer.
+	const NvDsInferLayerInfo& num_dets = outputLayersInfo[0];
+	const NvDsInferLayerInfo& bboxes = outputLayersInfo[1];
+	const NvDsInferLayerInfo& scores = outputLayersInfo[2];
+	const NvDsInferLayerInfo& labels = outputLayersInfo[3];
+
+// num_dets(int) bboxes(float) scores(float) labels(int)
+	assert (num_dets.dims.numDims == 2);
+	assert (bboxes.dims.numDims == 3);
+	assert (scores.dims.numDims == 2);
+	assert (labels.dims.numDims == 2);
+
+
+// Decoding the output tensor of MMYOLO to the NvDsInferParseObjectInfo format.
+	std::vector<NvDsInferParseObjectInfo> objects =
+		decodeMMYoloTensor(
+			(const int*)(num_dets.buffer),
+			(const float*)(bboxes.buffer),
+			(const float*)(scores.buffer),
+			(const int*)(labels.buffer),
+			conf_thres,
+			networkInfo.width,
+			networkInfo.height
+		);
+
+	objectList.clear();
+	objectList = objects;
+	return true;
+}
+
+/* Check that the custom function has been defined correctly */
+CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomMMYOLO);
--- a/deploy/easydeploy/deepstream/deepstream_app_config.txt
+++ b/deploy/easydeploy/deepstream/deepstream_app_config.txt
+[application]
+enable-perf-measurement=1
+perf-measurement-interval-sec=5
+
+[tiled-display]
+enable=1
+rows=1
+columns=1
+width=1280
+height=720
+gpu-id=0
+nvbuf-memory-type=0
+
+[source0]
+enable=1
+type=3
+uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4
+num-sources=1
+gpu-id=0
+cudadec-memtype=0
+
+[sink0]
+enable=1
+type=2
+sync=0
+gpu-id=0
+nvbuf-memory-type=0
+
+[osd]
+enable=1
+gpu-id=0
+border-width=5
+text-size=15
+text-color=1;1;1;1;
+text-bg-color=0.3;0.3;0.3;1
+font=Serif
+show-clock=0
+clock-x-offset=800
+clock-y-offset=820
+clock-text-size=12
+clock-color=1;0;0;0
+nvbuf-memory-type=0
+
+[streammux]
+gpu-id=0
+live-source=0
+batch-size=1
+batched-push-timeout=40000
+width=1920
+height=1080
+enable-padding=0
+nvbuf-memory-type=0
+
+[primary-gie]
+enable=1
+gpu-id=0
+gie-unique-id=1
+nvbuf-memory-type=0
+config-file=configs/config_infer_rtmdet.txt
+
+[tests]
+file-loop=0
--- a/deploy/easydeploy/docs/model_convert.md
+++ b/deploy/easydeploy/docs/model_convert.md
+# MMYOLO 模型 ONNX 转换
+
+## 1. 导出后端支持的 ONNX
+
+## 环境依赖
+
+- [onnx](https://github.com/onnx/onnx)
+
+  ```shell
+  pip install onnx
+  ```
+
+  [onnx-simplifier](https://github.com/daquexian/onnx-simplifier) (可选，用于简化模型)
+
+  ```shell
+  pip install onnx-simplifier
+  ```
+
+\*\*\* 请确保您在 `MMYOLO` 根目录下运行相关脚本，避免无法找到相关依赖包。\*\*\*
+
+## 使用方法
+
+[模型导出脚本](./projects/easydeploy/tools/export_onnx.py)用于将 `MMYOLO` 模型转换为 `onnx` 。
+
+### 参数介绍:
+
+- `config` : 构建模型使用的配置文件，如 [`yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`](./configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py) 。
+- `checkpoint` : 训练得到的权重文件，如 `yolov5s.pth` 。
+- `--work-dir` : 转换后的模型保存路径。
+- `--img-size`: 转换模型时输入的尺寸，如 `640 640`。
+- `--batch-size`: 转换后的模型输入 `batch size` 。
+- `--device`: 转换模型使用的设备，默认为 `cuda:0`。
+- `--simplify`: 是否简化导出的 `onnx` 模型，需要安装 [onnx-simplifier](https://github.com/daquexian/onnx-simplifier)，默认关闭。
+- `--opset`: 指定导出 `onnx` 的 `opset`，默认为 `11` 。
+- `--backend`: 指定导出 `onnx` 用于的后端名称，`ONNXRuntime`: `onnxruntime`, `TensorRT8`: `tensorrt8`, `TensorRT7`: `tensorrt7`，默认为`onnxruntime`即 `ONNXRuntime`。
+- `--pre-topk`: 指定导出 `onnx` 的后处理筛选候选框个数阈值，默认为 `1000`。
+- `--keep-topk`: 指定导出 `onnx` 的非极大值抑制输出的候选框个数阈值，默认为 `100`。
+- `--iou-threshold`: 非极大值抑制中过滤重复候选框的 `iou` 阈值，默认为 `0.65`。
+- `--score-threshold`: 非极大值抑制中过滤候选框得分的阈值，默认为 `0.25`。
+- `--model-only`: 指定仅导出模型 backbone + neck, 不包含后处理，默认关闭。
+
+例子:
+
+```shell
+python ./projects/easydeploy/tools/export.py \
+	configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \
+	yolov5s.pth \
+	--work-dir work_dir \
+    --img-size 640 640 \
+    --batch 1 \
+    --device cpu \
+    --simplify \
+	--opset 11 \
+	--backend 1 \
+	--pre-topk 1000 \
+	--keep-topk 100 \
+	--iou-threshold 0.65 \
+	--score-threshold 0.25
+```
+
+然后利用后端支持的工具如 `TensorRT` 读取 `onnx` 再次转换为后端支持的模型格式如 `.engine/.plan` 等。
+
+`MMYOLO` 目前支持 `TensorRT8`, `TensorRT7`, `ONNXRuntime` 后端的端到端模型转换，目前仅支持静态 shape 模型的导出和转换，动态 batch 或动态长宽的模型端到端转换会在未来继续支持。
+
+端到端转换得到的 `onnx` 模型输入输出如图：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/92794867/232403745-101ca999-2003-46fa-bc5b-6b0eb2b2d41b.png"/>
+</div>
+
+输入名: `images`, 尺寸 640x640
+
+输出名: `num_dets`, 尺寸 1x1，表示检测目标数量。
+
+输出名: `boxes`, 尺寸 1x100x4，表示检测框的坐标，格式为 `x1y1x2y1`。
+
+输出名: `scores`, 尺寸 1x100，表示检测框的分数。
+
+输出名: `labels`, 尺寸 1x100，表示检测框的类别 id。
+
+可以利用 `num_dets` 中的个数对 `boxes`, `scores`, `labels` 进行截断，从 100 个检测结果中抽取前 `num_dets` 个目标作为最终检测结果。
+
+## 2. 仅导出模型 Backbone + Neck
+
+当您需要部署在非 `TensorRT`, `ONNXRuntime` 等支持端到端部署的平台时，您可以考虑使用`--model-only` 参数并且不要传递 `--backend` 参数，您将会导出仅包含 `Backbone` + `neck` 的模型，模型的部分输出如图:
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/92794867/232406169-40eee9fd-bc53-4fdc-bd37-d0e9033826f9.png"/>
+</div>
+
+这种导出方式获取的 `ONNX` 模型具有如下优点:
+
+- 算子简单，一般而言只包含 `Conv`，激活函数等简单算子，几乎不存在无法正确导出的情况，对于嵌入式部署更加友好。
+- 方便不同算法之间对比速度性能，由于不同的算法后处理不同，仅对比 `backbone` + `Neck` 的速度更加公平。
+
+也有如下缺点:
+
+- 后处理逻辑需要单独完成，会有额外的 `decode` + `nms` 的操作需要实现。
+- 与 `TensorRT` 相比，由于 `TensorRT` 可以利用多核优势并行进行后处理，使用 `--model-only` 方式导出的模型性能会差很多。
+
+### 使用方法
+
+```shell
+python ./projects/easydeploy/tools/export.py \
+	configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \
+	yolov5s.pth \
+	--work-dir work_dir \
+    --img-size 640 640 \
+    --batch 1 \
+    --device cpu \
+    --simplify \
+	--opset 11 \
+	--model-only
+```
+
+## 使用 `model-only` 导出的 ONNX 进行推理
+
+[模型推理脚本](./projects/easydeploy/examples/main_onnxruntime.py)用于推理导出的 `ONNX` 模型，需要安装基础依赖环境:
+
+[`onnxruntime`](https://github.com/microsoft/onnxruntime) 和 [`opencv-python`](https://github.com/opencv/opencv-python)
+
+```shell
+pip install onnxruntime
+pip install opencv-python==4.7.0.72 # 建议使用最新的 opencv
+```
+
+### 参数介绍:
+
+- `img` : 待检测的图片路径或图片文件夹路径。
+- `onnx` : 导出的 `model-only` ONNX 模型。
+- `--type` : 模型名称，目前支持 `yolov5`, `yolox`, `yolov6`, `ppyoloe`, `ppyoloep`, `yolov7`, `rtmdet`, `yolov8`。
+- `--img-size`: 转换模型时输入的尺寸，如 `640 640`。
+- `--out-dir`: 保存检测结果的路径 。
+- `--show`: 是否可视化检测结果。
+- `--score-thr`: 模型检测后处理的置信度分数 。
+- `--iou-thr`: 模型检测后处理的 IOU 分数 。
+
+## 使用方法
+
+```shell
+cd ./projects/easydeploy/examples
+python main_onnxruntime.py \
+	"image_path_to_detect" \
+	yolov5_s_model-only.onnx \
+	--out-dir work_dir \
+    --img-size 640 640 \
+    --show \
+    --score-thr 0.3 \
+    --iou-thr 0.7
+```
+
+*注意！！！*
+
+当您使用自定义数据集训练得到的模型时，请修改 [`config.py`](./projects/easydeploy/examples/config.py) 中 `CLASS_NAMES` 和 `CLASS_COLORS`，如果是 `yolov5` 或者 `yolov7` 基于 `anchor` 的模型请同时修改 `YOLOv5_ANCHORS` 和 `YOLOv7_ANCHORS`。
+
+[`numpy_coder.py`](./projects/easydeploy/examples/numpy_coder.py) 是目前所有算法仅使用 `numpy` 实现的 `decoder`，如果您对性能有较高的要求，可以参照相关代码改写为 `c/c++`。
--- a/deploy/easydeploy/examples/config.py
+++ b/deploy/easydeploy/examples/config.py
+from enum import Enum
+
+
+class TASK_TYPE(Enum):
+    DET = 'det'
+    SEG = 'seg'
+    POSE = 'pose'
+
+
+class ModelType(Enum):
+    YOLOV5 = 'yolov5'
+    YOLOX = 'yolox'
+    PPYOLOE = 'ppyoloe'
+    PPYOLOEP = 'ppyoloep'
+    YOLOV6 = 'yolov6'
+    YOLOV7 = 'yolov7'
+    RTMDET = 'rtmdet'
+    YOLOV8 = 'yolov8'
+
+
+CLASS_NAMES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+               'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+               'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+               'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+               'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+               'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+               'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+               'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+               'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+               'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+               'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+               'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
+               'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+
+CLASS_COLORS = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230),
+                (106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70),
+                (0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0),
+                (175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255),
+                (0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157),
+                (110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118),
+                (255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182),
+                (0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255),
+                (78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255),
+                (134, 134, 103), (145, 148, 174), (255, 208, 186),
+                (197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255),
+                (151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105),
+                (166, 196, 102), (208, 195, 210), (255, 109, 65),
+                (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
+                (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
+                (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
+                (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
+                (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
+                (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
+                (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
+                (246, 0, 122), (191, 162, 208)]
+
+YOLOv5_ANCHORS = [[(10, 13), (16, 30), (33, 23)],
+                  [(30, 61), (62, 45), (59, 119)],
+                  [(116, 90), (156, 198), (373, 326)]]
+
+YOLOv7_ANCHORS = [[(12, 16), (19, 36), (40, 28)],
+                  [(36, 75), (76, 55), (72, 146)],
+                  [(142, 110), (192, 243), (459, 401)]]
--- a/deploy/easydeploy/examples/cv2_nms.py
+++ b/deploy/easydeploy/examples/cv2_nms.py
+from typing import List, Tuple, Union
+
+import cv2
+from numpy import ndarray
+
+MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2])
+assert MAJOR == 4
+
+
+def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]],
+                        scores: Union[List[float], Tuple[float]],
+                        labels: Union[List[int], Tuple[int]],
+                        conf_thres: float = 0.25,
+                        iou_thres: float = 0.65) -> Tuple[List, List, List]:
+    if MINOR >= 7:
+        indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres,
+                                          iou_thres)
+    elif MINOR == 6:
+        indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres)
+    else:
+        indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres,
+                                   iou_thres).flatten()
+
+    nmsd_boxes = []
+    nmsd_scores = []
+    nmsd_labels = []
+    for idx in indices:
+        box = boxes[idx]
+        # x0y0wh -> x0y0x1y1
+        box[2:] = box[:2] + box[2:]
+        score = scores[idx]
+        label = labels[idx]
+        nmsd_boxes.append(box)
+        nmsd_scores.append(score)
+        nmsd_labels.append(label)
+    return nmsd_boxes, nmsd_scores, nmsd_labels
--- a/deploy/easydeploy/examples/main_onnxruntime.py
+++ b/deploy/easydeploy/examples/main_onnxruntime.py
+import math
+import sys
+from argparse import ArgumentParser
+from pathlib import Path
+
+import cv2
+import onnxruntime
+from config import (CLASS_COLORS, CLASS_NAMES, ModelType, YOLOv5_ANCHORS,
+                    YOLOv7_ANCHORS)
+from cv2_nms import non_max_suppression
+from numpy_coder import Decoder
+from preprocess import Preprocess
+from tqdm import tqdm
+
+# Add __FILE__  to sys.path
+sys.path.append(str(Path(__file__).resolve().parents[0]))
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def path_to_list(path: str):
+    path = Path(path)
+    if path.is_file() and path.suffix in IMG_EXTENSIONS:
+        res_list = [str(path.absolute())]
+    elif path.is_dir():
+        res_list = [
+            str(p.absolute()) for p in path.iterdir()
+            if p.suffix in IMG_EXTENSIONS
+        ]
+    else:
+        raise RuntimeError
+    return res_list
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        'img', help='Image path, include image file, dir and URL.')
+    parser.add_argument('onnx', type=str, help='Onnx file')
+    parser.add_argument('--type', type=str, help='Model type')
+    parser.add_argument(
+        '--img-size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help='Image size of height and width')
+    parser.add_argument(
+        '--out-dir', default='./output', type=str, help='Path to output file')
+    parser.add_argument(
+        '--show', action='store_true', help='Show the detection results')
+    parser.add_argument(
+        '--score-thr', type=float, default=0.3, help='Bbox score threshold')
+    parser.add_argument(
+        '--iou-thr', type=float, default=0.7, help='Bbox iou threshold')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    out_dir = Path(args.out_dir)
+    model_type = ModelType(args.type.lower())
+
+    if not args.show:
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+    files = path_to_list(args.img)
+    session = onnxruntime.InferenceSession(
+        args.onnx, providers=['CPUExecutionProvider'])
+    preprocessor = Preprocess(model_type)
+    decoder = Decoder(model_type, model_only=True)
+    if model_type == ModelType.YOLOV5:
+        anchors = YOLOv5_ANCHORS
+    elif model_type == ModelType.YOLOV7:
+        anchors = YOLOv7_ANCHORS
+    else:
+        anchors = None
+
+    for file in tqdm(files):
+        image = cv2.imread(file)
+        image_h, image_w = image.shape[:2]
+        img, (ratio_w, ratio_h) = preprocessor(image, args.img_size)
+        features = session.run(None, {'images': img})
+        decoder_outputs = decoder(
+            features,
+            args.score_thr,
+            num_labels=len(CLASS_NAMES),
+            anchors=anchors)
+        nmsd_boxes, nmsd_scores, nmsd_labels = non_max_suppression(
+            *decoder_outputs, args.score_thr, args.iou_thr)
+        for box, score, label in zip(nmsd_boxes, nmsd_scores, nmsd_labels):
+            x0, y0, x1, y1 = box
+            x0 = math.floor(min(max(x0 / ratio_w, 1), image_w - 1))
+            y0 = math.floor(min(max(y0 / ratio_h, 1), image_h - 1))
+            x1 = math.ceil(min(max(x1 / ratio_w, 1), image_w - 1))
+            y1 = math.ceil(min(max(y1 / ratio_h, 1), image_h - 1))
+            cv2.rectangle(image, (x0, y0), (x1, y1), CLASS_COLORS[label], 2)
+            cv2.putText(image, f'{CLASS_NAMES[label]}: {score:.2f}',
+                        (x0, y0 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
+                        (0, 255, 255), 2)
+        if args.show:
+            cv2.imshow('result', image)
+            cv2.waitKey(0)
+        else:
+            cv2.imwrite(f'{out_dir / Path(file).name}', image)
+
+
+if __name__ == '__main__':
+    main()
--- a/deploy/easydeploy/examples/numpy_coder.py
+++ b/deploy/easydeploy/examples/numpy_coder.py
+from typing import List, Tuple, Union
+
+import numpy as np
+from config import ModelType
+from numpy import ndarray
+
+
+def softmax(x: ndarray, axis: int = -1) -> ndarray:
+    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
+    y = e_x / e_x.sum(axis=axis, keepdims=True)
+    return y
+
+
+def sigmoid(x: ndarray) -> ndarray:
+    return 1. / (1. + np.exp(-x))
+
+
+class Decoder:
+
+    def __init__(self, model_type: ModelType, model_only: bool = False):
+        self.model_type = model_type
+        self.model_only = model_only
+        self.boxes_pro = []
+        self.scores_pro = []
+        self.labels_pro = []
+        self.is_logging = False
+
+    def __call__(self,
+                 feats: Union[List, Tuple],
+                 conf_thres: float,
+                 num_labels: int = 80,
+                 **kwargs) -> Tuple:
+        if not self.is_logging:
+            print('Only support decode in batch==1')
+            self.is_logging = True
+        self.boxes_pro.clear()
+        self.scores_pro.clear()
+        self.labels_pro.clear()
+
+        if self.model_only:
+            # transpose channel to last dim for easy decoding
+            feats = [
+                np.ascontiguousarray(feat[0].transpose(1, 2, 0))
+                for feat in feats
+            ]
+        else:
+            # ax620a horizonX3 transpose channel to last dim by default
+            feats = [np.ascontiguousarray(feat) for feat in feats]
+        if self.model_type == ModelType.YOLOV5:
+            self.__yolov5_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.YOLOX:
+            self.__yolox_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type in (ModelType.PPYOLOE, ModelType.PPYOLOEP):
+            self.__ppyoloe_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.YOLOV6:
+            self.__yolov6_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.YOLOV7:
+            self.__yolov7_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.RTMDET:
+            self.__rtmdet_decode(feats, conf_thres, num_labels, **kwargs)
+        elif self.model_type == ModelType.YOLOV8:
+            self.__yolov8_decode(feats, conf_thres, num_labels, **kwargs)
+        else:
+            raise NotImplementedError
+        return self.boxes_pro, self.scores_pro, self.labels_pro
+
+    def __yolov5_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        anchors: Union[List, Tuple] = kwargs.get(
+            'anchors',
+            [[(10, 13), (16, 30),
+              (33, 23)], [(30, 61), (62, 45),
+                          (59, 119)], [(116, 90), (156, 198), (373, 326)]])
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            feat_h, feat_w, _ = feat.shape
+            anchor = anchors[i]
+            feat = sigmoid(feat)
+            feat = feat.reshape((feat_h, feat_w, len(anchor), -1))
+            box_feat, conf_feat, score_feat = np.split(feat, [4, 5], -1)
+
+            hIdx, wIdx, aIdx, _ = np.where(conf_feat > conf_thres)
+
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            score_feat = score_feat[hIdx, wIdx, aIdx] * conf_feat[hIdx, wIdx,
+                                                                  aIdx]
+            boxes = box_feat[hIdx, wIdx, aIdx]
+            labels = score_feat.argmax(-1)
+            scores = score_feat.max(-1)
+
+            indices = np.where(scores > conf_thres)[0]
+            if len(indices) == 0:
+                continue
+
+            for idx in indices:
+                a_w, a_h = anchor[aIdx[idx]]
+                x, y, w, h = boxes[idx]
+                x = (x * 2.0 - 0.5 + wIdx[idx]) * stride
+                y = (y * 2.0 - 0.5 + hIdx[idx]) * stride
+                w = (w * 2.0)**2 * a_w
+                h = (h * 2.0)**2 * a_h
+
+                x0 = x - w / 2
+                y0 = y - h / 2
+
+                self.scores_pro.append(float(scores[idx]))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(labels[idx]))
+
+    def __yolox_decode(self,
+                       feats: List[ndarray],
+                       conf_thres: float,
+                       num_labels: int = 80,
+                       **kwargs):
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            score_feat, box_feat, conf_feat = np.split(
+                feat, [num_labels, num_labels + 4], -1)
+            conf_feat = sigmoid(conf_feat)
+
+            hIdx, wIdx, _ = np.where(conf_feat > conf_thres)
+
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            score_feat = sigmoid(score_feat[hIdx, wIdx]) * conf_feat[hIdx,
+                                                                     wIdx]
+            boxes = box_feat[hIdx, wIdx]
+            labels = score_feat.argmax(-1)
+            scores = score_feat.max(-1)
+            indices = np.where(scores > conf_thres)[0]
+
+            if len(indices) == 0:
+                continue
+
+            for idx in indices:
+                score = scores[idx]
+                label = labels[idx]
+
+                x, y, w, h = boxes[idx]
+
+                x = (x + wIdx[idx]) * stride
+                y = (y + hIdx[idx]) * stride
+                w = np.exp(w) * stride
+                h = np.exp(h) * stride
+
+                x0 = x - w / 2
+                y0 = y - h / 2
+
+                self.scores_pro.append(float(score))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(label))
+
+    def __ppyoloe_decode(self,
+                         feats: List[ndarray],
+                         conf_thres: float,
+                         num_labels: int = 80,
+                         **kwargs):
+        reg_max: int = kwargs.get('reg_max', 17)
+        dfl = np.arange(0, reg_max, dtype=np.float32)
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            score_feat, box_feat = np.split(feat, [
+                num_labels,
+            ], -1)
+            score_feat = sigmoid(score_feat)
+            _argmax = score_feat.argmax(-1)
+            _max = score_feat.max(-1)
+            indices = np.where(_max > conf_thres)
+            hIdx, wIdx = indices
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            scores = _max[hIdx, wIdx]
+            boxes = box_feat[hIdx, wIdx].reshape(num_proposal, 4, reg_max)
+            boxes = softmax(boxes, -1) @ dfl
+            labels = _argmax[hIdx, wIdx]
+
+            for k in range(num_proposal):
+                score = scores[k]
+                label = labels[k]
+
+                x0, y0, x1, y1 = boxes[k]
+
+                x0 = (wIdx[k] + 0.5 - x0) * stride
+                y0 = (hIdx[k] + 0.5 - y0) * stride
+                x1 = (wIdx[k] + 0.5 + x1) * stride
+                y1 = (hIdx[k] + 0.5 + y1) * stride
+
+                w = x1 - x0
+                h = y1 - y0
+
+                self.scores_pro.append(float(score))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(label))
+
+    def __yolov6_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            score_feat, box_feat = np.split(feat, [
+                num_labels,
+            ], -1)
+            score_feat = sigmoid(score_feat)
+            _argmax = score_feat.argmax(-1)
+            _max = score_feat.max(-1)
+            indices = np.where(_max > conf_thres)
+            hIdx, wIdx = indices
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            scores = _max[hIdx, wIdx]
+            boxes = box_feat[hIdx, wIdx]
+            labels = _argmax[hIdx, wIdx]
+
+            for k in range(num_proposal):
+                score = scores[k]
+                label = labels[k]
+
+                x0, y0, x1, y1 = boxes[k]
+
+                x0 = (wIdx[k] + 0.5 - x0) * stride
+                y0 = (hIdx[k] + 0.5 - y0) * stride
+                x1 = (wIdx[k] + 0.5 + x1) * stride
+                y1 = (hIdx[k] + 0.5 + y1) * stride
+
+                w = x1 - x0
+                h = y1 - y0
+
+                self.scores_pro.append(float(score))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(label))
+
+    def __yolov7_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        anchors: Union[List, Tuple] = kwargs.get(
+            'anchors',
+            [[(12, 16), (19, 36),
+              (40, 28)], [(36, 75), (76, 55),
+                          (72, 146)], [(142, 110), (192, 243), (459, 401)]])
+        self.__yolov5_decode(feats, conf_thres, num_labels, anchors=anchors)
+
+    def __rtmdet_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        for i, feat in enumerate(feats):
+            stride = 8 << i
+            score_feat, box_feat = np.split(feat, [
+                num_labels,
+            ], -1)
+            score_feat = sigmoid(score_feat)
+            _argmax = score_feat.argmax(-1)
+            _max = score_feat.max(-1)
+            indices = np.where(_max > conf_thres)
+            hIdx, wIdx = indices
+            num_proposal = hIdx.size
+            if not num_proposal:
+                continue
+
+            scores = _max[hIdx, wIdx]
+            boxes = box_feat[hIdx, wIdx]
+            labels = _argmax[hIdx, wIdx]
+
+            for k in range(num_proposal):
+                score = scores[k]
+                label = labels[k]
+
+                x0, y0, x1, y1 = boxes[k]
+
+                x0 = (wIdx[k] - x0) * stride
+                y0 = (hIdx[k] - y0) * stride
+                x1 = (wIdx[k] + x1) * stride
+                y1 = (hIdx[k] + y1) * stride
+
+                w = x1 - x0
+                h = y1 - y0
+
+                self.scores_pro.append(float(score))
+                self.boxes_pro.append(
+                    np.array([x0, y0, w, h], dtype=np.float32))
+                self.labels_pro.append(int(label))
+
+    def __yolov8_decode(self,
+                        feats: List[ndarray],
+                        conf_thres: float,
+                        num_labels: int = 80,
+                        **kwargs):
+        self.__yolov6_decode(feats, conf_thres, num_labels)
--- a/deploy/easydeploy/examples/preprocess.py
+++ b/deploy/easydeploy/examples/preprocess.py
+from typing import List, Tuple, Union
+
+import cv2
+import numpy as np
+from config import ModelType
+from numpy import ndarray
+
+
+class Preprocess:
+
+    def __init__(self, model_type: ModelType):
+        if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7,
+                          ModelType.YOLOV8):
+            mean = np.array([0, 0, 0], dtype=np.float32)
+            std = np.array([255, 255, 255], dtype=np.float32)
+            is_rgb = True
+        elif model_type == ModelType.YOLOX:
+            mean = np.array([0, 0, 0], dtype=np.float32)
+            std = np.array([1, 1, 1], dtype=np.float32)
+            is_rgb = False
+        elif model_type == ModelType.PPYOLOE:
+            mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
+            std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
+            is_rgb = True
+
+        elif model_type == ModelType.PPYOLOEP:
+            mean = np.array([0, 0, 0], dtype=np.float32)
+            std = np.array([255, 255, 255], dtype=np.float32)
+            is_rgb = True
+        elif model_type == ModelType.RTMDET:
+            mean = np.array([103.53, 116.28, 123.675], dtype=np.float32)
+            std = np.array([57.375, 57.12, 58.3955], dtype=np.float32)
+            is_rgb = False
+        else:
+            raise NotImplementedError
+
+        self.mean = mean.reshape((3, 1, 1))
+        self.std = std.reshape((3, 1, 1))
+        self.is_rgb = is_rgb
+
+    def __call__(self,
+                 image: ndarray,
+                 new_size: Union[List[int], Tuple[int]] = (640, 640),
+                 **kwargs) -> Tuple[ndarray, Tuple[float, float]]:
+        # new_size: (height, width)
+        height, width = image.shape[:2]
+        ratio_h, ratio_w = new_size[0] / height, new_size[1] / width
+        image = cv2.resize(
+            image, (0, 0),
+            fx=ratio_w,
+            fy=ratio_h,
+            interpolation=cv2.INTER_LINEAR)
+        image = np.ascontiguousarray(image.transpose(2, 0, 1))
+        image = image.astype(np.float32)
+        image -= self.mean
+        image /= self.std
+        return image[np.newaxis], (ratio_w, ratio_h)
--- a/deploy/easydeploy/examples/requirements.txt
+++ b/deploy/easydeploy/examples/requirements.txt
+onnxruntime
+opencv-python==4.7.0.72
--- a/deploy/easydeploy/model/__init__.py
+++ b/deploy/easydeploy/model/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backend import MMYOLOBackend
+from .backendwrapper import ORTWrapper, TRTWrapper
+from .model import DeployModel
+
+__all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend']
--- a/deploy/easydeploy/model/backend.py
+++ b/deploy/easydeploy/model/backend.py
+from enum import Enum
+
+import torch
+import torch.nn.functional as F
+
+
+class MMYOLOBackend(Enum):
+    AX620A = 'ax620a'
+    COREML = 'coreml'
+    HORIZONX3 = 'horizonx3'
+    NCNN = 'ncnn'
+    ONNXRUNTIME = 'onnxruntime'
+    OPENVINO = 'openvino'
+    PPLNN = 'pplnn'
+    RKNN = 'rknn'
+    TENSORRT8 = 'tensorrt8'
+    TENSORRT7 = 'tensorrt7'
+    TORCHSCRIPT = 'torchscript'
+    TVM = 'tvm'
+
+
+def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor:
+    return F.hardsigmoid(x, inplace=True)
--- a/deploy/easydeploy/model/backendwrapper.py
+++ b/deploy/easydeploy/model/backendwrapper.py
+import warnings
+from collections import namedtuple
+from functools import partial
+from pathlib import Path
+from typing import List, Optional, Union
+
+import numpy as np
+import onnxruntime
+
+try:
+    import tensorrt as trt
+except Exception:
+    trt = None
+import torch
+
+warnings.filterwarnings(action='ignore', category=DeprecationWarning)
+
+
+class TRTWrapper(torch.nn.Module):
+    dtype_mapping = {}
+
+    def __init__(self, weight: Union[str, Path],
+                 device: Optional[torch.device]):
+        super().__init__()
+        weight = Path(weight) if isinstance(weight, str) else weight
+        assert weight.exists() and weight.suffix in ('.engine', '.plan')
+        if isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device(f'cuda:{device}')
+        self.weight = weight
+        self.device = device
+        self.stream = torch.cuda.Stream(device=device)
+        self.__update_mapping()
+        self.__init_engine()
+        self.__init_bindings()
+
+    def __update_mapping(self):
+        self.dtype_mapping.update({
+            trt.bool: torch.bool,
+            trt.int8: torch.int8,
+            trt.int32: torch.int32,
+            trt.float16: torch.float16,
+            trt.float32: torch.float32
+        })
+
+    def __init_engine(self):
+        logger = trt.Logger(trt.Logger.ERROR)
+        self.log = partial(logger.log, trt.Logger.ERROR)
+        trt.init_libnvinfer_plugins(logger, namespace='')
+        self.logger = logger
+        with trt.Runtime(logger) as runtime:
+            model = runtime.deserialize_cuda_engine(self.weight.read_bytes())
+
+        context = model.create_execution_context()
+
+        names = [model.get_binding_name(i) for i in range(model.num_bindings)]
+
+        num_inputs, num_outputs = 0, 0
+
+        for i in range(model.num_bindings):
+            if model.binding_is_input(i):
+                num_inputs += 1
+            else:
+                num_outputs += 1
+
+        self.is_dynamic = -1 in model.get_binding_shape(0)
+
+        self.model = model
+        self.context = context
+        self.input_names = names[:num_inputs]
+        self.output_names = names[num_inputs:]
+        self.num_inputs = num_inputs
+        self.num_outputs = num_outputs
+        self.num_bindings = num_inputs + num_outputs
+        self.bindings: List[int] = [0] * self.num_bindings
+
+    def __init_bindings(self):
+        Binding = namedtuple('Binding', ('name', 'dtype', 'shape'))
+        inputs_info = []
+        outputs_info = []
+
+        for i, name in enumerate(self.input_names):
+            assert self.model.get_binding_name(i) == name
+            dtype = self.dtype_mapping[self.model.get_binding_dtype(i)]
+            shape = tuple(self.model.get_binding_shape(i))
+            inputs_info.append(Binding(name, dtype, shape))
+
+        for i, name in enumerate(self.output_names):
+            i += self.num_inputs
+            assert self.model.get_binding_name(i) == name
+            dtype = self.dtype_mapping[self.model.get_binding_dtype(i)]
+            shape = tuple(self.model.get_binding_shape(i))
+            outputs_info.append(Binding(name, dtype, shape))
+        self.inputs_info = inputs_info
+        self.outputs_info = outputs_info
+        if not self.is_dynamic:
+            self.output_tensor = [
+                torch.empty(o.shape, dtype=o.dtype, device=self.device)
+                for o in outputs_info
+            ]
+
+    def forward(self, *inputs):
+
+        assert len(inputs) == self.num_inputs
+
+        contiguous_inputs: List[torch.Tensor] = [
+            i.contiguous() for i in inputs
+        ]
+
+        for i in range(self.num_inputs):
+            self.bindings[i] = contiguous_inputs[i].data_ptr()
+            if self.is_dynamic:
+                self.context.set_binding_shape(
+                    i, tuple(contiguous_inputs[i].shape))
+
+        # create output tensors
+        outputs: List[torch.Tensor] = []
+
+        for i in range(self.num_outputs):
+            j = i + self.num_inputs
+            if self.is_dynamic:
+                shape = tuple(self.context.get_binding_shape(j))
+                output = torch.empty(
+                    size=shape,
+                    dtype=self.output_dtypes[i],
+                    device=self.device)
+
+            else:
+                output = self.output_tensor[i]
+            outputs.append(output)
+            self.bindings[j] = output.data_ptr()
+
+        self.context.execute_async_v2(self.bindings, self.stream.cuda_stream)
+        self.stream.synchronize()
+
+        return tuple(outputs)
+
+
+class ORTWrapper(torch.nn.Module):
+
+    def __init__(self, weight: Union[str, Path],
+                 device: Optional[torch.device]):
+        super().__init__()
+        weight = Path(weight) if isinstance(weight, str) else weight
+        assert weight.exists() and weight.suffix == '.onnx'
+
+        if isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device(f'cuda:{device}')
+        self.weight = weight
+        self.device = device
+        self.__init_session()
+        self.__init_bindings()
+
+    def __init_session(self):
+        providers = ['CPUExecutionProvider']
+        if 'cuda' in self.device.type:
+            providers.insert(0, 'CUDAExecutionProvider')
+
+        session = onnxruntime.InferenceSession(
+            str(self.weight), providers=providers)
+        self.session = session
+
+    def __init_bindings(self):
+        Binding = namedtuple('Binding', ('name', 'dtype', 'shape'))
+        inputs_info = []
+        outputs_info = []
+        self.is_dynamic = False
+        for i, tensor in enumerate(self.session.get_inputs()):
+            if any(not isinstance(i, int) for i in tensor.shape):
+                self.is_dynamic = True
+            inputs_info.append(
+                Binding(tensor.name, tensor.type, tuple(tensor.shape)))
+
+        for i, tensor in enumerate(self.session.get_outputs()):
+            outputs_info.append(
+                Binding(tensor.name, tensor.type, tuple(tensor.shape)))
+        self.inputs_info = inputs_info
+        self.outputs_info = outputs_info
+        self.num_inputs = len(inputs_info)
+
+    def forward(self, *inputs):
+
+        assert len(inputs) == self.num_inputs
+
+        contiguous_inputs: List[np.ndarray] = [
+            i.contiguous().cpu().numpy() for i in inputs
+        ]
+
+        if not self.is_dynamic:
+            # make sure input shape is right for static input shape
+            for i in range(self.num_inputs):
+                assert contiguous_inputs[i].shape == self.inputs_info[i].shape
+
+        outputs = self.session.run([o.name for o in self.outputs_info], {
+            j.name: contiguous_inputs[i]
+            for i, j in enumerate(self.inputs_info)
+        })
+
+        return tuple(torch.from_numpy(o).to(self.device) for o in outputs)
--- a/deploy/easydeploy/model/model.py
+++ b/deploy/easydeploy/model/model.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from functools import partial
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmdet.models.backbones.csp_darknet import Focus
+from mmdet.models.layers import ChannelAttention
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmyolo.models import RepVGGBlock
+from mmyolo.models.dense_heads import (PPYOLOEHead, RTMDetHead, YOLOv5Head,
+                                       YOLOv7Head, YOLOv8Head, YOLOXHead)
+from mmyolo.models.layers import ImplicitA, ImplicitM
+from ..backbone import DeployFocus, GConvFocus, NcnnFocus
+from ..bbox_code import (rtmdet_bbox_decoder, yolov5_bbox_decoder,
+                         yolox_bbox_decoder)
+from ..nms import batched_nms, efficient_nms, onnx_nms
+from .backend import MMYOLOBackend
+
+
+class DeployModel(nn.Module):
+    transpose = False
+
+    def __init__(self,
+                 baseModel: nn.Module,
+                 backend: MMYOLOBackend,
+                 postprocess_cfg: Optional[ConfigDict] = None,
+                 with_nms=True,
+                 without_bbox_decoder=False):
+        super().__init__()
+        self.baseModel = baseModel
+        self.baseHead = baseModel.bbox_head
+        self.backend = backend
+        self.with_nms = with_nms
+        self.without_bbox_decoder = without_bbox_decoder
+        if postprocess_cfg is None:
+            self.with_postprocess = False
+        else:
+            self.with_postprocess = True
+            self.__init_sub_attributes()
+            self.detector_type = type(self.baseHead)
+            self.pre_top_k = postprocess_cfg.get('pre_top_k', 1000)
+            self.keep_top_k = postprocess_cfg.get('keep_top_k', 100)
+            self.iou_threshold = postprocess_cfg.get('iou_threshold', 0.65)
+            self.score_threshold = postprocess_cfg.get('score_threshold', 0.25)
+        self.__switch_deploy()
+
+    def __init_sub_attributes(self):
+        self.bbox_decoder = self.baseHead.bbox_coder.decode
+        self.prior_generate = self.baseHead.prior_generator.grid_priors
+        self.num_base_priors = self.baseHead.num_base_priors
+        self.featmap_strides = self.baseHead.featmap_strides
+        self.num_classes = self.baseHead.num_classes
+
+    def __switch_deploy(self):
+        headType = type(self.baseHead)
+        if not self.with_postprocess:
+            if headType in (YOLOv5Head, YOLOv7Head):
+                self.baseHead.head_module.forward_single = self.forward_single
+            elif headType in (PPYOLOEHead, YOLOv8Head):
+                self.baseHead.head_module.reg_max = 0
+
+        if self.backend in (MMYOLOBackend.HORIZONX3, MMYOLOBackend.NCNN,
+                            MMYOLOBackend.TORCHSCRIPT):
+            self.transpose = True
+        for layer in self.baseModel.modules():
+            if isinstance(layer, RepVGGBlock):
+                layer.switch_to_deploy()
+            elif isinstance(layer, ChannelAttention):
+                layer.global_avgpool.forward = self.forward_gvp
+            elif isinstance(layer, Focus):
+                # onnxruntime openvino tensorrt8 tensorrt7
+                if self.backend in (MMYOLOBackend.ONNXRUNTIME,
+                                    MMYOLOBackend.OPENVINO,
+                                    MMYOLOBackend.TENSORRT8,
+                                    MMYOLOBackend.TENSORRT7):
+                    self.baseModel.backbone.stem = DeployFocus(layer)
+                # ncnn
+                elif self.backend == MMYOLOBackend.NCNN:
+                    self.baseModel.backbone.stem = NcnnFocus(layer)
+                # switch focus to group conv
+                else:
+                    self.baseModel.backbone.stem = GConvFocus(layer)
+
+    def pred_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     objectnesses: Optional[List[Tensor]] = None,
+                     coeff_preds: Optional[List[Tensor]] = None,
+                     proto_preds: Optional[List[Tensor]] = None,
+                     **kwargs):
+        assert len(cls_scores) == len(bbox_preds)
+        dtype = cls_scores[0].dtype
+        device = cls_scores[0].device
+
+        nms_func = self.select_nms()
+        if self.detector_type in (YOLOv5Head, YOLOv7Head):
+            bbox_decoder = yolov5_bbox_decoder
+        elif self.detector_type is RTMDetHead:
+            bbox_decoder = rtmdet_bbox_decoder
+        elif self.detector_type is YOLOXHead:
+            bbox_decoder = yolox_bbox_decoder
+        else:
+            bbox_decoder = self.bbox_decoder
+        print(bbox_decoder)
+        
+        num_imgs = cls_scores[0].shape[0]
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+
+        mlvl_priors = self.prior_generate(featmap_sizes,
+                                          dtype=dtype,
+                                          device=device)
+
+        flatten_priors = torch.cat(mlvl_priors)
+        mlvl_strides = [
+            flatten_priors.new_full(
+                (featmap_size[0] * featmap_size[1] * self.num_base_priors, ),
+                stride) for featmap_size, stride in zip(
+                    featmap_sizes, self.featmap_strides)
+        ]
+        flatten_stride = torch.cat(mlvl_strides)
+
+        text_len = cls_scores[0].shape[1]
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, text_len)
+            for cls_score in cls_scores
+        ]
+        cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+
+        if objectnesses is not None:
+            flatten_objectness = [
+                objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+                for objectness in objectnesses
+            ]
+            flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+            cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1))
+
+        scores = cls_scores
+        bboxes = flatten_bbox_preds
+        if self.without_bbox_decoder:
+            return scores, bboxes
+        bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds,
+                              flatten_stride)
+
+        if self.with_nms:
+            return nms_func(bboxes, scores, self.keep_top_k,
+                            self.iou_threshold, self.score_threshold,
+                            self.pre_top_k, self.keep_top_k)
+        else:
+            return scores, bboxes
+
+    def select_nms(self):
+        if self.backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO):
+            nms_func = onnx_nms
+        elif self.backend == MMYOLOBackend.TENSORRT8:
+            nms_func = efficient_nms
+        elif self.backend == MMYOLOBackend.TENSORRT7:
+            nms_func = batched_nms
+        else:
+            raise NotImplementedError
+        if type(self.baseHead) in (YOLOv5Head, YOLOv7Head, YOLOXHead):
+            nms_func = partial(nms_func, box_coding=1)
+
+        return nms_func
+
+    def forward(self, inputs: Tensor):
+        neck_outputs = self.baseModel(inputs)
+        if self.with_postprocess:
+            return self.pred_by_feat(*neck_outputs)
+        else:
+            outputs = []
+            if self.transpose:
+                for feats in zip(*neck_outputs):
+                    if self.backend in (MMYOLOBackend.NCNN,
+                                        MMYOLOBackend.TORCHSCRIPT):
+                        outputs.append(
+                            torch.cat(
+                                [feat.permute(0, 2, 3, 1) for feat in feats],
+                                -1))
+                    else:
+                        outputs.append(torch.cat(feats, 1).permute(0, 2, 3, 1))
+            else:
+                for feats in zip(*neck_outputs):
+                    outputs.append(torch.cat(feats, 1))
+            return tuple(outputs)
+
+    @staticmethod
+    def forward_single(x: Tensor, convs: nn.Module) -> Tuple[Tensor]:
+        if isinstance(convs, nn.Sequential) and any(
+                type(m) in (ImplicitA, ImplicitM) for m in convs):
+            a, c, m = convs
+            aw = a.implicit.clone()
+            mw = m.implicit.clone()
+            c = deepcopy(c)
+            nw, cw, _, _ = c.weight.shape
+            na, ca, _, _ = aw.shape
+            nm, cm, _, _ = mw.shape
+            c.bias = nn.Parameter(c.bias + (
+                c.weight.reshape(nw, cw) @ aw.reshape(ca, na)).squeeze(1))
+            c.bias = nn.Parameter(c.bias * mw.reshape(cm))
+            c.weight = nn.Parameter(c.weight * mw.transpose(0, 1))
+            convs = c
+        feat = convs(x)
+        return (feat, )
+
+    @staticmethod
+    def forward_gvp(x: Tensor) -> Tensor:
+        return torch.mean(x, [2, 3], keepdim=True)
--- a/deploy/easydeploy/nms/__init__.py
+++ b/deploy/easydeploy/nms/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ort_nms import onnx_nms
+from .trt_nms import batched_nms, efficient_nms
+
+__all__ = ['efficient_nms', 'batched_nms', 'onnx_nms']
--- a/deploy/easydeploy/nms/ort_nms.py
+++ b/deploy/easydeploy/nms/ort_nms.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+from torchvision.ops import batched_nms
+
+_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0],
+                           [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]],
+                          dtype=torch.float32)
+
+
+def sort_nms_index(nms_index, scores, batch_size, keep_top_k=-1):
+    """
+    first sort the nms_index by batch, and then sort by score in every image result, final apply keep_top_k strategy. In the process, we can also get the number of detections for each image: num_dets
+    """
+    # first sort by batch index to make sure that the same batch index is together
+    device = nms_index.device
+    nms_index_indices = torch.argsort(nms_index[:, 0], dim=0).to(device)
+    nms_index = nms_index[nms_index_indices]
+
+    scores = scores[nms_index[:, 0], nms_index[:, 1], nms_index[:, 2]]
+    batch_inds = nms_index[:, 0]
+
+    # Get the number of detections for each image
+    num_dets = torch.bincount(batch_inds,minlength=batch_size).to(device)
+    # Calculate the sum from front to back
+    cumulative_sum = torch.cumsum(num_dets, dim=0).to(device)
+    # add initial value 0
+    cumulative_sum = torch.cat((torch.tensor([0]).to(device), cumulative_sum))
+    for i in range(len(num_dets)):
+        start = cumulative_sum[i]
+        end = cumulative_sum[i + 1]
+        # sort by score in every batch
+        block_idx = torch.argsort(scores[start:end], descending=True).to(device)
+        nms_index[start:end] = nms_index[start:end][block_idx]
+        if keep_top_k > 0 and end - start > keep_top_k:
+            # delete lines from start+keep_top_k to end to keep only top k
+            nms_index = torch.cat(
+                (nms_index[: start + keep_top_k], nms_index[end:]), dim=0
+            )
+            num_dets[i] -= end - start - keep_top_k
+            cumulative_sum -= end - start - keep_top_k
+    return nms_index, num_dets
+
+
+def select_nms_index(
+    scores: Tensor,
+    boxes: Tensor,
+    nms_index: Tensor,
+    batch_size: int,
+    keep_top_k: int = -1,
+):
+    if nms_index.numel() == 0:
+        return torch.empty(0), torch.empty(0, 4), torch.empty(0), torch.empty(0)
+    nms_index, num_dets = sort_nms_index(nms_index, scores, batch_size, keep_top_k)
+    batch_inds, cls_inds = nms_index[:, 0], nms_index[:, 1]
+    box_inds = nms_index[:, 2]
+
+    # according to the nms_index to get the scores,boxes and labels
+    batched_scores = scores[batch_inds, cls_inds, box_inds]
+    batched_dets = boxes[batch_inds, box_inds, ...]
+    batched_labels = cls_inds
+
+    return num_dets, batched_dets, batched_scores, batched_labels
+
+
+def construct_indice(batch_idx, select_bbox_idxs, class_idxs, original_idxs):
+    num_bbox = len(select_bbox_idxs)
+    class_idxs = class_idxs[select_bbox_idxs]
+    indice = torch.zeros((num_bbox, 3), dtype=torch.int32).to(select_bbox_idxs.device)
+    # batch_idx
+    indice[:, 0] = batch_idx
+    # class_idxs
+    indice[:, 1] = class_idxs
+    # select_bbox_idxs
+    indice[:, 2] = original_idxs[select_bbox_idxs]
+    return indice
+
+
+def filter_max_boxes_per_class(
+    select_bbox_idxs, class_idxs, max_output_boxes_per_class
+):
+    class_counts = {}  #  used to track the count of each class
+
+    filtered_select_bbox_idxs = []
+    filtered_max_class_idxs = []
+
+    for bbox_idx, class_idx in zip(select_bbox_idxs, class_idxs):
+        class_count = class_counts.get(
+            class_idx.item(), 0
+        )  #  Get the count of the current class, or return 0 if it does not exist
+        if class_count < max_output_boxes_per_class:
+            filtered_select_bbox_idxs.append(bbox_idx)
+            filtered_max_class_idxs.append(class_idx)
+            class_counts[class_idx.item()] = class_count + 1
+    return torch.tensor(filtered_select_bbox_idxs), torch.tensor(
+        filtered_max_class_idxs
+    )
+
+
+class ONNXNMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        boxes: Tensor,
+        scores: Tensor,
+        max_output_boxes_per_class: Tensor = torch.tensor([100]),
+        iou_threshold: Tensor = torch.tensor([0.5]),
+        score_threshold: Tensor = torch.tensor([0.05])
+    ) -> Tensor:
+        """
+        Non-Maximum Suppression (NMS) implementation.
+
+        Args:
+            boxes (Tensor): Bounding boxes of shape (batch_size, num_boxes, 4).
+            scores (Tensor): Confidence scores of shape (batch_size, num_classes, num_boxes).
+            max_output_boxes_per_class (Tensor): Maximum number of output boxes per class.
+            iou_threshold (Tensor): IoU threshold for NMS.
+            score_threshold (Tensor): Confidence score threshold.
+
+        Returns:
+            Tensor: Selected indices of shape (num_det, 3).first value is batch index, second value is class index, third value is box index
+        """
+        device = boxes.device
+        batch_size, num_classes, num_boxes = scores.shape
+        selected_indices = []
+        for batch_idx in range(batch_size):
+            boxes_per_image = boxes[batch_idx]
+            scores_per_image = scores[batch_idx]
+
+            # If no boxes in this image, continue to the next image
+            if boxes_per_image.numel() == 0:
+                continue
+
+            # for one box, only exist one class,so use torch.max to get the max score and class index
+            scores_per_image, class_idxs = torch.max(scores_per_image, dim=0)
+            # Apply score threshold before batched_nms bacause nms operation is time expensive
+            keep_idxs = scores_per_image > score_threshold
+            if not torch.any(keep_idxs):
+                # If no boxes left after applying score threshold, continue to the next image
+                continue
+
+            boxes_per_image = boxes_per_image[keep_idxs]
+            scores_per_image = scores_per_image[keep_idxs]
+            class_idxs = class_idxs[keep_idxs]
+
+            #  The purpose of original_idxs is we want to return the indexs to the original input data instead of the filtered.
+            original_idxs = torch.arange(num_boxes, device=device)[keep_idxs]
+            # reference: https://pytorch.org/vision/main/generated/torchvision.ops.batched_nms.html
+            select_bbox_idxs = batched_nms(
+                boxes_per_image, scores_per_image, class_idxs, iou_threshold
+            )
+            if (
+                select_bbox_idxs.shape[0] > max_output_boxes_per_class
+            ):  # If the boxes detected by all classes together are less than max_output_boxes_per_class, then there is no need to filter
+                select_bbox_idxs, _ = filter_max_boxes_per_class(
+                    select_bbox_idxs,
+                    class_idxs[select_bbox_idxs],
+                    max_output_boxes_per_class,
+                )
+            selected_indice = construct_indice(
+                batch_idx, select_bbox_idxs, class_idxs, original_idxs
+            )
+            selected_indices.append(selected_indice)
+        if len(selected_indices) == 0:
+            return torch.tensor([], device=device)
+        selected_indices = torch.cat(selected_indices, dim=0)
+        return selected_indices
+
+    @staticmethod
+    def symbolic(
+            g,
+            boxes: Tensor,
+            scores: Tensor,
+            max_output_boxes_per_class: Tensor = torch.tensor([100]),
+            iou_threshold: Tensor = torch.tensor([0.5]),
+            score_threshold: Tensor = torch.tensor([0.05]),
+    ):
+        return g.op(
+            'NonMaxSuppression',
+            boxes,
+            scores,
+            max_output_boxes_per_class,
+            iou_threshold,
+            score_threshold,
+            outputs=1)
+
+
+def onnx_nms(
+    boxes: torch.Tensor,
+    scores: torch.Tensor,
+    max_output_boxes_per_class: int = 100,
+    iou_threshold: float = 0.5,
+    score_threshold: float = 0.05,
+    pre_top_k: int = -1,
+    keep_top_k: int = 100,
+    box_coding: int = 0,
+):
+    max_output_boxes_per_class = torch.tensor([max_output_boxes_per_class])
+    iou_threshold = torch.tensor([iou_threshold]).to(boxes.device)
+    score_threshold = torch.tensor([score_threshold]).to(boxes.device)
+
+    batch_size, _, _ = scores.shape
+    if box_coding == 1:
+        boxes = boxes @ (_XYWH2XYXY.to(boxes.device))
+    scores = scores.transpose(1, 2).contiguous()
+    selected_indices = ONNXNMSop.apply(boxes, scores,
+                                       max_output_boxes_per_class,
+                                       iou_threshold, score_threshold)
+
+    num_dets, batched_dets, batched_scores, batched_labels = select_nms_index(
+        scores, boxes, selected_indices, batch_size, keep_top_k=keep_top_k)
+
+    return num_dets, batched_dets, batched_scores, batched_labels.to(
+        torch.int32)
--- a/deploy/easydeploy/nms/trt_nms.py
+++ b/deploy/easydeploy/nms/trt_nms.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0],
+                           [-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]],
+                          dtype=torch.float32)
+
+
+class TRTEfficientNMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        boxes: Tensor,
+        scores: Tensor,
+        background_class: int = -1,
+        box_coding: int = 0,
+        iou_threshold: float = 0.45,
+        max_output_boxes: int = 100,
+        plugin_version: str = '1',
+        score_activation: int = 0,
+        score_threshold: float = 0.25,
+    ):
+        batch_size, _, num_classes = scores.shape
+        num_det = torch.randint(
+            0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
+        det_boxes = torch.randn(batch_size, max_output_boxes, 4)
+        det_scores = torch.randn(batch_size, max_output_boxes)
+        det_classes = torch.randint(
+            0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
+        return num_det, det_boxes, det_scores, det_classes
+
+    @staticmethod
+    def symbolic(g,
+                 boxes: Tensor,
+                 scores: Tensor,
+                 background_class: int = -1,
+                 box_coding: int = 0,
+                 iou_threshold: float = 0.45,
+                 max_output_boxes: int = 100,
+                 plugin_version: str = '1',
+                 score_activation: int = 0,
+                 score_threshold: float = 0.25):
+        out = g.op(
+            'TRT::EfficientNMS_TRT',
+            boxes,
+            scores,
+            background_class_i=background_class,
+            box_coding_i=box_coding,
+            iou_threshold_f=iou_threshold,
+            max_output_boxes_i=max_output_boxes,
+            plugin_version_s=plugin_version,
+            score_activation_i=score_activation,
+            score_threshold_f=score_threshold,
+            outputs=4)
+        num_det, det_boxes, det_scores, det_classes = out
+        return num_det, det_boxes, det_scores, det_classes
+
+
+class TRTbatchedNMSop(torch.autograd.Function):
+    """TensorRT NMS operation."""
+
+    @staticmethod
+    def forward(
+        ctx,
+        boxes: Tensor,
+        scores: Tensor,
+        plugin_version: str = '1',
+        shareLocation: int = 1,
+        backgroundLabelId: int = -1,
+        numClasses: int = 80,
+        topK: int = 1000,
+        keepTopK: int = 100,
+        scoreThreshold: float = 0.25,
+        iouThreshold: float = 0.45,
+        isNormalized: int = 0,
+        clipBoxes: int = 0,
+        scoreBits: int = 16,
+        caffeSemantics: int = 1,
+    ):
+        batch_size, _, numClasses = scores.shape
+        num_det = torch.randint(
+            0, keepTopK, (batch_size, 1), dtype=torch.int32)
+        det_boxes = torch.randn(batch_size, keepTopK, 4)
+        det_scores = torch.randn(batch_size, keepTopK)
+        det_classes = torch.randint(0, numClasses,
+                                    (batch_size, keepTopK)).float()
+        return num_det, det_boxes, det_scores, det_classes
+
+    @staticmethod
+    def symbolic(
+        g,
+        boxes: Tensor,
+        scores: Tensor,
+        plugin_version: str = '1',
+        shareLocation: int = 1,
+        backgroundLabelId: int = -1,
+        numClasses: int = 80,
+        topK: int = 1000,
+        keepTopK: int = 100,
+        scoreThreshold: float = 0.25,
+        iouThreshold: float = 0.45,
+        isNormalized: int = 0,
+        clipBoxes: int = 0,
+        scoreBits: int = 16,
+        caffeSemantics: int = 1,
+    ):
+        out = g.op(
+            'TRT::BatchedNMSDynamic_TRT',
+            boxes,
+            scores,
+            shareLocation_i=shareLocation,
+            plugin_version_s=plugin_version,
+            backgroundLabelId_i=backgroundLabelId,
+            numClasses_i=numClasses,
+            topK_i=topK,
+            keepTopK_i=keepTopK,
+            scoreThreshold_f=scoreThreshold,
+            iouThreshold_f=iouThreshold,
+            isNormalized_i=isNormalized,
+            clipBoxes_i=clipBoxes,
+            scoreBits_i=scoreBits,
+            caffeSemantics_i=caffeSemantics,
+            outputs=4)
+        num_det, det_boxes, det_scores, det_classes = out
+        return num_det, det_boxes, det_scores, det_classes
+
+
+def _efficient_nms(
+    boxes: Tensor,
+    scores: Tensor,
+    max_output_boxes_per_class: int = 1000,
+    iou_threshold: float = 0.5,
+    score_threshold: float = 0.05,
+    pre_top_k: int = -1,
+    keep_top_k: int = 100,
+    box_coding: int = 0,
+):
+    """Wrapper for `efficient_nms` with TensorRT.
+    Args:
+        boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4].
+        scores (Tensor): The detection scores of shape
+            [N, num_boxes, num_classes].
+        max_output_boxes_per_class (int): Maximum number of output
+            boxes per class of nms. Defaults to 1000.
+        iou_threshold (float): IOU threshold of nms. Defaults to 0.5.
+        score_threshold (float): score threshold of nms.
+            Defaults to 0.05.
+        pre_top_k (int): Number of top K boxes to keep before nms.
+            Defaults to -1.
+        keep_top_k (int): Number of top K boxes to keep after nms.
+            Defaults to -1.
+        box_coding (int): Bounding boxes format for nms.
+            Defaults to 0 means [x1, y1 ,x2, y2].
+            Set to 1 means [x, y, w, h].
+    Returns:
+        tuple[Tensor, Tensor, Tensor, Tensor]:
+        (num_det, det_boxes, det_scores, det_classes),
+        `num_det` of shape [N, 1]
+        `det_boxes` of shape [N, num_det, 4]
+        `det_scores` of shape [N, num_det]
+        `det_classes` of shape [N, num_det]
+    """
+    num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply(
+        boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0,
+        score_threshold)
+    return num_det, det_boxes, det_scores, det_classes
+
+
+def _batched_nms(
+    boxes: Tensor,
+    scores: Tensor,
+    max_output_boxes_per_class: int = 1000,
+    iou_threshold: float = 0.5,
+    score_threshold: float = 0.05,
+    pre_top_k: int = -1,
+    keep_top_k: int = 100,
+    box_coding: int = 0,
+):
+    """Wrapper for `efficient_nms` with TensorRT.
+    Args:
+        boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4].
+        scores (Tensor): The detection scores of shape
+            [N, num_boxes, num_classes].
+        max_output_boxes_per_class (int): Maximum number of output
+            boxes per class of nms. Defaults to 1000.
+        iou_threshold (float): IOU threshold of nms. Defaults to 0.5.
+        score_threshold (float): score threshold of nms.
+            Defaults to 0.05.
+        pre_top_k (int): Number of top K boxes to keep before nms.
+            Defaults to -1.
+        keep_top_k (int): Number of top K boxes to keep after nms.
+            Defaults to -1.
+        box_coding (int): Bounding boxes format for nms.
+            Defaults to 0 means [x1, y1 ,x2, y2].
+            Set to 1 means [x, y, w, h].
+    Returns:
+        tuple[Tensor, Tensor, Tensor, Tensor]:
+        (num_det, det_boxes, det_scores, det_classes),
+        `num_det` of shape [N, 1]
+        `det_boxes` of shape [N, num_det, 4]
+        `det_scores` of shape [N, num_det]
+        `det_classes` of shape [N, num_det]
+    """
+    if box_coding == 1:
+        boxes = boxes @ (_XYWH2XYXY.to(boxes.device))
+    boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2)
+    _, _, numClasses = scores.shape
+
+    num_det, det_boxes, det_scores, det_classes = TRTbatchedNMSop.apply(
+        boxes, scores, '1', 1, -1, int(numClasses), min(pre_top_k, 4096),
+        keep_top_k, score_threshold, iou_threshold, 0, 0, 16, 1)
+
+    det_classes = det_classes.int()
+    return num_det, det_boxes, det_scores, det_classes
+
+
+def efficient_nms(*args, **kwargs):
+    """Wrapper function for `_efficient_nms`."""
+    return _efficient_nms(*args, **kwargs)
+
+
+def batched_nms(*args, **kwargs):
+    """Wrapper function for `_batched_nms`."""
+    return _batched_nms(*args, **kwargs)
--- a/deploy/easydeploy/onnx_demo.py
+++ b/deploy/easydeploy/onnx_demo.py
--- a/deploy/easydeploy/tools/build_engine.py
+++ b/deploy/easydeploy/tools/build_engine.py
+import argparse
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+try:
+    import tensorrt as trt
+except Exception:
+    trt = None
+import warnings
+
+import numpy as np
+import torch
+
+warnings.filterwarnings(action='ignore', category=DeprecationWarning)
+
+
+class EngineBuilder:
+
+    def __init__(
+            self,
+            checkpoint: Union[str, Path],
+            opt_shape: Union[Tuple, List] = (1, 3, 640, 640),
+            device: Optional[Union[str, int, torch.device]] = None) -> None:
+        checkpoint = Path(checkpoint) if isinstance(checkpoint,
+                                                    str) else checkpoint
+        assert checkpoint.exists() and checkpoint.suffix == '.onnx'
+        if isinstance(device, str):
+            device = torch.device(device)
+        elif isinstance(device, int):
+            device = torch.device(f'cuda:{device}')
+
+        self.checkpoint = checkpoint
+        self.opt_shape = np.array(opt_shape, dtype=np.float32)
+        self.device = device
+
+    def __build_engine(self,
+                       scale: Optional[List[List]] = None,
+                       fp16: bool = True,
+                       with_profiling: bool = True) -> None:
+        logger = trt.Logger(trt.Logger.WARNING)
+        trt.init_libnvinfer_plugins(logger, namespace='')
+        builder = trt.Builder(logger)
+        config = builder.create_builder_config()
+        config.max_workspace_size = torch.cuda.get_device_properties(
+            self.device).total_memory
+        flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        network = builder.create_network(flag)
+        parser = trt.OnnxParser(network, logger)
+        if not parser.parse_from_file(str(self.checkpoint)):
+            raise RuntimeError(
+                f'failed to load ONNX file: {str(self.checkpoint)}')
+        inputs = [network.get_input(i) for i in range(network.num_inputs)]
+        outputs = [network.get_output(i) for i in range(network.num_outputs)]
+        profile = None
+        dshape = -1 in network.get_input(0).shape
+        if dshape:
+            profile = builder.create_optimization_profile()
+            if scale is None:
+                scale = np.array(
+                    [[1, 1, 0.5, 0.5], [1, 1, 1, 1], [4, 1, 1.5, 1.5]],
+                    dtype=np.float32)
+                scale = (self.opt_shape * scale).astype(np.int32)
+            elif isinstance(scale, List):
+                scale = np.array(scale, dtype=np.int32)
+                assert scale.shape[0] == 3, 'Input a wrong scale list'
+            else:
+                raise NotImplementedError
+
+        for inp in inputs:
+            logger.log(
+                trt.Logger.WARNING,
+                f'input "{inp.name}" with shape{inp.shape} {inp.dtype}')
+            if dshape:
+                profile.set_shape(inp.name, *scale)
+        for out in outputs:
+            logger.log(
+                trt.Logger.WARNING,
+                f'output "{out.name}" with shape{out.shape} {out.dtype}')
+        if fp16 and builder.platform_has_fast_fp16:
+            config.set_flag(trt.BuilderFlag.FP16)
+        self.weight = self.checkpoint.with_suffix('.engine')
+        if dshape:
+            config.add_optimization_profile(profile)
+        if with_profiling:
+            config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
+        with builder.build_engine(network, config) as engine:
+            self.weight.write_bytes(engine.serialize())
+        logger.log(
+            trt.Logger.WARNING, f'Build tensorrt engine finish.\n'
+            f'Save in {str(self.weight.absolute())}')
+
+    def build(self,
+              scale: Optional[List[List]] = None,
+              fp16: bool = True,
+              with_profiling=True):
+        self.__build_engine(scale, fp16, with_profiling)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--img-size',
+        nargs='+',
+        type=int,
+        default=[640, 640],
+        help='Image size of height and width')
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='TensorRT builder device')
+    parser.add_argument(
+        '--scales',
+        type=str,
+        default='[[1,3,640,640],[1,3,640,640],[1,3,640,640]]',
+        help='Input scales for build dynamic input shape engine')
+    parser.add_argument(
+        '--fp16', action='store_true', help='Build model with fp16 mode')
+    args = parser.parse_args()
+    args.img_size *= 2 if len(args.img_size) == 1 else 1
+    return args
+
+
+def main(args):
+    img_size = (1, 3, *args.img_size)
+    try:
+        scales = eval(args.scales)
+    except Exception:
+        print('Input scales is not a python variable')
+        print('Set scales default None')
+        scales = None
+    builder = EngineBuilder(args.checkpoint, img_size, args.device)
+    builder.build(scales, fp16=args.fp16)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)