Commit e9cee049 authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #1056 canceled with stages
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-color-format=0
model-engine-file=../end2end.engine
labelfile-path=../coco_labels.txt
batch-size=1
network-mode=0
num-detected-classes=80
interval=0
gie-unique-id=1
process-mode=1
network-type=0
cluster-mode=2
maintain-aspect-ratio=1
parse-bbox-func-name=NvDsInferParseCustomMMYOLO
custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
[class-attrs-all]
pre-cluster-threshold=0.45
topk=100
[property]
gpu-id=0
net-scale-factor=0.0039215697906911373
model-color-format=0
model-engine-file=../end2end.engine
labelfile-path=../coco_labels.txt
batch-size=1
network-mode=0
num-detected-classes=80
interval=0
gie-unique-id=1
process-mode=1
network-type=0
cluster-mode=2
maintain-aspect-ratio=1
parse-bbox-func-name=NvDsInferParseCustomMMYOLO
custom-lib-path=../build/libnvdsparsebbox_mmyolo.so
[class-attrs-all]
pre-cluster-threshold=0.45
topk=100
#include "nvdsinfer_custom_impl.h"
#include <cassert>
#include <iostream>
/**
* Function expected by DeepStream for decoding the MMYOLO output.
*
* C-linkage [extern "C"] was written to prevent name-mangling. This function must return true after
* adding all bounding boxes to the objectList vector.
*
* @param [outputLayersInfo] std::vector of NvDsInferLayerInfo objects with information about the output layer.
* @param [networkInfo] NvDsInferNetworkInfo object with information about the MMYOLO network.
* @param [detectionParams] NvDsInferParseDetectionParams with information about some config params.
* @param [objectList] std::vector of NvDsInferParseObjectInfo objects to which bounding box information must
* be stored.
*
* @return true
*/
// This is just the function prototype. The definition is written at the end of the file.
extern "C" bool NvDsInferParseCustomMMYOLO(
std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList);
static __inline__ float clamp(float& val, float min, float max)
{
return val > min ? (val < max ? val : max) : min;
}
static std::vector<NvDsInferParseObjectInfo> decodeMMYoloTensor(
const int* num_dets,
const float* bboxes,
const float* scores,
const int* labels,
const float& conf_thres,
const unsigned int& img_w,
const unsigned int& img_h
)
{
std::vector<NvDsInferParseObjectInfo> bboxInfo;
size_t nums = num_dets[0];
for (size_t i = 0; i < nums; i++)
{
float score = scores[i];
if (score < conf_thres)continue;
float x0 = (bboxes[i * 4]);
float y0 = (bboxes[i * 4 + 1]);
float x1 = (bboxes[i * 4 + 2]);
float y1 = (bboxes[i * 4 + 3]);
x0 = clamp(x0, 0.f, img_w);
y0 = clamp(y0, 0.f, img_h);
x1 = clamp(x1, 0.f, img_w);
y1 = clamp(y1, 0.f, img_h);
NvDsInferParseObjectInfo obj;
obj.left = x0;
obj.top = y0;
obj.width = x1 - x0;
obj.height = y1 - y0;
obj.detectionConfidence = score;
obj.classId = labels[i];
bboxInfo.push_back(obj);
}
return bboxInfo;
}
/* C-linkage to prevent name-mangling */
extern "C" bool NvDsInferParseCustomMMYOLO(
std::vector<NvDsInferLayerInfo> const& outputLayersInfo,
NvDsInferNetworkInfo const& networkInfo,
NvDsInferParseDetectionParams const& detectionParams,
std::vector<NvDsInferParseObjectInfo>& objectList)
{
// Some assertions and error checking.
if (outputLayersInfo.empty() || outputLayersInfo.size() != 4)
{
std::cerr << "Could not find output layer in bbox parsing" << std::endl;
return false;
}
// Score threshold of bboxes.
const float conf_thres = detectionParams.perClassThreshold[0];
// Obtaining the output layer.
const NvDsInferLayerInfo& num_dets = outputLayersInfo[0];
const NvDsInferLayerInfo& bboxes = outputLayersInfo[1];
const NvDsInferLayerInfo& scores = outputLayersInfo[2];
const NvDsInferLayerInfo& labels = outputLayersInfo[3];
// num_dets(int) bboxes(float) scores(float) labels(int)
assert (num_dets.dims.numDims == 2);
assert (bboxes.dims.numDims == 3);
assert (scores.dims.numDims == 2);
assert (labels.dims.numDims == 2);
// Decoding the output tensor of MMYOLO to the NvDsInferParseObjectInfo format.
std::vector<NvDsInferParseObjectInfo> objects =
decodeMMYoloTensor(
(const int*)(num_dets.buffer),
(const float*)(bboxes.buffer),
(const float*)(scores.buffer),
(const int*)(labels.buffer),
conf_thres,
networkInfo.width,
networkInfo.height
);
objectList.clear();
objectList = objects;
return true;
}
/* Check that the custom function has been defined correctly */
CHECK_CUSTOM_PARSE_FUNC_PROTOTYPE(NvDsInferParseCustomMMYOLO);
[application]
enable-perf-measurement=1
perf-measurement-interval-sec=5
[tiled-display]
enable=1
rows=1
columns=1
width=1280
height=720
gpu-id=0
nvbuf-memory-type=0
[source0]
enable=1
type=3
uri=file:///opt/nvidia/deepstream/deepstream/samples/streams/sample_1080p_h264.mp4
num-sources=1
gpu-id=0
cudadec-memtype=0
[sink0]
enable=1
type=2
sync=0
gpu-id=0
nvbuf-memory-type=0
[osd]
enable=1
gpu-id=0
border-width=5
text-size=15
text-color=1;1;1;1;
text-bg-color=0.3;0.3;0.3;1
font=Serif
show-clock=0
clock-x-offset=800
clock-y-offset=820
clock-text-size=12
clock-color=1;0;0;0
nvbuf-memory-type=0
[streammux]
gpu-id=0
live-source=0
batch-size=1
batched-push-timeout=40000
width=1920
height=1080
enable-padding=0
nvbuf-memory-type=0
[primary-gie]
enable=1
gpu-id=0
gie-unique-id=1
nvbuf-memory-type=0
config-file=configs/config_infer_rtmdet.txt
[tests]
file-loop=0
# MMYOLO 模型 ONNX 转换
## 1. 导出后端支持的 ONNX
## 环境依赖
- [onnx](https://github.com/onnx/onnx)
```shell
pip install onnx
```
[onnx-simplifier](https://github.com/daquexian/onnx-simplifier) (可选,用于简化模型)
```shell
pip install onnx-simplifier
```
\*\*\* 请确保您在 `MMYOLO` 根目录下运行相关脚本,避免无法找到相关依赖包。\*\*\*
## 使用方法
[模型导出脚本](./projects/easydeploy/tools/export_onnx.py)用于将 `MMYOLO` 模型转换为 `onnx`
### 参数介绍:
- `config` : 构建模型使用的配置文件,如 [`yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py`](./configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py)
- `checkpoint` : 训练得到的权重文件,如 `yolov5s.pth`
- `--work-dir` : 转换后的模型保存路径。
- `--img-size`: 转换模型时输入的尺寸,如 `640 640`
- `--batch-size`: 转换后的模型输入 `batch size`
- `--device`: 转换模型使用的设备,默认为 `cuda:0`
- `--simplify`: 是否简化导出的 `onnx` 模型,需要安装 [onnx-simplifier](https://github.com/daquexian/onnx-simplifier),默认关闭。
- `--opset`: 指定导出 `onnx``opset`,默认为 `11`
- `--backend`: 指定导出 `onnx` 用于的后端名称,`ONNXRuntime`: `onnxruntime`, `TensorRT8`: `tensorrt8`, `TensorRT7`: `tensorrt7`,默认为`onnxruntime``ONNXRuntime`
- `--pre-topk`: 指定导出 `onnx` 的后处理筛选候选框个数阈值,默认为 `1000`
- `--keep-topk`: 指定导出 `onnx` 的非极大值抑制输出的候选框个数阈值,默认为 `100`
- `--iou-threshold`: 非极大值抑制中过滤重复候选框的 `iou` 阈值,默认为 `0.65`
- `--score-threshold`: 非极大值抑制中过滤候选框得分的阈值,默认为 `0.25`
- `--model-only`: 指定仅导出模型 backbone + neck, 不包含后处理,默认关闭。
例子:
```shell
python ./projects/easydeploy/tools/export.py \
configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \
yolov5s.pth \
--work-dir work_dir \
--img-size 640 640 \
--batch 1 \
--device cpu \
--simplify \
--opset 11 \
--backend 1 \
--pre-topk 1000 \
--keep-topk 100 \
--iou-threshold 0.65 \
--score-threshold 0.25
```
然后利用后端支持的工具如 `TensorRT` 读取 `onnx` 再次转换为后端支持的模型格式如 `.engine/.plan` 等。
`MMYOLO` 目前支持 `TensorRT8`, `TensorRT7`, `ONNXRuntime` 后端的端到端模型转换,目前仅支持静态 shape 模型的导出和转换,动态 batch 或动态长宽的模型端到端转换会在未来继续支持。
端到端转换得到的 `onnx` 模型输入输出如图:
<div align=center>
<img src="https://user-images.githubusercontent.com/92794867/232403745-101ca999-2003-46fa-bc5b-6b0eb2b2d41b.png"/>
</div>
输入名: `images`, 尺寸 640x640
输出名: `num_dets`, 尺寸 1x1,表示检测目标数量。
输出名: `boxes`, 尺寸 1x100x4,表示检测框的坐标,格式为 `x1y1x2y1`
输出名: `scores`, 尺寸 1x100,表示检测框的分数。
输出名: `labels`, 尺寸 1x100,表示检测框的类别 id。
可以利用 `num_dets` 中的个数对 `boxes`, `scores`, `labels` 进行截断,从 100 个检测结果中抽取前 `num_dets` 个目标作为最终检测结果。
## 2. 仅导出模型 Backbone + Neck
当您需要部署在非 `TensorRT`, `ONNXRuntime` 等支持端到端部署的平台时,您可以考虑使用`--model-only` 参数并且不要传递 `--backend` 参数,您将会导出仅包含 `Backbone` + `neck` 的模型,模型的部分输出如图:
<div align=center>
<img src="https://user-images.githubusercontent.com/92794867/232406169-40eee9fd-bc53-4fdc-bd37-d0e9033826f9.png"/>
</div>
这种导出方式获取的 `ONNX` 模型具有如下优点:
- 算子简单,一般而言只包含 `Conv`,激活函数等简单算子,几乎不存在无法正确导出的情况,对于嵌入式部署更加友好。
- 方便不同算法之间对比速度性能,由于不同的算法后处理不同,仅对比 `backbone` + `Neck` 的速度更加公平。
也有如下缺点:
- 后处理逻辑需要单独完成,会有额外的 `decode` + `nms` 的操作需要实现。
-`TensorRT` 相比,由于 `TensorRT` 可以利用多核优势并行进行后处理,使用 `--model-only` 方式导出的模型性能会差很多。
### 使用方法
```shell
python ./projects/easydeploy/tools/export.py \
configs/yolov5/yolov5_s-v61_syncbn_fast_8xb16-300e_coco.py \
yolov5s.pth \
--work-dir work_dir \
--img-size 640 640 \
--batch 1 \
--device cpu \
--simplify \
--opset 11 \
--model-only
```
## 使用 `model-only` 导出的 ONNX 进行推理
[模型推理脚本](./projects/easydeploy/examples/main_onnxruntime.py)用于推理导出的 `ONNX` 模型,需要安装基础依赖环境:
[`onnxruntime`](https://github.com/microsoft/onnxruntime)[`opencv-python`](https://github.com/opencv/opencv-python)
```shell
pip install onnxruntime
pip install opencv-python==4.7.0.72 # 建议使用最新的 opencv
```
### 参数介绍:
- `img` : 待检测的图片路径或图片文件夹路径。
- `onnx` : 导出的 `model-only` ONNX 模型。
- `--type` : 模型名称,目前支持 `yolov5`, `yolox`, `yolov6`, `ppyoloe`, `ppyoloep`, `yolov7`, `rtmdet`, `yolov8`
- `--img-size`: 转换模型时输入的尺寸,如 `640 640`
- `--out-dir`: 保存检测结果的路径 。
- `--show`: 是否可视化检测结果。
- `--score-thr`: 模型检测后处理的置信度分数 。
- `--iou-thr`: 模型检测后处理的 IOU 分数 。
## 使用方法
```shell
cd ./projects/easydeploy/examples
python main_onnxruntime.py \
"image_path_to_detect" \
yolov5_s_model-only.onnx \
--out-dir work_dir \
--img-size 640 640 \
--show \
--score-thr 0.3 \
--iou-thr 0.7
```
*注意!!!*
当您使用自定义数据集训练得到的模型时,请修改 [`config.py`](./projects/easydeploy/examples/config.py)`CLASS_NAMES``CLASS_COLORS`,如果是 `yolov5` 或者 `yolov7` 基于 `anchor` 的模型请同时修改 `YOLOv5_ANCHORS``YOLOv7_ANCHORS`
[`numpy_coder.py`](./projects/easydeploy/examples/numpy_coder.py) 是目前所有算法仅使用 `numpy` 实现的 `decoder`,如果您对性能有较高的要求,可以参照相关代码改写为 `c/c++`
from enum import Enum
class TASK_TYPE(Enum):
DET = 'det'
SEG = 'seg'
POSE = 'pose'
class ModelType(Enum):
YOLOV5 = 'yolov5'
YOLOX = 'yolox'
PPYOLOE = 'ppyoloe'
PPYOLOEP = 'ppyoloep'
YOLOV6 = 'yolov6'
YOLOV7 = 'yolov7'
RTMDET = 'rtmdet'
YOLOV8 = 'yolov8'
CLASS_NAMES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
CLASS_COLORS = [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230),
(106, 0, 228), (0, 60, 100), (0, 80, 100), (0, 0, 70),
(0, 0, 192), (250, 170, 30), (100, 170, 30), (220, 220, 0),
(175, 116, 175), (250, 0, 30), (165, 42, 42), (255, 77, 255),
(0, 226, 252), (182, 182, 255), (0, 82, 0), (120, 166, 157),
(110, 76, 0), (174, 57, 255), (199, 100, 0), (72, 0, 118),
(255, 179, 240), (0, 125, 92), (209, 0, 151), (188, 208, 182),
(0, 220, 176), (255, 99, 164), (92, 0, 73), (133, 129, 255),
(78, 180, 255), (0, 228, 0), (174, 255, 243), (45, 89, 255),
(134, 134, 103), (145, 148, 174), (255, 208, 186),
(197, 226, 255), (171, 134, 1), (109, 63, 54), (207, 138, 255),
(151, 0, 95), (9, 80, 61), (84, 105, 51), (74, 65, 105),
(166, 196, 102), (208, 195, 210), (255, 109, 65),
(0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
(227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
(163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
(183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
(166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
(65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
(196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
(246, 0, 122), (191, 162, 208)]
YOLOv5_ANCHORS = [[(10, 13), (16, 30), (33, 23)],
[(30, 61), (62, 45), (59, 119)],
[(116, 90), (156, 198), (373, 326)]]
YOLOv7_ANCHORS = [[(12, 16), (19, 36), (40, 28)],
[(36, 75), (76, 55), (72, 146)],
[(142, 110), (192, 243), (459, 401)]]
from typing import List, Tuple, Union
import cv2
from numpy import ndarray
MAJOR, MINOR = map(int, cv2.__version__.split('.')[:2])
assert MAJOR == 4
def non_max_suppression(boxes: Union[List[ndarray], Tuple[ndarray]],
scores: Union[List[float], Tuple[float]],
labels: Union[List[int], Tuple[int]],
conf_thres: float = 0.25,
iou_thres: float = 0.65) -> Tuple[List, List, List]:
if MINOR >= 7:
indices = cv2.dnn.NMSBoxesBatched(boxes, scores, labels, conf_thres,
iou_thres)
elif MINOR == 6:
indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres, iou_thres)
else:
indices = cv2.dnn.NMSBoxes(boxes, scores, conf_thres,
iou_thres).flatten()
nmsd_boxes = []
nmsd_scores = []
nmsd_labels = []
for idx in indices:
box = boxes[idx]
# x0y0wh -> x0y0x1y1
box[2:] = box[:2] + box[2:]
score = scores[idx]
label = labels[idx]
nmsd_boxes.append(box)
nmsd_scores.append(score)
nmsd_labels.append(label)
return nmsd_boxes, nmsd_scores, nmsd_labels
import math
import sys
from argparse import ArgumentParser
from pathlib import Path
import cv2
import onnxruntime
from config import (CLASS_COLORS, CLASS_NAMES, ModelType, YOLOv5_ANCHORS,
YOLOv7_ANCHORS)
from cv2_nms import non_max_suppression
from numpy_coder import Decoder
from preprocess import Preprocess
from tqdm import tqdm
# Add __FILE__ to sys.path
sys.path.append(str(Path(__file__).resolve().parents[0]))
IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
'.tiff', '.webp')
def path_to_list(path: str):
path = Path(path)
if path.is_file() and path.suffix in IMG_EXTENSIONS:
res_list = [str(path.absolute())]
elif path.is_dir():
res_list = [
str(p.absolute()) for p in path.iterdir()
if p.suffix in IMG_EXTENSIONS
]
else:
raise RuntimeError
return res_list
def parse_args():
parser = ArgumentParser()
parser.add_argument(
'img', help='Image path, include image file, dir and URL.')
parser.add_argument('onnx', type=str, help='Onnx file')
parser.add_argument('--type', type=str, help='Model type')
parser.add_argument(
'--img-size',
nargs='+',
type=int,
default=[640, 640],
help='Image size of height and width')
parser.add_argument(
'--out-dir', default='./output', type=str, help='Path to output file')
parser.add_argument(
'--show', action='store_true', help='Show the detection results')
parser.add_argument(
'--score-thr', type=float, default=0.3, help='Bbox score threshold')
parser.add_argument(
'--iou-thr', type=float, default=0.7, help='Bbox iou threshold')
args = parser.parse_args()
return args
def main():
args = parse_args()
out_dir = Path(args.out_dir)
model_type = ModelType(args.type.lower())
if not args.show:
out_dir.mkdir(parents=True, exist_ok=True)
files = path_to_list(args.img)
session = onnxruntime.InferenceSession(
args.onnx, providers=['CPUExecutionProvider'])
preprocessor = Preprocess(model_type)
decoder = Decoder(model_type, model_only=True)
if model_type == ModelType.YOLOV5:
anchors = YOLOv5_ANCHORS
elif model_type == ModelType.YOLOV7:
anchors = YOLOv7_ANCHORS
else:
anchors = None
for file in tqdm(files):
image = cv2.imread(file)
image_h, image_w = image.shape[:2]
img, (ratio_w, ratio_h) = preprocessor(image, args.img_size)
features = session.run(None, {'images': img})
decoder_outputs = decoder(
features,
args.score_thr,
num_labels=len(CLASS_NAMES),
anchors=anchors)
nmsd_boxes, nmsd_scores, nmsd_labels = non_max_suppression(
*decoder_outputs, args.score_thr, args.iou_thr)
for box, score, label in zip(nmsd_boxes, nmsd_scores, nmsd_labels):
x0, y0, x1, y1 = box
x0 = math.floor(min(max(x0 / ratio_w, 1), image_w - 1))
y0 = math.floor(min(max(y0 / ratio_h, 1), image_h - 1))
x1 = math.ceil(min(max(x1 / ratio_w, 1), image_w - 1))
y1 = math.ceil(min(max(y1 / ratio_h, 1), image_h - 1))
cv2.rectangle(image, (x0, y0), (x1, y1), CLASS_COLORS[label], 2)
cv2.putText(image, f'{CLASS_NAMES[label]}: {score:.2f}',
(x0, y0 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
(0, 255, 255), 2)
if args.show:
cv2.imshow('result', image)
cv2.waitKey(0)
else:
cv2.imwrite(f'{out_dir / Path(file).name}', image)
if __name__ == '__main__':
main()
from typing import List, Tuple, Union
import numpy as np
from config import ModelType
from numpy import ndarray
def softmax(x: ndarray, axis: int = -1) -> ndarray:
e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
y = e_x / e_x.sum(axis=axis, keepdims=True)
return y
def sigmoid(x: ndarray) -> ndarray:
return 1. / (1. + np.exp(-x))
class Decoder:
def __init__(self, model_type: ModelType, model_only: bool = False):
self.model_type = model_type
self.model_only = model_only
self.boxes_pro = []
self.scores_pro = []
self.labels_pro = []
self.is_logging = False
def __call__(self,
feats: Union[List, Tuple],
conf_thres: float,
num_labels: int = 80,
**kwargs) -> Tuple:
if not self.is_logging:
print('Only support decode in batch==1')
self.is_logging = True
self.boxes_pro.clear()
self.scores_pro.clear()
self.labels_pro.clear()
if self.model_only:
# transpose channel to last dim for easy decoding
feats = [
np.ascontiguousarray(feat[0].transpose(1, 2, 0))
for feat in feats
]
else:
# ax620a horizonX3 transpose channel to last dim by default
feats = [np.ascontiguousarray(feat) for feat in feats]
if self.model_type == ModelType.YOLOV5:
self.__yolov5_decode(feats, conf_thres, num_labels, **kwargs)
elif self.model_type == ModelType.YOLOX:
self.__yolox_decode(feats, conf_thres, num_labels, **kwargs)
elif self.model_type in (ModelType.PPYOLOE, ModelType.PPYOLOEP):
self.__ppyoloe_decode(feats, conf_thres, num_labels, **kwargs)
elif self.model_type == ModelType.YOLOV6:
self.__yolov6_decode(feats, conf_thres, num_labels, **kwargs)
elif self.model_type == ModelType.YOLOV7:
self.__yolov7_decode(feats, conf_thres, num_labels, **kwargs)
elif self.model_type == ModelType.RTMDET:
self.__rtmdet_decode(feats, conf_thres, num_labels, **kwargs)
elif self.model_type == ModelType.YOLOV8:
self.__yolov8_decode(feats, conf_thres, num_labels, **kwargs)
else:
raise NotImplementedError
return self.boxes_pro, self.scores_pro, self.labels_pro
def __yolov5_decode(self,
feats: List[ndarray],
conf_thres: float,
num_labels: int = 80,
**kwargs):
anchors: Union[List, Tuple] = kwargs.get(
'anchors',
[[(10, 13), (16, 30),
(33, 23)], [(30, 61), (62, 45),
(59, 119)], [(116, 90), (156, 198), (373, 326)]])
for i, feat in enumerate(feats):
stride = 8 << i
feat_h, feat_w, _ = feat.shape
anchor = anchors[i]
feat = sigmoid(feat)
feat = feat.reshape((feat_h, feat_w, len(anchor), -1))
box_feat, conf_feat, score_feat = np.split(feat, [4, 5], -1)
hIdx, wIdx, aIdx, _ = np.where(conf_feat > conf_thres)
num_proposal = hIdx.size
if not num_proposal:
continue
score_feat = score_feat[hIdx, wIdx, aIdx] * conf_feat[hIdx, wIdx,
aIdx]
boxes = box_feat[hIdx, wIdx, aIdx]
labels = score_feat.argmax(-1)
scores = score_feat.max(-1)
indices = np.where(scores > conf_thres)[0]
if len(indices) == 0:
continue
for idx in indices:
a_w, a_h = anchor[aIdx[idx]]
x, y, w, h = boxes[idx]
x = (x * 2.0 - 0.5 + wIdx[idx]) * stride
y = (y * 2.0 - 0.5 + hIdx[idx]) * stride
w = (w * 2.0)**2 * a_w
h = (h * 2.0)**2 * a_h
x0 = x - w / 2
y0 = y - h / 2
self.scores_pro.append(float(scores[idx]))
self.boxes_pro.append(
np.array([x0, y0, w, h], dtype=np.float32))
self.labels_pro.append(int(labels[idx]))
def __yolox_decode(self,
feats: List[ndarray],
conf_thres: float,
num_labels: int = 80,
**kwargs):
for i, feat in enumerate(feats):
stride = 8 << i
score_feat, box_feat, conf_feat = np.split(
feat, [num_labels, num_labels + 4], -1)
conf_feat = sigmoid(conf_feat)
hIdx, wIdx, _ = np.where(conf_feat > conf_thres)
num_proposal = hIdx.size
if not num_proposal:
continue
score_feat = sigmoid(score_feat[hIdx, wIdx]) * conf_feat[hIdx,
wIdx]
boxes = box_feat[hIdx, wIdx]
labels = score_feat.argmax(-1)
scores = score_feat.max(-1)
indices = np.where(scores > conf_thres)[0]
if len(indices) == 0:
continue
for idx in indices:
score = scores[idx]
label = labels[idx]
x, y, w, h = boxes[idx]
x = (x + wIdx[idx]) * stride
y = (y + hIdx[idx]) * stride
w = np.exp(w) * stride
h = np.exp(h) * stride
x0 = x - w / 2
y0 = y - h / 2
self.scores_pro.append(float(score))
self.boxes_pro.append(
np.array([x0, y0, w, h], dtype=np.float32))
self.labels_pro.append(int(label))
def __ppyoloe_decode(self,
feats: List[ndarray],
conf_thres: float,
num_labels: int = 80,
**kwargs):
reg_max: int = kwargs.get('reg_max', 17)
dfl = np.arange(0, reg_max, dtype=np.float32)
for i, feat in enumerate(feats):
stride = 8 << i
score_feat, box_feat = np.split(feat, [
num_labels,
], -1)
score_feat = sigmoid(score_feat)
_argmax = score_feat.argmax(-1)
_max = score_feat.max(-1)
indices = np.where(_max > conf_thres)
hIdx, wIdx = indices
num_proposal = hIdx.size
if not num_proposal:
continue
scores = _max[hIdx, wIdx]
boxes = box_feat[hIdx, wIdx].reshape(num_proposal, 4, reg_max)
boxes = softmax(boxes, -1) @ dfl
labels = _argmax[hIdx, wIdx]
for k in range(num_proposal):
score = scores[k]
label = labels[k]
x0, y0, x1, y1 = boxes[k]
x0 = (wIdx[k] + 0.5 - x0) * stride
y0 = (hIdx[k] + 0.5 - y0) * stride
x1 = (wIdx[k] + 0.5 + x1) * stride
y1 = (hIdx[k] + 0.5 + y1) * stride
w = x1 - x0
h = y1 - y0
self.scores_pro.append(float(score))
self.boxes_pro.append(
np.array([x0, y0, w, h], dtype=np.float32))
self.labels_pro.append(int(label))
def __yolov6_decode(self,
feats: List[ndarray],
conf_thres: float,
num_labels: int = 80,
**kwargs):
for i, feat in enumerate(feats):
stride = 8 << i
score_feat, box_feat = np.split(feat, [
num_labels,
], -1)
score_feat = sigmoid(score_feat)
_argmax = score_feat.argmax(-1)
_max = score_feat.max(-1)
indices = np.where(_max > conf_thres)
hIdx, wIdx = indices
num_proposal = hIdx.size
if not num_proposal:
continue
scores = _max[hIdx, wIdx]
boxes = box_feat[hIdx, wIdx]
labels = _argmax[hIdx, wIdx]
for k in range(num_proposal):
score = scores[k]
label = labels[k]
x0, y0, x1, y1 = boxes[k]
x0 = (wIdx[k] + 0.5 - x0) * stride
y0 = (hIdx[k] + 0.5 - y0) * stride
x1 = (wIdx[k] + 0.5 + x1) * stride
y1 = (hIdx[k] + 0.5 + y1) * stride
w = x1 - x0
h = y1 - y0
self.scores_pro.append(float(score))
self.boxes_pro.append(
np.array([x0, y0, w, h], dtype=np.float32))
self.labels_pro.append(int(label))
def __yolov7_decode(self,
feats: List[ndarray],
conf_thres: float,
num_labels: int = 80,
**kwargs):
anchors: Union[List, Tuple] = kwargs.get(
'anchors',
[[(12, 16), (19, 36),
(40, 28)], [(36, 75), (76, 55),
(72, 146)], [(142, 110), (192, 243), (459, 401)]])
self.__yolov5_decode(feats, conf_thres, num_labels, anchors=anchors)
def __rtmdet_decode(self,
feats: List[ndarray],
conf_thres: float,
num_labels: int = 80,
**kwargs):
for i, feat in enumerate(feats):
stride = 8 << i
score_feat, box_feat = np.split(feat, [
num_labels,
], -1)
score_feat = sigmoid(score_feat)
_argmax = score_feat.argmax(-1)
_max = score_feat.max(-1)
indices = np.where(_max > conf_thres)
hIdx, wIdx = indices
num_proposal = hIdx.size
if not num_proposal:
continue
scores = _max[hIdx, wIdx]
boxes = box_feat[hIdx, wIdx]
labels = _argmax[hIdx, wIdx]
for k in range(num_proposal):
score = scores[k]
label = labels[k]
x0, y0, x1, y1 = boxes[k]
x0 = (wIdx[k] - x0) * stride
y0 = (hIdx[k] - y0) * stride
x1 = (wIdx[k] + x1) * stride
y1 = (hIdx[k] + y1) * stride
w = x1 - x0
h = y1 - y0
self.scores_pro.append(float(score))
self.boxes_pro.append(
np.array([x0, y0, w, h], dtype=np.float32))
self.labels_pro.append(int(label))
def __yolov8_decode(self,
feats: List[ndarray],
conf_thres: float,
num_labels: int = 80,
**kwargs):
self.__yolov6_decode(feats, conf_thres, num_labels)
from typing import List, Tuple, Union
import cv2
import numpy as np
from config import ModelType
from numpy import ndarray
class Preprocess:
def __init__(self, model_type: ModelType):
if model_type in (ModelType.YOLOV5, ModelType.YOLOV6, ModelType.YOLOV7,
ModelType.YOLOV8):
mean = np.array([0, 0, 0], dtype=np.float32)
std = np.array([255, 255, 255], dtype=np.float32)
is_rgb = True
elif model_type == ModelType.YOLOX:
mean = np.array([0, 0, 0], dtype=np.float32)
std = np.array([1, 1, 1], dtype=np.float32)
is_rgb = False
elif model_type == ModelType.PPYOLOE:
mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
is_rgb = True
elif model_type == ModelType.PPYOLOEP:
mean = np.array([0, 0, 0], dtype=np.float32)
std = np.array([255, 255, 255], dtype=np.float32)
is_rgb = True
elif model_type == ModelType.RTMDET:
mean = np.array([103.53, 116.28, 123.675], dtype=np.float32)
std = np.array([57.375, 57.12, 58.3955], dtype=np.float32)
is_rgb = False
else:
raise NotImplementedError
self.mean = mean.reshape((3, 1, 1))
self.std = std.reshape((3, 1, 1))
self.is_rgb = is_rgb
def __call__(self,
image: ndarray,
new_size: Union[List[int], Tuple[int]] = (640, 640),
**kwargs) -> Tuple[ndarray, Tuple[float, float]]:
# new_size: (height, width)
height, width = image.shape[:2]
ratio_h, ratio_w = new_size[0] / height, new_size[1] / width
image = cv2.resize(
image, (0, 0),
fx=ratio_w,
fy=ratio_h,
interpolation=cv2.INTER_LINEAR)
image = np.ascontiguousarray(image.transpose(2, 0, 1))
image = image.astype(np.float32)
image -= self.mean
image /= self.std
return image[np.newaxis], (ratio_w, ratio_h)
onnxruntime
opencv-python==4.7.0.72
# Copyright (c) OpenMMLab. All rights reserved.
from .backend import MMYOLOBackend
from .backendwrapper import ORTWrapper, TRTWrapper
from .model import DeployModel
__all__ = ['DeployModel', 'TRTWrapper', 'ORTWrapper', 'MMYOLOBackend']
from enum import Enum
import torch
import torch.nn.functional as F
class MMYOLOBackend(Enum):
AX620A = 'ax620a'
COREML = 'coreml'
HORIZONX3 = 'horizonx3'
NCNN = 'ncnn'
ONNXRUNTIME = 'onnxruntime'
OPENVINO = 'openvino'
PPLNN = 'pplnn'
RKNN = 'rknn'
TENSORRT8 = 'tensorrt8'
TENSORRT7 = 'tensorrt7'
TORCHSCRIPT = 'torchscript'
TVM = 'tvm'
def HSigmoid__forward(self, x: torch.Tensor) -> torch.Tensor:
return F.hardsigmoid(x, inplace=True)
import warnings
from collections import namedtuple
from functools import partial
from pathlib import Path
from typing import List, Optional, Union
import numpy as np
import onnxruntime
try:
import tensorrt as trt
except Exception:
trt = None
import torch
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
class TRTWrapper(torch.nn.Module):
dtype_mapping = {}
def __init__(self, weight: Union[str, Path],
device: Optional[torch.device]):
super().__init__()
weight = Path(weight) if isinstance(weight, str) else weight
assert weight.exists() and weight.suffix in ('.engine', '.plan')
if isinstance(device, str):
device = torch.device(device)
elif isinstance(device, int):
device = torch.device(f'cuda:{device}')
self.weight = weight
self.device = device
self.stream = torch.cuda.Stream(device=device)
self.__update_mapping()
self.__init_engine()
self.__init_bindings()
def __update_mapping(self):
self.dtype_mapping.update({
trt.bool: torch.bool,
trt.int8: torch.int8,
trt.int32: torch.int32,
trt.float16: torch.float16,
trt.float32: torch.float32
})
def __init_engine(self):
logger = trt.Logger(trt.Logger.ERROR)
self.log = partial(logger.log, trt.Logger.ERROR)
trt.init_libnvinfer_plugins(logger, namespace='')
self.logger = logger
with trt.Runtime(logger) as runtime:
model = runtime.deserialize_cuda_engine(self.weight.read_bytes())
context = model.create_execution_context()
names = [model.get_binding_name(i) for i in range(model.num_bindings)]
num_inputs, num_outputs = 0, 0
for i in range(model.num_bindings):
if model.binding_is_input(i):
num_inputs += 1
else:
num_outputs += 1
self.is_dynamic = -1 in model.get_binding_shape(0)
self.model = model
self.context = context
self.input_names = names[:num_inputs]
self.output_names = names[num_inputs:]
self.num_inputs = num_inputs
self.num_outputs = num_outputs
self.num_bindings = num_inputs + num_outputs
self.bindings: List[int] = [0] * self.num_bindings
def __init_bindings(self):
Binding = namedtuple('Binding', ('name', 'dtype', 'shape'))
inputs_info = []
outputs_info = []
for i, name in enumerate(self.input_names):
assert self.model.get_binding_name(i) == name
dtype = self.dtype_mapping[self.model.get_binding_dtype(i)]
shape = tuple(self.model.get_binding_shape(i))
inputs_info.append(Binding(name, dtype, shape))
for i, name in enumerate(self.output_names):
i += self.num_inputs
assert self.model.get_binding_name(i) == name
dtype = self.dtype_mapping[self.model.get_binding_dtype(i)]
shape = tuple(self.model.get_binding_shape(i))
outputs_info.append(Binding(name, dtype, shape))
self.inputs_info = inputs_info
self.outputs_info = outputs_info
if not self.is_dynamic:
self.output_tensor = [
torch.empty(o.shape, dtype=o.dtype, device=self.device)
for o in outputs_info
]
def forward(self, *inputs):
assert len(inputs) == self.num_inputs
contiguous_inputs: List[torch.Tensor] = [
i.contiguous() for i in inputs
]
for i in range(self.num_inputs):
self.bindings[i] = contiguous_inputs[i].data_ptr()
if self.is_dynamic:
self.context.set_binding_shape(
i, tuple(contiguous_inputs[i].shape))
# create output tensors
outputs: List[torch.Tensor] = []
for i in range(self.num_outputs):
j = i + self.num_inputs
if self.is_dynamic:
shape = tuple(self.context.get_binding_shape(j))
output = torch.empty(
size=shape,
dtype=self.output_dtypes[i],
device=self.device)
else:
output = self.output_tensor[i]
outputs.append(output)
self.bindings[j] = output.data_ptr()
self.context.execute_async_v2(self.bindings, self.stream.cuda_stream)
self.stream.synchronize()
return tuple(outputs)
class ORTWrapper(torch.nn.Module):
def __init__(self, weight: Union[str, Path],
device: Optional[torch.device]):
super().__init__()
weight = Path(weight) if isinstance(weight, str) else weight
assert weight.exists() and weight.suffix == '.onnx'
if isinstance(device, str):
device = torch.device(device)
elif isinstance(device, int):
device = torch.device(f'cuda:{device}')
self.weight = weight
self.device = device
self.__init_session()
self.__init_bindings()
def __init_session(self):
providers = ['CPUExecutionProvider']
if 'cuda' in self.device.type:
providers.insert(0, 'CUDAExecutionProvider')
session = onnxruntime.InferenceSession(
str(self.weight), providers=providers)
self.session = session
def __init_bindings(self):
Binding = namedtuple('Binding', ('name', 'dtype', 'shape'))
inputs_info = []
outputs_info = []
self.is_dynamic = False
for i, tensor in enumerate(self.session.get_inputs()):
if any(not isinstance(i, int) for i in tensor.shape):
self.is_dynamic = True
inputs_info.append(
Binding(tensor.name, tensor.type, tuple(tensor.shape)))
for i, tensor in enumerate(self.session.get_outputs()):
outputs_info.append(
Binding(tensor.name, tensor.type, tuple(tensor.shape)))
self.inputs_info = inputs_info
self.outputs_info = outputs_info
self.num_inputs = len(inputs_info)
def forward(self, *inputs):
assert len(inputs) == self.num_inputs
contiguous_inputs: List[np.ndarray] = [
i.contiguous().cpu().numpy() for i in inputs
]
if not self.is_dynamic:
# make sure input shape is right for static input shape
for i in range(self.num_inputs):
assert contiguous_inputs[i].shape == self.inputs_info[i].shape
outputs = self.session.run([o.name for o in self.outputs_info], {
j.name: contiguous_inputs[i]
for i, j in enumerate(self.inputs_info)
})
return tuple(torch.from_numpy(o).to(self.device) for o in outputs)
# Copyright (c) OpenMMLab. All rights reserved.
from copy import deepcopy
from functools import partial
from typing import List, Optional, Tuple
import torch
import torch.nn as nn
from mmdet.models.backbones.csp_darknet import Focus
from mmdet.models.layers import ChannelAttention
from mmengine.config import ConfigDict
from torch import Tensor
from mmyolo.models import RepVGGBlock
from mmyolo.models.dense_heads import (PPYOLOEHead, RTMDetHead, YOLOv5Head,
YOLOv7Head, YOLOv8Head, YOLOXHead)
from mmyolo.models.layers import ImplicitA, ImplicitM
from ..backbone import DeployFocus, GConvFocus, NcnnFocus
from ..bbox_code import (rtmdet_bbox_decoder, yolov5_bbox_decoder,
yolox_bbox_decoder)
from ..nms import batched_nms, efficient_nms, onnx_nms
from .backend import MMYOLOBackend
class DeployModel(nn.Module):
transpose = False
def __init__(self,
baseModel: nn.Module,
backend: MMYOLOBackend,
postprocess_cfg: Optional[ConfigDict] = None,
with_nms=True,
without_bbox_decoder=False):
super().__init__()
self.baseModel = baseModel
self.baseHead = baseModel.bbox_head
self.backend = backend
self.with_nms = with_nms
self.without_bbox_decoder = without_bbox_decoder
if postprocess_cfg is None:
self.with_postprocess = False
else:
self.with_postprocess = True
self.__init_sub_attributes()
self.detector_type = type(self.baseHead)
self.pre_top_k = postprocess_cfg.get('pre_top_k', 1000)
self.keep_top_k = postprocess_cfg.get('keep_top_k', 100)
self.iou_threshold = postprocess_cfg.get('iou_threshold', 0.65)
self.score_threshold = postprocess_cfg.get('score_threshold', 0.25)
self.__switch_deploy()
def __init_sub_attributes(self):
self.bbox_decoder = self.baseHead.bbox_coder.decode
self.prior_generate = self.baseHead.prior_generator.grid_priors
self.num_base_priors = self.baseHead.num_base_priors
self.featmap_strides = self.baseHead.featmap_strides
self.num_classes = self.baseHead.num_classes
def __switch_deploy(self):
headType = type(self.baseHead)
if not self.with_postprocess:
if headType in (YOLOv5Head, YOLOv7Head):
self.baseHead.head_module.forward_single = self.forward_single
elif headType in (PPYOLOEHead, YOLOv8Head):
self.baseHead.head_module.reg_max = 0
if self.backend in (MMYOLOBackend.HORIZONX3, MMYOLOBackend.NCNN,
MMYOLOBackend.TORCHSCRIPT):
self.transpose = True
for layer in self.baseModel.modules():
if isinstance(layer, RepVGGBlock):
layer.switch_to_deploy()
elif isinstance(layer, ChannelAttention):
layer.global_avgpool.forward = self.forward_gvp
elif isinstance(layer, Focus):
# onnxruntime openvino tensorrt8 tensorrt7
if self.backend in (MMYOLOBackend.ONNXRUNTIME,
MMYOLOBackend.OPENVINO,
MMYOLOBackend.TENSORRT8,
MMYOLOBackend.TENSORRT7):
self.baseModel.backbone.stem = DeployFocus(layer)
# ncnn
elif self.backend == MMYOLOBackend.NCNN:
self.baseModel.backbone.stem = NcnnFocus(layer)
# switch focus to group conv
else:
self.baseModel.backbone.stem = GConvFocus(layer)
def pred_by_feat(self,
cls_scores: List[Tensor],
bbox_preds: List[Tensor],
objectnesses: Optional[List[Tensor]] = None,
coeff_preds: Optional[List[Tensor]] = None,
proto_preds: Optional[List[Tensor]] = None,
**kwargs):
assert len(cls_scores) == len(bbox_preds)
dtype = cls_scores[0].dtype
device = cls_scores[0].device
nms_func = self.select_nms()
if self.detector_type in (YOLOv5Head, YOLOv7Head):
bbox_decoder = yolov5_bbox_decoder
elif self.detector_type is RTMDetHead:
bbox_decoder = rtmdet_bbox_decoder
elif self.detector_type is YOLOXHead:
bbox_decoder = yolox_bbox_decoder
else:
bbox_decoder = self.bbox_decoder
print(bbox_decoder)
num_imgs = cls_scores[0].shape[0]
featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
mlvl_priors = self.prior_generate(featmap_sizes,
dtype=dtype,
device=device)
flatten_priors = torch.cat(mlvl_priors)
mlvl_strides = [
flatten_priors.new_full(
(featmap_size[0] * featmap_size[1] * self.num_base_priors, ),
stride) for featmap_size, stride in zip(
featmap_sizes, self.featmap_strides)
]
flatten_stride = torch.cat(mlvl_strides)
text_len = cls_scores[0].shape[1]
flatten_cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, text_len)
for cls_score in cls_scores
]
cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
flatten_bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
for bbox_pred in bbox_preds
]
flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
if objectnesses is not None:
flatten_objectness = [
objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
for objectness in objectnesses
]
flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
cls_scores = cls_scores * (flatten_objectness.unsqueeze(-1))
scores = cls_scores
bboxes = flatten_bbox_preds
if self.without_bbox_decoder:
return scores, bboxes
bboxes = bbox_decoder(flatten_priors[None], flatten_bbox_preds,
flatten_stride)
if self.with_nms:
return nms_func(bboxes, scores, self.keep_top_k,
self.iou_threshold, self.score_threshold,
self.pre_top_k, self.keep_top_k)
else:
return scores, bboxes
def select_nms(self):
if self.backend in (MMYOLOBackend.ONNXRUNTIME, MMYOLOBackend.OPENVINO):
nms_func = onnx_nms
elif self.backend == MMYOLOBackend.TENSORRT8:
nms_func = efficient_nms
elif self.backend == MMYOLOBackend.TENSORRT7:
nms_func = batched_nms
else:
raise NotImplementedError
if type(self.baseHead) in (YOLOv5Head, YOLOv7Head, YOLOXHead):
nms_func = partial(nms_func, box_coding=1)
return nms_func
def forward(self, inputs: Tensor):
neck_outputs = self.baseModel(inputs)
if self.with_postprocess:
return self.pred_by_feat(*neck_outputs)
else:
outputs = []
if self.transpose:
for feats in zip(*neck_outputs):
if self.backend in (MMYOLOBackend.NCNN,
MMYOLOBackend.TORCHSCRIPT):
outputs.append(
torch.cat(
[feat.permute(0, 2, 3, 1) for feat in feats],
-1))
else:
outputs.append(torch.cat(feats, 1).permute(0, 2, 3, 1))
else:
for feats in zip(*neck_outputs):
outputs.append(torch.cat(feats, 1))
return tuple(outputs)
@staticmethod
def forward_single(x: Tensor, convs: nn.Module) -> Tuple[Tensor]:
if isinstance(convs, nn.Sequential) and any(
type(m) in (ImplicitA, ImplicitM) for m in convs):
a, c, m = convs
aw = a.implicit.clone()
mw = m.implicit.clone()
c = deepcopy(c)
nw, cw, _, _ = c.weight.shape
na, ca, _, _ = aw.shape
nm, cm, _, _ = mw.shape
c.bias = nn.Parameter(c.bias + (
c.weight.reshape(nw, cw) @ aw.reshape(ca, na)).squeeze(1))
c.bias = nn.Parameter(c.bias * mw.reshape(cm))
c.weight = nn.Parameter(c.weight * mw.transpose(0, 1))
convs = c
feat = convs(x)
return (feat, )
@staticmethod
def forward_gvp(x: Tensor) -> Tensor:
return torch.mean(x, [2, 3], keepdim=True)
# Copyright (c) OpenMMLab. All rights reserved.
from .ort_nms import onnx_nms
from .trt_nms import batched_nms, efficient_nms
__all__ = ['efficient_nms', 'batched_nms', 'onnx_nms']
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch import Tensor
from torchvision.ops import batched_nms
_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0],
[-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]],
dtype=torch.float32)
def sort_nms_index(nms_index, scores, batch_size, keep_top_k=-1):
"""
first sort the nms_index by batch, and then sort by score in every image result, final apply keep_top_k strategy. In the process, we can also get the number of detections for each image: num_dets
"""
# first sort by batch index to make sure that the same batch index is together
device = nms_index.device
nms_index_indices = torch.argsort(nms_index[:, 0], dim=0).to(device)
nms_index = nms_index[nms_index_indices]
scores = scores[nms_index[:, 0], nms_index[:, 1], nms_index[:, 2]]
batch_inds = nms_index[:, 0]
# Get the number of detections for each image
num_dets = torch.bincount(batch_inds,minlength=batch_size).to(device)
# Calculate the sum from front to back
cumulative_sum = torch.cumsum(num_dets, dim=0).to(device)
# add initial value 0
cumulative_sum = torch.cat((torch.tensor([0]).to(device), cumulative_sum))
for i in range(len(num_dets)):
start = cumulative_sum[i]
end = cumulative_sum[i + 1]
# sort by score in every batch
block_idx = torch.argsort(scores[start:end], descending=True).to(device)
nms_index[start:end] = nms_index[start:end][block_idx]
if keep_top_k > 0 and end - start > keep_top_k:
# delete lines from start+keep_top_k to end to keep only top k
nms_index = torch.cat(
(nms_index[: start + keep_top_k], nms_index[end:]), dim=0
)
num_dets[i] -= end - start - keep_top_k
cumulative_sum -= end - start - keep_top_k
return nms_index, num_dets
def select_nms_index(
scores: Tensor,
boxes: Tensor,
nms_index: Tensor,
batch_size: int,
keep_top_k: int = -1,
):
if nms_index.numel() == 0:
return torch.empty(0), torch.empty(0, 4), torch.empty(0), torch.empty(0)
nms_index, num_dets = sort_nms_index(nms_index, scores, batch_size, keep_top_k)
batch_inds, cls_inds = nms_index[:, 0], nms_index[:, 1]
box_inds = nms_index[:, 2]
# according to the nms_index to get the scores,boxes and labels
batched_scores = scores[batch_inds, cls_inds, box_inds]
batched_dets = boxes[batch_inds, box_inds, ...]
batched_labels = cls_inds
return num_dets, batched_dets, batched_scores, batched_labels
def construct_indice(batch_idx, select_bbox_idxs, class_idxs, original_idxs):
num_bbox = len(select_bbox_idxs)
class_idxs = class_idxs[select_bbox_idxs]
indice = torch.zeros((num_bbox, 3), dtype=torch.int32).to(select_bbox_idxs.device)
# batch_idx
indice[:, 0] = batch_idx
# class_idxs
indice[:, 1] = class_idxs
# select_bbox_idxs
indice[:, 2] = original_idxs[select_bbox_idxs]
return indice
def filter_max_boxes_per_class(
select_bbox_idxs, class_idxs, max_output_boxes_per_class
):
class_counts = {} # used to track the count of each class
filtered_select_bbox_idxs = []
filtered_max_class_idxs = []
for bbox_idx, class_idx in zip(select_bbox_idxs, class_idxs):
class_count = class_counts.get(
class_idx.item(), 0
) # Get the count of the current class, or return 0 if it does not exist
if class_count < max_output_boxes_per_class:
filtered_select_bbox_idxs.append(bbox_idx)
filtered_max_class_idxs.append(class_idx)
class_counts[class_idx.item()] = class_count + 1
return torch.tensor(filtered_select_bbox_idxs), torch.tensor(
filtered_max_class_idxs
)
class ONNXNMSop(torch.autograd.Function):
@staticmethod
def forward(
ctx,
boxes: Tensor,
scores: Tensor,
max_output_boxes_per_class: Tensor = torch.tensor([100]),
iou_threshold: Tensor = torch.tensor([0.5]),
score_threshold: Tensor = torch.tensor([0.05])
) -> Tensor:
"""
Non-Maximum Suppression (NMS) implementation.
Args:
boxes (Tensor): Bounding boxes of shape (batch_size, num_boxes, 4).
scores (Tensor): Confidence scores of shape (batch_size, num_classes, num_boxes).
max_output_boxes_per_class (Tensor): Maximum number of output boxes per class.
iou_threshold (Tensor): IoU threshold for NMS.
score_threshold (Tensor): Confidence score threshold.
Returns:
Tensor: Selected indices of shape (num_det, 3).first value is batch index, second value is class index, third value is box index
"""
device = boxes.device
batch_size, num_classes, num_boxes = scores.shape
selected_indices = []
for batch_idx in range(batch_size):
boxes_per_image = boxes[batch_idx]
scores_per_image = scores[batch_idx]
# If no boxes in this image, continue to the next image
if boxes_per_image.numel() == 0:
continue
# for one box, only exist one class,so use torch.max to get the max score and class index
scores_per_image, class_idxs = torch.max(scores_per_image, dim=0)
# Apply score threshold before batched_nms bacause nms operation is time expensive
keep_idxs = scores_per_image > score_threshold
if not torch.any(keep_idxs):
# If no boxes left after applying score threshold, continue to the next image
continue
boxes_per_image = boxes_per_image[keep_idxs]
scores_per_image = scores_per_image[keep_idxs]
class_idxs = class_idxs[keep_idxs]
# The purpose of original_idxs is we want to return the indexs to the original input data instead of the filtered.
original_idxs = torch.arange(num_boxes, device=device)[keep_idxs]
# reference: https://pytorch.org/vision/main/generated/torchvision.ops.batched_nms.html
select_bbox_idxs = batched_nms(
boxes_per_image, scores_per_image, class_idxs, iou_threshold
)
if (
select_bbox_idxs.shape[0] > max_output_boxes_per_class
): # If the boxes detected by all classes together are less than max_output_boxes_per_class, then there is no need to filter
select_bbox_idxs, _ = filter_max_boxes_per_class(
select_bbox_idxs,
class_idxs[select_bbox_idxs],
max_output_boxes_per_class,
)
selected_indice = construct_indice(
batch_idx, select_bbox_idxs, class_idxs, original_idxs
)
selected_indices.append(selected_indice)
if len(selected_indices) == 0:
return torch.tensor([], device=device)
selected_indices = torch.cat(selected_indices, dim=0)
return selected_indices
@staticmethod
def symbolic(
g,
boxes: Tensor,
scores: Tensor,
max_output_boxes_per_class: Tensor = torch.tensor([100]),
iou_threshold: Tensor = torch.tensor([0.5]),
score_threshold: Tensor = torch.tensor([0.05]),
):
return g.op(
'NonMaxSuppression',
boxes,
scores,
max_output_boxes_per_class,
iou_threshold,
score_threshold,
outputs=1)
def onnx_nms(
boxes: torch.Tensor,
scores: torch.Tensor,
max_output_boxes_per_class: int = 100,
iou_threshold: float = 0.5,
score_threshold: float = 0.05,
pre_top_k: int = -1,
keep_top_k: int = 100,
box_coding: int = 0,
):
max_output_boxes_per_class = torch.tensor([max_output_boxes_per_class])
iou_threshold = torch.tensor([iou_threshold]).to(boxes.device)
score_threshold = torch.tensor([score_threshold]).to(boxes.device)
batch_size, _, _ = scores.shape
if box_coding == 1:
boxes = boxes @ (_XYWH2XYXY.to(boxes.device))
scores = scores.transpose(1, 2).contiguous()
selected_indices = ONNXNMSop.apply(boxes, scores,
max_output_boxes_per_class,
iou_threshold, score_threshold)
num_dets, batched_dets, batched_scores, batched_labels = select_nms_index(
scores, boxes, selected_indices, batch_size, keep_top_k=keep_top_k)
return num_dets, batched_dets, batched_scores, batched_labels.to(
torch.int32)
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch import Tensor
_XYWH2XYXY = torch.tensor([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0],
[-0.5, 0.0, 0.5, 0.0], [0.0, -0.5, 0.0, 0.5]],
dtype=torch.float32)
class TRTEfficientNMSop(torch.autograd.Function):
@staticmethod
def forward(
ctx,
boxes: Tensor,
scores: Tensor,
background_class: int = -1,
box_coding: int = 0,
iou_threshold: float = 0.45,
max_output_boxes: int = 100,
plugin_version: str = '1',
score_activation: int = 0,
score_threshold: float = 0.25,
):
batch_size, _, num_classes = scores.shape
num_det = torch.randint(
0, max_output_boxes, (batch_size, 1), dtype=torch.int32)
det_boxes = torch.randn(batch_size, max_output_boxes, 4)
det_scores = torch.randn(batch_size, max_output_boxes)
det_classes = torch.randint(
0, num_classes, (batch_size, max_output_boxes), dtype=torch.int32)
return num_det, det_boxes, det_scores, det_classes
@staticmethod
def symbolic(g,
boxes: Tensor,
scores: Tensor,
background_class: int = -1,
box_coding: int = 0,
iou_threshold: float = 0.45,
max_output_boxes: int = 100,
plugin_version: str = '1',
score_activation: int = 0,
score_threshold: float = 0.25):
out = g.op(
'TRT::EfficientNMS_TRT',
boxes,
scores,
background_class_i=background_class,
box_coding_i=box_coding,
iou_threshold_f=iou_threshold,
max_output_boxes_i=max_output_boxes,
plugin_version_s=plugin_version,
score_activation_i=score_activation,
score_threshold_f=score_threshold,
outputs=4)
num_det, det_boxes, det_scores, det_classes = out
return num_det, det_boxes, det_scores, det_classes
class TRTbatchedNMSop(torch.autograd.Function):
"""TensorRT NMS operation."""
@staticmethod
def forward(
ctx,
boxes: Tensor,
scores: Tensor,
plugin_version: str = '1',
shareLocation: int = 1,
backgroundLabelId: int = -1,
numClasses: int = 80,
topK: int = 1000,
keepTopK: int = 100,
scoreThreshold: float = 0.25,
iouThreshold: float = 0.45,
isNormalized: int = 0,
clipBoxes: int = 0,
scoreBits: int = 16,
caffeSemantics: int = 1,
):
batch_size, _, numClasses = scores.shape
num_det = torch.randint(
0, keepTopK, (batch_size, 1), dtype=torch.int32)
det_boxes = torch.randn(batch_size, keepTopK, 4)
det_scores = torch.randn(batch_size, keepTopK)
det_classes = torch.randint(0, numClasses,
(batch_size, keepTopK)).float()
return num_det, det_boxes, det_scores, det_classes
@staticmethod
def symbolic(
g,
boxes: Tensor,
scores: Tensor,
plugin_version: str = '1',
shareLocation: int = 1,
backgroundLabelId: int = -1,
numClasses: int = 80,
topK: int = 1000,
keepTopK: int = 100,
scoreThreshold: float = 0.25,
iouThreshold: float = 0.45,
isNormalized: int = 0,
clipBoxes: int = 0,
scoreBits: int = 16,
caffeSemantics: int = 1,
):
out = g.op(
'TRT::BatchedNMSDynamic_TRT',
boxes,
scores,
shareLocation_i=shareLocation,
plugin_version_s=plugin_version,
backgroundLabelId_i=backgroundLabelId,
numClasses_i=numClasses,
topK_i=topK,
keepTopK_i=keepTopK,
scoreThreshold_f=scoreThreshold,
iouThreshold_f=iouThreshold,
isNormalized_i=isNormalized,
clipBoxes_i=clipBoxes,
scoreBits_i=scoreBits,
caffeSemantics_i=caffeSemantics,
outputs=4)
num_det, det_boxes, det_scores, det_classes = out
return num_det, det_boxes, det_scores, det_classes
def _efficient_nms(
boxes: Tensor,
scores: Tensor,
max_output_boxes_per_class: int = 1000,
iou_threshold: float = 0.5,
score_threshold: float = 0.05,
pre_top_k: int = -1,
keep_top_k: int = 100,
box_coding: int = 0,
):
"""Wrapper for `efficient_nms` with TensorRT.
Args:
boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4].
scores (Tensor): The detection scores of shape
[N, num_boxes, num_classes].
max_output_boxes_per_class (int): Maximum number of output
boxes per class of nms. Defaults to 1000.
iou_threshold (float): IOU threshold of nms. Defaults to 0.5.
score_threshold (float): score threshold of nms.
Defaults to 0.05.
pre_top_k (int): Number of top K boxes to keep before nms.
Defaults to -1.
keep_top_k (int): Number of top K boxes to keep after nms.
Defaults to -1.
box_coding (int): Bounding boxes format for nms.
Defaults to 0 means [x1, y1 ,x2, y2].
Set to 1 means [x, y, w, h].
Returns:
tuple[Tensor, Tensor, Tensor, Tensor]:
(num_det, det_boxes, det_scores, det_classes),
`num_det` of shape [N, 1]
`det_boxes` of shape [N, num_det, 4]
`det_scores` of shape [N, num_det]
`det_classes` of shape [N, num_det]
"""
num_det, det_boxes, det_scores, det_classes = TRTEfficientNMSop.apply(
boxes, scores, -1, box_coding, iou_threshold, keep_top_k, '1', 0,
score_threshold)
return num_det, det_boxes, det_scores, det_classes
def _batched_nms(
boxes: Tensor,
scores: Tensor,
max_output_boxes_per_class: int = 1000,
iou_threshold: float = 0.5,
score_threshold: float = 0.05,
pre_top_k: int = -1,
keep_top_k: int = 100,
box_coding: int = 0,
):
"""Wrapper for `efficient_nms` with TensorRT.
Args:
boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4].
scores (Tensor): The detection scores of shape
[N, num_boxes, num_classes].
max_output_boxes_per_class (int): Maximum number of output
boxes per class of nms. Defaults to 1000.
iou_threshold (float): IOU threshold of nms. Defaults to 0.5.
score_threshold (float): score threshold of nms.
Defaults to 0.05.
pre_top_k (int): Number of top K boxes to keep before nms.
Defaults to -1.
keep_top_k (int): Number of top K boxes to keep after nms.
Defaults to -1.
box_coding (int): Bounding boxes format for nms.
Defaults to 0 means [x1, y1 ,x2, y2].
Set to 1 means [x, y, w, h].
Returns:
tuple[Tensor, Tensor, Tensor, Tensor]:
(num_det, det_boxes, det_scores, det_classes),
`num_det` of shape [N, 1]
`det_boxes` of shape [N, num_det, 4]
`det_scores` of shape [N, num_det]
`det_classes` of shape [N, num_det]
"""
if box_coding == 1:
boxes = boxes @ (_XYWH2XYXY.to(boxes.device))
boxes = boxes if boxes.dim() == 4 else boxes.unsqueeze(2)
_, _, numClasses = scores.shape
num_det, det_boxes, det_scores, det_classes = TRTbatchedNMSop.apply(
boxes, scores, '1', 1, -1, int(numClasses), min(pre_top_k, 4096),
keep_top_k, score_threshold, iou_threshold, 0, 0, 16, 1)
det_classes = det_classes.int()
return num_det, det_boxes, det_scores, det_classes
def efficient_nms(*args, **kwargs):
"""Wrapper function for `_efficient_nms`."""
return _efficient_nms(*args, **kwargs)
def batched_nms(*args, **kwargs):
"""Wrapper function for `_batched_nms`."""
return _batched_nms(*args, **kwargs)
import argparse
from pathlib import Path
from typing import List, Optional, Tuple, Union
try:
import tensorrt as trt
except Exception:
trt = None
import warnings
import numpy as np
import torch
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
class EngineBuilder:
def __init__(
self,
checkpoint: Union[str, Path],
opt_shape: Union[Tuple, List] = (1, 3, 640, 640),
device: Optional[Union[str, int, torch.device]] = None) -> None:
checkpoint = Path(checkpoint) if isinstance(checkpoint,
str) else checkpoint
assert checkpoint.exists() and checkpoint.suffix == '.onnx'
if isinstance(device, str):
device = torch.device(device)
elif isinstance(device, int):
device = torch.device(f'cuda:{device}')
self.checkpoint = checkpoint
self.opt_shape = np.array(opt_shape, dtype=np.float32)
self.device = device
def __build_engine(self,
scale: Optional[List[List]] = None,
fp16: bool = True,
with_profiling: bool = True) -> None:
logger = trt.Logger(trt.Logger.WARNING)
trt.init_libnvinfer_plugins(logger, namespace='')
builder = trt.Builder(logger)
config = builder.create_builder_config()
config.max_workspace_size = torch.cuda.get_device_properties(
self.device).total_memory
flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
network = builder.create_network(flag)
parser = trt.OnnxParser(network, logger)
if not parser.parse_from_file(str(self.checkpoint)):
raise RuntimeError(
f'failed to load ONNX file: {str(self.checkpoint)}')
inputs = [network.get_input(i) for i in range(network.num_inputs)]
outputs = [network.get_output(i) for i in range(network.num_outputs)]
profile = None
dshape = -1 in network.get_input(0).shape
if dshape:
profile = builder.create_optimization_profile()
if scale is None:
scale = np.array(
[[1, 1, 0.5, 0.5], [1, 1, 1, 1], [4, 1, 1.5, 1.5]],
dtype=np.float32)
scale = (self.opt_shape * scale).astype(np.int32)
elif isinstance(scale, List):
scale = np.array(scale, dtype=np.int32)
assert scale.shape[0] == 3, 'Input a wrong scale list'
else:
raise NotImplementedError
for inp in inputs:
logger.log(
trt.Logger.WARNING,
f'input "{inp.name}" with shape{inp.shape} {inp.dtype}')
if dshape:
profile.set_shape(inp.name, *scale)
for out in outputs:
logger.log(
trt.Logger.WARNING,
f'output "{out.name}" with shape{out.shape} {out.dtype}')
if fp16 and builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
self.weight = self.checkpoint.with_suffix('.engine')
if dshape:
config.add_optimization_profile(profile)
if with_profiling:
config.profiling_verbosity = trt.ProfilingVerbosity.DETAILED
with builder.build_engine(network, config) as engine:
self.weight.write_bytes(engine.serialize())
logger.log(
trt.Logger.WARNING, f'Build tensorrt engine finish.\n'
f'Save in {str(self.weight.absolute())}')
def build(self,
scale: Optional[List[List]] = None,
fp16: bool = True,
with_profiling=True):
self.__build_engine(scale, fp16, with_profiling)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('checkpoint', help='Checkpoint file')
parser.add_argument(
'--img-size',
nargs='+',
type=int,
default=[640, 640],
help='Image size of height and width')
parser.add_argument(
'--device', type=str, default='cuda:0', help='TensorRT builder device')
parser.add_argument(
'--scales',
type=str,
default='[[1,3,640,640],[1,3,640,640],[1,3,640,640]]',
help='Input scales for build dynamic input shape engine')
parser.add_argument(
'--fp16', action='store_true', help='Build model with fp16 mode')
args = parser.parse_args()
args.img_size *= 2 if len(args.img_size) == 1 else 1
return args
def main(args):
img_size = (1, 3, *args.img_size)
try:
scales = eval(args.scales)
except Exception:
print('Input scales is not a python variable')
print('Set scales default None')
scales = None
builder = EngineBuilder(args.checkpoint, img_size, args.device)
builder.build(scales, fp16=args.fp16)
if __name__ == '__main__':
args = parse_args()
main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment