Commit c6a27e0b authored by panhb's avatar panhb
Browse files

first init

parent e4b993b1
Pipeline #2192 canceled with stages
use_gpu: true
use_xpu: false
use_mlu: false
use_npu: false
log_iter: 20
save_dir: output
snapshot_epoch: 1
print_flops: false
print_params: false
# Exporting the model
export:
post_process: True # Whether post-processing is included in the network when export model.
nms: True # Whether NMS is included in the network when export model.
benchmark: False # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
fuse_conv_bn: False
# Vision Transformer Detection
## Introduction
- [Context Autoencoder for Self-Supervised Representation Learning](https://arxiv.org/abs/2202.03026)
- [Benchmarking Detection Transfer Learning with Vision Transformers](https://arxiv.org/pdf/2111.11429.pdf)
Object detection is a central downstream task used to
test if pre-trained network parameters confer benefits, such
as improved accuracy or training speed. The complexity
of object detection methods can make this benchmarking
non-trivial when new architectures, such as Vision Transformer (ViT) models, arrive.
## Model Zoo
| Model | Backbone | Pretrained | Scheduler | Images/GPU | Box AP | Mask AP | Config | Download |
|:------:|:--------:|:--------------:|:--------------:|:--------------:|:--------------:|:------:|:------:|:--------:|
| PP-YOLOE | ViT-base | CAE | 36e | 2 | 52.2 | - | [config](./ppyoloe_vit_base_csppan_cae_36e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_vit_base_csppan_cae_36e_coco.pdparams) |
**Notes:**
- Model is trained on COCO train2017 dataset and evaluated on val2017 results of `mAP(IoU=0.5:0.95)
- Base model is trained on 8x32G V100 GPU, large model on 8x80G A100
## Citations
```
@article{chen2022context,
title={Context autoencoder for self-supervised representation learning},
author={Chen, Xiaokang and Ding, Mingyu and Wang, Xiaodi and Xin, Ying and Mo, Shentong and Wang, Yunhao and Han, Shumin and Luo, Ping and Zeng, Gang and Wang, Jingdong},
journal={arXiv preprint arXiv:2202.03026},
year={2022}
}
@article{DBLP:journals/corr/abs-2111-11429,
author = {Yanghao Li and
Saining Xie and
Xinlei Chen and
Piotr Doll{\'{a}}r and
Kaiming He and
Ross B. Girshick},
title = {Benchmarking Detection Transfer Learning with Vision Transformers},
journal = {CoRR},
volume = {abs/2111.11429},
year = {2021},
url = {https://arxiv.org/abs/2111.11429},
eprinttype = {arXiv},
eprint = {2111.11429},
timestamp = {Fri, 26 Nov 2021 13:48:43 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2111-11429.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Cai_2019,
title={Cascade R-CNN: High Quality Object Detection and Instance Segmentation},
ISSN={1939-3539},
url={http://dx.doi.org/10.1109/tpami.2019.2956516},
DOI={10.1109/tpami.2019.2956516},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
author={Cai, Zhaowei and Vasconcelos, Nuno},
year={2019},
pages={1–1}
}
```
epoch: 36
LearningRate:
base_lr: 0.0001
schedulers:
- !CosineDecay
max_epochs: 36
min_lr_ratio: 0.1
- !LinearWarmup
start_factor: 0.001
epochs: 1
OptimizerBuilder:
clip_grad_by_norm: 0.1
regularizer: false
optimizer:
type: AdamW
weight_decay: 0.0001
worker_num: 4
eval_height: &eval_height 640
eval_width: &eval_width 640
eval_size: &eval_size [*eval_height, *eval_width]
TrainReader:
sample_transforms:
- Decode: {}
- RandomDistort: {}
- RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
- RandomCrop: {}
- RandomFlip: {}
batch_transforms:
- BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768], random_size: True, random_interp: True, keep_ratio: False}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
- PadGT: {}
batch_size: 2
shuffle: true
drop_last: true
use_shared_memory: true
collate_batch: true
EvalReader:
sample_transforms:
- Decode: {}
- Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 2
TestReader:
inputs_def:
image_shape: [3, *eval_height, *eval_width]
sample_transforms:
- Decode: {}
- Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 1
_BASE_: [
'../datasets/coco_detection.yml',
'../runtime.yml',
'./_base_/ppyoloe_reader.yml',
'./_base_/optimizer_base_36e.yml'
]
weights: output/ppyoloe_vit_base_csppan_cae_36e_coco/model_final
snapshot_epoch: 2
log_iter: 100
use_ema: true
ema_decay: 0.9999
ema_skip_names: ['yolo_head.proj_conv.weight', 'backbone.pos_embed']
custom_black_list: ['reduce_mean']
use_fused_allreduce_gradients: &use_checkpoint False
architecture: YOLOv3
norm_type: sync_bn
YOLOv3:
backbone: VisionTransformer
neck: YOLOCSPPAN
yolo_head: PPYOLOEHead
post_process: ~
VisionTransformer:
patch_size: 16
embed_dim: 768
depth: 12
num_heads: 12
mlp_ratio: 4
qkv_bias: True
drop_rate: 0.0
drop_path_rate: 0.2
init_values: 0.1
final_norm: False
use_rel_pos_bias: False
use_sincos_pos_emb: True
epsilon: 0.000001 # 1e-6
out_indices: [11, ]
with_fpn: True
num_fpn_levels: 3
out_with_norm: False
use_checkpoint: *use_checkpoint
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams
YOLOCSPPAN:
in_channels: [768, 768, 768]
act: 'silu'
PPYOLOEHead:
fpn_strides: [8, 16, 32]
in_channels: [768, 768, 768]
static_assigner_epoch: -1
grid_cell_scale: 5.0
grid_cell_offset: 0.5
use_varifocal_loss: True
loss_weight: {class: 1.0, iou: 2.5, dfl: 0.5}
static_assigner:
name: ATSSAssigner
topk: 9
assigner:
name: TaskAlignedAssigner
topk: 13
alpha: 1.0
beta: 6.0
nms:
name: MultiClassNMS
nms_top_k: 1000
keep_top_k: 300
score_threshold: 0.01
nms_threshold: 0.7
# YOLO on VOC
## 模型库
| 网络模型 | 输入尺寸 | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP(0.50,11point) | Params(M) | FLOPs(G) | 下载链接 | 配置文件 |
| :-----------: | :-------: | :-------: | :------: | :------------: | :---------------: | :------------------: |:-----------------: | :------: | :------: |
| YOLOv5-s | 640 | 16 | 60e | 3.2 | 80.3 | 7.24 | 16.54 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov5_s_60e_voc.pdparams) | [配置文件](./yolov5_s_60e_voc.yml) |
| YOLOv7-tiny | 640 | 32 | 60e | 2.6 | 80.2 | 6.23 | 6.90 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov7_tiny_60e_voc.pdparams) | [配置文件](./yolov7_tiny_60e_voc.yml) |
| YOLOX-s | 640 | 8 | 40e | 3.0 | 82.9 | 9.0 | 26.8 | [下载链接](https://paddledet.bj.bcebos.com/models/yolox_s_40e_voc.pdparams) | [配置文件](./yolox_s_40e_voc.yml) |
| PP-YOLOE+_s | 640 | 8 | 30e | 2.9 | 86.7 | 7.93 | 17.36 | [下载链接](https://paddledet.bj.bcebos.com/models/ppyoloe_plus_crn_s_30e_voc.pdparams) | [配置文件](./ppyoloe_plus_crn_s_30e_voc.yml) |
**注意:**
- 所有YOLO模型均使用VOC数据集训练,mAP为`mAP(IoU=0.5)`的结果,且评估未使用`multi_label`等trick;
- 所有YOLO模型均加载各自模型的COCO权重作为预训练,各个配置文件的配置均为默认使用8卡GPU,可作为自定义数据集设置参考,具体精度会因数据集而异;
- YOLO检测模型建议**总`batch_size`至少大于`64`**去训练,如果资源不够请**换小模型****减小模型的输入尺度**,为了保障较高检测精度,**尽量不要尝试单卡训和总`batch_size`小于`64`训**
- Params(M)和FLOPs(G)均为训练时所测,YOLOv7没有s模型,故选用tiny模型;
- TRT-FP16-Latency(ms)测速相关请查看各YOLO模型的config的主页;
## 使用教程
### 下载数据集:
下载PaddleDetection团队整理的VOC数据,并放置于`PaddleDetection/dataset/voc`
```
wget https://bj.bcebos.com/v1/paddledet/data/voc.zip
```
### 训练评估预测:
```
model_name=voc
job_name=ppyoloe_plus_crn_s_30e_voc # 可修改,如 yolov7_tiny_60e_voc
config=configs/${model_name}/${job_name}.yml
log_dir=log_dir/${job_name}
# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
weights=output/${job_name}/model_final.pdparams
# 1.训练(单卡/多卡)
# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp
python -m paddle.distributed.launch --log_dir=${log_dir} --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp
# 2.评估
CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c ${config} -o weights=${weights} --classwise
# 3.预测
CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_img=demo/000000014439_640x640.jpg --draw_threshold=0.5
```
_BASE_: [
'../ppyoloe/ppyoloe_plus_crn_s_80e_coco.yml',
'../datasets/voc.yml',
]
log_iter: 50
snapshot_epoch: 5
weights: output/ppyoloe_plus_crn_s_30e_voc/model_final
pretrain_weights: https://bj.bcebos.com/v1/paddledet/models/ppyoloe_plus_crn_s_80e_coco.pdparams
depth_mult: 0.33
width_mult: 0.50
TrainReader:
batch_size: 8 # default 8 gpus, total bs = 64
EvalReader:
batch_size: 4
epoch: 30
LearningRate:
base_lr: 0.001
schedulers:
- !CosineDecay
max_epochs: 36
- !LinearWarmup
start_factor: 0.
epochs: 1
PPYOLOEHead:
static_assigner_epoch: -1
nms:
name: MultiClassNMS
nms_top_k: 1000
keep_top_k: 300
score_threshold: 0.01
nms_threshold: 0.7
_BASE_: [
'../yolov5/yolov5_s_300e_coco.yml',
'../datasets/voc.yml',
]
log_iter: 50
snapshot_epoch: 5
weights: output/yolov5_s_60e_voc/model_final
pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov5_s_300e_coco.pdparams
depth_mult: 0.33
width_mult: 0.50
TrainReader:
batch_size: 16 # default 8 gpus, total bs = 128
EvalReader:
batch_size: 4
epoch: 60
LearningRate:
base_lr: 0.001
schedulers:
- !YOLOv5LRDecay
max_epochs: 60
min_lr_ratio: 0.01
- !ExpWarmup
epochs: 1
_BASE_: [
'../yolov7/yolov7_tiny_300e_coco.yml',
'../datasets/voc.yml',
]
log_iter: 50
snapshot_epoch: 5
weights: output/yolov7_tiny_60e_voc/model_final
pretrain_weights: https://paddledet.bj.bcebos.com/models/yolov7_tiny_300e_coco.pdparams
arch: tiny
act: LeakyReLU
TrainReader:
batch_size: 32 # default 8 gpus, total bs = 256
EvalReader:
batch_size: 4
epoch: 60
LearningRate:
base_lr: 0.001
schedulers:
- !YOLOv5LRDecay
max_epochs: 60
min_lr_ratio: 0.1
- !ExpWarmup
epochs: 1
_BASE_: [
'../yolox/yolox_s_300e_coco.yml',
'../datasets/voc.yml',
]
log_iter: 50
snapshot_epoch: 5
weights: output/yolox_s_40e_voc/model_final
pretrain_weights: https://paddledet.bj.bcebos.com/models/yolox_s_300e_coco.pdparams
depth_mult: 0.33
width_mult: 0.50
TrainReader:
batch_size: 8 # default 8 gpus, total bs = 64
EvalReader:
batch_size: 4
epoch: 40
LearningRate:
base_lr: 0.001
schedulers:
- !CosineDecay
max_epochs: 40
min_lr_ratio: 0.05
last_plateau_epochs: 4
- !ExpWarmup
epochs: 1
# YOLOv8
## 内容
- [模型库](#模型库)
- [使用教程](#使用教程)
- [FastDeploy多硬件快速部署](#FastDeploy多硬件快速部署)
- [引用](#引用)
## 模型库
### 基础检测模型
| 网络网络 | 输入尺寸 | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) | 下载链接 | 配置文件 |
| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
| *YOLOv8-n | 640 | 16 | 500e | 1.8 | 37.3 | 53.0 | 3.16 | 8.7 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_n_500e_coco.pdparams) | [配置文件](./yolov8_n_500e_coco.yml) |
| *YOLOv8-s | 640 | 16 | 500e | 3.4 | 44.9 | 61.8 | 11.17 | 28.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_s_500e_coco.pdparams) | [配置文件](./yolov8_s_500e_coco.yml) |
| *YOLOv8-m | 640 | 16 | 500e | 6.5 | 50.2 | 67.3 | 25.90 | 78.9 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_m_500e_coco.pdparams) | [配置文件](./yolov8_m_500e_coco.yml) |
| *YOLOv8-l | 640 | 16 | 500e | 10.0 | 52.8 | 69.6 | 43.69 | 165.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_l_500e_coco.pdparams) | [配置文件](./yolov8_l_500e_coco.yml) |
| *YOLOv8-x | 640 | 16 | 500e | 15.1 | 53.8 | 70.6 | 68.23 | 257.8 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_x_500e_coco.pdparams) | [配置文件](./yolov8_x_500e_coco.yml) |
### Open Images v7 大规模600类检测模型
| 网络网络 | 输入尺寸 | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) | 下载链接 | 配置文件 |
| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
| *YOLOv8-n | 640 | 16 | 100e | 1.8 | - | - | 3.16 | 8.7 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_n_100e_oiv7.pdparams) | [配置文件](openimagev7/yolov8_n_100e_oiv7.yml) |
| *YOLOv8-s | 640 | 16 | 100e | 3.4 | - | - | 11.17 | 28.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_s_100e_oiv7.pdparams) | [配置文件](openimagev7/yolov8_s_100e_oiv7.yml) |
| *YOLOv8-m | 640 | 16 | 100e | 6.5 | - | - | 25.90 | 78.9 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_m_100e_oiv7.pdparams) | [配置文件](openimagev7/yolov8_m_100e_oiv7.yml) |
| *YOLOv8-l | 640 | 16 | 100e | 10.0 | - | - | 43.69 | 165.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_l_100e_oiv7.pdparams) | [配置文件](openimagev7/yolov8_l_100e_oiv7.yml) |
| *YOLOv8-x | 640 | 16 | 100e | 15.1 | - | - | 68.23 | 257.8 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_x_100e_oiv7.pdparams) | [配置文件](openimagev7/yolov8_x_100e_oiv7.yml) |
### 实例分割模型
| 网络网络 | 输入尺寸 | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | box AP | mask AP | Params(M) | FLOPs(G) | 下载链接 | 配置文件 |
| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
| *YOLOv8-n | 640 | 16 | 500e | - | 36.6 | - | 3.4 | 12.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_seg_n_500e_coco.pdparams) | [配置文件](../yolov8_seg/yolov8_seg_n_500e_coco.yml) |
| *YOLOv8-s | 640 | 16 | 500e | - | 44.6 | - | 11.8 | 42.6 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_seg_s_500e_coco.pdparams) | [配置文件](../yolov8_seg/yolov8_seg_s_500e_coco.yml) |
| *YOLOv8-m | 640 | 16 | 500e | - | 49.7 | - | 27.3 | 110.2 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_seg_m_500e_coco.pdparams) | [配置文件](../yolov8_seg/yolov8_seg_m_500e_coco.yml) |
| *YOLOv8-l | 640 | 16 | 500e | - | 52.1 | - | 46.0 | 220.5 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_seg_l_500e_coco.pdparams) | [配置文件](../yolov8_seg/yolov8_seg_l_500e_coco.yml) |
| *YOLOv8-x | 640 | 16 | 500e | - | 53.4 | - | 71.8 | 344.1 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8_seg_x_500e_coco.pdparams) | [配置文件](../yolov8_seg/yolov8_seg_x_500e_coco.yml) |
### P6大尺度模型
| 网络网络 | 输入尺寸 | 图片数/GPU | 学习率策略 | TRT-FP16-Latency(ms) | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 | Params(M) | FLOPs(G) | 下载链接 | 配置文件 |
| :------------- | :------- | :-------: | :------: | :------------: | :---------------------: | :----------------: |:---------: | :------: |:---------------: |:-----: |
| *YOLOv8-P6-x | 1280 | 16 | 500e | 55.0 | - | - | 97.42 | 522.93 | [下载链接](https://paddledet.bj.bcebos.com/models/yolov8p6_x_500e_coco.pdparams) | [配置文件](./yolov8p6_x_500e_coco.yml) |
**注意:**
- YOLOv8模型mAP为部署权重在COCO val2017上的`mAP(IoU=0.5:0.95)`结果,且评估未使用`multi_label`等trick;
- YOLOv8模型训练使用COCO train2017作为训练集,Box AP为在COCO val2017上的`mAP(IoU=0.5:0.95)`结果;
- YOLOv8模型训练过程中默认使用8 GPUs进行混合精度训练,默认lr为0.01为8卡总batch_size的设置,如果**GPU卡数**或者每卡**batch size**发生改动,也不需要改动学习率,但为了保证高精度最好使用**总batch size大于64**的配置去训练;
- TRT-FP16-Latency(ms)模型推理耗时为TensorRT-FP16下测试的耗时,不包含数据预处理和模型输出后处理(NMS)的耗时。测试采用**单卡Tesla T4 GPU**,batch size=1,测试环境为**paddlepaddle-2.3.2**, **CUDA 11.2**, **CUDNN 8.2**, **GCC-8.2**, **TensorRT 8.0.3.4**
- 如果你设置了`--run_benchmark=True`, 你首先需要安装以下依赖`pip install pynvml psutil GPUtil`
### 部署模型
| 网络模型 | 输入尺寸 | 导出后的权重(带nms) | 导出后的权重(exclude_nms)| ONNX(exclude_post_process) |
| :-------- | :----: | :---------------: | :--------------------: | :-------------------------: |
| YOLOv8-n | 640 | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_n_500e_coco.onnx) |
| YOLOv8-s | 640 | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_s_500e_coco.onnx) |
| YOLOv8-m | 640 | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_m_500e_coco.onnx) |
| YOLOv8-l | 640 | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_l_500e_coco.onnx) |
| YOLOv8-x | 640 | [(w_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco_w_nms.zip) | [(wo_nms)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco_wo_nms.zip) | [(onnx)](https://paddledet.bj.bcebos.com/deploy/paddleyolo/yolov8/yolov8_x_500e_coco.onnx) |
**注意:**
- 带nms的导出权重为普通导出方式,加trt表示用于trt加速,对NMS和silu激活函数提速明显。运行命令为:
```CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} trt=True```
- `exclude_nms`导出的权重表示去除NMS导出,返回2个Tensor,是缩放回原图后的坐标和分类置信度。运行命令为:
```CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_nms=True trt=True```
- `exclude_post_process`导出表示去除后处理导出,返回和YOLOv5导出ONNX时相同格式的concat后的1个Tensor,是未缩放回原图的坐标和分类置信度。运行命令为:
```CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_post_process=True trt=True ```
## 使用教程
### 0. **一键运行全流程**
将以下命令写在一个脚本文件里如```run.sh```,一键运行命令为:```sh run.sh```,也可命令行一句句去运行。
```bash
model_name=yolov8 # 可修改,如 ppyoloe
job_name=yolov8_s_500e_coco # 可修改,如 ppyoloe_plus_crn_s_80e_coco
config=configs/${model_name}/${job_name}.yml
log_dir=log_dir/${job_name}
# weights=https://bj.bcebos.com/v1/paddledet/models/${job_name}.pdparams
weights=output/${job_name}/model_final.pdparams
# 1.训练(单卡/多卡),加 --eval 表示边训边评估,加 --amp 表示混合精度训练
# CUDA_VISIBLE_DEVICES=0 python tools/train.py -c ${config} --eval --amp
python -m paddle.distributed.launch --log_dir=${log_dir} --gpus 0,1,2,3,4,5,6,7 tools/train.py -c ${config} --eval --amp
# 2.评估,加 --classwise 表示输出每一类mAP
CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c ${config} -o weights=${weights} --classwise
# 3.预测 (单张图/图片文件夹)
CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_img=demo/000000014439_640x640.jpg --draw_threshold=0.5
# CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c ${config} -o weights=${weights} --infer_dir=demo/ --draw_threshold=0.5
# 4.导出模型,以下3种模式选一种
## 普通导出,加trt表示用于trt加速,对NMS和silu激活函数提速明显
CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} # trt=True
## exclude_post_process去除后处理导出,返回和YOLOv5导出ONNX时相同格式的concat后的1个Tensor,是未缩放回原图的坐标+分类置信度
# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_post_process=True # trt=True
## exclude_nms去除NMS导出,返回2个Tensor,是缩放回原图后的坐标和分类置信度
# CUDA_VISIBLE_DEVICES=0 python tools/export_model.py -c ${config} -o weights=${weights} exclude_nms=True # trt=True
# 5.部署预测,注意不能使用 去除后处理 或 去除NMS 导出后的模型去预测
CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU
# 6.部署测速,加 “--run_mode=trt_fp16” 表示在TensorRT FP16模式下测速,注意如需用到 trt_fp16 则必须为加 trt=True 导出的模型
CUDA_VISIBLE_DEVICES=0 python deploy/python/infer.py --model_dir=output_inference/${job_name} --image_file=demo/000000014439_640x640.jpg --device=GPU --run_benchmark=True # --run_mode=trt_fp16
# 7.onnx导出,一般结合 exclude_post_process去除后处理导出的模型
paddle2onnx --model_dir output_inference/${job_name} --model_filename model.pdmodel --params_filename model.pdiparams --opset_version 12 --save_file ${job_name}.onnx
# 8.onnx trt测速
/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp16
/usr/local/TensorRT-8.0.3.4/bin/trtexec --onnx=${job_name}.onnx --workspace=4096 --avgRuns=10 --shapes=input:1x3x640x640 --fp32
```
### 1. 训练
执行以下指令使用混合精度训练YOLOv8
```bash
python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/yolov8/yolov8_s_500e_coco.yml --amp --eval
```
**注意:**
- `--amp`表示开启混合精度训练以避免显存溢出,`--eval`表示边训边验证。
### 2. 评估
执行以下命令在单个GPU上评估COCO val2017数据集
```bash
CUDA_VISIBLE_DEVICES=0 python tools/eval.py -c configs/yolov8/yolov8_s_500e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov8_s_500e_coco.pdparams
```
### 3. 推理
使用以下命令在单张GPU上预测图片,使用`--infer_img`推理单张图片以及使用`--infer_dir`推理文件中的所有图片。
```bash
# 推理单张图片
CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/yolov8/yolov8_s_500e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov8_s_500e_coco.pdparams --infer_img=demo/000000014439_640x640.jpg
# 推理文件中的所有图片
CUDA_VISIBLE_DEVICES=0 python tools/infer.py -c configs/yolov8/yolov8_s_500e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov8_s_500e_coco.pdparams --infer_dir=demo
```
### 4.导出模型
YOLOv8在GPU上推理部署或benchmark测速等需要通过`tools/export_model.py`导出模型。
当你**使用Paddle Inference但不使用TensorRT**时,运行以下的命令导出模型
```bash
python tools/export_model.py -c configs/yolov8/yolov8_s_500e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov8_s_500e_coco.pdparams
```
当你**使用Paddle Inference且使用TensorRT**时,需要指定`-o trt=True`来导出模型。
```bash
python tools/export_model.py -c configs/yolov8/yolov8_s_500e_coco.yml -o weights=https://paddledet.bj.bcebos.com/models/yolov8_s_500e_coco.pdparams trt=True
```
如果你想将YOLOv8模型导出为**ONNX格式**,参考
[PaddleDetection模型导出为ONNX格式教程](../../deploy/EXPORT_ONNX_MODEL.md),运行以下命令:
```bash
# 导出推理模型
python tools/export_model.py -c configs/yolov8/yolov8_s_500e_coco.yml --output_dir=output_inference -o weights=https://paddledet.bj.bcebos.com/models/yolov8_s_500e_coco.pdparams
# 安装paddle2onnx
pip install paddle2onnx
# 转换成onnx格式
paddle2onnx --model_dir output_inference/yolov8_s_500e_coco --model_filename model.pdmodel --params_filename model.pdiparams --opset_version 11 --save_file yolov8_s_500e_coco.onnx
```
**注意:** ONNX模型目前只支持batch_size=1
## FastDeploy多硬件快速部署
FastDeploy是飞桨推出的统一部署工具,支持云边端部署。目前在YOLO系列支持的部署能力如下所示。具体部署示例,可以前往[FastDeploy仓库](https://github.com/PaddlePaddle/FastDeploy)使用。
| | [YOLOv5](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | [YOLOv6](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | [YOLOv7](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | [YOLOv8](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | [PP-YOLOE+](https://github.com/PaddlePaddle/FastDeploy/tree/develop/examples/vision/detection/paddledetection) | 部署特色 |
| ------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | ------------------------------------- |
| [Intel CPU](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/download_prebuilt_libraries.md) | 支持 | 支持 | 支持 | 支持 | 支持 | 集成PaddleSlim一键压缩压缩,实现极致性能 |
| [NVIDIA GPU](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/download_prebuilt_libraries.md) | 支持 | 支持 | 支持 | 支持 | 支持 | 集成PaddleSlim一键压缩工具、CUDA预处理加速,实现极致性能 |
| [飞腾 CPU]() | 支持 | 支持 | 支持 | 支持 | 支持 | X86 CPU与ARM CPU无缝切换 |
| [昆仑芯 R200*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/kunlunxin.md) | 支持 | 支持 | 支持 | 支持 | 支持 | 无缝部署Paddle模型 |
| [昇腾310*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/kunlunxin.md) | 支持 | 即将支持 | 即将支持 | 即将支持 | 支持 | 无缝部署Paddle模型 |
| [算能SC7-FP300*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/sophgo.md) | 支持 | 支持 | 支持 | 支持 | 支持 | 充分发挥硬件工具链特性,实现模型快速部署 |
| [Jetson](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/jetson.md) | 支持 | 支持 | 支持 | 支持 | 支持 | 集成PaddleSlim一键压缩工具、CUDA预处理加速,实现极致性能 |
| [ARM CPU](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/download_prebuilt_libraries.md) | 支持 | 支持 | 支持 | 支持 | 支持 | 集成PaddleSlim一键压缩工具、预处理加速库FlyCV,实现极致性能 |
| [RK3588*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/rknpu2.md) | 支持 | 支持 | 支持 | 支持 | 支持 | 充分发挥硬件工具链特性,实现模型快速部署 |
| [RV1126*](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/cn/build_and_install/rv1126.md) | 支持 | 暂不支持 | 暂不支持 | 暂不支持 | 支持 | 联合全量化实现模型端到端的优化 |
| [服务化部署](https://github.com/PaddlePaddle/FastDeploy/tree/develop/serving) | 支持 | 暂不支持 | 暂不支持 | 暂不支持 | 支持 | 实现企业级高并发需求 |
| [视频流部署](https://github.com/PaddlePaddle/FastDeploy/tree/develop/streamer) | 暂不支持 | 暂不支持 | 暂不支持 | 暂不支持 | 支持 | 调用硬解码核,实现数据零拷贝,充分利用硬件资源 |
备注:
*表示:FastDeploy目前在该型号硬件上测试。通常同类型硬件上使用的是相同的软件栈,该部署能力可以延伸到同软件架栈的硬件。譬如RK3588与RK3566、RK3568相同的软件栈。
「硬件列-纵轴」链接到部署预编译包安装或部署示例,「横轴」跳转到具体部署示例。
## 引用
```
```
epoch: 500
LearningRate:
base_lr: 0.01
schedulers:
- !YOLOv5LRDecay
max_epochs: 500
min_lr_ratio: 0.01
- !ExpWarmup
epochs: 5 #3
OptimizerBuilder:
optimizer:
type: Momentum
momentum: 0.937
use_nesterov: True
regularizer:
factor: 0.0005
type: L2
clip_grad_by_value: 10.
epoch: 500
LearningRate:
base_lr: 0.01
schedulers:
- !YOLOv5LRDecay
max_epochs: 500
min_lr_ratio: 0.1 #
- !ExpWarmup
epochs: 5 #3
OptimizerBuilder:
optimizer:
type: Momentum
momentum: 0.937
use_nesterov: True
regularizer:
factor: 0.0005
type: L2
clip_grad_by_value: 10.
architecture: YOLOv8
norm_type: sync_bn
use_ema: True
ema_decay: 0.9999
ema_decay_type: "exponential"
act: silu
find_unused_parameters: True
depth_mult: 1.0 # default: L version
width_mult: 1.0
YOLOv8:
backbone: YOLOv8CSPDarkNet
neck: YOLOv8CSPPAN
yolo_head: YOLOv8Head
post_process: ~
YOLOv8CSPDarkNet:
arch: 'P5'
return_idx: [2, 3, 4]
last_stage_ch: 1024
last2_stage_ch: 512
# use default config
# YOLOv8CSPPAN:
YOLOv8Head:
fpn_strides: [8, 16, 32]
loss_weight: {class: 0.5, iou: 7.5, dfl: 1.5}
assigner:
name: TaskAlignedAssigner
topk: 10
alpha: 0.5
beta: 6.0
nms:
name: MultiClassNMS
nms_top_k: 3000
keep_top_k: 300
score_threshold: 0.001
nms_threshold: 0.7
input_height: &input_height 640
input_width: &input_width 640
input_size: &input_size [*input_height, *input_width]
mosaic_epoch: &mosaic_epoch 490 # last 10 epochs close mosaic, totally 500 epochs as default
worker_num: 4
TrainReader:
sample_transforms:
- Decode: {}
- MosaicPerspective: {mosaic_prob: 1.0, boxes_normed: False, target_size: *input_size}
- RandomHSV: {hgain: 0.015, sgain: 0.7, vgain: 0.4}
- RandomFlip: {}
batch_transforms:
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
- PadGT: {}
batch_size: 8
shuffle: True
drop_last: False
use_shared_memory: True
collate_batch: True
mosaic_epoch: *mosaic_epoch
EvalReader:
sample_transforms:
- Decode: {}
- Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
- Pad: {size: *input_size, fill_value: [114., 114., 114.]}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 8
TestReader:
inputs_def:
image_shape: [3, 640, 640]
sample_transforms:
- Decode: {}
- Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
- Pad: {size: *input_size, fill_value: [114., 114., 114.]}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 1
fuse_normalize: False
input_height: &input_height 640
input_width: &input_width 640
input_size: &input_size [*input_height, *input_width]
mosaic_epoch: &mosaic_epoch 490 # last 10 epochs close mosaic, totally 500 epochs as default
worker_num: 4
TrainReader:
sample_transforms:
- Decode: {}
- MosaicPerspective: {mosaic_prob: 1.0, boxes_normed: False, target_size: *input_size, scale: 0.9, mixup_prob: 0.1, copy_paste_prob: 0.1}
- RandomHSV: {hgain: 0.015, sgain: 0.7, vgain: 0.4}
- RandomFlip: {}
batch_transforms:
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
- PadGT: {}
batch_size: 8
shuffle: True
drop_last: False
use_shared_memory: True
collate_batch: True
mosaic_epoch: *mosaic_epoch
EvalReader:
sample_transforms:
- Decode: {}
- Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
- Pad: {size: *input_size, fill_value: [114., 114., 114.]}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 1
TestReader:
inputs_def:
image_shape: [3, 640, 640]
sample_transforms:
- Decode: {}
- Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
- Pad: {size: *input_size, fill_value: [114., 114., 114.]}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 1
fuse_normalize: False
architecture: YOLOv8
norm_type: sync_bn
use_ema: True
ema_decay: 0.9999
ema_decay_type: "exponential"
act: silu
find_unused_parameters: True
depth_mult: 1.0 # default: L version
width_mult: 1.0
YOLOv8:
backbone: YOLOv8CSPDarkNet
neck: YOLOv8CSPPANP6
yolo_head: YOLOv8Head
post_process: ~
YOLOv8CSPDarkNet:
arch: 'P6'
return_idx: [2, 3, 4, 5]
last_stage_ch: 1024
last2_stage_ch: 768
# use default config
# YOLOv8CSPPANP6:
YOLOv8Head:
fpn_strides: [8, 16, 32, 64]
loss_weight: {class: 0.5, iou: 7.5, dfl: 1.5}
assigner:
name: TaskAlignedAssigner
topk: 10
alpha: 0.5
beta: 6.0
nms:
name: MultiClassNMS
nms_top_k: 1000
keep_top_k: 300
score_threshold: 0.001
nms_threshold: 0.7
input_height: &input_height 1280
input_width: &input_width 1280
input_size: &input_size [*input_height, *input_width]
mosaic_epoch: &mosaic_epoch 490 # last 10 epochs close mosaic, totally 500 epochs as default
worker_num: 4
TrainReader:
sample_transforms:
- Decode: {}
- MosaicPerspective: {mosaic_prob: 1.0, boxes_normed: False, target_size: *input_size, scale: 0.9, mixup_prob: 0.1, copy_paste_prob: 0.1}
- RandomHSV: {hgain: 0.015, sgain: 0.7, vgain: 0.4}
- RandomFlip: {}
batch_transforms:
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
- PadGT: {}
batch_size: 8
shuffle: True
drop_last: False
use_shared_memory: True
collate_batch: True
mosaic_epoch: *mosaic_epoch
EvalReader:
sample_transforms:
- Decode: {}
- Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
- Pad: {size: *input_size, fill_value: [114., 114., 114.]}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 1
TestReader:
inputs_def:
image_shape: [3, 1280, 1280]
sample_transforms:
- Decode: {}
- Resize: {target_size: *input_size, keep_ratio: True, interp: 1}
- Pad: {size: *input_size, fill_value: [114., 114., 114.]}
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
- Permute: {}
batch_size: 1
fuse_normalize: False
_BASE_: [
'../../datasets/openimagesv7_detection.yml',
'../../runtime.yml',
'../_base_/optimizer_100e.yml',
'../_base_/yolov8_cspdarknet.yml',
'../_base_/yolov8_reader_high_aug.yml',
]
depth_mult: 1.0
width_mult: 1.0
log_iter: 50
snapshot_epoch: 10
weights: output/yolov8_l_100e_oiv7/model_final
YOLOv8CSPDarkNet:
last_stage_ch: 512 # The actual channel is int(512 * width_mult), not int(1024 * width_mult) as in YOLOv5
YOLOv8Head:
customized_c3: 256
TrainReader:
batch_size: 16 # default 8 gpus, total bs = 128
_BASE_: [
'../../datasets/openimagesv7_detection.yml',
'../../runtime.yml',
'../_base_/optimizer_100e.yml',
'../_base_/yolov8_cspdarknet.yml',
'../_base_/yolov8_reader_high_aug.yml',
]
depth_mult: 0.67
width_mult: 0.75
log_iter: 50
snapshot_epoch: 10
weights: output/yolov8_m_100e_oiv7/model_final
YOLOv8CSPDarkNet:
last_stage_ch: 768 # The actual channel is int(768 * width_mult), not int(1024 * width_mult) as in YOLOv5
YOLOv8Head:
customized_c3: 192
TrainReader:
batch_size: 16 # default 8 gpus, total bs = 128
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment