Commit 1345fab2 authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #1263 canceled with stages
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Project related directories
data/
checkpoints/
visualization/
# Default ignored files
/shelf/
/workspace.xml
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/salience_detr_pytorch.iml" filepath="$PROJECT_DIR$/.idea/salience_detr_pytorch.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="GOOGLE" />
<option name="myDocStringFormat" value="Google" />
</component>
</module>
\ No newline at end of file
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10-py38
\ No newline at end of file
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# Salience-DETR
## 论文
`Salience DETR: Enhancing Detection Transformer with Hierarchical Salience Filtering Refinement`
- https://arxiv.org/abs/2403.16131
## 模型结构
显著性DETR采用了高性能的两阶段管道。
SlalenceDETR和主流的两阶段DETR类方法之间的主要架构区别在于变压器编码器和查询细化。
给定来自主干的多尺度特征,编码器仅更新基于显著性引导监督的层次查询过滤选择的查询。
通过查询细化模块,可以缓解查询之间的语义失调。
<div align=center>
<img src="./docs/The architecture overview of Salience DETR.png"/>
</div>
## 算法原理
Salience DETR和主流的两阶段类DETR方法之间的主要架构差异在于detection transformer encoder 和 query refinement。主要包含以下几个部分:
(1)显著引导的监督
根据预测的置信度,查询过滤更新信息最丰富的查询,以以更少的计算负担实现类似的性能。从Focus DETR 中汲取灵感,为多尺度特性中的每个级别的查询提供监督。没有构建只分类前景和背景的离散标签{0,1},而是构建了一个尺度独立的显著性作为监督目标来克服尺度偏差。
(2)层次查询过滤
Revisting query filtering in Focus DETR。Focus DETR引入了一个额外的分支,该分支通过对多尺度特征进行自上而下的分数调制来预测前景置信度。
(3)分层查询筛选
通常,高级标记比低级标记带来更少的计算负担,同时保留更多的信息语义。因此,除了传统的分层滤波外,一个自然的动机是引入分层滤波来处理多尺度特性。文章引入了两个集合作为相应的滤波比,对于第t个编码器层和第l个特征级,只有顶部的v*t*w*l*查询进行注意编码,而其他查询保持不变。
(4)跨级令牌融合
针对特定级别的过滤比率导致的不同级别查询的语义错位,文章提出了一种令牌融合模块,该模块利用路径聚合结构来处理跨级别的信息交互。在该模块中,通过提出的 RepVGGPluXBlock融合相邻的令牌,如图所示。
<div align=center>
<img src="./docs/Cross-level token fusion.png"/>
</div>
## 环境配置
### Docker(方法一)
此处提供[光源](https://www.sourcefind.cn/#/service-details)拉取docker镜像的地址与使用步骤,以及[光合](https://developer.hpccube.com/tool/)开发者社区深度学习库下载地址
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10-py38
docker run -it --shm-size=128G -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name salience_detr_pytorch <your IMAGE ID> bash # <your IMAGE ID>为以上拉取的docker的镜像ID替换,本镜像为:ffa1f63239fc
cd /path/your_code_data/salience_detr_pytorch
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
### Dockerfile(方法二)
此处提供dockerfile的使用方法
```
docker build --no-cache -t salience_detr:latest .
docker run -it --shm-size=128G -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name salience_detr_pytorch salience_detr bash
cd /path/your_code_data/sed_pytorch
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
### Anaconda(方法三)
此处提供本地配置、编译的详细步骤,例如:
关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
```
#DTK驱动:dtk23.10
# python:python3.8
# torch: 2.1.0
# torchvision: 0.16.0
conda create -n salience_detr python=3.8
conda activate salience_detr
pip install torch-2.1.0a0+git793d2b5.abi0.dtk2310-cp38-cp38-manylinux2014_x86_64.whl
pip install torchvision-0.16.0+git267eff6.abi0.dtk2310.torch2.1.0-cp38-cp38-manylinux2014_x86_64.whl
```
`Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
其它依赖环境安装如下:
```
cd /path/your_code_data/salience_detr_pytorch
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
## 数据集
请下载[COCO 2017](https://cocodataset.org/#home)或将自己的数据集准备到data/中,并按如下方式组织它们。您可以使用tools/visualize_datasets.py可视化数据集注释以验证其正确性。
```
coco/
├── train2017/
├── val2017/
└── annotations/
├── instances_train2017.json
└── instances_val2017.json
```
## 训练
使用accelerate原生处理多DCU的软件包,使用HIP_VISIBLE_DEVICES来指定DCU。如果未指定,脚本将使用节点上所有可用的DCU进行训练。
在训练前修改:configs/train_config.py文件
### 单机单卡
```
HIP_VISIBLE_DEVICES=0 accelerate launch main.py
```
### 单机多卡
```
HIP_VISIBLE_DEVICES=0,1,2,3 accelerate launch main.py
```
## 推理
在各种设置下使用 ResNet50 和 Swin-L 训练了 Salience DETR。以下为 COCO 2017 提供了相应的配置和检查点。
### 训练12轮
| 模型 | 主干网 | 下载 |
| ------------- | ----------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| Salience DETR | ResNet50 | [配置](configs/salience_detr/salience_detr_resnet50_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_resnet50_800_1333_coco_1x.pth) |
| Salience DETR | ConvNeXt-L | [配置](configs/salience_detr/salience_detr_convnext_l_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_convnext_l_800_1333_coco_1x.pth) |
| Salience DETR | Swin-L<sub>(IN-22K) | [配置](configs/salience_detr/salience_detr_swin_l_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_swin_l_800_1333_coco_1x.pth) |
| Salience DETR | FocalNet-L<sub>(IN-22K) | [配置](configs/salience_detr/salience_detr_focalnet_large_lrf_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_focalnet_large_lrf_800_1333_coco_1x.pth) |
### 训练24轮
| 模型 | 主干网 | 下载 |
| ------------- | ----------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| Salience DETR | ResNet50 | [配置](configs/salience_detr/salience_detr_resnet50_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_resnet50_800_1333_coco_2x.pth) |
### 单卡推理
注意--model-config和--checkpoint需要一一对应。
Inference:
```
# python inference.py --image-dir /path/to/images --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth --show-dir /path/to/dir
python inference.py --image-dir ./images --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py --checkpoint checkpoint/salience_detr_resnet50_800_1333_coco_2x.pth --show-dir results
```
Evaluation/Test:
```
# CUDA_VISIBLE_DEVICES=<gpu_ids> accelerate launch test.py --coco-path /path/to/coco --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth
HIP_VISIBLE_DEVICES=0 accelerate launch test.py --coco-path ./data/coco --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py --checkpoint checkpoint/salience_detr_resnet50_800_1333_coco_2x.pth
```
### 多卡推理
```
CUDA_VISIBLE_DEVICES=<gpu_ids> accelerate launch test.py --coco-path /path/to/coco --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth
HIP_VISIBLE_DEVICES=0,1,2,3 accelerate launch test.py --coco-path ./data/coco --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py --checkpoint checkpoint/salience_detr_resnet50_800_1333_coco_2x.pth
```
## result
原图:
<div align=center>
<img src="./docs/000000000139.jpg"/>
</div>
inference可视化结果如下:
<div align=center>
<img src="./docs/000000000139_detr.jpg"/>
</div>
### 精度
使用四张DCU-K100卡推理
| 模型 | 主干网 | AP | AP<sub>50 | AP<sub>75 | AP<sub>S | AP<sub>M | AP<sub>L |
| ------------- | ----------------------- | :---: | :-------: | :-------: | :------: | :------: | :------: |
| Salience DETR | ResNet50 | 50.0 | 67.7 | 54.2 | 33.3 | 54.4 | 64.4 |
| Salience DETR | ConvNeXt-L | 54.2 | 72.4 | 59.1 | 38.8 | 58.3 | 69.6 |
| Salience DETR | Swin-L<sub>(IN-22K) | 56.5 | 75.0 | 61.5 | 40.2 | 61.2 | 72.8 |
| Salience DETR | FocalNet-L<sub>(IN-22K) | 57.3 | 75.5 | 62.3 | 40.9 | 61.8 | 74.5 |
| 模型 | 主干网 | AP | AP<sub>50 | AP<sub>75 | AP<sub>S | AP<sub>M | AP<sub>L |
| ------------- | ----------------------- | :---: | :-------: | :-------: | :------: | :------: | :------: |
| Salience DETR | ResNet50 | 51.2 | 68.9 | 55.7 | 33.9 | 55.5 | 65.6 |
## 应用场景
### 算法类别
`目标检测`
### 热点应用行业
`科研,制造,医疗,家居,教育`
## 源码仓库及问题反馈
- https://developer.hpccube.com/codes/modelzoo/salience_detr_pytorch
## 参考资料
- https://github.com/xiuqhou/Salience-DETR
简体中文 | [English](README.md)
**Salience DETR**: Enhancing Detection Transformer with Hierarchical Salience Filtering Refinement
===
By [Xiuquan Hou](https://github.com/xiuqhou), [Meiqin Liu](https://scholar.google.com/citations?user=T07OWMkAAAAJ&hl=zh-CN&oi=ao), Senlin Zhang, [Ping Wei](https://scholar.google.com/citations?user=1OQBtdcAAAAJ&hl=zh-CN&oi=ao), [Badong Chen](https://scholar.google.com/citations?user=mq6tPX4AAAAJ&hl=zh-CN&oi=ao).
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/salience-detr-enhancing-detection-transformer-1/object-detection-on-coco-2017-val)](https://paperswithcode.com/sota/object-detection-on-coco-2017-val?p=salience-detr-enhancing-detection-transformer-1)
[![arXiv](https://img.shields.io/badge/arXiv-2403.16131-b31b1b.svg)](https://arxiv.org/abs/2403.16131)
[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](https://makeapullrequest.com)
[![GitHub license](https://img.shields.io/github/license/xiuqhou/Salience-DETR.svg?color=blue)](https://github.com/xiuqhou/Salience-DETR/blob/master/LICENSE)
![GitHub stars](https://img.shields.io/github/stars/xiuqhou/Salience-DETR)
![GitHub forks](https://img.shields.io/github/forks/xiuqhou/Salience-DETR)
本仓库是**CVPR 2024**(得分**553**)论文Salience DETR的官方实现.
## ✨研究亮点:
1. 我们深入分析了两阶段DETR类方法中存在的[尺度偏差和查询冗余](id_1)问题。
2. 我们提出了一种在显著性监督下降低计算复杂度的分层过滤机制,所提出的监督方式甚至能在仅使用检测框标注的情况下捕捉[细粒度的物体轮廓](#id_2)
3. Salience DETR在三个极具挑战的缺陷检测任务上分别提升了 **+4.0%**, **+0.2%****+4.4%** AP,在COCO 2017上只使用了大约 **70\%** FLOPs 实现了相当的精度。
<div align="center">
<img src="images/Salience-DETR.svg">
</div>
<details>
<summary>🔎可视化</summary>
- 现有DETR方法的两阶段选择出的查询通常是**冗余**的,并且存在**尺度偏执**(左图)。
- 对于缺陷检测和目标检测任务,**显著性监督**都有助于在仅使用检测框标注的情况下捕捉**物体轮廓**(右图).
<h3 align="center">
<a id="id_1"><img src="images/query_visualization.svg" width="335"></a>
<a id="id_2"><img src="images/salience_visualization.svg" width="462"></a>
</h3>
</details>
## 更新动态
`2024-04-19`: 以 [FocalNet-Large](https://github.com/microsoft/FocalNet) 作为主干网,Salience DETR在COCO val2017上取得了 **56.8 AP**, [**配置**](configs/salience_detr/salience_detr_focalnet_large_lrf_800_1333.py)[**权重**](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_focalnet_large_lrf_800_1333_coco_1x.pth) 已更新!
`2024-04-08`: 更新以ConvNeXt-L作为主干网、在COCO 2017数据集上训练12轮的Salience DETR [**配置**](configs/salience_detr/salience_detr_convnext_l_800_1333.py)[**权重**](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_convnext_l_800_1333_coco_1x.pth).
`2024-04-01`: 使用Swin-L作为主干网,Salience DETR在COCO 2017数据集上取得了 **56.5** AP (训练12轮)。 模型 [**配置**](configs/salience_detr/salience_detr_swin_l_800_1333.py)[**权重**](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_swin_l_800_1333_coco_1x.pth) 已发布.
`2024-03-26`: 我们发布了Salience DETR的代码和在COCO 2017上使用ResNet50作为主干网络的预训练权重。
`2024-02-29`: Salience DETR被CVPR2024接受,代码将在本仓库发布。欢迎关注!
## 模型库
在被 **CVPR 2024** 接受以后, 我们又在多种设置下重新训练了以 **ResNet50****Swin-L** 作为主干网的 **Salience DETR** 。我们提供了相应的 [**COCO 2017**](https://cocodataset.org/#home) 数据集的配置和权重。
### 训练12轮
| 模型 | 主干网 | AP | AP<sub>50 | AP<sub>75 | AP<sub>S | AP<sub>M | AP<sub>L | 下载 |
| ------------- | ----------------------- | :---: | :-------: | :-------: | :------: | :------: | :------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| Salience DETR | ResNet50 | 50.0 | 67.7 | 54.2 | 33.3 | 54.4 | 64.4 | [配置](configs/salience_detr/salience_detr_resnet50_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_resnet50_800_1333_coco_1x.pth) |
| Salience DETR | ConvNeXt-L | 54.2 | 72.4 | 59.1 | 38.8 | 58.3 | 69.6 | [配置](configs/salience_detr/salience_detr_convnext_l_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_convnext_l_800_1333_coco_1x.pth) |
| Salience DETR | Swin-L<sub>(IN-22K) | 56.5 | 75.0 | 61.5 | 40.2 | 61.2 | 72.8 | [配置](configs/salience_detr/salience_detr_swin_l_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_swin_l_800_1333_coco_1x.pth) |
| Salience DETR | FocalNet-L<sub>(IN-22K) | 57.3 | 75.5 | 62.3 | 40.9 | 61.8 | 74.5 | [配置](configs/salience_detr/salience_detr_focalnet_large_lrf_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_focalnet_large_lrf_800_1333_coco_1x.pth) |
### 训练24轮
| 模型 | 主干网 | AP | AP<sub>50 | AP<sub>75 | AP<sub>S | AP<sub>M | AP<sub>L | 下载 |
| ------------- | ----------------------- | :---: | :-------: | :-------: | :------: | :------: | :------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| Salience DETR | ResNet50 | 51.2 | 68.9 | 55.7 | 33.9 | 55.5 | 65.6 | [配置](configs/salience_detr/salience_detr_resnet50_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_resnet50_800_1333_coco_2x.pth) |
## 🔧安装步骤
1. 克隆本仓库:
```shell
git clone https://github.com/xiuqhou/Salience-DETR.git
cd Salience-DETR/
```
2. 创建并激活conda环境:
```shell
conda create -n salience_detr python=3.8
conda activate salience_detr
```
3. 根据官方步骤 [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) 安装pytorch。本代码要求 `python>=3.8, torch>=1.11.0, torchvision>=0.12.0`
```shell
conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
```
4. 安装其他依赖:
```shell
conda install --file requirements.txt -c conda-forge
```
您不需要手动编译CUDA算子,代码第一次运行时会自动编译并加载。
## 📁准备数据集
请按照如下格式下载 [COCO 2017](https://cocodataset.org/) 数据集或准备您自己的数据集,并将他们放在 `data/` 目录下。您可以使用 [`tools/visualize_datasets.py`](tools/visualize_datasets.py) 来可视化数据集以验证其正确性。
```shell
coco/
├── train2017/
├── val2017/
└── annotations/
├── instances_train2017.json
└── instances_val2017.json
```
<details>
<summary>可视化例子</summary>
```shell
python tools/visualize_datasets.py \
--coco-img data/coco/val2017 \
--coco-ann data/coco/annotations/instances_val2017.json \
--show-dir visualize_dataset/
```
</details>
## 📚︎训练模型
我们使用 `accelerate` 包来原生处理多GPU训练,您只需要使用 `CUDA_VISIBLE_DEVICES` 来指定要用于训练的GPU/GPUs。如果未指定,脚本会自动使用机器上所有可用的GPU来训练。
```shell
CUDA_VISIBLE_DEVICES=0 accelerate launch main.py # 使用1个GPU进行训练
CUDA_VISIBLE_DEVICES=0,1 accelerate launch main.py # 使用2个GPU进行训练
```
训练之前请调整 [`configs/train_config.py`](configs/train_config.py) 中的参数。
<details>
<summary>训练配置文件的例子</summary>
```python
from torch import optim
from datasets.coco import CocoDetection
from transforms import presets
from optimizer import param_dict
# 经常需要改动的训练配置
num_epochs = 12 # 训练轮次
batch_size = 2 # 总批次尺寸 = GPU数量 x 批次尺寸batch_size
num_workers = 4 # pytorch DataLoader加载数据所使用的进程数量
pin_memory = True # 是否在 pytorch DataLoader 中使用pin_memory
print_freq = 50 # 日志记录的频率
starting_epoch = 0
max_norm = 0.1 # 梯度裁剪的范数
output_dir = None # 保存checkpoints的路径,如果设置为None,则默认保存至checkpoints/{model_name}路径下
find_unused_parameters = False # 用于调试分布式训练
# 定义用于训练的数据集
coco_path = "data/coco" # 数据集路径
train_transform = presets.detr # 从 transforms/presets.py 文件中选择数据增强
train_dataset = CocoDetection(
img_folder=f"{coco_path}/train2017",
ann_file=f"{coco_path}/annotations/instances_train2017.json",
transforms=train_transform,
train=True,
)
test_dataset = CocoDetection(
img_folder=f"{coco_path}/val2017",
ann_file=f"{coco_path}/annotations/instances_val2017.json",
transforms=None, # eval_transform已集成至网络前向传播中
)
# 模型配置文件
model_path = "configs/salience_detr/salience_detr_resnet50_800_1333.py"
# 指定一个检查点文件夹来恢复训练,或者指定一个“.pth”文件来进行微调,例如:
# checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-09_38_50
# checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-09_38_50/best_ap.pth
resume_from_checkpoint = None
learning_rate = 1e-4 # 初始学习率
optimizer = optim.AdamW(lr=learning_rate, weight_decay=1e-4, betas=(0.9, 0.999))
lr_scheduler = optim.lr_scheduler.MultiStepLR(milestones=[10], gamma=0.1)
# 为不同的参数定义不同学习率
param_dicts = param_dict.finetune_backbone_and_linear_projection(lr=learning_rate)
```
</details>
## 📈评估和测试
为了使用单个或多个GPU来评估模型,请指定 `CUDA_VISIBLE_DEVICES``dataset``model``checkpoint` 等参数。
```shell
CUDA_VISIBLE_DEVICES=<gpu_ids> accelerate launch test.py --coco-path /path/to/coco --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth
```
以下是可选参数,更多参数请查看 [test.py](test.py)
- `--show-dir`: 指定用于保存可视化结果的文件夹路径。
- `--result`: 指定用于保存检测结果的文件路径,必须以 `.json` 结尾。
<details>
<summary>模型评估的例子</summary>
例如,使用8张GPU来在 `coco` 上评估 `salience_detr_resnet50_800_1333` 模型,并将检测结果保存至 `result.json` 文件,并将检测结果的可视化保存至 `visualization/` 文件夹下,请运行以下命令:
```shell
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch test.py
--coco-path data/coco \
--model-config configs/salience_detr/salience_detr_resnet50_800_1333.py \
--checkpoint checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-21_29_56/best_ap.pth \
--result result.json \
--show-dir visualization/
```
</details>
<details>
<summary>评估json结果文件</summary>
在获取到上述保存的json检测结果文件后,如果要对该文件进行评估,请指定 `--result` 参数但不需要指定 `--model` 参数。
```shell
CUDA_VISIBLE_DEVICES=0 accelerate launch test.py --coco-path /path/to/coco --result /path/to/result.json
```
以下是可选参数,完整参数请查看 [test.py](test.py)
- `--show-dir`: 指定用于保存可视化结果的文件夹路径。
</details>
## ▶︎模型推理
使用 [`inference.py`](inference.py) 来推理图片,使用 `--image-dir` 指定图片所在的文件夹路径。
```shell
python inference.py --image-dir /path/to/images --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth --show-dir /path/to/dir
```
<details>
<summary>推理图片的例子</summary>
例如,运行如下命令推理 `images/` 文件夹下的图片并将可视化结果保存至 `visualization/` 文件夹中。
```shell
python inference.py \
--image-dir images/ \
--model-config configs/salience_detr/salience_detr_resnet50_800_1333.py \
--checkpoint checkpoint.pth \
--show-dir visualization/
```
</details>
或使用 [`inference.ipynb`](inference.ipynb) 进行单张图片的推理和可视化。
## 🔁评估模型速度、显存和参数
使用 `tools/benchmark_model.py` 来评估模型的推理速度、显存占用和参数量。
```shell
python tools/benchmark_model.py --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py
```
## 📍训练自定义的数据集
训练您自己的数据集之前请执行下面步骤:
1. 按照COCO标注格式准备您自己的数据集,并相应地将 [`configs/train_config.py`](configs/train_config.py) 中的 `coco_path` 参数调整为数据集所在的路径。
2. 打开 [`configs/salience_detr`](configs/salience_detr) 路径下的模型配置文件,将 `num_classes` 参数调整为数据集大于数据集的 `最大类别id+1`。以COCO数据集为例,查看 `instances_val2017.json` 标注文件,我们可以发现其最大类别id为`90`,因此设置 `num_classes = 91`
```json
{"supercategory": "indoor","id": 90,"name": "toothbrush"}
```
如果您不确定 `num_classes` 需要设置为多少,也可以简单地将其设置为足够大的一个数。(例如,设置`num_classes = 92`或`num_classes = 365`对于COCO数据集都没问题)。
3. 按需调整 [`configs/salience_detr`](configs/salience_detr/) 文件夹下的其他模型参数和 [`train_config.py`](train_config.py) 文件中的训练参数。
## 📥导出ONNX模型
对于想部署我们模型的高级用户,我们提供了脚本来导出ONNX文件。
```shell
python tools/pytorch2onnx.py \
--model-config /path/to/model.py \
--checkpoint /path/to/checkpoint.pth \
--save-file /path/to/save.onnx \
--simplify \ # 使用onnxsim来简化导出的ONNX文件
--verify # 验证导出的ONNX模型和原始pytorch模型的误差
```
请参照 [`tools/pytorch2onnx.py`](tools/pytorch2onnx.py) 文件中的 `ONNXDetector` 类来进行ONNX模型的推理。
## 引用
如果我们的工作对您的研究有帮助,请考虑引用我们的论文或为本仓库点一颗星⭐。
```bibtex
@InProceedings{Hou_2024_CVPR,
author = {Hou, Xiuquan and Liu, Meiqin and Zhang, Senlin and Wei, Ping and Chen, Badong},
title = {Salience DETR: Enhancing Detection Transformer with Hierarchical Salience Filtering Refinement},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2024},
pages = {17574-17583}
}
```
\ No newline at end of file
from torch import nn
from models.backbones.convnext import ConvNeXtBackbone
from models.bricks.position_encoding import PositionEmbeddingSine
from models.bricks.post_process import PostProcess
from models.bricks.salience_transformer import (
SalienceTransformer,
SalienceTransformerDecoder,
SalienceTransformerDecoderLayer,
SalienceTransformerEncoder,
SalienceTransformerEncoderLayer,
)
from models.bricks.set_criterion import HybridSetCriterion
from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
from models.matcher.hungarian_matcher import HungarianMatcher
from models.necks.channel_mapper import ChannelMapper
from models.necks.repnet import RepVGGPluXNetwork
# mostly changed parameters
embed_dim = 256
num_classes = 91
num_queries = 900
num_feature_levels = 4
transformer_enc_layers = 6
transformer_dec_layers = 6
num_heads = 8
dim_feedforward = 2048
# instantiate model components
position_embedding = PositionEmbeddingSine(embed_dim // 2, temperature=10000, normalize=True, offset=-0.5)
backbone = ConvNeXtBackbone("conv_l", return_indices=(1, 2, 3), freeze_indices=(0,))
neck = ChannelMapper(
in_channels=backbone.num_channels,
out_channels=embed_dim,
num_outs=num_feature_levels,
)
transformer = SalienceTransformer(
encoder=SalienceTransformerEncoder(
encoder_layer=SalienceTransformerEncoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_enc_layers,
),
neck=RepVGGPluXNetwork(
in_channels_list=neck.num_channels,
out_channels_list=neck.num_channels,
norm_layer=nn.BatchNorm2d,
activation=nn.SiLU,
groups=4,
),
decoder=SalienceTransformerDecoder(
decoder_layer=SalienceTransformerDecoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_dec_layers,
num_classes=num_classes,
),
num_classes=num_classes,
num_feature_levels=num_feature_levels,
two_stage_num_proposals=num_queries,
level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
)
matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
weight_dict.update({
k + f"_{i}": v
for i in range(transformer_dec_layers - 1)
for k, v in weight_dict.items()
})
weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
weight_dict.update({"loss_salience": 2})
criterion = HybridSetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0)
foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
postprocessor = PostProcess(select_box_nums_for_evaluation=300)
# combine above components to instantiate the model
model = SalienceDETR(
backbone=backbone,
neck=neck,
position_embedding=position_embedding,
transformer=transformer,
criterion=criterion,
focus_criterion=foreground_criterion,
postprocessor=postprocessor,
num_classes=num_classes,
num_queries=num_queries,
aux_loss=True,
min_size=800,
max_size=1333,
)
from torch import nn
from models.backbones.focalnet import FocalNetBackbone
from models.bricks.position_encoding import PositionEmbeddingSine
from models.bricks.post_process import PostProcess
from models.bricks.salience_transformer import (
SalienceTransformer,
SalienceTransformerDecoder,
SalienceTransformerDecoderLayer,
SalienceTransformerEncoder,
SalienceTransformerEncoderLayer,
)
from models.bricks.set_criterion import HybridSetCriterion
from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
from models.matcher.hungarian_matcher import HungarianMatcher
from models.necks.channel_mapper import ChannelMapper
from models.necks.repnet import RepVGGPluXNetwork
# mostly changed parameters
embed_dim = 256
num_classes = 91
num_queries = 900
num_feature_levels = 4
transformer_enc_layers = 6
transformer_dec_layers = 6
num_heads = 8
dim_feedforward = 2048
# instantiate model components
position_embedding = PositionEmbeddingSine(
embed_dim // 2, temperature=10000, normalize=True, offset=-0.5
)
backbone = FocalNetBackbone(
"focalnet_large_lrf_fl4", return_indices=(1, 2, 3), freeze_indices=(0,)
)
neck = ChannelMapper(
in_channels=backbone.num_channels,
out_channels=embed_dim,
num_outs=num_feature_levels,
)
transformer = SalienceTransformer(
encoder=SalienceTransformerEncoder(
encoder_layer=SalienceTransformerEncoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_enc_layers,
),
neck=RepVGGPluXNetwork(
in_channels_list=neck.num_channels,
out_channels_list=neck.num_channels,
norm_layer=nn.BatchNorm2d,
activation=nn.SiLU,
groups=4,
),
decoder=SalienceTransformerDecoder(
decoder_layer=SalienceTransformerDecoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_dec_layers,
num_classes=num_classes,
),
num_classes=num_classes,
num_feature_levels=num_feature_levels,
two_stage_num_proposals=num_queries,
level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
)
matcher = HungarianMatcher(
cost_class=2, cost_bbox=5, cost_giou=2, focal_alpha=0.25, focal_gamma=2.0
)
weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
weight_dict.update({
k + f"_{i}": v
for i in range(transformer_dec_layers - 1)
for k, v in weight_dict.items()
})
weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
weight_dict.update({"loss_salience": 2})
criterion = HybridSetCriterion(
num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0
)
foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
postprocessor = PostProcess(select_box_nums_for_evaluation=300)
# combine above components to instantiate the model
model = SalienceDETR(
backbone=backbone,
neck=neck,
position_embedding=position_embedding,
transformer=transformer,
criterion=criterion,
focus_criterion=foreground_criterion,
postprocessor=postprocessor,
num_classes=num_classes,
num_queries=num_queries,
aux_loss=True,
min_size=800,
max_size=1333,
)
from torch import nn
# from torchvision.ops import FrozenBatchNorm2d
from models.backbones.resnet import ResNetBackbone
from models.bricks.misc import FrozenBatchNorm2d
from models.bricks.position_encoding import PositionEmbeddingSine
from models.bricks.post_process import PostProcess
from models.bricks.salience_transformer import (
SalienceTransformer,
SalienceTransformerDecoder,
SalienceTransformerDecoderLayer,
SalienceTransformerEncoder,
SalienceTransformerEncoderLayer,
)
from models.bricks.set_criterion import HybridSetCriterion
from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
from models.matcher.hungarian_matcher import HungarianMatcher
from models.necks.channel_mapper import ChannelMapper
from models.necks.repnet import RepVGGPluXNetwork
# mostly changed parameters
embed_dim = 256
num_classes = 91
num_queries = 900
num_feature_levels = 4
transformer_enc_layers = 6
transformer_dec_layers = 6
num_heads = 8
dim_feedforward = 2048
# instantiate model components
position_embedding = PositionEmbeddingSine(embed_dim // 2, temperature=10000, normalize=True, offset=-0.5)
backbone = ResNetBackbone(
"resnet50", norm_layer=FrozenBatchNorm2d, return_indices=(1, 2, 3), freeze_indices=(0,)
)
neck = ChannelMapper(
in_channels=backbone.num_channels,
out_channels=embed_dim,
num_outs=num_feature_levels,
)
transformer = SalienceTransformer(
encoder=SalienceTransformerEncoder(
encoder_layer=SalienceTransformerEncoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_enc_layers,
),
neck=RepVGGPluXNetwork(
in_channels_list=neck.num_channels,
out_channels_list=neck.num_channels,
norm_layer=nn.BatchNorm2d,
activation=nn.SiLU,
groups=4,
),
decoder=SalienceTransformerDecoder(
decoder_layer=SalienceTransformerDecoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_dec_layers,
num_classes=num_classes,
),
num_classes=num_classes,
num_feature_levels=num_feature_levels,
two_stage_num_proposals=num_queries,
level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
)
matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
weight_dict.update({
k + f"_{i}": v
for i in range(transformer_dec_layers - 1)
for k, v in weight_dict.items()
})
weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
weight_dict.update({"loss_salience": 2})
criterion = HybridSetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0)
foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
postprocessor = PostProcess(select_box_nums_for_evaluation=300)
# combine above components to instantiate the model
model = SalienceDETR(
backbone=backbone,
neck=neck,
position_embedding=position_embedding,
transformer=transformer,
criterion=criterion,
focus_criterion=foreground_criterion,
postprocessor=postprocessor,
num_classes=num_classes,
num_queries=num_queries,
aux_loss=True,
min_size=800,
max_size=1333,
)
from torch import nn
from models.backbones.swin import SwinTransformerBackbone
from models.bricks.position_encoding import PositionEmbeddingSine
from models.bricks.post_process import PostProcess
from models.bricks.salience_transformer import (
SalienceTransformer,
SalienceTransformerDecoder,
SalienceTransformerDecoderLayer,
SalienceTransformerEncoder,
SalienceTransformerEncoderLayer,
)
from models.bricks.set_criterion import HybridSetCriterion
from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
from models.matcher.hungarian_matcher import HungarianMatcher
from models.necks.channel_mapper import ChannelMapper
from models.necks.repnet import RepVGGPluXNetwork
# mostly changed parameters
embed_dim = 256
num_classes = 91
num_queries = 900
num_feature_levels = 4
transformer_enc_layers = 6
transformer_dec_layers = 6
num_heads = 8
dim_feedforward = 2048
# instantiate model components
position_embedding = PositionEmbeddingSine(embed_dim // 2, temperature=10000, normalize=True, offset=-0.5)
backbone = SwinTransformerBackbone("swin_l", return_indices=(1, 2, 3), freeze_indices=(0,))
neck = ChannelMapper(
in_channels=backbone.num_channels,
out_channels=embed_dim,
num_outs=num_feature_levels,
)
transformer = SalienceTransformer(
encoder=SalienceTransformerEncoder(
encoder_layer=SalienceTransformerEncoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_enc_layers,
),
neck=RepVGGPluXNetwork(
in_channels_list=neck.num_channels,
out_channels_list=neck.num_channels,
norm_layer=nn.BatchNorm2d,
activation=nn.SiLU,
groups=4,
),
decoder=SalienceTransformerDecoder(
decoder_layer=SalienceTransformerDecoderLayer(
embed_dim=embed_dim,
n_heads=num_heads,
dropout=0.0,
activation=nn.ReLU(inplace=True),
n_levels=num_feature_levels,
n_points=4,
d_ffn=dim_feedforward,
),
num_layers=transformer_dec_layers,
num_classes=num_classes,
),
num_classes=num_classes,
num_feature_levels=num_feature_levels,
two_stage_num_proposals=num_queries,
level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
)
matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
weight_dict.update({
k + f"_{i}": v
for i in range(transformer_dec_layers - 1)
for k, v in weight_dict.items()
})
weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
weight_dict.update({"loss_salience": 2})
criterion = HybridSetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0)
foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
postprocessor = PostProcess(select_box_nums_for_evaluation=300)
# combine above components to instantiate the model
model = SalienceDETR(
backbone=backbone,
neck=neck,
position_embedding=position_embedding,
transformer=transformer,
criterion=criterion,
focus_criterion=foreground_criterion,
postprocessor=postprocessor,
num_classes=num_classes,
num_queries=num_queries,
aux_loss=True,
min_size=800,
max_size=1333,
)
from torch import optim
from datasets.coco import CocoDetection
from transforms import presets
from optimizer import param_dict
# Commonly changed training configurations
num_epochs = 12 # train epochs
batch_size = 2 # total_batch_size = #GPU x batch_size
num_workers = 4 # workers for pytorch DataLoader
pin_memory = True # whether pin_memory for pytorch DataLoader
print_freq = 50 # frequency to print logs
starting_epoch = 0
max_norm = 0.1 # clip gradient norm
output_dir = None # path to save checkpoints, default for None: checkpoints/{model_name}
find_unused_parameters = False # useful for debugging distributed training
# define dataset for train
coco_path = "data/coco" # /PATH/TO/YOUR/COCODIR
train_transform = presets.detr # see transforms/presets to choose a transform
train_dataset = CocoDetection(
img_folder=f"{coco_path}/train2017",
ann_file=f"{coco_path}/annotations/instances_train2017.json",
transforms=train_transform,
train=True,
)
test_dataset = CocoDetection(
img_folder=f"{coco_path}/val2017",
ann_file=f"{coco_path}/annotations/instances_val2017.json",
transforms=None, # the eval_transform is integrated in the model
)
# model config to train
model_path = "configs/salience_detr/salience_detr_resnet50_800_1333.py"
# specify a checkpoint folder to resume, or a pretrained ".pth" to finetune, for example:
# checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-09_38_50
# checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-09_38_50/best_ap.pth
resume_from_checkpoint = None
learning_rate = 1e-4 # initial learning rate
optimizer = optim.AdamW(lr=learning_rate, weight_decay=1e-4, betas=(0.9, 0.999))
lr_scheduler = optim.lr_scheduler.MultiStepLR(milestones=[10], gamma=0.1)
# This define parameter groups with different learning rate
param_dicts = param_dict.finetune_backbone_and_linear_projection(lr=learning_rate)
import os
import albumentations as A
import cv2
import numpy as np
import torchvision
from transforms import v2 as T
from transforms.convert_coco_polys_to_mask import ConvertCocoPolysToMask
from util import datapoints
from util.misc import deepcopy
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(
self,
img_folder,
ann_file,
transforms=None,
train=False,
):
super(CocoDetection, self).__init__(img_folder, ann_file)
self.prepare = ConvertCocoPolysToMask()
self._transforms = transforms
self._transforms = self.update_dataset(self._transforms)
self.train = train
if train:
self._coco_remove_images_without_annotations()
def update_dataset(self, transform):
if isinstance(transform, (T.Compose, A.Compose)):
processed_transforms = []
for trans in transform.transforms:
trans = self.update_dataset(trans)
processed_transforms.append(trans)
return type(transform)(processed_transforms)
if hasattr(transform, "update_dataset"):
transform.update_dataset(self)
return transform
def load_image(self, image_name):
# after comparing the speed of PIL, torchvision and cv2,
# cv2 is chosen as the default backend to load images,
# uncomment the following code to switch among them.
# image = Image.open(os.path.join(self.root, path)).convert('RGB')
# image = torchvision.io.read_image(os.path.join(self.root, path))
# To avoid deadlock between DataLoader and OpenCV
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)
# image = cv2.imread(os.path.join(self.root, image_name))
image = cv2.imdecode(np.fromfile(os.path.join(self.root, image_name), dtype=np.uint8), -1)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).transpose(2, 0, 1)
return image
def get_image_id(self, item: int):
if hasattr(self, "indices"):
item = self.indices[item]
image_id = self.ids[item]
return image_id
def load_image_and_target(self, item: int):
image_id = self.get_image_id(item)
# load images and annotations
image_name = self.coco.loadImgs([image_id])[0]["file_name"]
image = self.load_image(image_name)
target = self.coco.loadAnns(self.coco.getAnnIds([image_id]))
target = dict(image_id=image_id, annotations=target)
image, target = self.prepare((image, target))
return image, target
def data_augmentation(self, image, target):
# preprocess
image = datapoints.Image(image)
bounding_boxes = datapoints.BoundingBox(
target["boxes"],
format=datapoints.BoundingBoxFormat.XYXY,
spatial_size=image.shape[-2:],
)
labels = target["labels"]
if self._transforms is not None:
image, bounding_boxes, labels = self._transforms(image, bounding_boxes, labels)
return image.data, bounding_boxes.data, labels
def __getitem__(self, item):
image, target = self.load_image_and_target(item)
image, target["boxes"], target["labels"] = self.data_augmentation(image, target)
return deepcopy(image), deepcopy(target)
def __len__(self):
return len(self.indices) if hasattr(self, "indices") else len(self.ids)
def _coco_remove_images_without_annotations(self, cat_list=None):
def _has_only_empty_bbox(anno):
return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
def _count_visible_keypoints(anno):
return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
min_keypoints_per_image = 10
def _has_valid_annotation(anno):
# if it's empty, there is no annotation
if len(anno) == 0:
return False
# if all boxes have close to zero area, there is no annotation
if _has_only_empty_bbox(anno):
return False
# keypoints task have a slight different critera for considering
# if an annotation is valid
if "keypoints" not in anno[0]:
return True
# for keypoint detection tasks, only consider valid images those
# containing at least min_keypoints_per_image
if _count_visible_keypoints(anno) >= min_keypoints_per_image:
return True
return False
ids = []
for ds_idx, img_id in enumerate(self.ids):
ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None)
anno = self.coco.loadAnns(ann_ids)
if cat_list:
anno = [obj for obj in anno if obj["category_id"] in cat_list]
if _has_valid_annotation(anno):
ids.append(ds_idx)
self.indices = ids
class Object365Detection(CocoDetection):
def load_image_and_target(self, item: int):
image_id = self.get_image_id(item)
# load images and annotations
image_name = self.coco.loadImgs([image_id])[0]["file_name"]
# NOTE: Only for object 365
image_name = os.path.join(*image_name.split(os.sep)[-2:])
if self.train:
image_name = os.path.join("images/train", image_name)
else:
image_name = os.path.join("images/val", image_name)
image = self.load_image(image_name)
target = self.coco.loadAnns(self.coco.getAnnIds([image_id]))
target = dict(image_id=image_id, annotations=target)
image, target = self.prepare((image, target))
return image, target
def __getitem__(self, item):
try:
image, target = self.load_image_and_target(item)
except:
item += 1
image, target = self.load_image_and_target(item)
image, target["boxes"], target["labels"] = self.data_augmentation(image, target)
return deepcopy(image), deepcopy(target)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment