Initial commit

1345fab2 · luopl · 1345fab2 · 1345fab2 · 1345fab2 · 1345fab2
Commit 1345fab2 authored Jun 27, 2024 by luopl
20 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Project related directories
+data/
+checkpoints/
+visualization/
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/salience_detr_pytorch.iml" filepath="$PROJECT_DIR$/.idea/salience_detr_pytorch.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/salience_detr_pytorch.iml
+++ b/.idea/salience_detr_pytorch.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+</module>
\ No newline at end of file
--- a/Dockerfile
+++ b/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10-py38
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
+# Salience-DETR
+## 论文
+`Salience DETR: Enhancing Detection Transformer with Hierarchical Salience Filtering Refinement`
+- https://arxiv.org/abs/2403.16131
+## 模型结构
+显著性DETR采用了高性能的两阶段管道。
+SlalenceDETR和主流的两阶段DETR类方法之间的主要架构区别在于变压器编码器和查询细化。
+给定来自主干的多尺度特征，编码器仅更新基于显著性引导监督的层次查询过滤选择的查询。
+通过查询细化模块，可以缓解查询之间的语义失调。
+<div align=center>
+    <img src="./docs/The architecture overview of Salience DETR.png"/>
+</div>
+
+## 算法原理
+Salience DETR和主流的两阶段类DETR方法之间的主要架构差异在于detection transformer encoder 和 query refinement。主要包含以下几个部分：
+(1)显著引导的监督
+根据预测的置信度，查询过滤更新信息最丰富的查询，以以更少的计算负担实现类似的性能。从Focus DETR 中汲取灵感，为多尺度特性中的每个级别的查询提供监督。没有构建只分类前景和背景的离散标签{0,1}，而是构建了一个尺度独立的显著性作为监督目标来克服尺度偏差。
+
+(2)层次查询过滤
+
+Revisting query filtering in Focus DETR。Focus DETR引入了一个额外的分支，该分支通过对多尺度特征进行自上而下的分数调制来预测前景置信度。
+
+(3)分层查询筛选
+
+通常，高级标记比低级标记带来更少的计算负担，同时保留更多的信息语义。因此，除了传统的分层滤波外，一个自然的动机是引入分层滤波来处理多尺度特性。文章引入了两个集合作为相应的滤波比，对于第t个编码器层和第l个特征级，只有顶部的v*t*w*l*查询进行注意编码，而其他查询保持不变。
+
+(4)跨级令牌融合
+针对特定级别的过滤比率导致的不同级别查询的语义错位，文章提出了一种令牌融合模块，该模块利用路径聚合结构来处理跨级别的信息交互。在该模块中，通过提出的 RepVGGPluXBlock融合相邻的令牌，如图所示。
+
+<div align=center>
+    <img src="./docs/Cross-level token fusion.png"/>
+</div>
+
+## 环境配置
+### Docker（方法一）
+此处提供[光源](https://www.sourcefind.cn/#/service-details)拉取docker镜像的地址与使用步骤，以及[光合](https://developer.hpccube.com/tool/)开发者社区深度学习库下载地址
+```
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10-py38 
+docker run -it --shm-size=128G -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name salience_detr_pytorch  <your IMAGE ID> bash # <your IMAGE ID>为以上拉取的docker的镜像ID替换，本镜像为：ffa1f63239fc
+cd /path/your_code_data/salience_detr_pytorch
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/  --trusted-host mirrors.aliyun.com
+
+```
+### Dockerfile（方法二）
+此处提供dockerfile的使用方法
+```
+docker build --no-cache -t salience_detr:latest .
+docker run -it --shm-size=128G -v /path/your_code_data/:/path/your_code_data/ -v /opt/hyhal:/opt/hyhal:ro --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name salience_detr_pytorch  salience_detr  bash
+cd /path/your_code_data/sed_pytorch
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/  --trusted-host mirrors.aliyun.com
+
+```
+### Anaconda（方法三）
+此处提供本地配置、编译的详细步骤，例如：
+
+关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
+```
+#DTK驱动：dtk23.10
+# python：python3.8
+# torch: 2.1.0
+# torchvision: 0.16.0
+conda create -n salience_detr python=3.8
+conda activate salience_detr
+pip install torch-2.1.0a0+git793d2b5.abi0.dtk2310-cp38-cp38-manylinux2014_x86_64.whl
+pip install torchvision-0.16.0+git267eff6.abi0.dtk2310.torch2.1.0-cp38-cp38-manylinux2014_x86_64.whl
+```
+`Tips：以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应`
+
+其它依赖环境安装如下：
+```
+cd /path/your_code_data/salience_detr_pytorch
+pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/  --trusted-host mirrors.aliyun.com
+
+```
+## 数据集
+
+请下载[COCO 2017](https://cocodataset.org/#home)或将自己的数据集准备到data/中，并按如下方式组织它们。您可以使用tools/visualize_datasets.py可视化数据集注释以验证其正确性。
+
+```
+coco/
+  ├── train2017/
+  ├── val2017/
+  └── annotations/
+  	├── instances_train2017.json
+  	└── instances_val2017.json
+```
+
+## 训练
+使用accelerate原生处理多DCU的软件包，使用HIP_VISIBLE_DEVICES来指定DCU。如果未指定，脚本将使用节点上所有可用的DCU进行训练。
+在训练前修改：configs/train_config.py文件
+
+### 单机单卡
+```
+HIP_VISIBLE_DEVICES=0 accelerate launch main.py
+```
+### 单机多卡
+```
+HIP_VISIBLE_DEVICES=0,1,2,3 accelerate launch main.py
+```
+
+## 推理
+在各种设置下使用 ResNet50 和 Swin-L 训练了 Salience DETR。以下为 COCO 2017 提供了相应的配置和检查点。
+
+### 训练12轮
+
+| 模型          | 主干网                  |                                                                                                     下载                                                                                                     |
+| ------------- | ----------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Salience DETR | ResNet50                |           [配置](configs/salience_detr/salience_detr_resnet50_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_resnet50_800_1333_coco_1x.pth)           |
+| Salience DETR | ConvNeXt-L              |         [配置](configs/salience_detr/salience_detr_convnext_l_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_convnext_l_800_1333_coco_1x.pth)         |
+| Salience DETR | Swin-L<sub>(IN-22K)     |             [配置](configs/salience_detr/salience_detr_swin_l_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_swin_l_800_1333_coco_1x.pth)             |
+| Salience DETR | FocalNet-L<sub>(IN-22K) | [配置](configs/salience_detr/salience_detr_focalnet_large_lrf_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_focalnet_large_lrf_800_1333_coco_1x.pth) |
+### 训练24轮
+
+| 模型          | 主干网                   |                                                                                                     下载                                                                                                     |
+| ------------- | -----------------------  | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Salience DETR | ResNet50               |           [配置](configs/salience_detr/salience_detr_resnet50_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_resnet50_800_1333_coco_2x.pth)           |
+
+### 单卡推理
+
+注意--model-config和--checkpoint需要一一对应。
+
+Inference:
+```
+# python inference.py --image-dir /path/to/images --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth --show-dir /path/to/dir
+python inference.py --image-dir ./images --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py --checkpoint checkpoint/salience_detr_resnet50_800_1333_coco_2x.pth --show-dir results
+```
+
+Evaluation/Test:
+
+```
+# CUDA_VISIBLE_DEVICES=<gpu_ids> accelerate launch test.py --coco-path /path/to/coco --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth
+
+HIP_VISIBLE_DEVICES=0 accelerate launch test.py --coco-path ./data/coco --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py --checkpoint checkpoint/salience_detr_resnet50_800_1333_coco_2x.pth
+
+```
+
+
+### 多卡推理
+
+```
+CUDA_VISIBLE_DEVICES=<gpu_ids> accelerate launch test.py --coco-path /path/to/coco --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth
+HIP_VISIBLE_DEVICES=0,1,2,3 accelerate launch test.py --coco-path ./data/coco --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py --checkpoint checkpoint/salience_detr_resnet50_800_1333_coco_2x.pth
+```
+
+## result
+原图：
+<div align=center>
+    <img src="./docs/000000000139.jpg"/>
+</div>
+
+inference可视化结果如下：
+
+<div align=center>
+    <img src="./docs/000000000139_detr.jpg"/>
+</div>
+
+
+### 精度
+使用四张DCU-K100卡推理
+
+
+| 模型          | 主干网                  |  AP   | AP<sub>50 | AP<sub>75 | AP<sub>S | AP<sub>M | AP<sub>L |
+| ------------- | ----------------------- | :---: | :-------: | :-------: | :------: | :------: | :------: |
+| Salience DETR | ResNet50                | 50.0  |   67.7    |   54.2    |   33.3   |   54.4   |   64.4   |
+| Salience DETR | ConvNeXt-L              | 54.2  |   72.4    |   59.1    |   38.8   |   58.3   |   69.6   |
+| Salience DETR | Swin-L<sub>(IN-22K)     | 56.5  |   75.0    |   61.5    |   40.2   |   61.2   |   72.8   |
+| Salience DETR | FocalNet-L<sub>(IN-22K) | 57.3  |   75.5    |   62.3    |   40.9   |   61.8   |   74.5   |
+
+
+| 模型          | 主干网                  |  AP   | AP<sub>50 | AP<sub>75 | AP<sub>S | AP<sub>M | AP<sub>L |
+| ------------- | ----------------------- | :---: | :-------: | :-------: | :------: | :------: | :------: | 
+| Salience DETR | ResNet50                | 51.2  |   68.9    |   55.7    |   33.9   |   55.5   |   65.6   |
+
+
+## 应用场景
+### 算法类别
+`目标检测`
+### 热点应用行业
+`科研,制造,医疗,家居,教育`
+## 源码仓库及问题反馈
+- https://developer.hpccube.com/codes/modelzoo/salience_detr_pytorch
+## 参考资料
+- https://github.com/xiuqhou/Salience-DETR
+
--- a/README_cn.md
+++ b/README_cn.md
+简体中文 | [English](README.md)
+
+**Salience DETR**: Enhancing Detection Transformer with Hierarchical Salience Filtering Refinement
+===
+
+By [Xiuquan Hou](https://github.com/xiuqhou), [Meiqin Liu](https://scholar.google.com/citations?user=T07OWMkAAAAJ&hl=zh-CN&oi=ao), Senlin Zhang, [Ping Wei](https://scholar.google.com/citations?user=1OQBtdcAAAAJ&hl=zh-CN&oi=ao), [Badong Chen](https://scholar.google.com/citations?user=mq6tPX4AAAAJ&hl=zh-CN&oi=ao).
+
+[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/salience-detr-enhancing-detection-transformer-1/object-detection-on-coco-2017-val)](https://paperswithcode.com/sota/object-detection-on-coco-2017-val?p=salience-detr-enhancing-detection-transformer-1)
+[![arXiv](https://img.shields.io/badge/arXiv-2403.16131-b31b1b.svg)](https://arxiv.org/abs/2403.16131)
+[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](https://makeapullrequest.com)
+[![GitHub license](https://img.shields.io/github/license/xiuqhou/Salience-DETR.svg?color=blue)](https://github.com/xiuqhou/Salience-DETR/blob/master/LICENSE)
+![GitHub stars](https://img.shields.io/github/stars/xiuqhou/Salience-DETR)
+![GitHub forks](https://img.shields.io/github/forks/xiuqhou/Salience-DETR)
+
+本仓库是**CVPR 2024**（得分**553**）论文Salience DETR的官方实现.
+
+## ✨研究亮点:
+
+1. 我们深入分析了两阶段DETR类方法中存在的[尺度偏差和查询冗余](id_1)问题。
+2. 我们提出了一种在显著性监督下降低计算复杂度的分层过滤机制，所提出的监督方式甚至能在仅使用检测框标注的情况下捕捉[细粒度的物体轮廓](#id_2)。
+3. Salience DETR在三个极具挑战的缺陷检测任务上分别提升了 **+4.0%**, **+0.2%** 和 **+4.4%** AP，在COCO 2017上只使用了大约 **70\%** FLOPs 实现了相当的精度。
+
+<div align="center">
+    <img src="images/Salience-DETR.svg">
+</div>
+
+<details>
+
+<summary>🔎可视化</summary>
+
+- 现有DETR方法的两阶段选择出的查询通常是**冗余**的，并且存在**尺度偏执**（左图）。
+- 对于缺陷检测和目标检测任务，**显著性监督**都有助于在仅使用检测框标注的情况下捕捉**物体轮廓**（右图）.
+
+<h3 align="center">
+    <a id="id_1"><img src="images/query_visualization.svg" width="335"></a>
+    <a id="id_2"><img src="images/salience_visualization.svg" width="462"></a>
+</h3>
+
+</details>
+
+## 更新动态
+
+`2024-04-19`: 以 [FocalNet-Large](https://github.com/microsoft/FocalNet) 作为主干网，Salience DETR在COCO val2017上取得了 **56.8 AP**, [**配置**](configs/salience_detr/salience_detr_focalnet_large_lrf_800_1333.py) 和 [**权重**](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_focalnet_large_lrf_800_1333_coco_1x.pth) 已更新!
+
+`2024-04-08`: 更新以ConvNeXt-L作为主干网、在COCO 2017数据集上训练12轮的Salience DETR [**配置**](configs/salience_detr/salience_detr_convnext_l_800_1333.py) 和 [**权重**](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_convnext_l_800_1333_coco_1x.pth).
+
+`2024-04-01`: 使用Swin-L作为主干网，Salience DETR在COCO 2017数据集上取得了 **56.5** AP (训练12轮)。 模型 [**配置**](configs/salience_detr/salience_detr_swin_l_800_1333.py) 和 [**权重**](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_swin_l_800_1333_coco_1x.pth) 已发布.
+
+`2024-03-26`: 我们发布了Salience DETR的代码和在COCO 2017上使用ResNet50作为主干网络的预训练权重。
+
+`2024-02-29`: Salience DETR被CVPR2024接受，代码将在本仓库发布。欢迎关注！
+
+## 模型库
+
+在被 **CVPR 2024** 接受以后, 我们又在多种设置下重新训练了以 **ResNet50** 和 **Swin-L** 作为主干网的 **Salience DETR** 。我们提供了相应的 [**COCO 2017**](https://cocodataset.org/#home) 数据集的配置和权重。
+
+### 训练12轮
+
+| 模型          | 主干网                  |  AP   | AP<sub>50 | AP<sub>75 | AP<sub>S | AP<sub>M | AP<sub>L |                                                                                                     下载                                                                                                     |
+| ------------- | ----------------------- | :---: | :-------: | :-------: | :------: | :------: | :------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Salience DETR | ResNet50                | 50.0  |   67.7    |   54.2    |   33.3   |   54.4   |   64.4   |           [配置](configs/salience_detr/salience_detr_resnet50_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_resnet50_800_1333_coco_1x.pth)           |
+| Salience DETR | ConvNeXt-L              | 54.2  |   72.4    |   59.1    |   38.8   |   58.3   |   69.6   |         [配置](configs/salience_detr/salience_detr_convnext_l_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_convnext_l_800_1333_coco_1x.pth)         |
+| Salience DETR | Swin-L<sub>(IN-22K)     | 56.5  |   75.0    |   61.5    |   40.2   |   61.2   |   72.8   |             [配置](configs/salience_detr/salience_detr_swin_l_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_swin_l_800_1333_coco_1x.pth)             |
+| Salience DETR | FocalNet-L<sub>(IN-22K) | 57.3  |   75.5    |   62.3    |   40.9   |   61.8   |   74.5   | [配置](configs/salience_detr/salience_detr_focalnet_large_lrf_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_focalnet_large_lrf_800_1333_coco_1x.pth) |
+
+### 训练24轮
+
+| 模型          | 主干网                  |  AP   | AP<sub>50 | AP<sub>75 | AP<sub>S | AP<sub>M | AP<sub>L |                                                                                                     下载                                                                                                     |
+| ------------- | ----------------------- | :---: | :-------: | :-------: | :------: | :------: | :------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Salience DETR | ResNet50                | 51.2  |   68.9    |   55.7    |   33.9   |   55.5   |   65.6   |           [配置](configs/salience_detr/salience_detr_resnet50_800_1333.py) / [权重](https://github.com/xiuqhou/Salience-DETR/releases/download/v1.0.0/salience_detr_resnet50_800_1333_coco_2x.pth)           |
+
+## 🔧安装步骤
+
+1. 克隆本仓库：
+
+    ```shell
+    git clone https://github.com/xiuqhou/Salience-DETR.git
+    cd Salience-DETR/
+    ```
+
+2. 创建并激活conda环境：
+
+    ```shell
+    conda create -n salience_detr python=3.8
+    conda activate salience_detr
+    ```
+
+3. 根据官方步骤 [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/) 安装pytorch。本代码要求 `python>=3.8, torch>=1.11.0, torchvision>=0.12.0`。
+
+    ```shell
+    conda install pytorch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 cudatoolkit=11.3 -c pytorch
+    ```
+
+4. 安装其他依赖：
+
+    ```shell
+    conda install --file requirements.txt -c conda-forge
+    ```
+
+您不需要手动编译CUDA算子，代码第一次运行时会自动编译并加载。
+
+## 📁准备数据集
+
+请按照如下格式下载 [COCO 2017](https://cocodataset.org/) 数据集或准备您自己的数据集，并将他们放在 `data/` 目录下。您可以使用 [`tools/visualize_datasets.py`](tools/visualize_datasets.py) 来可视化数据集以验证其正确性。
+
+```shell
+coco/
+  ├── train2017/
+  ├── val2017/
+  └── annotations/
+  	├── instances_train2017.json
+  	└── instances_val2017.json
+```
+
+<details>
+
+<summary>可视化例子</summary>
+
+```shell
+python tools/visualize_datasets.py \
+    --coco-img data/coco/val2017 \
+    --coco-ann data/coco/annotations/instances_val2017.json \
+    --show-dir visualize_dataset/
+```
+
+</details>
+
+## 📚︎训练模型
+
+我们使用 `accelerate` 包来原生处理多GPU训练，您只需要使用 `CUDA_VISIBLE_DEVICES` 来指定要用于训练的GPU/GPUs。如果未指定，脚本会自动使用机器上所有可用的GPU来训练。
+
+```shell
+CUDA_VISIBLE_DEVICES=0 accelerate launch main.py    # 使用1个GPU进行训练
+CUDA_VISIBLE_DEVICES=0,1 accelerate launch main.py  # 使用2个GPU进行训练
+```
+
+训练之前请调整 [`configs/train_config.py`](configs/train_config.py) 中的参数。
+
+<details>
+
+<summary>训练配置文件的例子</summary>
+
+```python
+from torch import optim
+
+from datasets.coco import CocoDetection
+from transforms import presets
+from optimizer import param_dict
+
+# 经常需要改动的训练配置
+num_epochs = 12   # 训练轮次
+batch_size = 2    # 总批次尺寸 = GPU数量 x 批次尺寸batch_size
+num_workers = 4   # pytorch DataLoader加载数据所使用的进程数量
+pin_memory = True # 是否在 pytorch DataLoader 中使用pin_memory
+print_freq = 50   # 日志记录的频率
+starting_epoch = 0
+max_norm = 0.1    # 梯度裁剪的范数
+
+output_dir = None  # 保存checkpoints的路径，如果设置为None，则默认保存至checkpoints/{model_name}路径下
+find_unused_parameters = False  # 用于调试分布式训练
+
+# 定义用于训练的数据集
+coco_path = "data/coco"  # 数据集路径
+train_transform = presets.detr  # 从 transforms/presets.py 文件中选择数据增强
+train_dataset = CocoDetection(
+    img_folder=f"{coco_path}/train2017",
+    ann_file=f"{coco_path}/annotations/instances_train2017.json",
+    transforms=train_transform,
+    train=True,
+)
+test_dataset = CocoDetection(
+    img_folder=f"{coco_path}/val2017",
+    ann_file=f"{coco_path}/annotations/instances_val2017.json",
+    transforms=None,  # eval_transform已集成至网络前向传播中
+)
+
+# 模型配置文件
+model_path = "configs/salience_detr/salience_detr_resnet50_800_1333.py"
+
+# 指定一个检查点文件夹来恢复训练，或者指定一个“.pth”文件来进行微调，例如：
+# checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-09_38_50
+# checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-09_38_50/best_ap.pth
+resume_from_checkpoint = None
+
+learning_rate = 1e-4  # 初始学习率
+optimizer = optim.AdamW(lr=learning_rate, weight_decay=1e-4, betas=(0.9, 0.999))
+lr_scheduler = optim.lr_scheduler.MultiStepLR(milestones=[10], gamma=0.1)
+
+# 为不同的参数定义不同学习率
+param_dicts = param_dict.finetune_backbone_and_linear_projection(lr=learning_rate)
+```
+</details>
+
+## 📈评估和测试
+
+为了使用单个或多个GPU来评估模型，请指定 `CUDA_VISIBLE_DEVICES`、`dataset`、`model`、`checkpoint` 等参数。
+
+```shell
+CUDA_VISIBLE_DEVICES=<gpu_ids> accelerate launch test.py --coco-path /path/to/coco --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth
+```
+
+以下是可选参数，更多参数请查看 [test.py](test.py) 。
+
+- `--show-dir`: 指定用于保存可视化结果的文件夹路径。
+- `--result`: 指定用于保存检测结果的文件路径，必须以 `.json` 结尾。
+
+<details>
+
+<summary>模型评估的例子</summary>
+
+例如，使用8张GPU来在 `coco` 上评估 `salience_detr_resnet50_800_1333` 模型，并将检测结果保存至 `result.json` 文件，并将检测结果的可视化保存至 `visualization/` 文件夹下，请运行以下命令：
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 accelerate launch test.py
+    --coco-path data/coco \
+    --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py \
+    --checkpoint checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-21_29_56/best_ap.pth \
+    --result result.json \
+    --show-dir visualization/
+```
+
+</details>
+
+<details>
+
+<summary>评估json结果文件</summary>
+
+在获取到上述保存的json检测结果文件后，如果要对该文件进行评估，请指定 `--result` 参数但不需要指定 `--model` 参数。
+
+```shell
+CUDA_VISIBLE_DEVICES=0 accelerate launch test.py --coco-path /path/to/coco --result /path/to/result.json
+```
+
+以下是可选参数，完整参数请查看 [test.py](test.py) ：
+
+- `--show-dir`: 指定用于保存可视化结果的文件夹路径。
+
+</details>
+
+## ▶︎模型推理
+
+使用 [`inference.py`](inference.py) 来推理图片，使用 `--image-dir` 指定图片所在的文件夹路径。
+
+```shell
+python inference.py --image-dir /path/to/images --model-config /path/to/model.py --checkpoint /path/to/checkpoint.pth --show-dir /path/to/dir
+```
+
+<details>
+
+<summary>推理图片的例子</summary>
+
+例如，运行如下命令推理 `images/` 文件夹下的图片并将可视化结果保存至 `visualization/` 文件夹中。
+
+```shell
+python inference.py \
+    --image-dir images/ \
+    --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py \
+    --checkpoint checkpoint.pth \
+    --show-dir visualization/
+```
+
+</details>
+
+或使用 [`inference.ipynb`](inference.ipynb) 进行单张图片的推理和可视化。
+
+## 🔁评估模型速度、显存和参数
+
+使用 `tools/benchmark_model.py` 来评估模型的推理速度、显存占用和参数量。
+
+```shell
+python tools/benchmark_model.py --model-config configs/salience_detr/salience_detr_resnet50_800_1333.py
+```
+
+## 📍训练自定义的数据集
+
+训练您自己的数据集之前请执行下面步骤：
+
+1. 按照COCO标注格式准备您自己的数据集，并相应地将 [`configs/train_config.py`](configs/train_config.py) 中的 `coco_path` 参数调整为数据集所在的路径。
+2. 打开 [`configs/salience_detr`](configs/salience_detr) 路径下的模型配置文件，将 `num_classes` 参数调整为数据集大于数据集的 `最大类别id+1`。以COCO数据集为例，查看 `instances_val2017.json` 标注文件，我们可以发现其最大类别id为`90`，因此设置 `num_classes = 91`。
+
+    ```json
+    {"supercategory": "indoor","id": 90,"name": "toothbrush"}
+    ```
+    如果您不确定 `num_classes` 需要设置为多少，也可以简单地将其设置为足够大的一个数。（例如，设置`num_classes = 92`或`num_classes = 365`对于COCO数据集都没问题）。
+3. 按需调整 [`configs/salience_detr`](configs/salience_detr/) 文件夹下的其他模型参数和 [`train_config.py`](train_config.py) 文件中的训练参数。
+
+## 📥导出ONNX模型
+
+对于想部署我们模型的高级用户，我们提供了脚本来导出ONNX文件。
+
+```shell
+python tools/pytorch2onnx.py \
+    --model-config /path/to/model.py \
+    --checkpoint /path/to/checkpoint.pth \
+    --save-file /path/to/save.onnx \
+    --simplify \  # 使用onnxsim来简化导出的ONNX文件
+    --verify  # 验证导出的ONNX模型和原始pytorch模型的误差
+```
+
+请参照 [`tools/pytorch2onnx.py`](tools/pytorch2onnx.py) 文件中的 `ONNXDetector` 类来进行ONNX模型的推理。
+
+## 引用
+
+如果我们的工作对您的研究有帮助，请考虑引用我们的论文或为本仓库点一颗星⭐。
+
+```bibtex
+@InProceedings{Hou_2024_CVPR,
+    author    = {Hou, Xiuquan and Liu, Meiqin and Zhang, Senlin and Wei, Ping and Chen, Badong},
+    title     = {Salience DETR: Enhancing Detection Transformer with Hierarchical Salience Filtering Refinement},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2024},
+    pages     = {17574-17583}
+}
+```
\ No newline at end of file
--- a/configs/salience_detr/salience_detr_convnext_l_800_1333.py
+++ b/configs/salience_detr/salience_detr_convnext_l_800_1333.py
+from torch import nn
+
+from models.backbones.convnext import ConvNeXtBackbone
+from models.bricks.position_encoding import PositionEmbeddingSine
+from models.bricks.post_process import PostProcess
+from models.bricks.salience_transformer import (
+    SalienceTransformer,
+    SalienceTransformerDecoder,
+    SalienceTransformerDecoderLayer,
+    SalienceTransformerEncoder,
+    SalienceTransformerEncoderLayer,
+)
+from models.bricks.set_criterion import HybridSetCriterion
+from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
+from models.matcher.hungarian_matcher import HungarianMatcher
+from models.necks.channel_mapper import ChannelMapper
+from models.necks.repnet import RepVGGPluXNetwork
+
+# mostly changed parameters
+embed_dim = 256
+num_classes = 91
+num_queries = 900
+num_feature_levels = 4
+transformer_enc_layers = 6
+transformer_dec_layers = 6
+num_heads = 8
+dim_feedforward = 2048
+
+# instantiate model components
+position_embedding = PositionEmbeddingSine(embed_dim // 2, temperature=10000, normalize=True, offset=-0.5)
+
+backbone = ConvNeXtBackbone("conv_l", return_indices=(1, 2, 3), freeze_indices=(0,))
+
+neck = ChannelMapper(
+    in_channels=backbone.num_channels,
+    out_channels=embed_dim,
+    num_outs=num_feature_levels,
+)
+
+transformer = SalienceTransformer(
+    encoder=SalienceTransformerEncoder(
+        encoder_layer=SalienceTransformerEncoderLayer(
+            embed_dim=embed_dim,
+            n_heads=num_heads,
+            dropout=0.0,
+            activation=nn.ReLU(inplace=True),
+            n_levels=num_feature_levels,
+            n_points=4,
+            d_ffn=dim_feedforward,
+        ),
+        num_layers=transformer_enc_layers,
+    ),
+    neck=RepVGGPluXNetwork(
+        in_channels_list=neck.num_channels,
+        out_channels_list=neck.num_channels,
+        norm_layer=nn.BatchNorm2d,
+        activation=nn.SiLU,
+        groups=4,
+    ),
+    decoder=SalienceTransformerDecoder(
+        decoder_layer=SalienceTransformerDecoderLayer(
+            embed_dim=embed_dim,
+            n_heads=num_heads,
+            dropout=0.0,
+            activation=nn.ReLU(inplace=True),
+            n_levels=num_feature_levels,
+            n_points=4,
+            d_ffn=dim_feedforward,
+        ),
+        num_layers=transformer_dec_layers,
+        num_classes=num_classes,
+    ),
+    num_classes=num_classes,
+    num_feature_levels=num_feature_levels,
+    two_stage_num_proposals=num_queries,
+    level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
+    layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
+)
+
+matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
+
+weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
+weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
+weight_dict.update({
+    k + f"_{i}": v
+    for i in range(transformer_dec_layers - 1)
+    for k, v in weight_dict.items()
+})
+weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
+weight_dict.update({"loss_salience": 2})
+
+criterion = HybridSetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0)
+foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
+postprocessor = PostProcess(select_box_nums_for_evaluation=300)
+
+# combine above components to instantiate the model
+model = SalienceDETR(
+    backbone=backbone,
+    neck=neck,
+    position_embedding=position_embedding,
+    transformer=transformer,
+    criterion=criterion,
+    focus_criterion=foreground_criterion,
+    postprocessor=postprocessor,
+    num_classes=num_classes,
+    num_queries=num_queries,
+    aux_loss=True,
+    min_size=800,
+    max_size=1333,
+)
--- a/configs/salience_detr/salience_detr_focalnet_large_lrf_800_1333.py
+++ b/configs/salience_detr/salience_detr_focalnet_large_lrf_800_1333.py
+from torch import nn
+
+from models.backbones.focalnet import FocalNetBackbone
+from models.bricks.position_encoding import PositionEmbeddingSine
+from models.bricks.post_process import PostProcess
+from models.bricks.salience_transformer import (
+    SalienceTransformer,
+    SalienceTransformerDecoder,
+    SalienceTransformerDecoderLayer,
+    SalienceTransformerEncoder,
+    SalienceTransformerEncoderLayer,
+)
+from models.bricks.set_criterion import HybridSetCriterion
+from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
+from models.matcher.hungarian_matcher import HungarianMatcher
+from models.necks.channel_mapper import ChannelMapper
+from models.necks.repnet import RepVGGPluXNetwork
+
+# mostly changed parameters
+embed_dim = 256
+num_classes = 91
+num_queries = 900
+num_feature_levels = 4
+transformer_enc_layers = 6
+transformer_dec_layers = 6
+num_heads = 8
+dim_feedforward = 2048
+
+# instantiate model components
+position_embedding = PositionEmbeddingSine(
+    embed_dim // 2, temperature=10000, normalize=True, offset=-0.5
+)
+
+backbone = FocalNetBackbone(
+    "focalnet_large_lrf_fl4", return_indices=(1, 2, 3), freeze_indices=(0,)
+)
+
+neck = ChannelMapper(
+    in_channels=backbone.num_channels,
+    out_channels=embed_dim,
+    num_outs=num_feature_levels,
+)
+
+transformer = SalienceTransformer(
+    encoder=SalienceTransformerEncoder(
+        encoder_layer=SalienceTransformerEncoderLayer(
+            embed_dim=embed_dim,
+            n_heads=num_heads,
+            dropout=0.0,
+            activation=nn.ReLU(inplace=True),
+            n_levels=num_feature_levels,
+            n_points=4,
+            d_ffn=dim_feedforward,
+        ),
+        num_layers=transformer_enc_layers,
+    ),
+    neck=RepVGGPluXNetwork(
+        in_channels_list=neck.num_channels,
+        out_channels_list=neck.num_channels,
+        norm_layer=nn.BatchNorm2d,
+        activation=nn.SiLU,
+        groups=4,
+    ),
+    decoder=SalienceTransformerDecoder(
+        decoder_layer=SalienceTransformerDecoderLayer(
+            embed_dim=embed_dim,
+            n_heads=num_heads,
+            dropout=0.0,
+            activation=nn.ReLU(inplace=True),
+            n_levels=num_feature_levels,
+            n_points=4,
+            d_ffn=dim_feedforward,
+        ),
+        num_layers=transformer_dec_layers,
+        num_classes=num_classes,
+    ),
+    num_classes=num_classes,
+    num_feature_levels=num_feature_levels,
+    two_stage_num_proposals=num_queries,
+    level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
+    layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
+)
+
+matcher = HungarianMatcher(
+    cost_class=2, cost_bbox=5, cost_giou=2, focal_alpha=0.25, focal_gamma=2.0
+)
+
+weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
+weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
+weight_dict.update({
+    k + f"_{i}": v
+    for i in range(transformer_dec_layers - 1)
+    for k, v in weight_dict.items()
+})
+weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
+weight_dict.update({"loss_salience": 2})
+
+criterion = HybridSetCriterion(
+    num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0
+)
+foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
+postprocessor = PostProcess(select_box_nums_for_evaluation=300)
+
+# combine above components to instantiate the model
+model = SalienceDETR(
+    backbone=backbone,
+    neck=neck,
+    position_embedding=position_embedding,
+    transformer=transformer,
+    criterion=criterion,
+    focus_criterion=foreground_criterion,
+    postprocessor=postprocessor,
+    num_classes=num_classes,
+    num_queries=num_queries,
+    aux_loss=True,
+    min_size=800,
+    max_size=1333,
+)
--- a/configs/salience_detr/salience_detr_resnet50_800_1333.py
+++ b/configs/salience_detr/salience_detr_resnet50_800_1333.py
+from torch import nn
+# from torchvision.ops import FrozenBatchNorm2d
+
+from models.backbones.resnet import ResNetBackbone
+from models.bricks.misc import FrozenBatchNorm2d
+from models.bricks.position_encoding import PositionEmbeddingSine
+from models.bricks.post_process import PostProcess
+from models.bricks.salience_transformer import (
+    SalienceTransformer,
+    SalienceTransformerDecoder,
+    SalienceTransformerDecoderLayer,
+    SalienceTransformerEncoder,
+    SalienceTransformerEncoderLayer,
+)
+from models.bricks.set_criterion import HybridSetCriterion
+from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
+from models.matcher.hungarian_matcher import HungarianMatcher
+from models.necks.channel_mapper import ChannelMapper
+from models.necks.repnet import RepVGGPluXNetwork
+
+# mostly changed parameters
+embed_dim = 256
+num_classes = 91
+num_queries = 900
+num_feature_levels = 4
+transformer_enc_layers = 6
+transformer_dec_layers = 6
+num_heads = 8
+dim_feedforward = 2048
+
+# instantiate model components
+position_embedding = PositionEmbeddingSine(embed_dim // 2, temperature=10000, normalize=True, offset=-0.5)
+
+backbone = ResNetBackbone(
+    "resnet50", norm_layer=FrozenBatchNorm2d, return_indices=(1, 2, 3), freeze_indices=(0,)
+)
+
+neck = ChannelMapper(
+    in_channels=backbone.num_channels,
+    out_channels=embed_dim,
+    num_outs=num_feature_levels,
+)
+
+transformer = SalienceTransformer(
+    encoder=SalienceTransformerEncoder(
+        encoder_layer=SalienceTransformerEncoderLayer(
+            embed_dim=embed_dim,
+            n_heads=num_heads,
+            dropout=0.0,
+            activation=nn.ReLU(inplace=True),
+            n_levels=num_feature_levels,
+            n_points=4,
+            d_ffn=dim_feedforward,
+        ),
+        num_layers=transformer_enc_layers,
+    ),
+    neck=RepVGGPluXNetwork(
+        in_channels_list=neck.num_channels,
+        out_channels_list=neck.num_channels,
+        norm_layer=nn.BatchNorm2d,
+        activation=nn.SiLU,
+        groups=4,
+    ),
+    decoder=SalienceTransformerDecoder(
+        decoder_layer=SalienceTransformerDecoderLayer(
+            embed_dim=embed_dim,
+            n_heads=num_heads,
+            dropout=0.0,
+            activation=nn.ReLU(inplace=True),
+            n_levels=num_feature_levels,
+            n_points=4,
+            d_ffn=dim_feedforward,
+        ),
+        num_layers=transformer_dec_layers,
+        num_classes=num_classes,
+    ),
+    num_classes=num_classes,
+    num_feature_levels=num_feature_levels,
+    two_stage_num_proposals=num_queries,
+    level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
+    layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
+)
+
+matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
+
+weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
+weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
+weight_dict.update({
+    k + f"_{i}": v
+    for i in range(transformer_dec_layers - 1)
+    for k, v in weight_dict.items()
+})
+weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
+weight_dict.update({"loss_salience": 2})
+
+criterion = HybridSetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0)
+foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
+postprocessor = PostProcess(select_box_nums_for_evaluation=300)
+
+# combine above components to instantiate the model
+model = SalienceDETR(
+    backbone=backbone,
+    neck=neck,
+    position_embedding=position_embedding,
+    transformer=transformer,
+    criterion=criterion,
+    focus_criterion=foreground_criterion,
+    postprocessor=postprocessor,
+    num_classes=num_classes,
+    num_queries=num_queries,
+    aux_loss=True,
+    min_size=800,
+    max_size=1333,
+)
--- a/configs/salience_detr/salience_detr_swin_l_800_1333.py
+++ b/configs/salience_detr/salience_detr_swin_l_800_1333.py
+from torch import nn
+
+from models.backbones.swin import SwinTransformerBackbone
+from models.bricks.position_encoding import PositionEmbeddingSine
+from models.bricks.post_process import PostProcess
+from models.bricks.salience_transformer import (
+    SalienceTransformer,
+    SalienceTransformerDecoder,
+    SalienceTransformerDecoderLayer,
+    SalienceTransformerEncoder,
+    SalienceTransformerEncoderLayer,
+)
+from models.bricks.set_criterion import HybridSetCriterion
+from models.detectors.salience_detr import SalienceCriterion, SalienceDETR
+from models.matcher.hungarian_matcher import HungarianMatcher
+from models.necks.channel_mapper import ChannelMapper
+from models.necks.repnet import RepVGGPluXNetwork
+
+# mostly changed parameters
+embed_dim = 256
+num_classes = 91
+num_queries = 900
+num_feature_levels = 4
+transformer_enc_layers = 6
+transformer_dec_layers = 6
+num_heads = 8
+dim_feedforward = 2048
+
+# instantiate model components
+position_embedding = PositionEmbeddingSine(embed_dim // 2, temperature=10000, normalize=True, offset=-0.5)
+
+backbone = SwinTransformerBackbone("swin_l", return_indices=(1, 2, 3), freeze_indices=(0,))
+
+neck = ChannelMapper(
+    in_channels=backbone.num_channels,
+    out_channels=embed_dim,
+    num_outs=num_feature_levels,
+)
+
+transformer = SalienceTransformer(
+    encoder=SalienceTransformerEncoder(
+        encoder_layer=SalienceTransformerEncoderLayer(
+            embed_dim=embed_dim,
+            n_heads=num_heads,
+            dropout=0.0,
+            activation=nn.ReLU(inplace=True),
+            n_levels=num_feature_levels,
+            n_points=4,
+            d_ffn=dim_feedforward,
+        ),
+        num_layers=transformer_enc_layers,
+    ),
+    neck=RepVGGPluXNetwork(
+        in_channels_list=neck.num_channels,
+        out_channels_list=neck.num_channels,
+        norm_layer=nn.BatchNorm2d,
+        activation=nn.SiLU,
+        groups=4,
+    ),
+    decoder=SalienceTransformerDecoder(
+        decoder_layer=SalienceTransformerDecoderLayer(
+            embed_dim=embed_dim,
+            n_heads=num_heads,
+            dropout=0.0,
+            activation=nn.ReLU(inplace=True),
+            n_levels=num_feature_levels,
+            n_points=4,
+            d_ffn=dim_feedforward,
+        ),
+        num_layers=transformer_dec_layers,
+        num_classes=num_classes,
+    ),
+    num_classes=num_classes,
+    num_feature_levels=num_feature_levels,
+    two_stage_num_proposals=num_queries,
+    level_filter_ratio=(0.4, 0.8, 1.0, 1.0),
+    layer_filter_ratio=(1.0, 0.8, 0.6, 0.6, 0.4, 0.2),
+)
+
+matcher = HungarianMatcher(cost_class=2, cost_bbox=5, cost_giou=2)
+
+weight_dict = {"loss_class": 1, "loss_bbox": 5, "loss_giou": 2}
+weight_dict.update({"loss_class_dn": 1, "loss_bbox_dn": 5, "loss_giou_dn": 2})
+weight_dict.update({
+    k + f"_{i}": v
+    for i in range(transformer_dec_layers - 1)
+    for k, v in weight_dict.items()
+})
+weight_dict.update({"loss_class_enc": 1, "loss_bbox_enc": 5, "loss_giou_enc": 2})
+weight_dict.update({"loss_salience": 2})
+
+criterion = HybridSetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, alpha=0.25, gamma=2.0)
+foreground_criterion = SalienceCriterion(noise_scale=0.0, alpha=0.25, gamma=2.0)
+postprocessor = PostProcess(select_box_nums_for_evaluation=300)
+
+# combine above components to instantiate the model
+model = SalienceDETR(
+    backbone=backbone,
+    neck=neck,
+    position_embedding=position_embedding,
+    transformer=transformer,
+    criterion=criterion,
+    focus_criterion=foreground_criterion,
+    postprocessor=postprocessor,
+    num_classes=num_classes,
+    num_queries=num_queries,
+    aux_loss=True,
+    min_size=800,
+    max_size=1333,
+)
--- a/configs/train_config.py
+++ b/configs/train_config.py
+from torch import optim
+
+from datasets.coco import CocoDetection
+from transforms import presets
+from optimizer import param_dict
+
+# Commonly changed training configurations
+num_epochs = 12   # train epochs
+batch_size = 2    # total_batch_size = #GPU x batch_size
+num_workers = 4   # workers for pytorch DataLoader
+pin_memory = True # whether pin_memory for pytorch DataLoader
+print_freq = 50   # frequency to print logs
+starting_epoch = 0
+max_norm = 0.1    # clip gradient norm
+
+output_dir = None  # path to save checkpoints, default for None: checkpoints/{model_name}
+find_unused_parameters = False  # useful for debugging distributed training
+
+# define dataset for train
+coco_path = "data/coco"  # /PATH/TO/YOUR/COCODIR
+train_transform = presets.detr  # see transforms/presets to choose a transform
+train_dataset = CocoDetection(
+    img_folder=f"{coco_path}/train2017",
+    ann_file=f"{coco_path}/annotations/instances_train2017.json",
+    transforms=train_transform,
+    train=True,
+)
+test_dataset = CocoDetection(
+    img_folder=f"{coco_path}/val2017",
+    ann_file=f"{coco_path}/annotations/instances_val2017.json",
+    transforms=None,  # the eval_transform is integrated in the model
+)
+
+# model config to train
+model_path = "configs/salience_detr/salience_detr_resnet50_800_1333.py"
+
+# specify a checkpoint folder to resume, or a pretrained ".pth" to finetune, for example:
+# checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-09_38_50
+# checkpoints/salience_detr_resnet50_800_1333/train/2024-03-22-09_38_50/best_ap.pth
+resume_from_checkpoint = None  
+
+learning_rate = 1e-4  # initial learning rate
+optimizer = optim.AdamW(lr=learning_rate, weight_decay=1e-4, betas=(0.9, 0.999))
+lr_scheduler = optim.lr_scheduler.MultiStepLR(milestones=[10], gamma=0.1)
+
+# This define parameter groups with different learning rate
+param_dicts = param_dict.finetune_backbone_and_linear_projection(lr=learning_rate)
--- a/datasets/coco.py
+++ b/datasets/coco.py
+import os
+
+import albumentations as A
+import cv2
+import numpy as np
+import torchvision
+
+from transforms import v2 as T
+from transforms.convert_coco_polys_to_mask import ConvertCocoPolysToMask
+from util import datapoints
+from util.misc import deepcopy
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(
+        self,
+        img_folder,
+        ann_file,
+        transforms=None,
+        train=False,
+    ):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self.prepare = ConvertCocoPolysToMask()
+        self._transforms = transforms
+        self._transforms = self.update_dataset(self._transforms)
+        self.train = train
+
+        if train:
+            self._coco_remove_images_without_annotations()
+
+    def update_dataset(self, transform):
+        if isinstance(transform, (T.Compose, A.Compose)):
+            processed_transforms = []
+            for trans in transform.transforms:
+                trans = self.update_dataset(trans)
+                processed_transforms.append(trans)
+            return type(transform)(processed_transforms)
+        if hasattr(transform, "update_dataset"):
+            transform.update_dataset(self)
+        return transform
+
+    def load_image(self, image_name):
+        # after comparing the speed of PIL, torchvision and cv2,
+        # cv2 is chosen as the default backend to load images,
+        # uncomment the following code to switch among them.
+
+        # image = Image.open(os.path.join(self.root, path)).convert('RGB')
+        # image = torchvision.io.read_image(os.path.join(self.root, path))
+
+        # To avoid deadlock between DataLoader and OpenCV
+        cv2.setNumThreads(0)
+        cv2.ocl.setUseOpenCL(False)
+
+        # image = cv2.imread(os.path.join(self.root, image_name))
+        image = cv2.imdecode(np.fromfile(os.path.join(self.root, image_name), dtype=np.uint8), -1)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).transpose(2, 0, 1)
+        return image
+
+    def get_image_id(self, item: int):
+        if hasattr(self, "indices"):
+            item = self.indices[item]
+        image_id = self.ids[item]
+        return image_id
+
+    def load_image_and_target(self, item: int):
+        image_id = self.get_image_id(item)
+        # load images and annotations
+        image_name = self.coco.loadImgs([image_id])[0]["file_name"]
+        image = self.load_image(image_name)
+        target = self.coco.loadAnns(self.coco.getAnnIds([image_id]))
+        target = dict(image_id=image_id, annotations=target)
+        image, target = self.prepare((image, target))
+        return image, target
+
+    def data_augmentation(self, image, target):
+        # preprocess
+        image = datapoints.Image(image)
+        bounding_boxes = datapoints.BoundingBox(
+            target["boxes"],
+            format=datapoints.BoundingBoxFormat.XYXY,
+            spatial_size=image.shape[-2:],
+        )
+        labels = target["labels"]
+        if self._transforms is not None:
+            image, bounding_boxes, labels = self._transforms(image, bounding_boxes, labels)
+
+        return image.data, bounding_boxes.data, labels
+
+    def __getitem__(self, item):
+        image, target = self.load_image_and_target(item)
+        image, target["boxes"], target["labels"] = self.data_augmentation(image, target)
+
+        return deepcopy(image), deepcopy(target)
+
+    def __len__(self):
+        return len(self.indices) if hasattr(self, "indices") else len(self.ids)
+
+    def _coco_remove_images_without_annotations(self, cat_list=None):
+        def _has_only_empty_bbox(anno):
+            return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+        def _count_visible_keypoints(anno):
+            return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+        min_keypoints_per_image = 10
+
+        def _has_valid_annotation(anno):
+            # if it's empty, there is no annotation
+            if len(anno) == 0:
+                return False
+            # if all boxes have close to zero area, there is no annotation
+            if _has_only_empty_bbox(anno):
+                return False
+            # keypoints task have a slight different critera for considering
+            # if an annotation is valid
+            if "keypoints" not in anno[0]:
+                return True
+            # for keypoint detection tasks, only consider valid images those
+            # containing at least min_keypoints_per_image
+            if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+                return True
+            return False
+
+        ids = []
+        for ds_idx, img_id in enumerate(self.ids):
+            ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None)
+            anno = self.coco.loadAnns(ann_ids)
+            if cat_list:
+                anno = [obj for obj in anno if obj["category_id"] in cat_list]
+            if _has_valid_annotation(anno):
+                ids.append(ds_idx)
+
+        self.indices = ids
+
+
+class Object365Detection(CocoDetection):
+    def load_image_and_target(self, item: int):
+        image_id = self.get_image_id(item)
+        # load images and annotations
+        image_name = self.coco.loadImgs([image_id])[0]["file_name"]
+        # NOTE: Only for object 365
+        image_name = os.path.join(*image_name.split(os.sep)[-2:])
+        if self.train:
+            image_name = os.path.join("images/train", image_name)
+        else:
+            image_name = os.path.join("images/val", image_name)
+        image = self.load_image(image_name)
+        target = self.coco.loadAnns(self.coco.getAnnIds([image_id]))
+        target = dict(image_id=image_id, annotations=target)
+        image, target = self.prepare((image, target))
+        return image, target
+
+    def __getitem__(self, item):
+        try:
+            image, target = self.load_image_and_target(item)
+        except:
+            item += 1
+            image, target = self.load_image_and_target(item)
+        image, target["boxes"], target["labels"] = self.data_augmentation(image, target)
+
+        return deepcopy(image), deepcopy(target)
--- a/docs/000000000139.jpg
+++ b/docs/000000000139.jpg
--- a/docs/000000000139_detr.jpg
+++ b/docs/000000000139_detr.jpg
--- a/docs/Cross-level token fusion.png
+++ b/docs/Cross-level token fusion.png
--- a/docs/The architecture overview of Salience DETR.png
+++ b/docs/The architecture overview of Salience DETR.png
--- a/images/000000000139.jpg
+++ b/images/000000000139.jpg