Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
bdd98bcb
Unverified
Commit
bdd98bcb
authored
Apr 14, 2023
by
Zhe Chen
Committed by
GitHub
Apr 14, 2023
Browse files
Release DINO model with InternImage-T and -L (#99)
parent
1e6e309d
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
2742 additions
and
1 deletion
+2742
-1
detection/configs/coco/README.md
detection/configs/coco/README.md
+9
-0
detection/configs/coco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py
...oco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py
+178
-0
detection/configs/coco/dino_4scale_internimage_l_1x_coco_layer_wise_lr.py
...s/coco/dino_4scale_internimage_l_1x_coco_layer_wise_lr.py
+177
-0
detection/configs/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py
...s/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py
+177
-0
detection/mmdet_custom/models/__init__.py
detection/mmdet_custom/models/__init__.py
+4
-1
detection/mmdet_custom/models/dense_heads/__init__.py
detection/mmdet_custom/models/dense_heads/__init__.py
+11
-0
detection/mmdet_custom/models/dense_heads/deformable_detr_head.py
...n/mmdet_custom/models/dense_heads/deformable_detr_head.py
+332
-0
detection/mmdet_custom/models/dense_heads/detr_head.py
detection/mmdet_custom/models/dense_heads/detr_head.py
+954
-0
detection/mmdet_custom/models/dense_heads/dino_head.py
detection/mmdet_custom/models/dense_heads/dino_head.py
+364
-0
detection/mmdet_custom/models/detectors/__init__.py
detection/mmdet_custom/models/detectors/__init__.py
+9
-0
detection/mmdet_custom/models/detectors/dino.py
detection/mmdet_custom/models/detectors/dino.py
+10
-0
detection/mmdet_custom/models/utils/__init__.py
detection/mmdet_custom/models/utils/__init__.py
+5
-0
detection/mmdet_custom/models/utils/query_denoising.py
detection/mmdet_custom/models/utils/query_denoising.py
+234
-0
detection/mmdet_custom/models/utils/transformer.py
detection/mmdet_custom/models/utils/transformer.py
+278
-0
No files found.
detection/configs/coco/README.md
View file @
bdd98bcb
...
...
@@ -41,3 +41,12 @@ Based on community feedback, in 2017 the training/validation split was changed f
-
Training speed is measured with A100 GPUs using current code and may be faster than the speed in logs.
-
Some logs are our recent newly trained ones. There might be slight differences between the results in logs and our paper.
-
Please set
`with_cp=True`
to save memory if you meet
`out-of-memory`
issues.
### DINO + InternImage
| backbone | lr type | pretrain | schd | box mAP | train time | #param | Config | Download |
| :------------: | :---------: |:---------: | :---------: | :-----: | :---: | :-----: | :---: | :---: |
| InternImage-T | layer-wise lr | ImageNet-1K | 1x | 53.9 | 9.5h | 49M |
[
config
](
./dino_4scale_internimage_t_1x_coco_layer_wise_lr.py
)
|
[
ckpt
](
https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.pth
)
\|
[
log
](
https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_t_1x_coco.json
)
|
| InternImage-L | layer-wise lr | ImageNet-22K | 1x | 57.5 | 18h | 241M |
[
config
](
./dino_4scale_internimage_l_1x_coco_layer_wise_lr.py
)
|
[
ckpt
](
https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.pth
)
\|
[
log
](
https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_layer_wise_lr.log.json
)
|
| InternImage-L | 0.1x backbone lr | ImageNet-22K | 1x | 57.6 | 18h | 241M |
[
config
](
./dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py
)
|
[
ckpt
](
https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.pth
)
\|
[
log
](
https://huggingface.co/OpenGVLab/InternImage/resolve/main/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.log.json
)
|
detection/configs/coco/dino_4scale_internimage_l_1x_coco_0.1x_backbone_lr.py
0 → 100644
View file @
bdd98bcb
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_
=
[
'../_base_/datasets/coco_detection.py'
,
'../_base_/default_runtime.py'
,
'../_base_/schedules/schedule_1x.py'
,
]
pretrained
=
'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_l_22k_192to384.pth'
model
=
dict
(
type
=
'DINO'
,
backbone
=
dict
(
type
=
'InternImage'
,
core_op
=
'DCNv3'
,
channels
=
160
,
depths
=
[
5
,
5
,
22
,
5
],
groups
=
[
10
,
20
,
40
,
80
],
mlp_ratio
=
4.
,
drop_path_rate
=
0.4
,
norm_layer
=
'LN'
,
layer_scale
=
1.0
,
offset_scale
=
2.0
,
post_norm
=
True
,
with_cp
=
False
,
out_indices
=
(
1
,
2
,
3
),
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
pretrained
)),
neck
=
dict
(
type
=
'ChannelMapper'
,
in_channels
=
[
320
,
640
,
1280
],
kernel_size
=
1
,
out_channels
=
256
,
act_cfg
=
None
,
norm_cfg
=
dict
(
type
=
'GN'
,
num_groups
=
32
),
num_outs
=
4
),
bbox_head
=
dict
(
type
=
'DINOHead'
,
num_query
=
900
,
num_classes
=
80
,
in_channels
=
2048
,
sync_cls_avg_factor
=
True
,
as_two_stage
=
True
,
with_box_refine
=
True
,
dn_cfg
=
dict
(
type
=
'CdnQueryGenerator'
,
noise_scale
=
dict
(
label
=
0.5
,
box
=
1.0
),
group_cfg
=
dict
(
dynamic
=
True
,
num_groups
=
None
,
num_dn_queries
=
100
)),
transformer
=
dict
(
type
=
'DinoTransformer'
,
two_stage_num_proposals
=
900
,
encoder
=
dict
(
type
=
'DetrTransformerEncoder'
,
num_layers
=
6
,
transformerlayers
=
dict
(
type
=
'BaseTransformerLayer'
,
attn_cfgs
=
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
256
,
dropout
=
0.0
),
feedforward_channels
=
2048
,
ffn_dropout
=
0.0
,
# 0.1 for DeformDETR
operation_order
=
(
'self_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DinoTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
256
,
num_heads
=
8
,
dropout
=
0.0
),
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
256
,
dropout
=
0.0
),
],
feedforward_channels
=
2048
,
ffn_dropout
=
0.0
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
128
,
temperature
=
20
,
normalize
=
True
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
5.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
2.0
)),
# training and testing settings
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
5.0
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
2.0
))),
test_cfg
=
dict
(
max_per_img
=
300
))
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline
=
[
dict
(
type
=
'LoadImageFromFile'
),
dict
(
type
=
'LoadAnnotations'
,
with_bbox
=
True
),
dict
(
type
=
'RandomFlip'
,
flip_ratio
=
0.5
),
dict
(
type
=
'AutoAugment'
,
policies
=
[
[
dict
(
type
=
'Resize'
,
img_scale
=
[(
480
,
1333
),
(
512
,
1333
),
(
544
,
1333
),
(
576
,
1333
),
(
608
,
1333
),
(
640
,
1333
),
(
672
,
1333
),
(
704
,
1333
),
(
736
,
1333
),
(
768
,
1333
),
(
800
,
1333
)],
multiscale_mode
=
'value'
,
keep_ratio
=
True
)
],
[
dict
(
type
=
'Resize'
,
img_scale
=
[(
400
,
4200
),
(
500
,
4200
),
(
600
,
4200
)],
multiscale_mode
=
'value'
,
keep_ratio
=
True
),
dict
(
type
=
'RandomCrop'
,
crop_type
=
'absolute_range'
,
crop_size
=
(
384
,
600
),
allow_negative_crop
=
False
),
dict
(
type
=
'Resize'
,
img_scale
=
[(
480
,
1333
),
(
512
,
1333
),
(
544
,
1333
),
(
576
,
1333
),
(
608
,
1333
),
(
640
,
1333
),
(
672
,
1333
),
(
704
,
1333
),
(
736
,
1333
),
(
768
,
1333
),
(
800
,
1333
)],
multiscale_mode
=
'value'
,
override
=
True
,
keep_ratio
=
True
)
]
]),
dict
(
type
=
'Normalize'
,
**
img_norm_cfg
),
dict
(
type
=
'Pad'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_bboxes'
,
'gt_labels'
])
]
# By default, models are trained on 8 GPUs with 2 images per GPU
data
=
dict
(
samples_per_gpu
=
2
,
train
=
dict
(
pipeline
=
train_pipeline
))
# optimizer
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
0.0001
,
weight_decay
=
0.05
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'backbone'
:
dict
(
lr_mult
=
0.1
),
}))
optimizer_config
=
dict
(
_delete_
=
True
,
grad_clip
=
dict
(
max_norm
=
0.1
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
0.001
,
step
=
[
11
])
evaluation
=
dict
(
save_best
=
'auto'
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
3
,
save_last
=
True
,
)
\ No newline at end of file
detection/configs/coco/dino_4scale_internimage_l_1x_coco_layer_wise_lr.py
0 → 100644
View file @
bdd98bcb
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_
=
[
'../_base_/datasets/coco_detection.py'
,
'../_base_/default_runtime.py'
,
'../_base_/schedules/schedule_1x.py'
,
]
pretrained
=
'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_l_22k_192to384.pth'
model
=
dict
(
type
=
'DINO'
,
backbone
=
dict
(
type
=
'InternImage'
,
core_op
=
'DCNv3'
,
channels
=
160
,
depths
=
[
5
,
5
,
22
,
5
],
groups
=
[
10
,
20
,
40
,
80
],
mlp_ratio
=
4.
,
drop_path_rate
=
0.4
,
norm_layer
=
'LN'
,
layer_scale
=
1.0
,
offset_scale
=
2.0
,
post_norm
=
True
,
with_cp
=
False
,
out_indices
=
(
1
,
2
,
3
),
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
pretrained
)),
neck
=
dict
(
type
=
'ChannelMapper'
,
in_channels
=
[
320
,
640
,
1280
],
kernel_size
=
1
,
out_channels
=
256
,
act_cfg
=
None
,
norm_cfg
=
dict
(
type
=
'GN'
,
num_groups
=
32
),
num_outs
=
4
),
bbox_head
=
dict
(
type
=
'DINOHead'
,
num_query
=
900
,
num_classes
=
80
,
in_channels
=
2048
,
sync_cls_avg_factor
=
True
,
as_two_stage
=
True
,
with_box_refine
=
True
,
dn_cfg
=
dict
(
type
=
'CdnQueryGenerator'
,
noise_scale
=
dict
(
label
=
0.5
,
box
=
1.0
),
group_cfg
=
dict
(
dynamic
=
True
,
num_groups
=
None
,
num_dn_queries
=
100
)),
transformer
=
dict
(
type
=
'DinoTransformer'
,
two_stage_num_proposals
=
900
,
encoder
=
dict
(
type
=
'DetrTransformerEncoder'
,
num_layers
=
6
,
transformerlayers
=
dict
(
type
=
'BaseTransformerLayer'
,
attn_cfgs
=
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
256
,
dropout
=
0.0
),
feedforward_channels
=
2048
,
ffn_dropout
=
0.0
,
# 0.1 for DeformDETR
operation_order
=
(
'self_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DinoTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
256
,
num_heads
=
8
,
dropout
=
0.0
),
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
256
,
dropout
=
0.0
),
],
feedforward_channels
=
2048
,
ffn_dropout
=
0.0
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
128
,
temperature
=
20
,
normalize
=
True
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
5.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
2.0
)),
# training and testing settings
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
5.0
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
2.0
))),
test_cfg
=
dict
(
max_per_img
=
300
))
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline
=
[
dict
(
type
=
'LoadImageFromFile'
),
dict
(
type
=
'LoadAnnotations'
,
with_bbox
=
True
),
dict
(
type
=
'RandomFlip'
,
flip_ratio
=
0.5
),
dict
(
type
=
'AutoAugment'
,
policies
=
[
[
dict
(
type
=
'Resize'
,
img_scale
=
[(
480
,
1333
),
(
512
,
1333
),
(
544
,
1333
),
(
576
,
1333
),
(
608
,
1333
),
(
640
,
1333
),
(
672
,
1333
),
(
704
,
1333
),
(
736
,
1333
),
(
768
,
1333
),
(
800
,
1333
)],
multiscale_mode
=
'value'
,
keep_ratio
=
True
)
],
[
dict
(
type
=
'Resize'
,
img_scale
=
[(
400
,
4200
),
(
500
,
4200
),
(
600
,
4200
)],
multiscale_mode
=
'value'
,
keep_ratio
=
True
),
dict
(
type
=
'RandomCrop'
,
crop_type
=
'absolute_range'
,
crop_size
=
(
384
,
600
),
allow_negative_crop
=
False
),
dict
(
type
=
'Resize'
,
img_scale
=
[(
480
,
1333
),
(
512
,
1333
),
(
544
,
1333
),
(
576
,
1333
),
(
608
,
1333
),
(
640
,
1333
),
(
672
,
1333
),
(
704
,
1333
),
(
736
,
1333
),
(
768
,
1333
),
(
800
,
1333
)],
multiscale_mode
=
'value'
,
override
=
True
,
keep_ratio
=
True
)
]
]),
dict
(
type
=
'Normalize'
,
**
img_norm_cfg
),
dict
(
type
=
'Pad'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_bboxes'
,
'gt_labels'
])
]
# By default, models are trained on 8 GPUs with 2 images per GPU
data
=
dict
(
samples_per_gpu
=
2
,
train
=
dict
(
pipeline
=
train_pipeline
))
# optimizer
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
0.0001
,
weight_decay
=
0.0001
,
constructor
=
'CustomLayerDecayOptimizerConstructor'
,
paramwise_cfg
=
dict
(
num_layers
=
37
,
layer_decay_rate
=
0.90
,
depths
=
[
5
,
5
,
22
,
5
]))
optimizer_config
=
dict
(
_delete_
=
True
,
grad_clip
=
dict
(
max_norm
=
0.1
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
0.001
,
step
=
[
11
])
evaluation
=
dict
(
save_best
=
'auto'
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
3
,
save_last
=
True
,
)
\ No newline at end of file
detection/configs/coco/dino_4scale_internimage_t_1x_coco_layer_wise_lr.py
0 → 100644
View file @
bdd98bcb
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
_base_
=
[
'../_base_/datasets/coco_detection.py'
,
'../_base_/default_runtime.py'
,
'../_base_/schedules/schedule_1x.py'
,
]
pretrained
=
'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_t_1k_224.pth'
model
=
dict
(
type
=
'DINO'
,
backbone
=
dict
(
type
=
'InternImage'
,
core_op
=
'DCNv3'
,
channels
=
64
,
depths
=
[
4
,
4
,
18
,
4
],
groups
=
[
4
,
8
,
16
,
32
],
mlp_ratio
=
4.
,
drop_path_rate
=
0.2
,
norm_layer
=
'LN'
,
layer_scale
=
1.0
,
offset_scale
=
1.0
,
post_norm
=
False
,
with_cp
=
True
,
out_indices
=
(
1
,
2
,
3
),
init_cfg
=
dict
(
type
=
'Pretrained'
,
checkpoint
=
pretrained
)),
neck
=
dict
(
type
=
'ChannelMapper'
,
in_channels
=
[
128
,
256
,
512
],
kernel_size
=
1
,
out_channels
=
256
,
act_cfg
=
None
,
norm_cfg
=
dict
(
type
=
'GN'
,
num_groups
=
32
),
num_outs
=
4
),
bbox_head
=
dict
(
type
=
'DINOHead'
,
num_query
=
900
,
num_classes
=
80
,
in_channels
=
2048
,
sync_cls_avg_factor
=
True
,
as_two_stage
=
True
,
with_box_refine
=
True
,
dn_cfg
=
dict
(
type
=
'CdnQueryGenerator'
,
noise_scale
=
dict
(
label
=
0.5
,
box
=
1.0
),
group_cfg
=
dict
(
dynamic
=
True
,
num_groups
=
None
,
num_dn_queries
=
100
)),
transformer
=
dict
(
type
=
'DinoTransformer'
,
two_stage_num_proposals
=
900
,
encoder
=
dict
(
type
=
'DetrTransformerEncoder'
,
num_layers
=
6
,
transformerlayers
=
dict
(
type
=
'BaseTransformerLayer'
,
attn_cfgs
=
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
256
,
dropout
=
0.0
),
feedforward_channels
=
2048
,
ffn_dropout
=
0.0
,
# 0.1 for DeformDETR
operation_order
=
(
'self_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DinoTransformerDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
256
,
num_heads
=
8
,
dropout
=
0.0
),
dict
(
type
=
'MultiScaleDeformableAttention'
,
embed_dims
=
256
,
dropout
=
0.0
),
],
feedforward_channels
=
2048
,
ffn_dropout
=
0.0
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
128
,
temperature
=
20
,
normalize
=
True
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
5.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
2.0
)),
# training and testing settings
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
5.0
,
box_format
=
'xywh'
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
2.0
))),
test_cfg
=
dict
(
max_per_img
=
100
))
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
# from the default setting in mmdet.
train_pipeline
=
[
dict
(
type
=
'LoadImageFromFile'
),
dict
(
type
=
'LoadAnnotations'
,
with_bbox
=
True
),
dict
(
type
=
'RandomFlip'
,
flip_ratio
=
0.5
),
dict
(
type
=
'AutoAugment'
,
policies
=
[
[
dict
(
type
=
'Resize'
,
img_scale
=
[(
480
,
1333
),
(
512
,
1333
),
(
544
,
1333
),
(
576
,
1333
),
(
608
,
1333
),
(
640
,
1333
),
(
672
,
1333
),
(
704
,
1333
),
(
736
,
1333
),
(
768
,
1333
),
(
800
,
1333
)],
multiscale_mode
=
'value'
,
keep_ratio
=
True
)
],
[
dict
(
type
=
'Resize'
,
img_scale
=
[(
400
,
4200
),
(
500
,
4200
),
(
600
,
4200
)],
multiscale_mode
=
'value'
,
keep_ratio
=
True
),
dict
(
type
=
'RandomCrop'
,
crop_type
=
'absolute_range'
,
crop_size
=
(
384
,
600
),
allow_negative_crop
=
False
),
dict
(
type
=
'Resize'
,
img_scale
=
[(
480
,
1333
),
(
512
,
1333
),
(
544
,
1333
),
(
576
,
1333
),
(
608
,
1333
),
(
640
,
1333
),
(
672
,
1333
),
(
704
,
1333
),
(
736
,
1333
),
(
768
,
1333
),
(
800
,
1333
)],
multiscale_mode
=
'value'
,
override
=
True
,
keep_ratio
=
True
)
]
]),
dict
(
type
=
'Normalize'
,
**
img_norm_cfg
),
dict
(
type
=
'Pad'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle'
),
dict
(
type
=
'Collect'
,
keys
=
[
'img'
,
'gt_bboxes'
,
'gt_labels'
])
]
# By default, models are trained on 8 GPUs with 2 images per GPU
data
=
dict
(
samples_per_gpu
=
2
,
train
=
dict
(
pipeline
=
train_pipeline
))
# optimizer
optimizer
=
dict
(
_delete_
=
True
,
type
=
'AdamW'
,
lr
=
0.0001
,
weight_decay
=
0.0001
,
constructor
=
'CustomLayerDecayOptimizerConstructor'
,
paramwise_cfg
=
dict
(
num_layers
=
30
,
layer_decay_rate
=
0.9
,
depths
=
[
4
,
4
,
18
,
4
]))
optimizer_config
=
dict
(
_delete_
=
True
,
grad_clip
=
dict
(
max_norm
=
0.1
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'step'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
0.001
,
step
=
[
11
])
evaluation
=
dict
(
save_best
=
'auto'
)
checkpoint_config
=
dict
(
interval
=
1
,
max_keep_ckpts
=
3
,
save_last
=
True
,
)
\ No newline at end of file
detection/mmdet_custom/models/__init__.py
View file @
bdd98bcb
...
...
@@ -4,4 +4,7 @@
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.backbones
import
*
# noqa: F401,F403
\ No newline at end of file
from
.backbones
import
*
# noqa: F401,F403
from
.dense_heads
import
*
# noqa: F401,F403
from
.detectors
import
*
# noqa: F401,F403
from
.utils
import
*
# noqa: F401,F403
\ No newline at end of file
detection/mmdet_custom/models/dense_heads/__init__.py
0 → 100644
View file @
bdd98bcb
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.deformable_detr_head
import
DeformableDETRHead
from
.detr_head
import
DETRHead
from
.dino_head
import
DINOHead
__all__
=
[
'DeformableDETRHead'
,
'DETRHead'
,
'DINOHead'
]
\ No newline at end of file
detection/mmdet_custom/models/dense_heads/deformable_detr_head.py
0 → 100644
View file @
bdd98bcb
# Copyright (c) OpenMMLab. All rights reserved.
import
copy
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Linear
,
bias_init_with_prob
,
constant_init
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
multi_apply
from
mmdet.models.utils.transformer
import
inverse_sigmoid
from
mmdet.models.builder
import
HEADS
from
.detr_head
import
DETRHead
@
HEADS
.
register_module
(
force
=
True
)
class
DeformableDETRHead
(
DETRHead
):
"""Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to-
End Object Detection.
Code is modified from the `official github repo
<https://github.com/fundamentalvision/Deformable-DETR>`_.
More details can be found in the `paper
<https://arxiv.org/abs/2010.04159>`_ .
Args:
with_box_refine (bool): Whether to refine the reference points
in the decoder. Defaults to False.
as_two_stage (bool) : Whether to generate the proposal from
the outputs of encoder.
transformer (obj:`ConfigDict`): ConfigDict is used for building
the Encoder and Decoder.
"""
def
__init__
(
self
,
*
args
,
with_box_refine
=
False
,
as_two_stage
=
False
,
transformer
=
None
,
use_2fc_cls_branch
=
False
,
**
kwargs
):
self
.
with_box_refine
=
with_box_refine
self
.
as_two_stage
=
as_two_stage
self
.
use_2fc_cls_branch
=
use_2fc_cls_branch
if
self
.
as_two_stage
:
transformer
[
'as_two_stage'
]
=
self
.
as_two_stage
super
(
DeformableDETRHead
,
self
).
__init__
(
*
args
,
transformer
=
transformer
,
**
kwargs
)
def
_init_layers
(
self
):
"""Initialize classification branch and regression branch of head."""
if
not
self
.
use_2fc_cls_branch
:
fc_cls
=
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
)
else
:
fc_cls
=
nn
.
Sequential
(
*
[
Linear
(
self
.
embed_dims
,
int
(
self
.
embed_dims
*
1.5
)),
nn
.
LayerNorm
(
int
(
self
.
embed_dims
*
1.5
)),
nn
.
GELU
(),
Linear
(
int
(
self
.
embed_dims
*
1.5
),
self
.
cls_out_channels
),
])
fc_cls
.
out_features
=
self
.
cls_out_channels
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
4
))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
# last reg_branch is used to generate proposal from
# encode feature map when as_two_stage is True.
num_pred
=
(
self
.
transformer
.
decoder
.
num_layers
+
1
)
if
\
self
.
as_two_stage
else
self
.
transformer
.
decoder
.
num_layers
if
self
.
with_box_refine
:
self
.
cls_branches
=
_get_clones
(
fc_cls
,
num_pred
)
self
.
reg_branches
=
_get_clones
(
reg_branch
,
num_pred
)
else
:
self
.
cls_branches
=
nn
.
ModuleList
(
[
fc_cls
for
_
in
range
(
num_pred
)])
self
.
reg_branches
=
nn
.
ModuleList
(
[
reg_branch
for
_
in
range
(
num_pred
)])
if
not
self
.
as_two_stage
:
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
*
2
)
def
init_weights
(
self
):
"""Initialize weights of the DeformDETR head."""
self
.
transformer
.
init_weights
()
if
self
.
loss_cls
.
use_sigmoid
:
bias_init
=
bias_init_with_prob
(
0.01
)
if
not
self
.
use_2fc_cls_branch
:
for
m
in
self
.
cls_branches
:
nn
.
init
.
constant_
(
m
.
bias
,
bias_init
)
for
m
in
self
.
reg_branches
:
constant_init
(
m
[
-
1
],
0
,
bias
=
0
)
nn
.
init
.
constant_
(
self
.
reg_branches
[
0
][
-
1
].
bias
.
data
[
2
:],
-
2.0
)
if
self
.
as_two_stage
:
for
m
in
self
.
reg_branches
:
nn
.
init
.
constant_
(
m
[
-
1
].
bias
.
data
[
2
:],
0.0
)
def
forward
(
self
,
mlvl_feats
,
img_metas
):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 4D-tensor with shape
(N, C, H, W).
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
\
shape [nb_dec, bs, num_query, cls_out_channels]. Note
\
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
\
head with normalized coordinate format (cx, cy, w, h).
\
Shape [nb_dec, bs, num_query, 4].
enc_outputs_class (Tensor): The score of each point on encode
\
feature map, has shape (N, h*w, num_class). Only when
\
as_two_stage is True it would be returned, otherwise
\
`None` would be returned.
enc_outputs_coord (Tensor): The proposal generate from the
\
encode feature map, has shape (N, h*w, 4). Only when
\
as_two_stage is True it would be returned, otherwise
\
`None` would be returned.
"""
batch_size
=
mlvl_feats
[
0
].
size
(
0
)
input_img_h
,
input_img_w
=
img_metas
[
0
][
'batch_input_shape'
]
img_masks
=
mlvl_feats
[
0
].
new_ones
(
(
batch_size
,
input_img_h
,
input_img_w
))
for
img_id
in
range
(
batch_size
):
img_h
,
img_w
,
_
=
img_metas
[
img_id
][
'img_shape'
]
img_masks
[
img_id
,
:
img_h
,
:
img_w
]
=
0
mlvl_masks
=
[]
mlvl_positional_encodings
=
[]
for
feat
in
mlvl_feats
:
mlvl_masks
.
append
(
F
.
interpolate
(
img_masks
[
None
],
size
=
feat
.
shape
[
-
2
:]).
to
(
torch
.
bool
).
squeeze
(
0
))
mlvl_positional_encodings
.
append
(
self
.
positional_encoding
(
mlvl_masks
[
-
1
]))
query_embeds
=
None
if
not
self
.
as_two_stage
:
query_embeds
=
self
.
query_embedding
.
weight
hs
,
init_reference
,
inter_references
,
\
enc_outputs_class
,
enc_outputs_coord
=
self
.
transformer
(
mlvl_feats
,
mlvl_masks
,
query_embeds
,
mlvl_positional_encodings
,
reg_branches
=
self
.
reg_branches
if
self
.
with_box_refine
else
None
,
# noqa:E501
cls_branches
=
self
.
cls_branches
if
self
.
as_two_stage
else
None
# noqa:E501
)
hs
=
hs
.
permute
(
0
,
2
,
1
,
3
)
outputs_classes
=
[]
outputs_coords
=
[]
for
lvl
in
range
(
hs
.
shape
[
0
]):
if
lvl
==
0
:
reference
=
init_reference
else
:
reference
=
inter_references
[
lvl
-
1
]
reference
=
inverse_sigmoid
(
reference
)
outputs_class
=
self
.
cls_branches
[
lvl
](
hs
[
lvl
])
tmp
=
self
.
reg_branches
[
lvl
](
hs
[
lvl
])
if
reference
.
shape
[
-
1
]
==
4
:
tmp
+=
reference
else
:
assert
reference
.
shape
[
-
1
]
==
2
tmp
[...,
:
2
]
+=
reference
outputs_coord
=
tmp
.
sigmoid
()
outputs_classes
.
append
(
outputs_class
)
outputs_coords
.
append
(
outputs_coord
)
outputs_classes
=
torch
.
stack
(
outputs_classes
)
outputs_coords
=
torch
.
stack
(
outputs_coords
)
if
self
.
as_two_stage
:
return
outputs_classes
,
outputs_coords
,
\
enc_outputs_class
,
\
enc_outputs_coord
.
sigmoid
()
else
:
return
outputs_classes
,
outputs_coords
,
\
None
,
None
@
force_fp32
(
apply_to
=
(
'all_cls_scores_list'
,
'all_bbox_preds_list'
))
def
loss
(
self
,
all_cls_scores
,
all_bbox_preds
,
enc_cls_scores
,
enc_bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore
=
None
):
""""Loss function.
Args:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
num_dec_layers
=
len
(
all_cls_scores
)
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
img_metas_list
=
[
img_metas
for
_
in
range
(
num_dec_layers
)]
losses_cls
,
losses_bbox
,
losses_iou
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
img_metas_list
,
all_gt_bboxes_ignore_list
)
loss_dict
=
dict
()
# loss of proposal generated from encode feature map.
if
enc_cls_scores
is
not
None
:
binary_labels_list
=
[
torch
.
zeros_like
(
gt_labels_list
[
i
])
for
i
in
range
(
len
(
img_metas
))
]
enc_loss_cls
,
enc_losses_bbox
,
enc_losses_iou
=
\
self
.
loss_single
(
enc_cls_scores
,
enc_bbox_preds
,
gt_bboxes_list
,
binary_labels_list
,
img_metas
,
gt_bboxes_ignore
)
loss_dict
[
'enc_loss_cls'
]
=
enc_loss_cls
loss_dict
[
'enc_loss_bbox'
]
=
enc_losses_bbox
loss_dict
[
'enc_loss_iou'
]
=
enc_losses_iou
# loss from the last decoder layer
loss_dict
[
'loss_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_bbox'
]
=
losses_bbox
[
-
1
]
loss_dict
[
'loss_iou'
]
=
losses_iou
[
-
1
]
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
,
loss_iou_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
],
losses_iou
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
loss_bbox_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_iou'
]
=
loss_iou_i
num_dec_layer
+=
1
return
loss_dict
@
force_fp32
(
apply_to
=
(
'all_cls_scores_list'
,
'all_bbox_preds_list'
))
def
get_bboxes
(
self
,
all_cls_scores
,
all_bbox_preds
,
enc_cls_scores
,
enc_bbox_preds
,
img_metas
,
rescale
=
False
):
"""Transform network outputs for a batch into bbox predictions.
Args:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
img_metas (list[dict]): Meta information of each image.
rescale (bool, optional): If True, return boxes in original
image space. Default False.
Returns:
list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.
\
The first item is an (n, 5) tensor, where the first 4 columns
\
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
\
5-th column is a score between 0 and 1. The second item is a
\
(n,) tensor where each item is the predicted class label of
\
the corresponding box.
"""
cls_scores
=
all_cls_scores
[
-
1
]
bbox_preds
=
all_bbox_preds
[
-
1
]
result_list
=
[]
for
img_id
in
range
(
len
(
img_metas
)):
cls_score
=
cls_scores
[
img_id
]
bbox_pred
=
bbox_preds
[
img_id
]
img_shape
=
img_metas
[
img_id
][
'img_shape'
]
scale_factor
=
img_metas
[
img_id
][
'scale_factor'
]
proposals
=
self
.
_get_bboxes_single
(
cls_score
,
bbox_pred
,
img_shape
,
scale_factor
,
rescale
)
result_list
.
append
(
proposals
)
return
result_list
detection/mmdet_custom/models/dense_heads/detr_head.py
0 → 100644
View file @
bdd98bcb
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Conv2d
,
Linear
,
build_activation_layer
from
mmcv.cnn.bricks.transformer
import
FFN
,
build_positional_encoding
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
(
bbox_cxcywh_to_xyxy
,
bbox_xyxy_to_cxcywh
,
build_assigner
,
build_sampler
,
multi_apply
,
reduce_mean
)
from
mmdet.models.utils
import
build_transformer
from
mmdet.models.builder
import
HEADS
,
build_loss
from
mmdet.models.dense_heads.anchor_free_head
import
AnchorFreeHead
import
numpy
as
np
@
HEADS
.
register_module
(
force
=
True
)
class
DETRHead
(
AnchorFreeHead
):
"""Implements the DETR transformer head.
See `paper: End-to-End Object Detection with Transformers
<https://arxiv.org/pdf/2005.12872>`_ for details.
Args:
num_classes (int): Number of categories excluding the background.
in_channels (int): Number of channels in the input feature map.
num_query (int): Number of query in Transformer.
num_reg_fcs (int, optional): Number of fully-connected layers used in
`FFN`, which is then used for the regression head. Default 2.
transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
Default: None.
sync_cls_avg_factor (bool): Whether to sync the avg_factor of
all ranks. Default to False.
positional_encoding (obj:`mmcv.ConfigDict`|dict):
Config for position encoding.
loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
classification loss. Default `CrossEntropyLoss`.
loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
regression loss. Default `L1Loss`.
loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
regression iou loss. Default `GIoULoss`.
tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
transformer head.
test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
transformer head.
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None
"""
_version
=
2
def
__init__
(
self
,
num_classes
,
in_channels
,
num_query
=
100
,
num_reg_fcs
=
2
,
transformer
=
None
,
sync_cls_avg_factor
=
False
,
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
128
,
normalize
=
True
),
loss_cls
=
dict
(
type
=
'CrossEntropyLoss'
,
bg_cls_weight
=
0.1
,
use_sigmoid
=
False
,
loss_weight
=
1.0
,
class_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
5.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
2.0
),
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'ClassificationCost'
,
weight
=
1.
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
5.0
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
2.0
))),
test_cfg
=
dict
(
max_per_img
=
100
),
init_cfg
=
None
,
**
kwargs
):
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since it brings inconvenience when the initialization of
# `AnchorFreeHead` is called.
super
(
AnchorFreeHead
,
self
).
__init__
(
init_cfg
)
self
.
bg_cls_weight
=
0
self
.
sync_cls_avg_factor
=
sync_cls_avg_factor
class_weight
=
loss_cls
.
get
(
'class_weight'
,
None
)
if
class_weight
is
not
None
and
(
self
.
__class__
is
DETRHead
):
# assert isinstance(class_weight, float), 'Expected ' \
# 'class_weight to have type float. Found ' \
# f'{type(class_weight)}.'
# NOTE following the official DETR rep0, bg_cls_weight means
# relative classification weight of the no-object class.
bg_cls_weight
=
loss_cls
.
get
(
'bg_cls_weight'
,
class_weight
)
assert
isinstance
(
bg_cls_weight
,
float
),
'Expected '
\
'bg_cls_weight to have type float. Found '
\
f
'
{
type
(
bg_cls_weight
)
}
.'
if
isinstance
(
class_weight
,
list
):
class_weight
.
append
(
bg_cls_weight
)
class_weight
=
np
.
array
(
class_weight
)
class_weight
=
torch
.
from_numpy
(
class_weight
)
class_weight
=
torch
.
ones
(
num_classes
+
1
)
*
class_weight
elif
isinstance
(
class_weight
,
float
):
class_weight
=
torch
.
ones
(
num_classes
+
1
)
*
class_weight
# set background class as the last indice
class_weight
[
num_classes
]
=
bg_cls_weight
loss_cls
.
update
({
'class_weight'
:
class_weight
})
if
'bg_cls_weight'
in
loss_cls
:
loss_cls
.
pop
(
'bg_cls_weight'
)
self
.
bg_cls_weight
=
bg_cls_weight
if
train_cfg
:
assert
'assigner'
in
train_cfg
,
'assigner should be provided '
\
'when train_cfg is set.'
assigner
=
train_cfg
[
'assigner'
]
# assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'],
# 'The classification weight for loss and matcher should be' \
# 'exactly the same.'
# assert loss_bbox['loss_weight'] == assigner['reg_cost'][
# 'weight'], 'The regression L1 weight for loss and matcher '\
# 'should be exactly the same.'
# assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'],
# 'The regression iou weight for loss and matcher should be' \
# 'exactly the same.'
self
.
assigner
=
build_assigner
(
assigner
)
# DETR sampling=False, so use PseudoSampler
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
self
.
num_query
=
num_query
self
.
num_classes
=
num_classes
self
.
in_channels
=
in_channels
self
.
num_reg_fcs
=
num_reg_fcs
self
.
train_cfg
=
train_cfg
self
.
test_cfg
=
test_cfg
self
.
fp16_enabled
=
False
self
.
loss_cls
=
build_loss
(
loss_cls
)
self
.
loss_bbox
=
build_loss
(
loss_bbox
)
self
.
loss_iou
=
build_loss
(
loss_iou
)
if
self
.
loss_cls
.
use_sigmoid
:
self
.
cls_out_channels
=
num_classes
else
:
self
.
cls_out_channels
=
num_classes
+
1
self
.
act_cfg
=
transformer
.
get
(
'act_cfg'
,
dict
(
type
=
'ReLU'
,
inplace
=
True
))
self
.
activate
=
build_activation_layer
(
self
.
act_cfg
)
self
.
positional_encoding
=
build_positional_encoding
(
positional_encoding
)
self
.
transformer
=
build_transformer
(
transformer
)
self
.
embed_dims
=
self
.
transformer
.
embed_dims
assert
'num_feats'
in
positional_encoding
num_feats
=
positional_encoding
[
'num_feats'
]
assert
num_feats
*
2
==
self
.
embed_dims
,
'embed_dims should'
\
f
' be exactly 2 times of num_feats. Found
{
self
.
embed_dims
}
'
\
f
' and
{
num_feats
}
.'
self
.
_init_layers
()
def
_init_layers
(
self
):
"""Initialize layers of the transformer head."""
self
.
input_proj
=
Conv2d
(
self
.
in_channels
,
self
.
embed_dims
,
kernel_size
=
1
)
self
.
fc_cls
=
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
)
self
.
reg_ffn
=
FFN
(
self
.
embed_dims
,
self
.
embed_dims
,
self
.
num_reg_fcs
,
self
.
act_cfg
,
dropout
=
0.0
,
add_residual
=
False
)
self
.
fc_reg
=
Linear
(
self
.
embed_dims
,
4
)
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
)
def
init_weights
(
self
):
"""Initialize weights of the transformer head."""
# The initialization for transformer is important
self
.
transformer
.
init_weights
()
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
"""load checkpoints."""
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since `AnchorFreeHead._load_from_state_dict` should not be
# called here. Invoking the default `Module._load_from_state_dict`
# is enough.
# Names of some parameters in has been changed.
version
=
local_metadata
.
get
(
'version'
,
None
)
if
(
version
is
None
or
version
<
2
)
and
self
.
__class__
is
DETRHead
:
convert_dict
=
{
'.self_attn.'
:
'.attentions.0.'
,
'.ffn.'
:
'.ffns.0.'
,
'.multihead_attn.'
:
'.attentions.1.'
,
'.decoder.norm.'
:
'.decoder.post_norm.'
}
state_dict_keys
=
list
(
state_dict
.
keys
())
for
k
in
state_dict_keys
:
for
ori_key
,
convert_key
in
convert_dict
.
items
():
if
ori_key
in
k
:
convert_key
=
k
.
replace
(
ori_key
,
convert_key
)
state_dict
[
convert_key
]
=
state_dict
[
k
]
del
state_dict
[
k
]
super
(
AnchorFreeHead
,
self
).
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
)
def
forward
(
self
,
feats
,
img_metas
):
"""Forward function.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
img_metas (list[dict]): List of image information.
Returns:
tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
- all_cls_scores_list (list[Tensor]): Classification scores
\
for each scale level. Each is a 4D-tensor with shape
\
[nb_dec, bs, num_query, cls_out_channels]. Note
\
`cls_out_channels` should includes background.
- all_bbox_preds_list (list[Tensor]): Sigmoid regression
\
outputs for each scale level. Each is a 4D-tensor with
\
normalized coordinate format (cx, cy, w, h) and shape
\
[nb_dec, bs, num_query, 4].
"""
num_levels
=
len
(
feats
)
img_metas_list
=
[
img_metas
for
_
in
range
(
num_levels
)]
return
multi_apply
(
self
.
forward_single
,
feats
,
img_metas_list
)
def
forward_single
(
self
,
x
,
img_metas
):
""""Forward function for a single feature level.
Args:
x (Tensor): Input feature from backbone's single stage, shape
[bs, c, h, w].
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
shape [nb_dec, bs, num_query, cls_out_channels]. Note
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
head with normalized coordinate format (cx, cy, w, h).
Shape [nb_dec, bs, num_query, 4].
"""
# construct binary masks which used for the transformer.
# NOTE following the official DETR repo, non-zero values representing
# ignored positions, while zero values means valid positions.
batch_size
=
x
.
size
(
0
)
input_img_h
,
input_img_w
=
img_metas
[
0
][
'batch_input_shape'
]
masks
=
x
.
new_ones
((
batch_size
,
input_img_h
,
input_img_w
))
for
img_id
in
range
(
batch_size
):
img_h
,
img_w
,
_
=
img_metas
[
img_id
][
'img_shape'
]
masks
[
img_id
,
:
img_h
,
:
img_w
]
=
0
x
=
self
.
input_proj
(
x
)
# interpolate masks to have the same spatial shape with x
masks
=
F
.
interpolate
(
masks
.
unsqueeze
(
1
),
size
=
x
.
shape
[
-
2
:]).
to
(
torch
.
bool
).
squeeze
(
1
)
# position encoding
pos_embed
=
self
.
positional_encoding
(
masks
)
# [bs, embed_dim, h, w]
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec
,
_
=
self
.
transformer
(
x
,
masks
,
self
.
query_embedding
.
weight
,
pos_embed
)
all_cls_scores
=
self
.
fc_cls
(
outs_dec
)
all_bbox_preds
=
self
.
fc_reg
(
self
.
activate
(
self
.
reg_ffn
(
outs_dec
))).
sigmoid
()
return
all_cls_scores
,
all_bbox_preds
@
force_fp32
(
apply_to
=
(
'all_cls_scores_list'
,
'all_bbox_preds_list'
))
def
loss
(
self
,
all_cls_scores_list
,
all_bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore
=
None
):
""""Loss function.
Only outputs from the last feature level are used for computing
losses by default.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
# NOTE defaultly only the outputs from the last feature scale is used.
all_cls_scores
=
all_cls_scores_list
[
-
1
]
all_bbox_preds
=
all_bbox_preds_list
[
-
1
]
assert
gt_bboxes_ignore
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_dec_layers
=
len
(
all_cls_scores
)
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
img_metas_list
=
[
img_metas
for
_
in
range
(
num_dec_layers
)]
losses_cls
,
losses_bbox
,
losses_iou
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
img_metas_list
,
all_gt_bboxes_ignore_list
)
loss_dict
=
dict
()
# loss from the last decoder layer
loss_dict
[
'loss_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_bbox'
]
=
losses_bbox
[
-
1
]
loss_dict
[
'loss_iou'
]
=
losses_iou
[
-
1
]
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
,
loss_iou_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
],
losses_iou
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
loss_bbox_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_iou'
]
=
loss_iou_i
num_dec_layer
+=
1
return
loss_dict
def
get_fed_loss_classes
(
self
,
gt_classes
,
num_fed_loss_classes
,
num_classes
,
weight
):
"""
Args:
gt_classes: a long tensor of shape R that contains the gt class label of each proposal.
num_fed_loss_classes: minimum number of classes to keep when calculating federated loss.
Will sample negative classes if number of unique gt_classes is smaller than this value.
num_classes: number of foreground classes
weight: probabilities used to sample negative classes
Returns:
Tensor:
classes to keep when calculating the federated loss, including both unique gt
classes and sampled negative classes.
"""
unique_gt_classes
=
torch
.
unique
(
gt_classes
)
prob
=
unique_gt_classes
.
new_ones
(
num_classes
+
1
).
float
()
prob
[
-
1
]
=
0
if
len
(
unique_gt_classes
)
<
num_fed_loss_classes
:
prob
[:
num_classes
]
=
weight
.
float
().
clone
()
prob
[
unique_gt_classes
]
=
0
sampled_negative_classes
=
torch
.
multinomial
(
prob
,
num_fed_loss_classes
-
len
(
unique_gt_classes
),
replacement
=
False
)
fed_loss_classes
=
torch
.
cat
([
unique_gt_classes
,
sampled_negative_classes
])
else
:
fed_loss_classes
=
unique_gt_classes
return
fed_loss_classes
def
loss_single
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
=
None
):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs
=
cls_scores
.
size
(
0
)
cls_scores_list
=
[
cls_scores
[
i
]
for
i
in
range
(
num_imgs
)]
bbox_preds_list
=
[
bbox_preds
[
i
]
for
i
in
range
(
num_imgs
)]
cls_reg_targets
=
self
.
get_targets
(
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
)
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
=
cls_reg_targets
labels
=
torch
.
cat
(
labels_list
,
0
)
label_weights
=
torch
.
cat
(
label_weights_list
,
0
)
bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
0
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
0
)
# classification loss
cls_scores
=
cls_scores
.
reshape
(
-
1
,
self
.
cls_out_channels
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
cls_scores
.
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
labels
,
label_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# construct factors used for rescale bboxes
factors
=
[]
for
img_meta
,
bbox_pred
in
zip
(
img_metas
,
bbox_preds
):
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
).
repeat
(
bbox_pred
.
size
(
0
),
1
)
factors
.
append
(
factor
)
factors
=
torch
.
cat
(
factors
,
0
)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds
=
bbox_preds
.
reshape
(
-
1
,
4
)
bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_preds
)
*
factors
bboxes_gt
=
bbox_cxcywh_to_xyxy
(
bbox_targets
)
*
factors
# regression IoU loss, defaultly GIoU loss
loss_iou
=
self
.
loss_iou
(
bboxes
,
bboxes_gt
,
bbox_weights
,
avg_factor
=
num_total_pos
)
# regression L1 loss
loss_bbox
=
self
.
loss_bbox
(
bbox_preds
,
bbox_targets
,
bbox_weights
,
avg_factor
=
num_total_pos
)
return
loss_cls
,
loss_bbox
,
loss_iou
def
get_targets
(
self
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
=
None
):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all
\
images.
- bbox_targets_list (list[Tensor]): BBox targets for all
\
images.
- bbox_weights_list (list[Tensor]): BBox weights for all
\
images.
- num_total_pos (int): Number of positive samples in all
\
images.
- num_total_neg (int): Number of negative samples in all
\
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs
=
len
(
cls_scores_list
)
gt_bboxes_ignore_list
=
[
gt_bboxes_ignore_list
for
_
in
range
(
num_imgs
)
]
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
pos_inds_list
,
neg_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
return
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
def
_get_area_thr
(
self
,
img_shape
,
type
):
MIN_V
=
0
MAX_V
=
1e10
short_edge
=
min
(
img_shape
[
0
],
img_shape
[
1
])
if
type
==
'v1'
:
DELTA
=
4
if
short_edge
<=
600
:
min_edge
=
128
-
DELTA
max_edge
=
MAX_V
elif
600
<
short_edge
<=
800
:
min_edge
=
96
-
DELTA
max_edge
=
MAX_V
elif
800
<
short_edge
<=
1000
:
min_edge
=
64
-
DELTA
max_edge
=
MAX_V
elif
1000
<
short_edge
<=
1200
:
min_edge
=
32
-
DELTA
max_edge
=
MAX_V
elif
1200
<
short_edge
<=
1400
:
min_edge
=
MIN_V
max_edge
=
MAX_V
else
:
min_edge
=
MIN_V
max_edge
=
2
+
DELTA
elif
type
==
'v2'
:
if
short_edge
<=
1000
:
min_edge
=
112
max_edge
=
MAX_V
elif
1000
<
short_edge
<=
1400
:
min_edge
=
32
max_edge
=
160
elif
short_edge
>
1400
:
min_edge
=
0
max_edge
=
80
elif
type
==
'v3'
:
if
short_edge
<=
800
:
min_edge
=
96
max_edge
=
MAX_V
elif
800
<
short_edge
<=
1000
:
min_edge
=
64
max_edge
=
MAX_V
elif
1000
<
short_edge
<=
1400
:
min_edge
=
MIN_V
max_edge
=
MAX_V
elif
1400
<
short_edge
<=
1600
:
min_edge
=
MIN_V
max_edge
=
96
elif
short_edge
>
1600
:
min_edge
=
MIN_V
max_edge
=
64
elif
type
==
'v4'
:
DELTA
=
4
if
short_edge
<=
800
:
min_edge
=
96
-
DELTA
max_edge
=
MAX_V
elif
800
<
short_edge
<=
1000
:
min_edge
=
64
-
DELTA
max_edge
=
MAX_V
elif
1000
<
short_edge
<=
1400
:
min_edge
=
MIN_V
max_edge
=
MAX_V
elif
1400
<
short_edge
<=
1600
:
min_edge
=
MIN_V
max_edge
=
64
+
DELTA
elif
short_edge
>
1600
:
min_edge
=
MIN_V
max_edge
=
32
+
DELTA
return
min_edge
**
2
,
max_edge
**
2
def
_get_target_single
(
self
,
cls_score
,
bbox_pred
,
gt_bboxes
,
gt_labels
,
img_meta
,
gt_bboxes_ignore
=
None
):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
img_meta (dict): Meta information for one image.
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes
=
bbox_pred
.
size
(
0
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
bbox_pred
,
cls_score
,
gt_bboxes
,
gt_labels
,
img_meta
,
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
bbox_pred
,
gt_bboxes
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
# label targets
labels
=
gt_bboxes
.
new_full
((
num_bboxes
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_bboxes
.
new_ones
(
num_bboxes
)
# bbox targets
bbox_targets
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
[
pos_inds
]
=
1.0
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
)
pos_gt_bboxes_normalized
=
sampling_result
.
pos_gt_bboxes
/
factor
pos_gt_bboxes_targets
=
bbox_xyxy_to_cxcywh
(
pos_gt_bboxes_normalized
)
bbox_targets
[
pos_inds
]
=
pos_gt_bboxes_targets
return
(
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
pos_inds
,
neg_inds
)
# over-write because img_metas are needed as inputs for bbox_head.
def
forward_train
(
self
,
x
,
img_metas
,
gt_bboxes
,
gt_labels
=
None
,
gt_bboxes_ignore
=
None
,
proposal_cfg
=
None
,
**
kwargs
):
"""Forward function for training mode.
Args:
x (list[Tensor]): Features from backbone.
img_metas (list[dict]): Meta information of each image, e.g.,
image size, scaling factor, etc.
gt_bboxes (Tensor): Ground truth bboxes of the image,
shape (num_gts, 4).
gt_labels (Tensor): Ground truth labels of each box,
shape (num_gts,).
gt_bboxes_ignore (Tensor): Ground truth bboxes to be
ignored, shape (num_ignored_gts, 4).
proposal_cfg (mmcv.Config): Test / postprocessing configuration,
if None, test_cfg would be used.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
proposal_cfg
is
None
,
'"proposal_cfg" must be None'
outs
=
self
(
x
,
img_metas
)
if
gt_labels
is
None
:
loss_inputs
=
outs
+
(
gt_bboxes
,
img_metas
)
else
:
loss_inputs
=
outs
+
(
gt_bboxes
,
gt_labels
,
img_metas
)
losses
=
self
.
loss
(
*
loss_inputs
,
gt_bboxes_ignore
=
gt_bboxes_ignore
)
return
losses
@
force_fp32
(
apply_to
=
(
'all_cls_scores_list'
,
'all_bbox_preds_list'
))
def
get_bboxes
(
self
,
all_cls_scores_list
,
all_bbox_preds_list
,
img_metas
,
rescale
=
False
):
"""Transform network outputs for a batch into bbox predictions.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
img_metas (list[dict]): Meta information of each image.
rescale (bool, optional): If True, return boxes in original
image space. Default False.
Returns:
list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.
\
The first item is an (n, 5) tensor, where the first 4 columns
\
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
\
5-th column is a score between 0 and 1. The second item is a
\
(n,) tensor where each item is the predicted class label of
\
the corresponding box.
"""
# NOTE defaultly only using outputs from the last feature level,
# and only the outputs from the last decoder layer is used.
cls_scores
=
all_cls_scores_list
[
-
1
][
-
1
]
bbox_preds
=
all_bbox_preds_list
[
-
1
][
-
1
]
result_list
=
[]
for
img_id
in
range
(
len
(
img_metas
)):
cls_score
=
cls_scores
[
img_id
]
bbox_pred
=
bbox_preds
[
img_id
]
img_shape
=
img_metas
[
img_id
][
'img_shape'
]
scale_factor
=
img_metas
[
img_id
][
'scale_factor'
]
proposals
=
self
.
_get_bboxes_single
(
cls_score
,
bbox_pred
,
img_shape
,
scale_factor
,
rescale
)
result_list
.
append
(
proposals
)
return
result_list
def
_get_bboxes_single
(
self
,
cls_score
,
bbox_pred
,
img_shape
,
scale_factor
,
rescale
=
False
):
"""Transform outputs from the last decoder layer into bbox predictions
for each image.
Args:
cls_score (Tensor): Box score logits from the last decoder layer
for each image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
for each image, with coordinate format (cx, cy, w, h) and
shape [num_query, 4].
img_shape (tuple[int]): Shape of input image, (height, width, 3).
scale_factor (ndarray, optional): Scale factor of the image arange
as (w_scale, h_scale, w_scale, h_scale).
rescale (bool, optional): If True, return boxes in original image
space. Default False.
Returns:
tuple[Tensor]: Results of detected bboxes and labels.
- det_bboxes: Predicted bboxes with shape [num_query, 5],
\
where the first 4 columns are bounding box positions
\
(tl_x, tl_y, br_x, br_y) and the 5-th column are scores
\
between 0 and 1.
- det_labels: Predicted labels of the corresponding box with
\
shape [num_query].
"""
assert
len
(
cls_score
)
==
len
(
bbox_pred
)
max_per_img
=
self
.
test_cfg
.
get
(
'max_per_img'
,
self
.
num_query
)
# exclude background
if
self
.
loss_cls
.
use_sigmoid
:
cls_score
=
cls_score
.
sigmoid
()
scores
,
indexes
=
cls_score
.
view
(
-
1
).
topk
(
max_per_img
)
det_labels
=
indexes
%
self
.
num_classes
bbox_index
=
indexes
//
self
.
num_classes
bbox_pred
=
bbox_pred
[
bbox_index
]
else
:
scores
,
det_labels
=
F
.
softmax
(
cls_score
,
dim
=-
1
)[...,
:
-
1
].
max
(
-
1
)
scores
,
bbox_index
=
scores
.
topk
(
max_per_img
)
bbox_pred
=
bbox_pred
[
bbox_index
]
det_labels
=
det_labels
[
bbox_index
]
det_bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_pred
)
det_bboxes
[:,
0
::
2
]
=
det_bboxes
[:,
0
::
2
]
*
img_shape
[
1
]
det_bboxes
[:,
1
::
2
]
=
det_bboxes
[:,
1
::
2
]
*
img_shape
[
0
]
det_bboxes
[:,
0
::
2
].
clamp_
(
min
=
0
,
max
=
img_shape
[
1
])
det_bboxes
[:,
1
::
2
].
clamp_
(
min
=
0
,
max
=
img_shape
[
0
])
if
rescale
:
det_bboxes
/=
det_bboxes
.
new_tensor
(
scale_factor
)
det_bboxes
=
torch
.
cat
((
det_bboxes
,
scores
.
unsqueeze
(
1
)),
-
1
)
return
det_bboxes
,
det_labels
def
simple_test_bboxes
(
self
,
feats
,
img_metas
,
rescale
=
False
):
"""Test det bboxes without test-time augmentation.
Args:
feats (tuple[torch.Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
The first item is ``bboxes`` with shape (n, 5),
where 5 represent (tl_x, tl_y, br_x, br_y, score).
The shape of the second tensor in the tuple is ``labels``
with shape (n,)
"""
# forward of this head requires img_metas
outs
=
self
.
forward
(
feats
,
img_metas
)
results_list
=
self
.
get_bboxes
(
*
outs
,
img_metas
,
rescale
=
rescale
)
return
results_list
def
forward_onnx
(
self
,
feats
,
img_metas
):
"""Forward function for exporting to ONNX.
Over-write `forward` because: `masks` is directly created with
zero (valid position tag) and has the same spatial size as `x`.
Thus the construction of `masks` is different from that in `forward`.
Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.
img_metas (list[dict]): List of image information.
Returns:
tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
- all_cls_scores_list (list[Tensor]): Classification scores
\
for each scale level. Each is a 4D-tensor with shape
\
[nb_dec, bs, num_query, cls_out_channels]. Note
\
`cls_out_channels` should includes background.
- all_bbox_preds_list (list[Tensor]): Sigmoid regression
\
outputs for each scale level. Each is a 4D-tensor with
\
normalized coordinate format (cx, cy, w, h) and shape
\
[nb_dec, bs, num_query, 4].
"""
num_levels
=
len
(
feats
)
img_metas_list
=
[
img_metas
for
_
in
range
(
num_levels
)]
return
multi_apply
(
self
.
forward_single_onnx
,
feats
,
img_metas_list
)
def
forward_single_onnx
(
self
,
x
,
img_metas
):
""""Forward function for a single feature level with ONNX exportation.
Args:
x (Tensor): Input feature from backbone's single stage, shape
[bs, c, h, w].
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
shape [nb_dec, bs, num_query, cls_out_channels]. Note
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
head with normalized coordinate format (cx, cy, w, h).
Shape [nb_dec, bs, num_query, 4].
"""
# Note `img_shape` is not dynamically traceable to ONNX,
# since the related augmentation was done with numpy under
# CPU. Thus `masks` is directly created with zeros (valid tag)
# and the same spatial shape as `x`.
# The difference between torch and exported ONNX model may be
# ignored, since the same performance is achieved (e.g.
# 40.1 vs 40.1 for DETR)
batch_size
=
x
.
size
(
0
)
h
,
w
=
x
.
size
()[
-
2
:]
masks
=
x
.
new_zeros
((
batch_size
,
h
,
w
))
# [B,h,w]
x
=
self
.
input_proj
(
x
)
# interpolate masks to have the same spatial shape with x
masks
=
F
.
interpolate
(
masks
.
unsqueeze
(
1
),
size
=
x
.
shape
[
-
2
:]).
to
(
torch
.
bool
).
squeeze
(
1
)
pos_embed
=
self
.
positional_encoding
(
masks
)
outs_dec
,
_
=
self
.
transformer
(
x
,
masks
,
self
.
query_embedding
.
weight
,
pos_embed
)
all_cls_scores
=
self
.
fc_cls
(
outs_dec
)
all_bbox_preds
=
self
.
fc_reg
(
self
.
activate
(
self
.
reg_ffn
(
outs_dec
))).
sigmoid
()
return
all_cls_scores
,
all_bbox_preds
def
onnx_export
(
self
,
all_cls_scores_list
,
all_bbox_preds_list
,
img_metas
):
"""Transform network outputs into bbox predictions, with ONNX
exportation.
Args:
all_cls_scores_list (list[Tensor]): Classification outputs
for each feature level. Each is a 4D-tensor with shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds_list (list[Tensor]): Sigmoid regression
outputs for each feature level. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
img_metas (list[dict]): Meta information of each image.
Returns:
tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
and class labels of shape [N, num_det].
"""
assert
len
(
img_metas
)
==
1
,
\
'Only support one input image while in exporting to ONNX'
cls_scores
=
all_cls_scores_list
[
-
1
][
-
1
]
bbox_preds
=
all_bbox_preds_list
[
-
1
][
-
1
]
# Note `img_shape` is not dynamically traceable to ONNX,
# here `img_shape_for_onnx` (padded shape of image tensor)
# is used.
img_shape
=
img_metas
[
0
][
'img_shape_for_onnx'
]
max_per_img
=
self
.
test_cfg
.
get
(
'max_per_img'
,
self
.
num_query
)
batch_size
=
cls_scores
.
size
(
0
)
# `batch_index_offset` is used for the gather of concatenated tensor
batch_index_offset
=
torch
.
arange
(
batch_size
).
to
(
cls_scores
.
device
)
*
max_per_img
batch_index_offset
=
batch_index_offset
.
unsqueeze
(
1
).
expand
(
batch_size
,
max_per_img
)
# supports dynamical batch inference
if
self
.
loss_cls
.
use_sigmoid
:
cls_scores
=
cls_scores
.
sigmoid
()
scores
,
indexes
=
cls_scores
.
view
(
batch_size
,
-
1
).
topk
(
max_per_img
,
dim
=
1
)
det_labels
=
indexes
%
self
.
num_classes
bbox_index
=
indexes
//
self
.
num_classes
bbox_index
=
(
bbox_index
+
batch_index_offset
).
view
(
-
1
)
bbox_preds
=
bbox_preds
.
view
(
-
1
,
4
)[
bbox_index
]
bbox_preds
=
bbox_preds
.
view
(
batch_size
,
-
1
,
4
)
else
:
scores
,
det_labels
=
F
.
softmax
(
cls_scores
,
dim
=-
1
)[...,
:
-
1
].
max
(
-
1
)
scores
,
bbox_index
=
scores
.
topk
(
max_per_img
,
dim
=
1
)
bbox_index
=
(
bbox_index
+
batch_index_offset
).
view
(
-
1
)
bbox_preds
=
bbox_preds
.
view
(
-
1
,
4
)[
bbox_index
]
det_labels
=
det_labels
.
view
(
-
1
)[
bbox_index
]
bbox_preds
=
bbox_preds
.
view
(
batch_size
,
-
1
,
4
)
det_labels
=
det_labels
.
view
(
batch_size
,
-
1
)
det_bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_preds
)
# use `img_shape_tensor` for dynamically exporting to ONNX
img_shape_tensor
=
img_shape
.
flip
(
0
).
repeat
(
2
)
# [w,h,w,h]
img_shape_tensor
=
img_shape_tensor
.
unsqueeze
(
0
).
unsqueeze
(
0
).
expand
(
batch_size
,
det_bboxes
.
size
(
1
),
4
)
det_bboxes
=
det_bboxes
*
img_shape_tensor
# dynamically clip bboxes
x1
,
y1
,
x2
,
y2
=
det_bboxes
.
split
((
1
,
1
,
1
,
1
),
dim
=-
1
)
from
mmdet.core.export
import
dynamic_clip_for_onnx
x1
,
y1
,
x2
,
y2
=
dynamic_clip_for_onnx
(
x1
,
y1
,
x2
,
y2
,
img_shape
)
det_bboxes
=
torch
.
cat
([
x1
,
y1
,
x2
,
y2
],
dim
=-
1
)
det_bboxes
=
torch
.
cat
((
det_bboxes
,
scores
.
unsqueeze
(
-
1
)),
-
1
)
return
det_bboxes
,
det_labels
detection/mmdet_custom/models/dense_heads/dino_head.py
0 → 100644
View file @
bdd98bcb
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmdet.core
import
(
bbox_cxcywh_to_xyxy
,
bbox_xyxy_to_cxcywh
,
multi_apply
,
reduce_mean
)
from
..utils
import
build_dn_generator
from
mmdet.models.utils.transformer
import
inverse_sigmoid
from
mmdet.models.builder
import
HEADS
from
.deformable_detr_head
import
DeformableDETRHead
from
mmcv.runner
import
force_fp32
@
HEADS
.
register_module
()
class
DINOHead
(
DeformableDETRHead
):
def
__init__
(
self
,
*
args
,
dn_cfg
=
None
,
**
kwargs
):
super
(
DINOHead
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
_init_layers
()
self
.
init_denoising
(
dn_cfg
)
assert
self
.
as_two_stage
,
\
'as_two_stage must be True for DINO'
assert
self
.
with_box_refine
,
\
'with_box_refine must be True for DINO'
def
_init_layers
(
self
):
super
().
_init_layers
()
# NOTE The original repo of DINO set the num_embeddings 92 for coco,
# 91 (0~90) of which represents target classes and the 92 (91)
# indicates [Unknown] class. However, the embedding of unknown class
# is not used in the original DINO
self
.
label_embedding
=
nn
.
Embedding
(
self
.
cls_out_channels
,
self
.
embed_dims
)
def
init_denoising
(
self
,
dn_cfg
):
if
dn_cfg
is
not
None
:
dn_cfg
[
'num_classes'
]
=
self
.
num_classes
dn_cfg
[
'num_queries'
]
=
self
.
num_query
dn_cfg
[
'hidden_dim'
]
=
self
.
embed_dims
self
.
dn_generator
=
build_dn_generator
(
dn_cfg
)
def
forward_train
(
self
,
x
,
img_metas
,
gt_bboxes
,
gt_labels
=
None
,
gt_bboxes_ignore
=
None
,
proposal_cfg
=
None
,
**
kwargs
):
assert
proposal_cfg
is
None
,
'"proposal_cfg" must be None'
assert
self
.
dn_generator
is
not
None
,
'"dn_cfg" must be set'
dn_label_query
,
dn_bbox_query
,
attn_mask
,
dn_meta
=
\
self
.
dn_generator
(
gt_bboxes
,
gt_labels
,
self
.
label_embedding
,
img_metas
)
outs
=
self
(
x
,
img_metas
,
dn_label_query
,
dn_bbox_query
,
attn_mask
)
if
gt_labels
is
None
:
loss_inputs
=
outs
+
(
gt_bboxes
,
img_metas
,
dn_meta
)
else
:
loss_inputs
=
outs
+
(
gt_bboxes
,
gt_labels
,
img_metas
,
dn_meta
)
losses
=
self
.
loss
(
*
loss_inputs
,
gt_bboxes_ignore
=
gt_bboxes_ignore
)
return
losses
def
forward
(
self
,
mlvl_feats
,
img_metas
,
dn_label_query
=
None
,
dn_bbox_query
=
None
,
attn_mask
=
None
):
batch_size
=
mlvl_feats
[
0
].
size
(
0
)
input_img_h
,
input_img_w
=
img_metas
[
0
][
'batch_input_shape'
]
img_masks
=
mlvl_feats
[
0
].
new_ones
(
(
batch_size
,
input_img_h
,
input_img_w
))
for
img_id
in
range
(
batch_size
):
img_h
,
img_w
,
_
=
img_metas
[
img_id
][
'img_shape'
]
img_masks
[
img_id
,
:
img_h
,
:
img_w
]
=
0
mlvl_masks
=
[]
mlvl_positional_encodings
=
[]
for
feat
in
mlvl_feats
:
mlvl_masks
.
append
(
F
.
interpolate
(
img_masks
[
None
],
size
=
feat
.
shape
[
-
2
:]).
to
(
torch
.
bool
).
squeeze
(
0
))
mlvl_positional_encodings
.
append
(
self
.
positional_encoding
(
mlvl_masks
[
-
1
]))
query_embeds
=
None
hs
,
inter_references
,
topk_score
,
topk_anchor
=
\
self
.
transformer
(
mlvl_feats
,
mlvl_masks
,
query_embeds
,
mlvl_positional_encodings
,
dn_label_query
,
dn_bbox_query
,
attn_mask
,
reg_branches
=
self
.
reg_branches
if
self
.
with_box_refine
else
None
,
# noqa:E501
cls_branches
=
self
.
cls_branches
if
self
.
as_two_stage
else
None
# noqa:E501
)
hs
=
hs
.
permute
(
0
,
2
,
1
,
3
)
if
dn_label_query
is
not
None
and
dn_label_query
.
size
(
1
)
==
0
:
# NOTE: If there is no target in the image, the parameters of
# label_embedding won't be used in producing loss, which raises
# RuntimeError when using distributed mode.
hs
[
0
]
+=
self
.
label_embedding
.
weight
[
0
,
0
]
*
0.0
outputs_classes
=
[]
outputs_coords
=
[]
for
lvl
in
range
(
hs
.
shape
[
0
]):
reference
=
inter_references
[
lvl
]
reference
=
inverse_sigmoid
(
reference
,
eps
=
1e-3
)
outputs_class
=
self
.
cls_branches
[
lvl
](
hs
[
lvl
])
tmp
=
self
.
reg_branches
[
lvl
](
hs
[
lvl
])
if
reference
.
shape
[
-
1
]
==
4
:
tmp
+=
reference
else
:
assert
reference
.
shape
[
-
1
]
==
2
tmp
[...,
:
2
]
+=
reference
outputs_coord
=
tmp
.
sigmoid
()
outputs_classes
.
append
(
outputs_class
)
outputs_coords
.
append
(
outputs_coord
)
outputs_classes
=
torch
.
stack
(
outputs_classes
)
outputs_coords
=
torch
.
stack
(
outputs_coords
)
return
outputs_classes
,
outputs_coords
,
topk_score
,
topk_anchor
@
force_fp32
(
apply_to
=
(
'all_cls_scores'
,
'all_bbox_preds'
))
def
loss
(
self
,
all_cls_scores
,
all_bbox_preds
,
enc_topk_scores
,
enc_topk_anchors
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
dn_meta
=
None
,
gt_bboxes_ignore
=
None
):
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
loss_dict
=
dict
()
# extract denoising and matching part of outputs
all_cls_scores
,
all_bbox_preds
,
dn_cls_scores
,
dn_bbox_preds
=
\
self
.
extract_dn_outputs
(
all_cls_scores
,
all_bbox_preds
,
dn_meta
)
if
enc_topk_scores
is
not
None
:
# calculate loss from encode feature maps
# NOTE The DeformDETR calculate binary cls loss
# for all encoder embeddings, while DINO calculate
# multi-class loss for topk embeddings.
enc_loss_cls
,
enc_losses_bbox
,
enc_losses_iou
=
\
self
.
loss_single
(
enc_topk_scores
,
enc_topk_anchors
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore
)
# collate loss from encode feature maps
loss_dict
[
'interm_loss_cls'
]
=
enc_loss_cls
loss_dict
[
'interm_loss_bbox'
]
=
enc_losses_bbox
loss_dict
[
'interm_loss_iou'
]
=
enc_losses_iou
# calculate loss from all decoder layers
num_dec_layers
=
len
(
all_cls_scores
)
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
img_metas_list
=
[
img_metas
for
_
in
range
(
num_dec_layers
)]
losses_cls
,
losses_bbox
,
losses_iou
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
img_metas_list
,
all_gt_bboxes_ignore_list
)
# collate loss from the last decoder layer
loss_dict
[
'loss_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_bbox'
]
=
losses_bbox
[
-
1
]
loss_dict
[
'loss_iou'
]
=
losses_iou
[
-
1
]
# collate loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
,
loss_iou_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
],
losses_iou
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
loss_bbox_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_iou'
]
=
loss_iou_i
num_dec_layer
+=
1
if
dn_cls_scores
is
not
None
:
# calculate denoising loss from all decoder layers
dn_meta
=
[
dn_meta
for
_
in
img_metas
]
dn_losses_cls
,
dn_losses_bbox
,
dn_losses_iou
=
self
.
loss_dn
(
dn_cls_scores
,
dn_bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
dn_meta
)
# collate denoising loss
loss_dict
[
'dn_loss_cls'
]
=
dn_losses_cls
[
-
1
]
loss_dict
[
'dn_loss_bbox'
]
=
dn_losses_bbox
[
-
1
]
loss_dict
[
'dn_loss_iou'
]
=
dn_losses_iou
[
-
1
]
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
,
loss_iou_i
in
zip
(
dn_losses_cls
[:
-
1
],
dn_losses_bbox
[:
-
1
],
dn_losses_iou
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.dn_loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.dn_loss_bbox'
]
=
loss_bbox_i
loss_dict
[
f
'd
{
num_dec_layer
}
.dn_loss_iou'
]
=
loss_iou_i
num_dec_layer
+=
1
return
loss_dict
def
loss_dn
(
self
,
dn_cls_scores
,
dn_bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
dn_meta
):
num_dec_layers
=
len
(
dn_cls_scores
)
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
img_metas_list
=
[
img_metas
for
_
in
range
(
num_dec_layers
)]
dn_meta_list
=
[
dn_meta
for
_
in
range
(
num_dec_layers
)]
return
multi_apply
(
self
.
loss_dn_single
,
dn_cls_scores
,
dn_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
img_metas_list
,
dn_meta_list
)
def
loss_dn_single
(
self
,
dn_cls_scores
,
dn_bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
dn_meta
):
num_imgs
=
dn_cls_scores
.
size
(
0
)
bbox_preds_list
=
[
dn_bbox_preds
[
i
]
for
i
in
range
(
num_imgs
)]
cls_reg_targets
=
self
.
get_dn_target
(
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
dn_meta
)
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
=
cls_reg_targets
labels
=
torch
.
cat
(
labels_list
,
0
)
label_weights
=
torch
.
cat
(
label_weights_list
,
0
)
bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
0
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
0
)
# classification loss
cls_scores
=
dn_cls_scores
.
reshape
(
-
1
,
self
.
cls_out_channels
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
\
num_total_pos
*
1.0
+
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
cls_scores
.
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
if
len
(
cls_scores
)
>
0
:
loss_cls
=
self
.
loss_cls
(
cls_scores
,
labels
,
label_weights
,
avg_factor
=
cls_avg_factor
)
else
:
loss_cls
=
torch
.
zeros
(
# TODO: How to better return zero loss
1
,
dtype
=
cls_scores
.
dtype
,
device
=
cls_scores
.
device
)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# construct factors used for rescale bboxes
factors
=
[]
for
img_meta
,
bbox_pred
in
zip
(
img_metas
,
dn_bbox_preds
):
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
).
repeat
(
bbox_pred
.
size
(
0
),
1
)
factors
.
append
(
factor
)
factors
=
torch
.
cat
(
factors
,
0
)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds
=
dn_bbox_preds
.
reshape
(
-
1
,
4
)
bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_preds
)
*
factors
bboxes_gt
=
bbox_cxcywh_to_xyxy
(
bbox_targets
)
*
factors
# regression IoU loss, defaultly GIoU loss
loss_iou
=
self
.
loss_iou
(
bboxes
,
bboxes_gt
,
bbox_weights
,
avg_factor
=
num_total_pos
)
# regression L1 loss
loss_bbox
=
self
.
loss_bbox
(
bbox_preds
,
bbox_targets
,
bbox_weights
,
avg_factor
=
num_total_pos
)
return
loss_cls
,
loss_bbox
,
loss_iou
def
get_dn_target
(
self
,
dn_bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
dn_meta
):
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
pos_inds_list
,
neg_inds_list
)
=
multi_apply
(
self
.
_get_dn_target_single
,
dn_bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
dn_meta
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
return
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
def
_get_dn_target_single
(
self
,
dn_bbox_pred
,
gt_bboxes
,
gt_labels
,
img_meta
,
dn_meta
):
num_groups
=
dn_meta
[
'num_dn_group'
]
pad_size
=
dn_meta
[
'pad_size'
]
assert
pad_size
%
num_groups
==
0
single_pad
=
pad_size
//
num_groups
num_bboxes
=
dn_bbox_pred
.
size
(
0
)
if
len
(
gt_labels
)
>
0
:
t
=
torch
.
range
(
0
,
len
(
gt_labels
)
-
1
).
long
().
cuda
()
t
=
t
.
unsqueeze
(
0
).
repeat
(
num_groups
,
1
)
pos_assigned_gt_inds
=
t
.
flatten
()
pos_inds
=
(
torch
.
tensor
(
range
(
num_groups
))
*
single_pad
).
long
().
cuda
().
unsqueeze
(
1
)
+
t
pos_inds
=
pos_inds
.
flatten
()
else
:
pos_inds
=
pos_assigned_gt_inds
=
torch
.
tensor
([]).
long
().
cuda
()
neg_inds
=
pos_inds
+
single_pad
//
2
# label targets
labels
=
gt_bboxes
.
new_full
((
num_bboxes
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
pos_assigned_gt_inds
]
label_weights
=
gt_bboxes
.
new_ones
(
num_bboxes
)
# bbox targets
bbox_targets
=
torch
.
zeros_like
(
dn_bbox_pred
)
bbox_weights
=
torch
.
zeros_like
(
dn_bbox_pred
)
bbox_weights
[
pos_inds
]
=
1.0
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor
=
dn_bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
)
gt_bboxes_normalized
=
gt_bboxes
/
factor
gt_bboxes_targets
=
bbox_xyxy_to_cxcywh
(
gt_bboxes_normalized
)
bbox_targets
[
pos_inds
]
=
gt_bboxes_targets
.
repeat
([
num_groups
,
1
])
return
(
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
pos_inds
,
neg_inds
)
@
staticmethod
def
extract_dn_outputs
(
all_cls_scores
,
all_bbox_preds
,
dn_meta
):
# if dn_meta and dn_meta['pad_size'] > 0:
if
dn_meta
is
not
None
:
denoising_cls_scores
=
all_cls_scores
[:,
:,
:
dn_meta
[
'pad_size'
],
:]
denoising_bbox_preds
=
all_bbox_preds
[:,
:,
:
dn_meta
[
'pad_size'
],
:]
matching_cls_scores
=
all_cls_scores
[:,
:,
dn_meta
[
'pad_size'
]:,
:]
matching_bbox_preds
=
all_bbox_preds
[:,
:,
dn_meta
[
'pad_size'
]:,
:]
else
:
denoising_cls_scores
=
None
denoising_bbox_preds
=
None
matching_cls_scores
=
all_cls_scores
matching_bbox_preds
=
all_bbox_preds
return
(
matching_cls_scores
,
matching_bbox_preds
,
denoising_cls_scores
,
denoising_bbox_preds
)
detection/mmdet_custom/models/detectors/__init__.py
0 → 100644
View file @
bdd98bcb
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
.dino
import
DINO
__all__
=
[
'DINO'
]
\ No newline at end of file
detection/mmdet_custom/models/detectors/dino.py
0 → 100644
View file @
bdd98bcb
# Copyright (c) OpenMMLab. All rights reserved.
from
mmdet.models.builder
import
DETECTORS
from
mmdet.models.detectors.detr
import
DETR
@
DETECTORS
.
register_module
()
class
DINO
(
DETR
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
DETR
,
self
).
__init__
(
*
args
,
**
kwargs
)
\ No newline at end of file
detection/mmdet_custom/models/utils/__init__.py
0 → 100644
View file @
bdd98bcb
from
.query_denoising
import
build_dn_generator
from
.transformer
import
(
DinoTransformer
,
DinoTransformerDecoder
)
__all__
=
[
'build_dn_generator'
,
'DinoTransformer'
,
'DinoTransformerDecoder'
]
\ No newline at end of file
detection/mmdet_custom/models/utils/query_denoising.py
0 → 100644
View file @
bdd98bcb
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
mmcv.runner
import
BaseModule
from
mmdet.core
import
bbox_xyxy_to_cxcywh
from
mmdet.models.utils.transformer
import
inverse_sigmoid
class
DnQueryGenerator
(
BaseModule
):
def
__init__
(
self
,
num_queries
,
hidden_dim
,
num_classes
,
noise_scale
=
dict
(
label
=
0.5
,
box
=
0.4
),
group_cfg
=
dict
(
dynamic
=
True
,
num_groups
=
None
,
num_dn_queries
=
None
)):
super
(
DnQueryGenerator
,
self
).
__init__
()
self
.
num_queries
=
num_queries
self
.
hidden_dim
=
hidden_dim
self
.
num_classes
=
num_classes
self
.
label_noise_scale
=
noise_scale
[
'label'
]
self
.
box_noise_scale
=
noise_scale
[
'box'
]
self
.
dynamic_dn_groups
=
group_cfg
.
get
(
'dynamic'
,
False
)
if
self
.
dynamic_dn_groups
:
assert
'num_dn_queries'
in
group_cfg
,
\
'num_dn_queries should be set when using '
\
'dynamic dn groups'
self
.
num_dn
=
group_cfg
[
'num_dn_queries'
]
else
:
assert
'num_groups'
in
group_cfg
,
\
'num_groups should be set when using '
\
'static dn groups'
self
.
num_dn
=
group_cfg
[
'num_groups'
]
assert
isinstance
(
self
.
num_dn
,
int
)
and
self
.
num_dn
>=
1
,
\
f
'Expected the num in group_cfg to have type int. '
\
f
'Found
{
type
(
self
.
num_dn
)
}
'
def
get_num_groups
(
self
,
group_queries
=
None
):
"""
Args:
group_queries (int): Number of dn queries in one group.
"""
if
self
.
dynamic_dn_groups
:
assert
group_queries
is
not
None
,
\
'group_queries should be provided when using '
\
'dynamic dn groups'
if
group_queries
==
0
:
num_groups
=
1
else
:
num_groups
=
self
.
num_dn
//
group_queries
else
:
num_groups
=
self
.
num_dn
if
num_groups
<
1
:
# avoid num_groups < 1 in query generator
num_groups
=
1
return
int
(
num_groups
)
def
forward
(
self
,
gt_bboxes
,
gt_labels
=
None
,
label_enc
=
None
,
img_metas
=
None
):
"""
Args:
gt_bboxes (List[Tensor]): List of ground truth bboxes
of the image, shape of each (num_gts, 4).
gt_labels (List[Tensor]): List of ground truth labels
of the image, shape of each (num_gts,), if None,
TODO:noisy_label would be None.
Returns:
TODO
"""
# TODO: temp only support for CDN
# TODO: temp assert gt_labels is not None and label_enc is not None
if
self
.
training
:
if
gt_labels
is
not
None
:
assert
len
(
gt_bboxes
)
==
len
(
gt_labels
),
\
f
'the length of provided gt_labels '
\
f
'
{
len
(
gt_labels
)
}
should be equal to'
\
f
' that of gt_bboxes
{
len
(
gt_bboxes
)
}
'
assert
gt_labels
is
not
None
\
and
label_enc
is
not
None
\
and
img_metas
is
not
None
# TODO: adjust args
batch_size
=
len
(
gt_bboxes
)
# convert bbox
gt_bboxes_list
=
[]
for
img_meta
,
bboxes
in
zip
(
img_metas
,
gt_bboxes
):
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
factor
=
bboxes
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
)
bboxes_normalized
=
bbox_xyxy_to_cxcywh
(
bboxes
)
/
factor
gt_bboxes_list
.
append
(
bboxes_normalized
)
gt_bboxes
=
gt_bboxes_list
known
=
[
torch
.
ones_like
(
labels
)
for
labels
in
gt_labels
]
known_num
=
[
sum
(
k
)
for
k
in
known
]
num_groups
=
self
.
get_num_groups
(
int
(
max
(
known_num
)))
unmask_bbox
=
unmask_label
=
torch
.
cat
(
known
)
labels
=
torch
.
cat
(
gt_labels
)
boxes
=
torch
.
cat
(
gt_bboxes
)
batch_idx
=
torch
.
cat
([
torch
.
full_like
(
t
.
long
(),
i
)
for
i
,
t
in
enumerate
(
gt_labels
)
])
known_indice
=
torch
.
nonzero
(
unmask_label
+
unmask_bbox
)
known_indice
=
known_indice
.
view
(
-
1
)
known_indice
=
known_indice
.
repeat
(
2
*
num_groups
,
1
).
view
(
-
1
)
known_labels
=
labels
.
repeat
(
2
*
num_groups
,
1
).
view
(
-
1
)
known_bid
=
batch_idx
.
repeat
(
2
*
num_groups
,
1
).
view
(
-
1
)
known_bboxs
=
boxes
.
repeat
(
2
*
num_groups
,
1
)
known_labels_expand
=
known_labels
.
clone
()
known_bbox_expand
=
known_bboxs
.
clone
()
if
self
.
label_noise_scale
>
0
:
p
=
torch
.
rand_like
(
known_labels_expand
.
float
())
chosen_indice
=
torch
.
nonzero
(
p
<
(
self
.
label_noise_scale
*
0.5
)).
view
(
-
1
)
new_label
=
torch
.
randint_like
(
chosen_indice
,
0
,
self
.
num_classes
)
known_labels_expand
.
scatter_
(
0
,
chosen_indice
,
new_label
)
single_pad
=
int
(
max
(
known_num
))
# TODO
pad_size
=
int
(
single_pad
*
2
*
num_groups
)
positive_idx
=
torch
.
tensor
(
range
(
len
(
boxes
))).
long
().
cuda
().
unsqueeze
(
0
).
repeat
(
num_groups
,
1
)
positive_idx
+=
(
torch
.
tensor
(
range
(
num_groups
))
*
len
(
boxes
)
*
2
).
long
().
cuda
().
unsqueeze
(
1
)
positive_idx
=
positive_idx
.
flatten
()
negative_idx
=
positive_idx
+
len
(
boxes
)
if
self
.
box_noise_scale
>
0
:
known_bbox_
=
torch
.
zeros_like
(
known_bboxs
)
known_bbox_
[:,
:
2
]
=
\
known_bboxs
[:,
:
2
]
-
known_bboxs
[:,
2
:]
/
2
known_bbox_
[:,
2
:]
=
\
known_bboxs
[:,
:
2
]
+
known_bboxs
[:,
2
:]
/
2
diff
=
torch
.
zeros_like
(
known_bboxs
)
diff
[:,
:
2
]
=
known_bboxs
[:,
2
:]
/
2
diff
[:,
2
:]
=
known_bboxs
[:,
2
:]
/
2
rand_sign
=
torch
.
randint_like
(
known_bboxs
,
low
=
0
,
high
=
2
,
dtype
=
torch
.
float32
)
rand_sign
=
rand_sign
*
2.0
-
1.0
rand_part
=
torch
.
rand_like
(
known_bboxs
)
rand_part
[
negative_idx
]
+=
1.0
rand_part
*=
rand_sign
known_bbox_
+=
\
torch
.
mul
(
rand_part
,
diff
).
cuda
()
*
self
.
box_noise_scale
known_bbox_
=
known_bbox_
.
clamp
(
min
=
0.0
,
max
=
1.0
)
known_bbox_expand
[:,
:
2
]
=
\
(
known_bbox_
[:,
:
2
]
+
known_bbox_
[:,
2
:])
/
2
known_bbox_expand
[:,
2
:]
=
\
known_bbox_
[:,
2
:]
-
known_bbox_
[:,
:
2
]
m
=
known_labels_expand
.
long
().
to
(
'cuda'
)
input_label_embed
=
label_enc
(
m
)
input_bbox_embed
=
inverse_sigmoid
(
known_bbox_expand
,
eps
=
1e-3
)
padding_label
=
torch
.
zeros
(
pad_size
,
self
.
hidden_dim
).
cuda
()
padding_bbox
=
torch
.
zeros
(
pad_size
,
4
).
cuda
()
input_query_label
=
padding_label
.
repeat
(
batch_size
,
1
,
1
)
input_query_bbox
=
padding_bbox
.
repeat
(
batch_size
,
1
,
1
)
map_known_indice
=
torch
.
tensor
([]).
to
(
'cuda'
)
if
len
(
known_num
):
map_known_indice
=
torch
.
cat
(
[
torch
.
tensor
(
range
(
num
))
for
num
in
known_num
])
map_known_indice
=
torch
.
cat
([
map_known_indice
+
single_pad
*
i
for
i
in
range
(
2
*
num_groups
)
]).
long
()
if
len
(
known_bid
):
input_query_label
[(
known_bid
.
long
(),
map_known_indice
)]
=
input_label_embed
input_query_bbox
[(
known_bid
.
long
(),
map_known_indice
)]
=
input_bbox_embed
tgt_size
=
pad_size
+
self
.
num_queries
attn_mask
=
torch
.
ones
(
tgt_size
,
tgt_size
).
to
(
'cuda'
)
<
0
# match query cannot see the reconstruct
attn_mask
[
pad_size
:,
:
pad_size
]
=
True
# reconstruct cannot see each other
for
i
in
range
(
num_groups
):
if
i
==
0
:
attn_mask
[
single_pad
*
2
*
i
:
single_pad
*
2
*
(
i
+
1
),
single_pad
*
2
*
(
i
+
1
):
pad_size
]
=
True
if
i
==
num_groups
-
1
:
attn_mask
[
single_pad
*
2
*
i
:
single_pad
*
2
*
(
i
+
1
),
:
single_pad
*
i
*
2
]
=
True
else
:
attn_mask
[
single_pad
*
2
*
i
:
single_pad
*
2
*
(
i
+
1
),
single_pad
*
2
*
(
i
+
1
):
pad_size
]
=
True
attn_mask
[
single_pad
*
2
*
i
:
single_pad
*
2
*
(
i
+
1
),
:
single_pad
*
2
*
i
]
=
True
dn_meta
=
{
'pad_size'
:
pad_size
,
'num_dn_group'
:
num_groups
,
}
else
:
input_query_label
=
None
input_query_bbox
=
None
attn_mask
=
None
dn_meta
=
None
return
input_query_label
,
input_query_bbox
,
attn_mask
,
dn_meta
class
CdnQueryGenerator
(
DnQueryGenerator
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
CdnQueryGenerator
,
self
).
__init__
(
*
args
,
**
kwargs
)
def
build_dn_generator
(
dn_args
):
"""
Args:
dn_args (dict):
Returns:
"""
if
dn_args
is
None
:
return
None
type
=
dn_args
.
pop
(
'type'
)
if
type
==
'DnQueryGenerator'
:
return
DnQueryGenerator
(
**
dn_args
)
elif
type
==
'CdnQueryGenerator'
:
return
CdnQueryGenerator
(
**
dn_args
)
else
:
raise
NotImplementedError
(
f
'
{
type
}
is not supported yet'
)
\ No newline at end of file
detection/mmdet_custom/models/utils/transformer.py
0 → 100644
View file @
bdd98bcb
import
math
import
torch
import
torch.nn
as
nn
from
mmdet.models.utils.builder
import
TRANSFORMER
from
mmcv.cnn.bricks.registry
import
(
TRANSFORMER_LAYER_SEQUENCE
,
FEEDFORWARD_NETWORK
,
DROPOUT_LAYERS
)
from
mmdet.models.utils.transformer
import
(
inverse_sigmoid
,
DeformableDetrTransformerDecoder
,
DeformableDetrTransformer
)
def
build_MLP
(
input_dim
,
hidden_dim
,
output_dim
,
num_layers
):
# TODO: It can be implemented by add an out_channel arg of
# mmcv.cnn.bricks.transformer.FFN
assert
num_layers
>
1
,
\
f
'num_layers should be greater than 1 but got
{
num_layers
}
'
h
=
[
hidden_dim
]
*
(
num_layers
-
1
)
layers
=
list
()
for
n
,
k
in
zip
([
input_dim
]
+
h
[:
-
1
],
h
):
layers
.
extend
((
nn
.
Linear
(
n
,
k
),
nn
.
ReLU
()))
# Note that the relu func of MLP in original DETR repo is set
# 'inplace=False', however the ReLU cfg of FFN in mmdet is set
# 'inplace=True' by default.
layers
.
append
(
nn
.
Linear
(
hidden_dim
,
output_dim
))
return
nn
.
Sequential
(
*
layers
)
@
TRANSFORMER_LAYER_SEQUENCE
.
register_module
()
class
DinoTransformerDecoder
(
DeformableDetrTransformerDecoder
):
def
__init__
(
self
,
*
args
,
with_rp_noise
=
False
,
**
kwargs
):
super
(
DinoTransformerDecoder
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
with_rp_noise
=
with_rp_noise
self
.
_init_layers
()
def
_init_layers
(
self
):
self
.
ref_point_head
=
build_MLP
(
self
.
embed_dims
*
2
,
self
.
embed_dims
,
self
.
embed_dims
,
2
)
self
.
norm
=
nn
.
LayerNorm
(
self
.
embed_dims
)
# @staticmethod
def
gen_sineembed_for_position
(
self
,
pos_tensor
):
# n_query, bs, _ = pos_tensor.size()
# sineembed_tensor = torch.zeros(n_query, bs, 256)
scale
=
2
*
math
.
pi
dim_t
=
torch
.
arange
(
self
.
embed_dims
//
2
,
dtype
=
torch
.
float32
,
device
=
pos_tensor
.
device
)
dim_t
=
10000
**
(
2
*
(
dim_t
//
2
)
/
(
self
.
embed_dims
//
2
))
x_embed
=
pos_tensor
[:,
:,
0
]
*
scale
y_embed
=
pos_tensor
[:,
:,
1
]
*
scale
pos_x
=
x_embed
[:,
:,
None
]
/
dim_t
pos_y
=
y_embed
[:,
:,
None
]
/
dim_t
pos_x
=
torch
.
stack
((
pos_x
[:,
:,
0
::
2
].
sin
(),
pos_x
[:,
:,
1
::
2
].
cos
()),
dim
=
3
).
flatten
(
2
)
pos_y
=
torch
.
stack
((
pos_y
[:,
:,
0
::
2
].
sin
(),
pos_y
[:,
:,
1
::
2
].
cos
()),
dim
=
3
).
flatten
(
2
)
if
pos_tensor
.
size
(
-
1
)
==
2
:
pos
=
torch
.
cat
((
pos_y
,
pos_x
),
dim
=
2
)
elif
pos_tensor
.
size
(
-
1
)
==
4
:
w_embed
=
pos_tensor
[:,
:,
2
]
*
scale
pos_w
=
w_embed
[:,
:,
None
]
/
dim_t
pos_w
=
torch
.
stack
(
(
pos_w
[:,
:,
0
::
2
].
sin
(),
pos_w
[:,
:,
1
::
2
].
cos
()),
dim
=
3
).
flatten
(
2
)
h_embed
=
pos_tensor
[:,
:,
3
]
*
scale
pos_h
=
h_embed
[:,
:,
None
]
/
dim_t
pos_h
=
torch
.
stack
(
(
pos_h
[:,
:,
0
::
2
].
sin
(),
pos_h
[:,
:,
1
::
2
].
cos
()),
dim
=
3
).
flatten
(
2
)
pos
=
torch
.
cat
((
pos_y
,
pos_x
,
pos_w
,
pos_h
),
dim
=
2
)
else
:
raise
ValueError
(
'Unknown pos_tensor shape(-1):{}'
.
format
(
pos_tensor
.
size
(
-
1
)))
return
pos
def
forward
(
self
,
query
,
*
args
,
reference_points
=
None
,
valid_ratios
=
None
,
reg_branches
=
None
,
**
kwargs
):
output
=
query
intermediate
=
[]
intermediate_reference_points
=
[
reference_points
]
for
lid
,
layer
in
enumerate
(
self
.
layers
):
if
reference_points
.
shape
[
-
1
]
==
4
:
reference_points_input
=
\
reference_points
[:,
:,
None
]
*
torch
.
cat
(
[
valid_ratios
,
valid_ratios
],
-
1
)[:,
None
]
else
:
assert
reference_points
.
shape
[
-
1
]
==
2
reference_points_input
=
\
reference_points
[:,
:,
None
]
*
valid_ratios
[:,
None
]
if
self
.
with_rp_noise
and
self
.
training
:
device
=
reference_points
.
device
b
,
n
,
d
=
reference_points
.
size
()
noise
=
torch
.
rand
(
b
,
n
,
d
).
to
(
device
)
*
0.02
-
0.01
reference_points
=
(
reference_points
+
noise
).
clamp
(
0
,
1
)
query_sine_embed
=
self
.
gen_sineembed_for_position
(
reference_points_input
[:,
:,
0
,
:])
query_pos
=
self
.
ref_point_head
(
query_sine_embed
)
query_pos
=
query_pos
.
permute
(
1
,
0
,
2
)
output
=
layer
(
output
,
*
args
,
query_pos
=
query_pos
,
reference_points
=
reference_points_input
,
**
kwargs
)
output
=
output
.
permute
(
1
,
0
,
2
)
if
reg_branches
is
not
None
:
tmp
=
reg_branches
[
lid
](
output
)
assert
reference_points
.
shape
[
-
1
]
==
4
new_reference_points
=
tmp
+
inverse_sigmoid
(
reference_points
,
eps
=
1e-3
)
new_reference_points
=
new_reference_points
.
sigmoid
()
reference_points
=
new_reference_points
.
detach
()
output
=
output
.
permute
(
1
,
0
,
2
)
if
self
.
return_intermediate
:
intermediate
.
append
(
self
.
norm
(
output
))
intermediate_reference_points
.
append
(
new_reference_points
)
# NOTE this is for the "Look Forward Twice" module,
# in the DeformDETR, reference_points was appended.
if
self
.
return_intermediate
:
return
torch
.
stack
(
intermediate
),
torch
.
stack
(
intermediate_reference_points
)
return
output
,
reference_points
@
TRANSFORMER
.
register_module
()
class
DinoTransformer
(
DeformableDetrTransformer
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
DinoTransformer
,
self
).
__init__
(
*
args
,
**
kwargs
)
def
init_layers
(
self
):
"""Initialize layers of the DinoTransformer."""
self
.
level_embeds
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
num_feature_levels
,
self
.
embed_dims
))
self
.
enc_output
=
nn
.
Linear
(
self
.
embed_dims
,
self
.
embed_dims
)
self
.
enc_output_norm
=
nn
.
LayerNorm
(
self
.
embed_dims
)
self
.
query_embed
=
nn
.
Embedding
(
self
.
two_stage_num_proposals
,
self
.
embed_dims
)
def
init_weights
(
self
):
super
().
init_weights
()
nn
.
init
.
normal_
(
self
.
query_embed
.
weight
.
data
)
def
forward
(
self
,
mlvl_feats
,
mlvl_masks
,
query_embed
,
mlvl_pos_embeds
,
dn_label_query
,
dn_bbox_query
,
attn_mask
,
reg_branches
=
None
,
cls_branches
=
None
,
**
kwargs
):
assert
self
.
as_two_stage
and
query_embed
is
None
,
\
'as_two_stage must be True for DINO'
feat_flatten
=
[]
mask_flatten
=
[]
lvl_pos_embed_flatten
=
[]
spatial_shapes
=
[]
for
lvl
,
(
feat
,
mask
,
pos_embed
)
in
enumerate
(
zip
(
mlvl_feats
,
mlvl_masks
,
mlvl_pos_embeds
)):
bs
,
c
,
h
,
w
=
feat
.
shape
spatial_shape
=
(
h
,
w
)
spatial_shapes
.
append
(
spatial_shape
)
feat
=
feat
.
flatten
(
2
).
transpose
(
1
,
2
)
mask
=
mask
.
flatten
(
1
)
pos_embed
=
pos_embed
.
flatten
(
2
).
transpose
(
1
,
2
)
lvl_pos_embed
=
pos_embed
+
self
.
level_embeds
[
lvl
].
view
(
1
,
1
,
-
1
)
lvl_pos_embed_flatten
.
append
(
lvl_pos_embed
)
feat_flatten
.
append
(
feat
)
mask_flatten
.
append
(
mask
)
feat_flatten
=
torch
.
cat
(
feat_flatten
,
1
)
mask_flatten
=
torch
.
cat
(
mask_flatten
,
1
)
lvl_pos_embed_flatten
=
torch
.
cat
(
lvl_pos_embed_flatten
,
1
)
spatial_shapes
=
torch
.
as_tensor
(
spatial_shapes
,
dtype
=
torch
.
long
,
device
=
feat_flatten
.
device
)
level_start_index
=
torch
.
cat
((
spatial_shapes
.
new_zeros
(
(
1
,
)),
spatial_shapes
.
prod
(
1
).
cumsum
(
0
)[:
-
1
]))
valid_ratios
=
torch
.
stack
(
[
self
.
get_valid_ratio
(
m
)
for
m
in
mlvl_masks
],
1
)
reference_points
=
self
.
get_reference_points
(
spatial_shapes
,
valid_ratios
,
device
=
feat
.
device
)
feat_flatten
=
feat_flatten
.
permute
(
1
,
0
,
2
)
# (H*W, bs, embed_dims)
lvl_pos_embed_flatten
=
lvl_pos_embed_flatten
.
permute
(
1
,
0
,
2
)
# (H*W, bs, embed_dims)
memory
=
self
.
encoder
(
query
=
feat_flatten
,
key
=
None
,
value
=
None
,
query_pos
=
lvl_pos_embed_flatten
,
query_key_padding_mask
=
mask_flatten
,
spatial_shapes
=
spatial_shapes
,
reference_points
=
reference_points
,
level_start_index
=
level_start_index
,
valid_ratios
=
valid_ratios
,
**
kwargs
)
memory
=
memory
.
permute
(
1
,
0
,
2
)
bs
,
_
,
c
=
memory
.
shape
output_memory
,
output_proposals
=
self
.
gen_encoder_output_proposals
(
memory
,
mask_flatten
,
spatial_shapes
)
enc_outputs_class
=
cls_branches
[
self
.
decoder
.
num_layers
](
output_memory
)
enc_outputs_coord_unact
=
reg_branches
[
self
.
decoder
.
num_layers
](
output_memory
)
+
output_proposals
cls_out_features
=
cls_branches
[
self
.
decoder
.
num_layers
].
out_features
topk
=
self
.
two_stage_num_proposals
# NOTE In DeformDETR, enc_outputs_class[..., 0] is used for topk TODO
topk_indices
=
torch
.
topk
(
enc_outputs_class
.
max
(
-
1
)[
0
],
topk
,
dim
=
1
)[
1
]
# topk_proposal = torch.gather(
# output_proposals, 1,
# topk_indices.unsqueeze(-1).repeat(1, 1, 4)).sigmoid()
# topk_memory = torch.gather(
# output_memory, 1,
# topk_indices.unsqueeze(-1).repeat(1, 1, self.embed_dims))
topk_score
=
torch
.
gather
(
enc_outputs_class
,
1
,
topk_indices
.
unsqueeze
(
-
1
).
repeat
(
1
,
1
,
cls_out_features
))
topk_coords_unact
=
torch
.
gather
(
enc_outputs_coord_unact
,
1
,
topk_indices
.
unsqueeze
(
-
1
).
repeat
(
1
,
1
,
4
))
topk_anchor
=
topk_coords_unact
.
sigmoid
()
# NOTE In the original DeformDETR, init_reference_out is obtained
# from detached topk_coords_unact, which is different with DINO. TODO
topk_coords_unact
=
topk_coords_unact
.
detach
()
query
=
self
.
query_embed
.
weight
[:,
None
,
:].
repeat
(
1
,
bs
,
1
).
transpose
(
0
,
1
)
if
dn_label_query
is
not
None
:
query
=
torch
.
cat
([
dn_label_query
,
query
],
dim
=
1
)
if
dn_bbox_query
is
not
None
:
reference_points
=
torch
.
cat
([
dn_bbox_query
,
topk_coords_unact
],
dim
=
1
)
else
:
reference_points
=
topk_coords_unact
reference_points
=
reference_points
.
sigmoid
()
# decoder
query
=
query
.
permute
(
1
,
0
,
2
)
memory
=
memory
.
permute
(
1
,
0
,
2
)
inter_states
,
inter_references
=
self
.
decoder
(
query
=
query
,
key
=
None
,
value
=
memory
,
attn_masks
=
attn_mask
,
key_padding_mask
=
mask_flatten
,
reference_points
=
reference_points
,
spatial_shapes
=
spatial_shapes
,
level_start_index
=
level_start_index
,
valid_ratios
=
valid_ratios
,
reg_branches
=
reg_branches
,
**
kwargs
)
inter_references_out
=
inter_references
return
inter_states
,
inter_references_out
,
topk_score
,
topk_anchor
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment