Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
TS-MODELS-OPT
training
Autonomous-Driving-models
Commits
19472568
Commit
19472568
authored
Apr 08, 2026
by
雍大凯
Browse files
将子模块转换为普通目录
parent
51e55208
Changes
233
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3398 additions
and
0 deletions
+3398
-0
docker-hub/MapTRv2/MapTR/projects/configs/maptrv2/maptrv2_nusc_r50_24ep_w_centerline.py
...cts/configs/maptrv2/maptrv2_nusc_r50_24ep_w_centerline.py
+337
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/__init__.py
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/__init__.py
+14
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py
...pTRv2/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py
+6
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py
.../MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py
+3
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
...pTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
+280
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py_old
...projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py_old
+202
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py
...TRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py
+164
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py
...Rv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py
+67
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
...projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
+1
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
...ts/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
+523
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/detectors/__init__.py
...R/projects/mmdet3d_plugin/bevformer/detectors/__init__.py
+2
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py
.../projects/mmdet3d_plugin/bevformer/detectors/bevformer.py
+289
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py
...ects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py
+89
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py
...MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py
+1
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
...R/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
+14
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py
...pTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py
+7
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py
...plugin/bevformer/modules/custom_base_transformer_layer.py
+487
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py
...apTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py
+345
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py
...apTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py
+404
-0
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py
...bevformer/modules/multi_scale_deformable_attn_function.py
+163
-0
No files found.
docker-hub/MapTRv2/MapTR/projects/configs/maptrv2/maptrv2_nusc_r50_24ep_w_centerline.py
0 → 100644
View file @
19472568
_base_
=
[
'../datasets/custom_nus-3d.py'
,
'../_base_/default_runtime.py'
]
#
plugin
=
True
plugin_dir
=
'projects/mmdet3d_plugin/'
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
point_cloud_range
=
[
-
15.0
,
-
30.0
,
-
10.0
,
15.0
,
30.0
,
10.0
]
voxel_size
=
[
0.15
,
0.15
,
20.0
]
dbound
=
[
1.0
,
35.0
,
0.5
]
grid_config
=
{
'x'
:
[
-
30.0
,
-
30.0
,
0.15
],
# useless
'y'
:
[
-
15.0
,
-
15.0
,
0.15
],
# useless
'z'
:
[
-
10
,
10
,
20
],
# useless
'depth'
:
[
1.0
,
35.0
,
0.5
],
# useful
}
img_norm_cfg
=
dict
(
mean
=
[
123.675
,
116.28
,
103.53
],
std
=
[
58.395
,
57.12
,
57.375
],
to_rgb
=
True
)
# For nuScenes we usually do 10-class detection
class_names
=
[
'car'
,
'truck'
,
'construction_vehicle'
,
'bus'
,
'trailer'
,
'barrier'
,
'motorcycle'
,
'bicycle'
,
'pedestrian'
,
'traffic_cone'
]
# map has classes: divider, ped_crossing, boundary
map_classes
=
[
'divider'
,
'ped_crossing'
,
'boundary'
,
'centerline'
]
# fixed_ptsnum_per_line = 20
# map_classes = ['divider',]
num_vec
=
70
fixed_ptsnum_per_gt_line
=
20
# now only support fixed_pts > 0
fixed_ptsnum_per_pred_line
=
20
eval_use_same_gt_sample_num_flag
=
True
num_map_classes
=
len
(
map_classes
)
input_modality
=
dict
(
use_lidar
=
False
,
use_camera
=
True
,
use_radar
=
False
,
use_map
=
False
,
use_external
=
True
)
_dim_
=
256
_pos_dim_
=
_dim_
//
2
_ffn_dim_
=
_dim_
*
2
_num_levels_
=
1
# bev_h_ = 50
# bev_w_ = 50
bev_h_
=
200
bev_w_
=
100
queue_length
=
1
# each sequence contains `queue_length` frames.
aux_seg_cfg
=
dict
(
use_aux_seg
=
True
,
bev_seg
=
True
,
pv_seg
=
True
,
seg_classes
=
1
,
feat_down_sample
=
32
,
pv_thickness
=
1
,
)
model
=
dict
(
type
=
'MapTRv2'
,
use_grid_mask
=
True
,
video_test_mode
=
False
,
pretrained
=
dict
(
img
=
'ckpts/resnet50-19c8e357.pth'
),
img_backbone
=
dict
(
type
=
'ResNet'
,
depth
=
50
,
num_stages
=
4
,
out_indices
=
(
3
,),
frozen_stages
=
1
,
norm_cfg
=
dict
(
type
=
'BN'
,
requires_grad
=
False
),
norm_eval
=
True
,
style
=
'pytorch'
),
img_neck
=
dict
(
type
=
'FPN'
,
in_channels
=
[
2048
],
out_channels
=
_dim_
,
start_level
=
0
,
add_extra_convs
=
'on_output'
,
num_outs
=
_num_levels_
,
relu_before_extra_convs
=
True
),
pts_bbox_head
=
dict
(
type
=
'MapTRv2Head'
,
bev_h
=
bev_h_
,
bev_w
=
bev_w_
,
num_query
=
900
,
num_vec_one2one
=
num_vec
,
num_vec_one2many
=
300
,
k_one2many
=
6
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
# one bbox
num_pts_per_gt_vec
=
fixed_ptsnum_per_gt_line
,
dir_interval
=
1
,
query_embed_type
=
'instance_pts'
,
transform_method
=
'minmax'
,
gt_shift_pts_pattern
=
'v2'
,
num_classes
=
num_map_classes
,
in_channels
=
_dim_
,
sync_cls_avg_factor
=
True
,
with_box_refine
=
True
,
as_two_stage
=
False
,
code_size
=
2
,
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
],
aux_seg
=
aux_seg_cfg
,
# z_cfg=z_cfg,
transformer
=
dict
(
type
=
'MapTRPerceptionTransformer'
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
embed_dims
=
_dim_
,
encoder
=
dict
(
type
=
'LSSTransform'
,
in_channels
=
_dim_
,
out_channels
=
_dim_
,
feat_down_sample
=
32
,
pc_range
=
point_cloud_range
,
voxel_size
=
voxel_size
,
dbound
=
dbound
,
downsample
=
2
,
loss_depth_weight
=
3.0
,
depthnet_cfg
=
dict
(
use_dcn
=
False
,
with_cp
=
False
,
aspp_mid_channels
=
96
),
grid_config
=
grid_config
,),
decoder
=
dict
(
type
=
'MapTRDecoder'
,
num_layers
=
6
,
return_intermediate
=
True
,
transformerlayers
=
dict
(
type
=
'DecoupledDetrTransformerDecoderLayer'
,
num_vec
=
num_vec
,
num_pts_per_vec
=
fixed_ptsnum_per_pred_line
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
_dim_
,
num_heads
=
8
,
dropout
=
0.1
),
dict
(
type
=
'CustomMSDeformableAttention'
,
embed_dims
=
_dim_
,
num_levels
=
1
),
],
feedforward_channels
=
_ffn_dim_
,
ffn_dropout
=
0.1
,
operation_order
=
(
'self_attn'
,
'norm'
,
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)))),
bbox_coder
=
dict
(
type
=
'MapTRNMSFreeCoder'
,
# post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
post_center_range
=
[
-
20
,
-
35
,
-
20
,
-
35
,
20
,
35
,
20
,
35
],
pc_range
=
point_cloud_range
,
max_num
=
50
,
voxel_size
=
voxel_size
,
num_classes
=
num_map_classes
),
positional_encoding
=
dict
(
type
=
'LearnedPositionalEncoding'
,
num_feats
=
_pos_dim_
,
row_num_embed
=
bev_h_
,
col_num_embed
=
bev_w_
,
),
loss_cls
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
,
loss_weight
=
2.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
0.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
0.0
),
loss_pts
=
dict
(
type
=
'PtsL1Loss'
,
loss_weight
=
5.0
),
loss_dir
=
dict
(
type
=
'PtsDirCosLoss'
,
loss_weight
=
0.005
),
loss_seg
=
dict
(
type
=
'SimpleLoss'
,
pos_weight
=
4.0
,
loss_weight
=
1.0
),
loss_pv_seg
=
dict
(
type
=
'SimpleLoss'
,
pos_weight
=
1.0
,
loss_weight
=
2.0
),),
# model training and testing settings
train_cfg
=
dict
(
pts
=
dict
(
grid_size
=
[
512
,
512
,
1
],
voxel_size
=
voxel_size
,
point_cloud_range
=
point_cloud_range
,
out_size_factor
=
4
,
assigner
=
dict
(
type
=
'MapTRAssigner'
,
cls_cost
=
dict
(
type
=
'FocalLossCost'
,
weight
=
2.0
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
0.0
,
box_format
=
'xywh'
),
# reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
# iou_cost=dict(type='IoUCost', weight=1.0), # Fake cost. This is just to make it compatible with DETR head.
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
0.0
),
pts_cost
=
dict
(
type
=
'OrderedPtsL1Cost'
,
weight
=
5
),
pc_range
=
point_cloud_range
))))
dataset_type
=
'CustomNuScenesOfflineLocalMapDataset'
data_root
=
'data/nuscenes/'
file_client_args
=
dict
(
backend
=
'disk'
)
train_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'PhotoMetricDistortionMultiViewImage'
),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'LoadPointsFromFile'
,
coord_type
=
'LIDAR'
,
load_dim
=
5
,
use_dim
=
5
,
file_client_args
=
file_client_args
),
dict
(
type
=
'CustomPointToMultiViewDepth'
,
downsample
=
1
,
grid_config
=
grid_config
),
dict
(
type
=
'PadMultiViewImageDepth'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
with_gt
=
False
,
with_label
=
False
,
class_names
=
map_classes
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
,
'gt_depth'
])
]
test_pipeline
=
[
dict
(
type
=
'LoadMultiViewImageFromFiles'
,
to_float32
=
True
),
dict
(
type
=
'RandomScaleImageMultiViewImage'
,
scales
=
[
0.5
]),
dict
(
type
=
'NormalizeMultiviewImage'
,
**
img_norm_cfg
),
dict
(
type
=
'MultiScaleFlipAug3D'
,
img_scale
=
(
1600
,
900
),
pts_scale_ratio
=
1
,
flip
=
False
,
transforms
=
[
dict
(
type
=
'PadMultiViewImage'
,
size_divisor
=
32
),
dict
(
type
=
'DefaultFormatBundle3D'
,
with_gt
=
False
,
with_label
=
False
,
class_names
=
map_classes
),
dict
(
type
=
'CustomCollect3D'
,
keys
=
[
'img'
])
])
]
data
=
dict
(
samples_per_gpu
=
4
,
workers_per_gpu
=
4
,
# TODO
train
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_map_infos_temporal_train.pkl'
,
pipeline
=
train_pipeline
,
classes
=
class_names
,
modality
=
input_modality
,
aux_seg
=
aux_seg_cfg
,
test_mode
=
False
,
use_valid_flag
=
True
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
queue_length
=
queue_length
,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d
=
'LiDAR'
),
val
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_map_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
,
samples_per_gpu
=
1
),
test
=
dict
(
type
=
dataset_type
,
data_root
=
data_root
,
ann_file
=
data_root
+
'nuscenes_map_infos_temporal_val.pkl'
,
map_ann_file
=
data_root
+
'nuscenes_map_anns_val.json'
,
pipeline
=
test_pipeline
,
bev_size
=
(
bev_h_
,
bev_w_
),
pc_range
=
point_cloud_range
,
fixed_ptsnum_per_line
=
fixed_ptsnum_per_gt_line
,
eval_use_same_gt_sample_num_flag
=
eval_use_same_gt_sample_num_flag
,
padding_value
=-
10000
,
map_classes
=
map_classes
,
classes
=
class_names
,
modality
=
input_modality
),
shuffler_sampler
=
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
dict
(
type
=
'DistributedSampler'
)
)
optimizer
=
dict
(
type
=
'AdamW'
,
lr
=
6e-4
,
paramwise_cfg
=
dict
(
custom_keys
=
{
'img_backbone'
:
dict
(
lr_mult
=
0.1
),
}),
weight_decay
=
0.01
)
optimizer_config
=
dict
(
grad_clip
=
dict
(
max_norm
=
35
,
norm_type
=
2
))
# learning policy
lr_config
=
dict
(
policy
=
'CosineAnnealing'
,
warmup
=
'linear'
,
warmup_iters
=
500
,
warmup_ratio
=
1.0
/
3
,
min_lr_ratio
=
1e-3
)
total_epochs
=
24
evaluation
=
dict
(
interval
=
2
,
pipeline
=
test_pipeline
,
metric
=
'chamfer'
,
save_best
=
'NuscMap_chamfer/mAP'
,
rule
=
'greater'
)
# total_epochs = 50
# evaluation = dict(interval=1, pipeline=test_pipeline)
runner
=
dict
(
type
=
'EpochBasedRunner'
,
max_epochs
=
total_epochs
)
log_config
=
dict
(
interval
=
50
,
hooks
=
[
dict
(
type
=
'TextLoggerHook'
),
dict
(
type
=
'TensorboardLoggerHook'
)
])
fp16
=
dict
(
loss_scale
=
512.
)
checkpoint_config
=
dict
(
max_keep_ckpts
=
1
,
interval
=
2
)
find_unused_parameters
=
True
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/__init__.py
0 → 100644
View file @
19472568
from
.core.bbox.assigners.hungarian_assigner_3d
import
HungarianAssigner3D
from
.core.bbox.coders.nms_free_coder
import
NMSFreeCoder
from
.core.bbox.match_costs
import
BBox3DL1Cost
from
.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
.datasets.pipelines
import
(
PhotoMetricDistortionMultiViewImage
,
PadMultiViewImage
,
NormalizeMultiviewImage
,
CustomCollect3D
)
from
.models.backbones.vovnet
import
VoVNet
from
.models.utils
import
*
from
.models.opt.adamw
import
AdamW2
from
.bevformer
import
*
from
.maptr
import
*
from
.models.backbones.efficientnet
import
EfficientNet
#from .models.opt.miopen_adam import Miopen_AdamW
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/__init__.py
0 → 100644
View file @
19472568
from
.dense_heads
import
*
from
.detectors
import
*
from
.modules
import
*
from
.runner
import
*
from
.hooks
import
*
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/__init__.py
0 → 100644
View file @
19472568
from
.train
import
custom_train_model
from
.mmdet_train
import
custom_train_detector
# from .test import custom_multi_gpu_test
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch.distributed
as
dist
from
mmcv.parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv.runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv.utils
import
build_from_cfg
from
mmdet.core
import
EvalHook
from
mmdet.datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet.utils
import
get_root_logger
import
time
import
os.path
as
osp
from
projects.mmdet3d_plugin.datasets.builder
import
build_dataloader
from
projects.mmdet3d_plugin.core.evaluation.eval_hooks
import
CustomDistEvalHook
from
projects.mmdet3d_plugin.datasets
import
custom_build_dataset
from
mmcv.runner
import
Hook
class
ProfilerHook
(
Hook
):
def
__init__
(
self
,
profiler
,
total_steps
):
self
.
profiler
=
profiler
self
.
total_steps
=
total_steps
# 总步数 (wait + warmup + active) * repeat
self
.
stopped
=
False
def
after_train_iter
(
self
,
runner
):
if
self
.
profiler
.
step_num
==
self
.
total_steps
and
not
self
.
stopped
:
# 停止Profiler
self
.
profiler
.
stop
()
self
.
stopped
=
True
# 只在rank 0上打印结果
rank
,
_
=
get_dist_info
()
if
rank
==
0
:
# 获取并打印关键指标
# table = self.profiler.key_averages().table(
# sort_by="self_cuda_time_total",
# row_limit=10
# )
# runner.logger.info(f"Profiler results after {self.total_steps} steps:\n{table}")
# table = self.profiler.key_averages().table(
# sort_by="self_cpu_time_total",
# row_limit=10
# )
# runner.logger.info(f"Profiler results after {self.total_steps} steps:\n{table}")
results
=
self
.
profiler
.
key_averages
().
table
(
sort_by
=
"cuda_time_total"
)
log_file
=
"/workspace/MapTR/profiler_logs/BW_log_step{}.txt"
.
format
(
self
.
total_steps
)
with
open
(
log_file
,
mode
=
'w'
)
as
file
:
file
.
write
(
str
(
results
))
# self.profiler.start()
if
not
self
.
stopped
:
self
.
profiler
.
step
()
# 检测是否完成所有schedule步骤
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
# prepare data loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#assert len(dataset)==1s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"=
{
cfg
.
data
.
imgs_per_gpu
}
and '
f
'"samples_per_gpu"=
{
cfg
.
data
.
samples_per_gpu
}
, "imgs_per_gpu"'
f
'=
{
cfg
.
data
.
imgs_per_gpu
}
is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'
{
cfg
.
data
.
imgs_per_gpu
}
in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
# cfg.gpus will be ignored if distributed
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
for
ds
in
dataset
]
# put model on gpus
if
distributed
:
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
# Sets the `find_unused_parameters` parameter in
# torch.nn.parallel.DistributedDataParallel
model
=
MMDistributedDataParallel
(
model
.
cuda
().
to
(
memory_format
=
torch
.
channels_last
),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
.
cuda
(),
device_ids
=
[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
model
=
MMDataParallel
(
model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
# build runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
# an ugly workaround to make .log and .log.json filenames the same
runner
.
timestamp
=
timestamp
# fp16 setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(
**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
# register hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
# register profiler hook
#trace_config = dict(type='tb_trace', dir_name='work_dir')
#profiler_config = dict(on_trace_ready=trace_config)
#runner.register_profiler_hook(profiler_config)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
# register eval hooks
if
validate
:
# Support batch_size > 1 in validation
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
assert
False
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
# dict(type='DistributedGroupSampler'),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
# dict(type='DistributedSampler'),
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!=
'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
# user-defined hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got
{
type
(
custom_hooks
)
}
'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'
{
type
(
hook_cfg
)
}
'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
if
False
:
# 创建profiler配置
total_steps
=
(
1
+
20
+
1
)
*
1
# 22 steps
profiler
=
torch
.
profiler
.
profile
(
activities
=
[
torch
.
profiler
.
ProfilerActivity
.
CPU
,
torch
.
profiler
.
ProfilerActivity
.
CUDA
],
schedule
=
torch
.
profiler
.
schedule
(
wait
=
1
,
# 跳过前1个step
warmup
=
20
,
# 预热1个step(不计入结果)
active
=
1
,
# 分析3个step
repeat
=
1
# 只执行一轮
),
on_trace_ready
=
torch
.
profiler
.
tensorboard_trace_handler
(
# f"{cfg.work_dir}/profiler_logs" # 输出目录
"/workspace/MapTR/profiler_logs"
# "./profiler_logs"
),
with_stack
=
True
,
# 收集调用栈信息
profile_memory
=
False
,
# 分析内存使用
record_shapes
=
False
# 记录张量形状
)
# 创建并注册ProfilerHook
# profiler_hook = ProfilerHook(profiler)
profiler_hook
=
ProfilerHook
(
profiler
,
total_steps
)
runner
.
register_hook
(
profiler_hook
)
# 启动profiler
profiler
.
start
()
print
(
"==================================== profiler.start()==================================================================="
)
try
:
# 运行训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
finally
:
# 确保profiler停止
profiler
.
stop
()
else
:
# 正常训练
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
#runner.run(data_loaders, cfg.workflow)
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py_old
0 → 100644
View file @
19472568
#
---------------------------------------------
#
Copyright
(
c
)
OpenMMLab
.
All
rights
reserved
.
#
---------------------------------------------
#
Modified
by
Zhiqi
Li
#
---------------------------------------------
import
random
import
warnings
import
numpy
as
np
import
torch
import
torch
.
distributed
as
dist
from
mmcv
.
parallel
import
MMDataParallel
,
MMDistributedDataParallel
from
mmcv
.
runner
import
(
HOOKS
,
DistSamplerSeedHook
,
EpochBasedRunner
,
Fp16OptimizerHook
,
OptimizerHook
,
build_optimizer
,
build_runner
,
get_dist_info
)
from
mmcv
.
utils
import
build_from_cfg
from
mmdet
.
core
import
EvalHook
from
mmdet
.
datasets
import
(
build_dataset
,
replace_ImageToTensor
)
from
mmdet
.
utils
import
get_root_logger
import
time
import
os
.
path
as
osp
from
projects
.
mmdet3d_plugin
.
datasets
.
builder
import
build_dataloader
from
projects
.
mmdet3d_plugin
.
core
.
evaluation
.
eval_hooks
import
CustomDistEvalHook
from
projects
.
mmdet3d_plugin
.
datasets
import
custom_build_dataset
def
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
logger
=
get_root_logger
(
cfg
.
log_level
)
#
prepare
data
loaders
dataset
=
dataset
if
isinstance
(
dataset
,
(
list
,
tuple
))
else
[
dataset
]
#
assert
len
(
dataset
)==
1
s
if
'imgs_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
'"imgs_per_gpu" is deprecated in MMDet V2.0. '
'Please use "samples_per_gpu" instead'
)
if
'samples_per_gpu'
in
cfg
.
data
:
logger
.
warning
(
f
'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
f
'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
f
'={cfg.data.imgs_per_gpu} is used in this experiments'
)
else
:
logger
.
warning
(
'Automatically set "samples_per_gpu"="imgs_per_gpu"='
f
'{cfg.data.imgs_per_gpu} in this experiments'
)
cfg
.
data
.
samples_per_gpu
=
cfg
.
data
.
imgs_per_gpu
data_loaders
=
[
build_dataloader
(
ds
,
cfg
.
data
.
samples_per_gpu
,
cfg
.
data
.
workers_per_gpu
,
#
cfg
.
gpus
will
be
ignored
if
distributed
len
(
cfg
.
gpu_ids
),
dist
=
distributed
,
seed
=
cfg
.
seed
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
#
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
#
dict
(
type
=
'DistributedSampler'
),
)
for
ds
in
dataset
]
#
put
model
on
gpus
if
distributed
:
find_unused_parameters
=
cfg
.
get
(
'find_unused_parameters'
,
False
)
#
Sets
the
`
find_unused_parameters
`
parameter
in
#
torch
.
nn
.
parallel
.
DistributedDataParallel
model
=
MMDistributedDataParallel
(
model
.
cuda
().
to
(
memory_format
=
torch
.
channels_last
),
device_ids
=[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
if
eval_model
is
not
None
:
eval_model
=
MMDistributedDataParallel
(
eval_model
.
cuda
(),
device_ids
=[
torch
.
cuda
.
current_device
()],
broadcast_buffers
=
False
,
find_unused_parameters
=
find_unused_parameters
)
else
:
model
=
MMDataParallel
(
model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
if
eval_model
is
not
None
:
eval_model
=
MMDataParallel
(
eval_model
.
cuda
(
cfg
.
gpu_ids
[
0
]),
device_ids
=
cfg
.
gpu_ids
)
#
build
runner
optimizer
=
build_optimizer
(
model
,
cfg
.
optimizer
)
if
'runner'
not
in
cfg
:
cfg
.
runner
=
{
'type'
:
'EpochBasedRunner'
,
'max_epochs'
:
cfg
.
total_epochs
}
warnings
.
warn
(
'config is now expected to have a `runner` section, '
'please set `runner` in your config.'
,
UserWarning
)
else
:
if
'total_epochs'
in
cfg
:
assert
cfg
.
total_epochs
==
cfg
.
runner
.
max_epochs
if
eval_model
is
not
None
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
eval_model
=
eval_model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
else
:
runner
=
build_runner
(
cfg
.
runner
,
default_args
=
dict
(
model
=
model
,
optimizer
=
optimizer
,
work_dir
=
cfg
.
work_dir
,
logger
=
logger
,
meta
=
meta
))
#
an
ugly
workaround
to
make
.
log
and
.
log
.
json
filenames
the
same
runner
.
timestamp
=
timestamp
#
fp16
setting
fp16_cfg
=
cfg
.
get
(
'fp16'
,
None
)
if
fp16_cfg
is
not
None
:
optimizer_config
=
Fp16OptimizerHook
(
**
cfg
.
optimizer_config
,
**
fp16_cfg
,
distributed
=
distributed
)
elif
distributed
and
'type'
not
in
cfg
.
optimizer_config
:
optimizer_config
=
OptimizerHook
(**
cfg
.
optimizer_config
)
else
:
optimizer_config
=
cfg
.
optimizer_config
#
register
hooks
runner
.
register_training_hooks
(
cfg
.
lr_config
,
optimizer_config
,
cfg
.
checkpoint_config
,
cfg
.
log_config
,
cfg
.
get
(
'momentum_config'
,
None
))
#
register
profiler
hook
#
trace_config
=
dict
(
type
=
'tb_trace'
,
dir_name
=
'work_dir'
)
#
profiler_config
=
dict
(
on_trace_ready
=
trace_config
)
#
runner
.
register_profiler_hook
(
profiler_config
)
if
distributed
:
if
isinstance
(
runner
,
EpochBasedRunner
):
runner
.
register_hook
(
DistSamplerSeedHook
())
#
register
eval
hooks
if
validate
:
#
Support
batch_size
>
1
in
validation
val_samples_per_gpu
=
cfg
.
data
.
val
.
pop
(
'samples_per_gpu'
,
1
)
if
val_samples_per_gpu
>
1
:
assert
False
#
Replace
'ImageToTensor'
to
'DefaultFormatBundle'
cfg
.
data
.
val
.
pipeline
=
replace_ImageToTensor
(
cfg
.
data
.
val
.
pipeline
)
val_dataset
=
custom_build_dataset
(
cfg
.
data
.
val
,
dict
(
test_mode
=
True
))
val_dataloader
=
build_dataloader
(
val_dataset
,
samples_per_gpu
=
val_samples_per_gpu
,
workers_per_gpu
=
cfg
.
data
.
workers_per_gpu
,
dist
=
distributed
,
shuffle
=
False
,
shuffler_sampler
=
cfg
.
data
.
shuffler_sampler
,
#
dict
(
type
=
'DistributedGroupSampler'
),
nonshuffler_sampler
=
cfg
.
data
.
nonshuffler_sampler
,
#
dict
(
type
=
'DistributedSampler'
),
)
eval_cfg
=
cfg
.
get
(
'evaluation'
,
{})
eval_cfg
[
'by_epoch'
]
=
cfg
.
runner
[
'type'
]
!= 'IterBasedRunner'
eval_cfg
[
'jsonfile_prefix'
]
=
osp
.
join
(
'val'
,
cfg
.
work_dir
,
time
.
ctime
().
replace
(
' '
,
'_'
).
replace
(
':'
,
'_'
))
eval_hook
=
CustomDistEvalHook
if
distributed
else
EvalHook
runner
.
register_hook
(
eval_hook
(
val_dataloader
,
**
eval_cfg
))
#
user
-
defined
hooks
if
cfg
.
get
(
'custom_hooks'
,
None
):
custom_hooks
=
cfg
.
custom_hooks
assert
isinstance
(
custom_hooks
,
list
),
\
f
'custom_hooks expect list type, but got {type(custom_hooks)}'
for
hook_cfg
in
cfg
.
custom_hooks
:
assert
isinstance
(
hook_cfg
,
dict
),
\
'Each item in custom_hooks expects dict type, but got '
\
f
'{type(hook_cfg)}'
hook_cfg
=
hook_cfg
.
copy
()
priority
=
hook_cfg
.
pop
(
'priority'
,
'NORMAL'
)
hook
=
build_from_cfg
(
hook_cfg
,
HOOKS
)
runner
.
register_hook
(
hook
,
priority
=
priority
)
if
cfg
.
resume_from
:
runner
.
resume
(
cfg
.
resume_from
)
elif
cfg
.
load_from
:
runner
.
load_checkpoint
(
cfg
.
load_from
)
runner
.
run
(
data_loaders
,
cfg
.
workflow
)
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/test.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
os.path
as
osp
import
pickle
import
shutil
import
tempfile
import
time
import
mmcv
import
torch
import
torch.distributed
as
dist
from
mmcv.image
import
tensor2imgs
from
mmcv.runner
import
get_dist_info
from
mmdet.core
import
encode_mask_results
import
mmcv
import
numpy
as
np
import
pycocotools.mask
as
mask_util
def
custom_encode_mask_results
(
mask_results
):
"""Encode bitmap mask to RLE code. Semantic Masks only
Args:
mask_results (list | tuple[list]): bitmap mask results.
In mask scoring rcnn, mask_results is a tuple of (segm_results,
segm_cls_score).
Returns:
list | tuple: RLE encoded mask.
"""
cls_segms
=
mask_results
num_classes
=
len
(
cls_segms
)
encoded_mask_results
=
[]
for
i
in
range
(
len
(
cls_segms
)):
encoded_mask_results
.
append
(
mask_util
.
encode
(
np
.
array
(
cls_segms
[
i
][:,
:,
np
.
newaxis
],
order
=
'F'
,
dtype
=
'uint8'
))[
0
])
# encoded with RLE
return
[
encoded_mask_results
]
def
custom_multi_gpu_test
(
model
,
data_loader
,
tmpdir
=
None
,
gpu_collect
=
False
):
"""Test model with multiple gpus.
This method tests model with multiple gpus and collects the results
under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
it encodes results to gpu tensors and use gpu communication for results
collection. On cpu mode it saves the results on different gpus to 'tmpdir'
and collects them by the rank 0 worker.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
tmpdir (str): Path of directory to save the temporary results from
different gpus under cpu mode.
gpu_collect (bool): Option to use either gpu or cpu to collect results.
Returns:
list: The prediction results.
"""
model
.
eval
()
bbox_results
=
[]
mask_results
=
[]
dataset
=
data_loader
.
dataset
rank
,
world_size
=
get_dist_info
()
if
rank
==
0
:
prog_bar
=
mmcv
.
ProgressBar
(
len
(
dataset
))
time
.
sleep
(
2
)
# This line can prevent deadlock problem in some cases.
have_mask
=
False
for
i
,
data
in
enumerate
(
data_loader
):
with
torch
.
no_grad
():
result
=
model
(
return_loss
=
False
,
rescale
=
True
,
**
data
)
# encode mask results
if
isinstance
(
result
,
dict
):
if
'bbox_results'
in
result
.
keys
():
bbox_result
=
result
[
'bbox_results'
]
batch_size
=
len
(
result
[
'bbox_results'
])
bbox_results
.
extend
(
bbox_result
)
if
'mask_results'
in
result
.
keys
()
and
result
[
'mask_results'
]
is
not
None
:
mask_result
=
custom_encode_mask_results
(
result
[
'mask_results'
])
mask_results
.
extend
(
mask_result
)
have_mask
=
True
else
:
batch_size
=
len
(
result
)
bbox_results
.
extend
(
result
)
#if isinstance(result[0], tuple):
# assert False, 'this code is for instance segmentation, which our code will not utilize.'
# result = [(bbox_results, encode_mask_results(mask_results))
# for bbox_results, mask_results in result]
if
rank
==
0
:
for
_
in
range
(
batch_size
*
world_size
):
prog_bar
.
update
()
# collect results from all ranks
if
gpu_collect
:
bbox_results
=
collect_results_gpu
(
bbox_results
,
len
(
dataset
))
if
have_mask
:
mask_results
=
collect_results_gpu
(
mask_results
,
len
(
dataset
))
else
:
mask_results
=
None
else
:
bbox_results
=
collect_results_cpu
(
bbox_results
,
len
(
dataset
),
tmpdir
)
tmpdir
=
tmpdir
+
'_mask'
if
tmpdir
is
not
None
else
None
if
have_mask
:
mask_results
=
collect_results_cpu
(
mask_results
,
len
(
dataset
),
tmpdir
)
else
:
mask_results
=
None
if
mask_results
is
None
:
return
bbox_results
return
{
'bbox_results'
:
bbox_results
,
'mask_results'
:
mask_results
}
def
collect_results_cpu
(
result_part
,
size
,
tmpdir
=
None
):
rank
,
world_size
=
get_dist_info
()
# create a tmp dir if it is not specified
if
tmpdir
is
None
:
MAX_LEN
=
512
# 32 is whitespace
dir_tensor
=
torch
.
full
((
MAX_LEN
,
),
32
,
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
if
rank
==
0
:
mmcv
.
mkdir_or_exist
(
'.dist_test'
)
tmpdir
=
tempfile
.
mkdtemp
(
dir
=
'.dist_test'
)
tmpdir
=
torch
.
tensor
(
bytearray
(
tmpdir
.
encode
()),
dtype
=
torch
.
uint8
,
device
=
'cuda'
)
dir_tensor
[:
len
(
tmpdir
)]
=
tmpdir
dist
.
broadcast
(
dir_tensor
,
0
)
tmpdir
=
dir_tensor
.
cpu
().
numpy
().
tobytes
().
decode
().
rstrip
()
else
:
mmcv
.
mkdir_or_exist
(
tmpdir
)
# dump the part result to the dir
mmcv
.
dump
(
result_part
,
osp
.
join
(
tmpdir
,
f
'part_
{
rank
}
.pkl'
))
dist
.
barrier
()
# collect all parts
if
rank
!=
0
:
return
None
else
:
# load results of all parts from tmp dir
part_list
=
[]
for
i
in
range
(
world_size
):
part_file
=
osp
.
join
(
tmpdir
,
f
'part_
{
i
}
.pkl'
)
part_list
.
append
(
mmcv
.
load
(
part_file
))
# sort the results
ordered_results
=
[]
'''
bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
'''
#for res in zip(*part_list):
for
res
in
part_list
:
ordered_results
.
extend
(
list
(
res
))
# the dataloader may pad some samples
ordered_results
=
ordered_results
[:
size
]
# remove tmp dir
shutil
.
rmtree
(
tmpdir
)
return
ordered_results
def
collect_results_gpu
(
result_part
,
size
):
collect_results_cpu
(
result_part
,
size
)
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/apis/train.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
.mmdet_train
import
custom_train_detector
from
mmseg.apis
import
train_segmentor
from
mmdet.apis
import
train_detector
def
custom_train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
eval_model
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
assert
False
else
:
custom_train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
eval_model
=
eval_model
,
meta
=
meta
)
def
train_model
(
model
,
dataset
,
cfg
,
distributed
=
False
,
validate
=
False
,
timestamp
=
None
,
meta
=
None
):
"""A function wrapper for launching model training according to cfg.
Because we need different eval_hook in runner. Should be deprecated in the
future.
"""
if
cfg
.
model
.
type
in
[
'EncoderDecoder3D'
]:
train_segmentor
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
else
:
train_detector
(
model
,
dataset
,
cfg
,
distributed
=
distributed
,
validate
=
validate
,
timestamp
=
timestamp
,
meta
=
meta
)
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
0 → 100644
View file @
19472568
from
.bevformer_head
import
BEVFormerHead
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
copy
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Linear
,
bias_init_with_prob
from
mmcv.utils
import
TORCH_VERSION
,
digit_version
from
mmdet.core
import
(
multi_apply
,
multi_apply
,
reduce_mean
)
from
mmdet.models.utils.transformer
import
inverse_sigmoid
from
mmdet.models
import
HEADS
from
mmdet.models.dense_heads
import
DETRHead
from
mmdet3d.core.bbox.coders
import
build_bbox_coder
from
projects.mmdet3d_plugin.core.bbox.util
import
normalize_bbox
from
mmcv.cnn.bricks.transformer
import
build_positional_encoding
from
mmcv.runner
import
force_fp32
,
auto_fp16
from
projects.mmdet3d_plugin.models.utils.bricks
import
run_time
import
numpy
as
np
import
mmcv
import
cv2
as
cv
from
projects.mmdet3d_plugin.models.utils.visual
import
save_tensor
@
HEADS
.
register_module
()
class
BEVFormerHead
(
DETRHead
):
"""Head of Detr3D.
Args:
with_box_refine (bool): Whether to refine the reference points
in the decoder. Defaults to False.
as_two_stage (bool) : Whether to generate the proposal from
the outputs of encoder.
transformer (obj:`ConfigDict`): ConfigDict is used for building
the Encoder and Decoder.
bev_h, bev_w (int): spatial shape of BEV queries.
"""
def
__init__
(
self
,
*
args
,
with_box_refine
=
False
,
as_two_stage
=
False
,
transformer
=
None
,
bbox_coder
=
None
,
num_cls_fcs
=
2
,
code_weights
=
None
,
bev_h
=
30
,
bev_w
=
30
,
**
kwargs
):
self
.
bev_h
=
bev_h
self
.
bev_w
=
bev_w
self
.
fp16_enabled
=
False
self
.
with_box_refine
=
with_box_refine
self
.
as_two_stage
=
as_two_stage
if
self
.
as_two_stage
:
transformer
[
'as_two_stage'
]
=
self
.
as_two_stage
if
'code_size'
in
kwargs
:
self
.
code_size
=
kwargs
[
'code_size'
]
else
:
self
.
code_size
=
10
if
code_weights
is
not
None
:
self
.
code_weights
=
code_weights
else
:
self
.
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.2
,
0.2
]
self
.
bbox_coder
=
build_bbox_coder
(
bbox_coder
)
self
.
pc_range
=
self
.
bbox_coder
.
pc_range
self
.
real_w
=
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
]
self
.
real_h
=
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
]
self
.
num_cls_fcs
=
num_cls_fcs
-
1
super
(
BEVFormerHead
,
self
).
__init__
(
*
args
,
transformer
=
transformer
,
**
kwargs
)
self
.
code_weights
=
nn
.
Parameter
(
torch
.
tensor
(
self
.
code_weights
,
requires_grad
=
False
),
requires_grad
=
False
)
def
_init_layers
(
self
):
"""Initialize classification branch and regression branch of head."""
cls_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
cls_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
cls_branch
.
append
(
nn
.
LayerNorm
(
self
.
embed_dims
))
cls_branch
.
append
(
nn
.
ReLU
(
inplace
=
True
))
cls_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
))
fc_cls
=
nn
.
Sequential
(
*
cls_branch
)
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
code_size
))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
# last reg_branch is used to generate proposal from
# encode feature map when as_two_stage is True.
num_pred
=
(
self
.
transformer
.
decoder
.
num_layers
+
1
)
if
\
self
.
as_two_stage
else
self
.
transformer
.
decoder
.
num_layers
if
self
.
with_box_refine
:
self
.
cls_branches
=
_get_clones
(
fc_cls
,
num_pred
)
self
.
reg_branches
=
_get_clones
(
reg_branch
,
num_pred
)
else
:
self
.
cls_branches
=
nn
.
ModuleList
(
[
fc_cls
for
_
in
range
(
num_pred
)])
self
.
reg_branches
=
nn
.
ModuleList
(
[
reg_branch
for
_
in
range
(
num_pred
)])
if
not
self
.
as_two_stage
:
self
.
bev_embedding
=
nn
.
Embedding
(
self
.
bev_h
*
self
.
bev_w
,
self
.
embed_dims
)
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
*
2
)
def
init_weights
(
self
):
"""Initialize weights of the DeformDETR head."""
self
.
transformer
.
init_weights
()
if
self
.
loss_cls
.
use_sigmoid
:
bias_init
=
bias_init_with_prob
(
0.01
)
for
m
in
self
.
cls_branches
:
nn
.
init
.
constant_
(
m
[
-
1
].
bias
,
bias_init
)
@
auto_fp16
(
apply_to
=
(
'mlvl_feats'
))
def
forward
(
self
,
mlvl_feats
,
img_metas
,
prev_bev
=
None
,
only_bev
=
False
):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 5D-tensor with shape
(B, N, C, H, W).
prev_bev: previous bev featues
only_bev: only compute BEV features with encoder.
Returns:
all_cls_scores (Tensor): Outputs from the classification head, \
shape [nb_dec, bs, num_query, cls_out_channels]. Note \
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression \
head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
Shape [nb_dec, bs, num_query, 9].
"""
bs
,
num_cam
,
_
,
_
,
_
=
mlvl_feats
[
0
].
shape
dtype
=
mlvl_feats
[
0
].
dtype
object_query_embeds
=
self
.
query_embedding
.
weight
.
to
(
dtype
)
bev_queries
=
self
.
bev_embedding
.
weight
.
to
(
dtype
)
bev_mask
=
torch
.
zeros
((
bs
,
self
.
bev_h
,
self
.
bev_w
),
device
=
bev_queries
.
device
).
to
(
dtype
)
bev_pos
=
self
.
positional_encoding
(
bev_mask
).
to
(
dtype
)
if
only_bev
:
# only use encoder to obtain BEV features, TODO: refine the workaround
return
self
.
transformer
.
get_bev_features
(
mlvl_feats
,
bev_queries
,
self
.
bev_h
,
self
.
bev_w
,
grid_length
=
(
self
.
real_h
/
self
.
bev_h
,
self
.
real_w
/
self
.
bev_w
),
bev_pos
=
bev_pos
,
img_metas
=
img_metas
,
prev_bev
=
prev_bev
,
)
else
:
outputs
=
self
.
transformer
(
mlvl_feats
,
bev_queries
,
object_query_embeds
,
self
.
bev_h
,
self
.
bev_w
,
grid_length
=
(
self
.
real_h
/
self
.
bev_h
,
self
.
real_w
/
self
.
bev_w
),
bev_pos
=
bev_pos
,
reg_branches
=
self
.
reg_branches
if
self
.
with_box_refine
else
None
,
# noqa:E501
cls_branches
=
self
.
cls_branches
if
self
.
as_two_stage
else
None
,
img_metas
=
img_metas
,
prev_bev
=
prev_bev
)
bev_embed
,
hs
,
init_reference
,
inter_references
=
outputs
hs
=
hs
.
permute
(
0
,
2
,
1
,
3
)
outputs_classes
=
[]
outputs_coords
=
[]
for
lvl
in
range
(
hs
.
shape
[
0
]):
if
lvl
==
0
:
reference
=
init_reference
else
:
reference
=
inter_references
[
lvl
-
1
]
reference
=
inverse_sigmoid
(
reference
)
outputs_class
=
self
.
cls_branches
[
lvl
](
hs
[
lvl
])
tmp
=
self
.
reg_branches
[
lvl
](
hs
[
lvl
])
# TODO: check the shape of reference
assert
reference
.
shape
[
-
1
]
==
3
tmp
[...,
0
:
2
]
+=
reference
[...,
0
:
2
]
tmp
[...,
0
:
2
]
=
tmp
[...,
0
:
2
].
sigmoid
()
tmp
[...,
4
:
5
]
+=
reference
[...,
2
:
3
]
tmp
[...,
4
:
5
]
=
tmp
[...,
4
:
5
].
sigmoid
()
tmp
[...,
0
:
1
]
=
(
tmp
[...,
0
:
1
]
*
(
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
])
+
self
.
pc_range
[
0
])
tmp
[...,
1
:
2
]
=
(
tmp
[...,
1
:
2
]
*
(
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
])
+
self
.
pc_range
[
1
])
tmp
[...,
4
:
5
]
=
(
tmp
[...,
4
:
5
]
*
(
self
.
pc_range
[
5
]
-
self
.
pc_range
[
2
])
+
self
.
pc_range
[
2
])
# TODO: check if using sigmoid
outputs_coord
=
tmp
outputs_classes
.
append
(
outputs_class
)
outputs_coords
.
append
(
outputs_coord
)
outputs_classes
=
torch
.
stack
(
outputs_classes
)
outputs_coords
=
torch
.
stack
(
outputs_coords
)
outs
=
{
'bev_embed'
:
bev_embed
,
'all_cls_scores'
:
outputs_classes
,
'all_bbox_preds'
:
outputs_coords
,
'enc_cls_scores'
:
None
,
'enc_bbox_preds'
:
None
,
}
return
outs
def
_get_target_single
(
self
,
cls_score
,
bbox_pred
,
gt_labels
,
gt_bboxes
,
gt_bboxes_ignore
=
None
):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes
=
bbox_pred
.
size
(
0
)
# assigner and sampler
gt_c
=
gt_bboxes
.
shape
[
-
1
]
assign_result
=
self
.
assigner
.
assign
(
bbox_pred
,
cls_score
,
gt_bboxes
,
gt_labels
,
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
bbox_pred
,
gt_bboxes
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
# label targets
labels
=
gt_bboxes
.
new_full
((
num_bboxes
,),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_bboxes
.
new_ones
(
num_bboxes
)
# bbox targets
bbox_targets
=
torch
.
zeros_like
(
bbox_pred
)[...,
:
gt_c
]
bbox_weights
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
[
pos_inds
]
=
1.0
# DETR
bbox_targets
[
pos_inds
]
=
sampling_result
.
pos_gt_bboxes
return
(
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
pos_inds
,
neg_inds
)
def
get_targets
(
self
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
=
None
):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all \
images.
- bbox_targets_list (list[Tensor]): BBox targets for all \
images.
- bbox_weights_list (list[Tensor]): BBox weights for all \
images.
- num_total_pos (int): Number of positive samples in all \
images.
- num_total_neg (int): Number of negative samples in all \
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs
=
len
(
cls_scores_list
)
gt_bboxes_ignore_list
=
[
gt_bboxes_ignore_list
for
_
in
range
(
num_imgs
)
]
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
pos_inds_list
,
neg_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
cls_scores_list
,
bbox_preds_list
,
gt_labels_list
,
gt_bboxes_list
,
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
return
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
def
loss_single
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
=
None
):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs
=
cls_scores
.
size
(
0
)
cls_scores_list
=
[
cls_scores
[
i
]
for
i
in
range
(
num_imgs
)]
bbox_preds_list
=
[
bbox_preds
[
i
]
for
i
in
range
(
num_imgs
)]
cls_reg_targets
=
self
.
get_targets
(
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
)
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
)
=
cls_reg_targets
labels
=
torch
.
cat
(
labels_list
,
0
)
label_weights
=
torch
.
cat
(
label_weights_list
,
0
)
bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
0
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
0
)
# classification loss
cls_scores
=
cls_scores
.
reshape
(
-
1
,
self
.
cls_out_channels
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
cls_scores
.
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
labels
,
label_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# regression L1 loss
bbox_preds
=
bbox_preds
.
reshape
(
-
1
,
bbox_preds
.
size
(
-
1
))
normalized_bbox_targets
=
normalize_bbox
(
bbox_targets
,
self
.
pc_range
)
isnotnan
=
torch
.
isfinite
(
normalized_bbox_targets
).
all
(
dim
=-
1
)
bbox_weights
=
bbox_weights
*
self
.
code_weights
loss_bbox
=
self
.
loss_bbox
(
bbox_preds
[
isnotnan
,
:
10
],
normalized_bbox_targets
[
isnotnan
,
:
10
],
bbox_weights
[
isnotnan
,
:
10
],
avg_factor
=
num_total_pos
)
if
digit_version
(
TORCH_VERSION
)
>=
digit_version
(
'1.8'
):
loss_cls
=
torch
.
nan_to_num
(
loss_cls
)
loss_bbox
=
torch
.
nan_to_num
(
loss_bbox
)
return
loss_cls
,
loss_bbox
@
force_fp32
(
apply_to
=
(
'preds_dicts'
))
def
loss
(
self
,
gt_bboxes_list
,
gt_labels_list
,
preds_dicts
,
gt_bboxes_ignore
=
None
,
img_metas
=
None
):
""""Loss function.
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
preds_dicts:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
all_cls_scores
=
preds_dicts
[
'all_cls_scores'
]
all_bbox_preds
=
preds_dicts
[
'all_bbox_preds'
]
enc_cls_scores
=
preds_dicts
[
'enc_cls_scores'
]
enc_bbox_preds
=
preds_dicts
[
'enc_bbox_preds'
]
num_dec_layers
=
len
(
all_cls_scores
)
device
=
gt_labels_list
[
0
].
device
gt_bboxes_list
=
[
torch
.
cat
(
(
gt_bboxes
.
gravity_center
,
gt_bboxes
.
tensor
[:,
3
:]),
dim
=
1
).
to
(
device
)
for
gt_bboxes
in
gt_bboxes_list
]
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
losses_cls
,
losses_bbox
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
all_gt_bboxes_ignore_list
)
loss_dict
=
dict
()
# loss of proposal generated from encode feature map.
if
enc_cls_scores
is
not
None
:
binary_labels_list
=
[
torch
.
zeros_like
(
gt_labels_list
[
i
])
for
i
in
range
(
len
(
all_gt_labels_list
))
]
enc_loss_cls
,
enc_losses_bbox
=
\
self
.
loss_single
(
enc_cls_scores
,
enc_bbox_preds
,
gt_bboxes_list
,
binary_labels_list
,
gt_bboxes_ignore
)
loss_dict
[
'enc_loss_cls'
]
=
enc_loss_cls
loss_dict
[
'enc_loss_bbox'
]
=
enc_losses_bbox
# loss from the last decoder layer
loss_dict
[
'loss_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_bbox'
]
=
losses_bbox
[
-
1
]
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
loss_bbox_i
num_dec_layer
+=
1
return
loss_dict
@
force_fp32
(
apply_to
=
(
'preds_dicts'
))
def
get_bboxes
(
self
,
preds_dicts
,
img_metas
,
rescale
=
False
):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
img_metas (list[dict]): Point cloud and image's meta info.
Returns:
list[dict]: Decoded bbox, scores and labels after nms.
"""
preds_dicts
=
self
.
bbox_coder
.
decode
(
preds_dicts
)
num_samples
=
len
(
preds_dicts
)
ret_list
=
[]
for
i
in
range
(
num_samples
):
preds
=
preds_dicts
[
i
]
bboxes
=
preds
[
'bboxes'
]
bboxes
[:,
2
]
=
bboxes
[:,
2
]
-
bboxes
[:,
5
]
*
0.5
code_size
=
bboxes
.
shape
[
-
1
]
bboxes
=
img_metas
[
i
][
'box_type_3d'
](
bboxes
,
code_size
)
scores
=
preds
[
'scores'
]
labels
=
preds
[
'labels'
]
ret_list
.
append
([
bboxes
,
scores
,
labels
])
return
ret_list
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/detectors/__init__.py
0 → 100644
View file @
19472568
from
.bevformer
import
BEVFormer
from
.bevformer_fp16
import
BEVFormer_fp16
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
tkinter.messagebox
import
NO
import
torch
from
mmcv.runner
import
force_fp32
,
auto_fp16
from
mmdet.models
import
DETECTORS
from
mmdet3d.core
import
bbox3d2result
from
mmdet3d.models.detectors.mvx_two_stage
import
MVXTwoStageDetector
from
projects.mmdet3d_plugin.models.utils.grid_mask
import
GridMask
import
time
import
copy
import
numpy
as
np
import
mmdet3d
from
projects.mmdet3d_plugin.models.utils.bricks
import
run_time
@
DETECTORS
.
register_module
()
class
BEVFormer
(
MVXTwoStageDetector
):
"""BEVFormer.
Args:
video_test_mode (bool): Decide whether to use temporal information during inference.
"""
def
__init__
(
self
,
use_grid_mask
=
False
,
pts_voxel_layer
=
None
,
pts_voxel_encoder
=
None
,
pts_middle_encoder
=
None
,
pts_fusion_layer
=
None
,
img_backbone
=
None
,
pts_backbone
=
None
,
img_neck
=
None
,
pts_neck
=
None
,
pts_bbox_head
=
None
,
img_roi_head
=
None
,
img_rpn_head
=
None
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
,
video_test_mode
=
False
):
super
(
BEVFormer
,
self
).
__init__
(
pts_voxel_layer
,
pts_voxel_encoder
,
pts_middle_encoder
,
pts_fusion_layer
,
img_backbone
,
pts_backbone
,
img_neck
,
pts_neck
,
pts_bbox_head
,
img_roi_head
,
img_rpn_head
,
train_cfg
,
test_cfg
,
pretrained
)
self
.
grid_mask
=
GridMask
(
True
,
True
,
rotate
=
1
,
offset
=
False
,
ratio
=
0.5
,
mode
=
1
,
prob
=
0.7
)
self
.
use_grid_mask
=
use_grid_mask
self
.
fp16_enabled
=
False
# temporal
self
.
video_test_mode
=
video_test_mode
self
.
prev_frame_info
=
{
'prev_bev'
:
None
,
'scene_token'
:
None
,
'prev_pos'
:
0
,
'prev_angle'
:
0
,
}
def
extract_img_feat
(
self
,
img
,
img_metas
,
len_queue
=
None
):
"""Extract features of images."""
B
=
img
.
size
(
0
)
if
img
is
not
None
:
# input_shape = img.shape[-2:]
# # update real input shape of each single img
# for img_meta in img_metas:
# img_meta.update(input_shape=input_shape)
if
img
.
dim
()
==
5
and
img
.
size
(
0
)
==
1
:
img
.
squeeze_
()
elif
img
.
dim
()
==
5
and
img
.
size
(
0
)
>
1
:
B
,
N
,
C
,
H
,
W
=
img
.
size
()
img
=
img
.
reshape
(
B
*
N
,
C
,
H
,
W
)
if
self
.
use_grid_mask
:
img
=
self
.
grid_mask
(
img
)
img_feats
=
self
.
img_backbone
(
img
)
if
isinstance
(
img_feats
,
dict
):
img_feats
=
list
(
img_feats
.
values
())
else
:
return
None
if
self
.
with_img_neck
:
img_feats
=
self
.
img_neck
(
img_feats
)
img_feats_reshaped
=
[]
for
img_feat
in
img_feats
:
BN
,
C
,
H
,
W
=
img_feat
.
size
()
if
len_queue
is
not
None
:
img_feats_reshaped
.
append
(
img_feat
.
view
(
int
(
B
/
len_queue
),
len_queue
,
int
(
BN
/
B
),
C
,
H
,
W
))
else
:
img_feats_reshaped
.
append
(
img_feat
.
view
(
B
,
int
(
BN
/
B
),
C
,
H
,
W
))
return
img_feats_reshaped
@
auto_fp16
(
apply_to
=
(
'img'
))
def
extract_feat
(
self
,
img
,
img_metas
=
None
,
len_queue
=
None
):
"""Extract features from images and points."""
img_feats
=
self
.
extract_img_feat
(
img
,
img_metas
,
len_queue
=
len_queue
)
return
img_feats
def
forward_pts_train
(
self
,
pts_feats
,
gt_bboxes_3d
,
gt_labels_3d
,
img_metas
,
gt_bboxes_ignore
=
None
,
prev_bev
=
None
):
"""Forward function'
Args:
pts_feats (list[torch.Tensor]): Features of point cloud branch
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes for each sample.
gt_labels_3d (list[torch.Tensor]): Ground truth labels for
boxes of each sampole
img_metas (list[dict]): Meta information of samples.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
prev_bev (torch.Tensor, optional): BEV features of previous frame.
Returns:
dict: Losses of each branch.
"""
outs
=
self
.
pts_bbox_head
(
pts_feats
,
img_metas
,
prev_bev
)
loss_inputs
=
[
gt_bboxes_3d
,
gt_labels_3d
,
outs
]
losses
=
self
.
pts_bbox_head
.
loss
(
*
loss_inputs
,
img_metas
=
img_metas
)
return
losses
def
forward_dummy
(
self
,
img
):
dummy_metas
=
None
return
self
.
forward_test
(
img
=
img
,
img_metas
=
[[
dummy_metas
]])
def
forward
(
self
,
return_loss
=
True
,
**
kwargs
):
"""Calls either forward_train or forward_test depending on whether
return_loss=True.
Note this setting will change the expected inputs. When
`return_loss=True`, img and img_metas are single-nested (i.e.
torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
img_metas should be double nested (i.e. list[torch.Tensor],
list[list[dict]]), with the outer list indicating test time
augmentations.
"""
if
return_loss
:
return
self
.
forward_train
(
**
kwargs
)
else
:
return
self
.
forward_test
(
**
kwargs
)
def
obtain_history_bev
(
self
,
imgs_queue
,
img_metas_list
):
"""Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
"""
self
.
eval
()
with
torch
.
no_grad
():
prev_bev
=
None
bs
,
len_queue
,
num_cams
,
C
,
H
,
W
=
imgs_queue
.
shape
imgs_queue
=
imgs_queue
.
reshape
(
bs
*
len_queue
,
num_cams
,
C
,
H
,
W
)
img_feats_list
=
self
.
extract_feat
(
img
=
imgs_queue
,
len_queue
=
len_queue
)
for
i
in
range
(
len_queue
):
img_metas
=
[
each
[
i
]
for
each
in
img_metas_list
]
# img_feats = self.extract_feat(img=img, img_metas=img_metas)
img_feats
=
[
each_scale
[:,
i
]
for
each_scale
in
img_feats_list
]
prev_bev
=
self
.
pts_bbox_head
(
img_feats
,
img_metas
,
prev_bev
,
only_bev
=
True
)
self
.
train
()
return
prev_bev
@
auto_fp16
(
apply_to
=
(
'img'
,
'points'
))
def
forward_train
(
self
,
points
=
None
,
img_metas
=
None
,
gt_bboxes_3d
=
None
,
gt_labels_3d
=
None
,
gt_labels
=
None
,
gt_bboxes
=
None
,
img
=
None
,
proposals
=
None
,
gt_bboxes_ignore
=
None
,
img_depth
=
None
,
img_mask
=
None
,
):
"""Forward training function.
Args:
points (list[torch.Tensor], optional): Points of each sample.
Defaults to None.
img_metas (list[dict], optional): Meta information of each sample.
Defaults to None.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth 3D boxes. Defaults to None.
gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
of 3D boxes. Defaults to None.
gt_labels (list[torch.Tensor], optional): Ground truth labels
of 2D boxes in images. Defaults to None.
gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
images. Defaults to None.
img (torch.Tensor optional): Images of each sample with shape
(N, C, H, W). Defaults to None.
proposals ([list[torch.Tensor], optional): Predicted proposals
used for training Fast RCNN. Defaults to None.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
2D boxes in images to be ignored. Defaults to None.
Returns:
dict: Losses of different branches.
"""
len_queue
=
img
.
size
(
1
)
prev_img
=
img
[:,
:
-
1
,
...]
img
=
img
[:,
-
1
,
...]
prev_img_metas
=
copy
.
deepcopy
(
img_metas
)
prev_bev
=
self
.
obtain_history_bev
(
prev_img
,
prev_img_metas
)
img_metas
=
[
each
[
len_queue
-
1
]
for
each
in
img_metas
]
img_feats
=
self
.
extract_feat
(
img
=
img
,
img_metas
=
img_metas
)
losses
=
dict
()
losses_pts
=
self
.
forward_pts_train
(
img_feats
,
gt_bboxes_3d
,
gt_labels_3d
,
img_metas
,
gt_bboxes_ignore
,
prev_bev
)
losses
.
update
(
losses_pts
)
return
losses
def
forward_test
(
self
,
img_metas
,
img
=
None
,
**
kwargs
):
for
var
,
name
in
[(
img_metas
,
'img_metas'
)]:
if
not
isinstance
(
var
,
list
):
raise
TypeError
(
'{} must be a list, but got {}'
.
format
(
name
,
type
(
var
)))
img
=
[
img
]
if
img
is
None
else
img
if
img_metas
[
0
][
0
][
'scene_token'
]
!=
self
.
prev_frame_info
[
'scene_token'
]:
# the first sample of each scene is truncated
self
.
prev_frame_info
[
'prev_bev'
]
=
None
# update idx
self
.
prev_frame_info
[
'scene_token'
]
=
img_metas
[
0
][
0
][
'scene_token'
]
# do not use temporal information
if
not
self
.
video_test_mode
:
self
.
prev_frame_info
[
'prev_bev'
]
=
None
# Get the delta of ego position and angle between two timestamps.
tmp_pos
=
copy
.
deepcopy
(
img_metas
[
0
][
0
][
'can_bus'
][:
3
])
tmp_angle
=
copy
.
deepcopy
(
img_metas
[
0
][
0
][
'can_bus'
][
-
1
])
if
self
.
prev_frame_info
[
'prev_bev'
]
is
not
None
:
img_metas
[
0
][
0
][
'can_bus'
][:
3
]
-=
self
.
prev_frame_info
[
'prev_pos'
]
img_metas
[
0
][
0
][
'can_bus'
][
-
1
]
-=
self
.
prev_frame_info
[
'prev_angle'
]
else
:
img_metas
[
0
][
0
][
'can_bus'
][
-
1
]
=
0
img_metas
[
0
][
0
][
'can_bus'
][:
3
]
=
0
new_prev_bev
,
bbox_results
=
self
.
simple_test
(
img_metas
[
0
],
img
[
0
],
prev_bev
=
self
.
prev_frame_info
[
'prev_bev'
],
**
kwargs
)
# During inference, we save the BEV features and ego motion of each timestamp.
self
.
prev_frame_info
[
'prev_pos'
]
=
tmp_pos
self
.
prev_frame_info
[
'prev_angle'
]
=
tmp_angle
self
.
prev_frame_info
[
'prev_bev'
]
=
new_prev_bev
return
bbox_results
def
simple_test_pts
(
self
,
x
,
img_metas
,
prev_bev
=
None
,
rescale
=
False
):
"""Test function"""
outs
=
self
.
pts_bbox_head
(
x
,
img_metas
,
prev_bev
=
prev_bev
)
bbox_list
=
self
.
pts_bbox_head
.
get_bboxes
(
outs
,
img_metas
,
rescale
=
rescale
)
bbox_results
=
[
bbox3d2result
(
bboxes
,
scores
,
labels
)
for
bboxes
,
scores
,
labels
in
bbox_list
]
return
outs
[
'bev_embed'
],
bbox_results
def
simple_test
(
self
,
img_metas
,
img
=
None
,
prev_bev
=
None
,
rescale
=
False
):
"""Test function without augmentaiton."""
img_feats
=
self
.
extract_feat
(
img
=
img
,
img_metas
=
img_metas
)
bbox_list
=
[
dict
()
for
i
in
range
(
len
(
img_metas
))]
new_prev_bev
,
bbox_pts
=
self
.
simple_test_pts
(
img_feats
,
img_metas
,
prev_bev
,
rescale
=
rescale
)
for
result_dict
,
pts_bbox
in
zip
(
bbox_list
,
bbox_pts
):
result_dict
[
'pts_bbox'
]
=
pts_bbox
return
new_prev_bev
,
bbox_list
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
tkinter.messagebox
import
NO
import
torch
from
mmcv.runner
import
force_fp32
,
auto_fp16
from
mmdet.models
import
DETECTORS
from
mmdet3d.core
import
bbox3d2result
from
mmdet3d.models.detectors.mvx_two_stage
import
MVXTwoStageDetector
from
projects.mmdet3d_plugin.models.utils.grid_mask
import
GridMask
from
projects.mmdet3d_plugin.bevformer.detectors.bevformer
import
BEVFormer
import
time
import
copy
import
numpy
as
np
import
mmdet3d
from
projects.mmdet3d_plugin.models.utils.bricks
import
run_time
@
DETECTORS
.
register_module
()
class
BEVFormer_fp16
(
BEVFormer
):
"""
The default version BEVFormer currently can not support FP16.
We provide this version to resolve this issue.
"""
@
auto_fp16
(
apply_to
=
(
'img'
,
'prev_bev'
,
'points'
))
def
forward_train
(
self
,
points
=
None
,
img_metas
=
None
,
gt_bboxes_3d
=
None
,
gt_labels_3d
=
None
,
gt_labels
=
None
,
gt_bboxes
=
None
,
img
=
None
,
proposals
=
None
,
gt_bboxes_ignore
=
None
,
img_depth
=
None
,
img_mask
=
None
,
prev_bev
=
None
,
):
"""Forward training function.
Args:
points (list[torch.Tensor], optional): Points of each sample.
Defaults to None.
img_metas (list[dict], optional): Meta information of each sample.
Defaults to None.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth 3D boxes. Defaults to None.
gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
of 3D boxes. Defaults to None.
gt_labels (list[torch.Tensor], optional): Ground truth labels
of 2D boxes in images. Defaults to None.
gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
images. Defaults to None.
img (torch.Tensor optional): Images of each sample with shape
(N, C, H, W). Defaults to None.
proposals ([list[torch.Tensor], optional): Predicted proposals
used for training Fast RCNN. Defaults to None.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
2D boxes in images to be ignored. Defaults to None.
Returns:
dict: Losses of different branches.
"""
img_feats
=
self
.
extract_feat
(
img
=
img
,
img_metas
=
img_metas
)
losses
=
dict
()
losses_pts
=
self
.
forward_pts_train
(
img_feats
,
gt_bboxes_3d
,
gt_labels_3d
,
img_metas
,
gt_bboxes_ignore
,
prev_bev
=
prev_bev
)
losses
.
update
(
losses_pts
)
return
losses
def
val_step
(
self
,
data
,
optimizer
):
"""
In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`.
This is not the standard function of `val_step`.
"""
img
=
data
[
'img'
]
img_metas
=
data
[
'img_metas'
]
img_feats
=
self
.
extract_feat
(
img
=
img
,
img_metas
=
img_metas
)
prev_bev
=
data
.
get
(
'prev_bev'
,
None
)
prev_bev
=
self
.
pts_bbox_head
(
img_feats
,
img_metas
,
prev_bev
=
prev_bev
,
only_bev
=
True
)
return
prev_bev
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/hooks/__init__.py
0 → 100644
View file @
19472568
from
.custom_hooks
import
TransferWeight
\ No newline at end of file
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
0 → 100644
View file @
19472568
from
mmcv.runner.hooks.hook
import
HOOKS
,
Hook
from
projects.mmdet3d_plugin.models.utils
import
run_time
@
HOOKS
.
register_module
()
class
TransferWeight
(
Hook
):
def
__init__
(
self
,
every_n_inters
=
1
):
self
.
every_n_inters
=
every_n_inters
def
after_train_iter
(
self
,
runner
):
if
self
.
every_n_inner_iters
(
runner
,
self
.
every_n_inters
):
runner
.
eval_model
.
load_state_dict
(
runner
.
model
.
state_dict
())
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/__init__.py
0 → 100644
View file @
19472568
from
.transformer
import
PerceptionTransformer
from
.spatial_cross_attention
import
SpatialCrossAttention
,
MSDeformableAttention3D
,
MSIPM3D
from
.temporal_self_attention
import
TemporalSelfAttention
from
.encoder
import
BEVFormerEncoder
,
BEVFormerLayer
from
.decoder
import
DetectionTransformerDecoder
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
copy
import
warnings
import
torch
import
torch.nn
as
nn
from
mmcv
import
ConfigDict
,
deprecated_api_warning
from
mmcv.cnn
import
Linear
,
build_activation_layer
,
build_norm_layer
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.cnn.bricks.registry
import
(
ATTENTION
,
FEEDFORWARD_NETWORK
,
POSITIONAL_ENCODING
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
try
:
from
mmcv.ops.multi_scale_deform_attn
import
MultiScaleDeformableAttention
# noqa F401
warnings
.
warn
(
ImportWarning
(
'``MultiScaleDeformableAttention`` has been moved to '
'``mmcv.ops.multi_scale_deform_attn``, please change original path '
# noqa E501
'``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '
# noqa E501
'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '
# noqa E501
))
except
ImportError
:
warnings
.
warn
(
'Fail to import ``MultiScaleDeformableAttention`` from '
'``mmcv.ops.multi_scale_deform_attn``, '
'You should install ``mmcv-full`` if you need this module. '
)
from
mmcv.cnn.bricks.transformer
import
build_feedforward_network
,
build_attention
@
TRANSFORMER_LAYER
.
register_module
()
class
MyCustomBaseTransformerLayer
(
BaseModule
):
"""Base `TransformerLayer` for vision transformer.
It can be built from `mmcv.ConfigDict` and support more flexible
customization, for example, using any number of `FFN or LN ` and
use different kinds of `attention` by specifying a list of `ConfigDict`
named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
when you specifying `norm` as the first element of `operation_order`.
More details about the `prenorm`: `On Layer Normalization in the
Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for `self_attention` or `cross_attention` modules,
The order of the configs in the list should be consistent with
corresponding attentions in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config. Default: None.
ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for FFN, The order of the configs in the list should be
consistent with corresponding ffn in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Support `prenorm` when you specifying first element as `norm`.
Default:None.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape
of (batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
"""
def
__init__
(
self
,
attn_cfgs
=
None
,
ffn_cfgs
=
dict
(
type
=
'FFN'
,
embed_dims
=
256
,
feedforward_channels
=
1024
,
num_fcs
=
2
,
ffn_drop
=
0.
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
operation_order
=
None
,
norm_cfg
=
dict
(
type
=
'LN'
),
init_cfg
=
None
,
batch_first
=
True
,
**
kwargs
):
deprecated_args
=
dict
(
feedforward_channels
=
'feedforward_channels'
,
ffn_dropout
=
'ffn_drop'
,
ffn_num_fcs
=
'num_fcs'
)
for
ori_name
,
new_name
in
deprecated_args
.
items
():
if
ori_name
in
kwargs
:
warnings
.
warn
(
f
'The arguments `
{
ori_name
}
` in BaseTransformerLayer '
f
'has been deprecated, now you should set `
{
new_name
}
` '
f
'and other FFN related arguments '
f
'to a dict named `ffn_cfgs`. '
)
ffn_cfgs
[
new_name
]
=
kwargs
[
ori_name
]
super
(
MyCustomBaseTransformerLayer
,
self
).
__init__
(
init_cfg
)
self
.
batch_first
=
batch_first
assert
set
(
operation_order
)
&
set
(
[
'self_attn'
,
'norm'
,
'ffn'
,
'cross_attn'
])
==
\
set
(
operation_order
),
f
'The operation_order of'
\
f
'
{
self
.
__class__
.
__name__
}
should '
\
f
'contains all four operation type '
\
f
"
{
[
'self_attn'
,
'norm'
,
'ffn'
,
'cross_attn'
]
}
"
num_attn
=
operation_order
.
count
(
'self_attn'
)
+
operation_order
.
count
(
'cross_attn'
)
if
isinstance
(
attn_cfgs
,
dict
):
attn_cfgs
=
[
copy
.
deepcopy
(
attn_cfgs
)
for
_
in
range
(
num_attn
)]
else
:
assert
num_attn
==
len
(
attn_cfgs
),
f
'The length '
\
f
'of attn_cfg
{
num_attn
}
is '
\
f
'not consistent with the number of attention'
\
f
'in operation_order
{
operation_order
}
.'
self
.
num_attn
=
num_attn
self
.
operation_order
=
operation_order
self
.
norm_cfg
=
norm_cfg
self
.
pre_norm
=
operation_order
[
0
]
==
'norm'
self
.
attentions
=
ModuleList
()
index
=
0
for
operation_name
in
operation_order
:
if
operation_name
in
[
'self_attn'
,
'cross_attn'
]:
if
'batch_first'
in
attn_cfgs
[
index
]:
assert
self
.
batch_first
==
attn_cfgs
[
index
][
'batch_first'
]
else
:
attn_cfgs
[
index
][
'batch_first'
]
=
self
.
batch_first
attention
=
build_attention
(
attn_cfgs
[
index
])
# Some custom attentions used as `self_attn`
# or `cross_attn` can have different behavior.
attention
.
operation_name
=
operation_name
self
.
attentions
.
append
(
attention
)
index
+=
1
self
.
embed_dims
=
self
.
attentions
[
0
].
embed_dims
self
.
ffns
=
ModuleList
()
num_ffns
=
operation_order
.
count
(
'ffn'
)
if
isinstance
(
ffn_cfgs
,
dict
):
ffn_cfgs
=
ConfigDict
(
ffn_cfgs
)
if
isinstance
(
ffn_cfgs
,
dict
):
ffn_cfgs
=
[
copy
.
deepcopy
(
ffn_cfgs
)
for
_
in
range
(
num_ffns
)]
assert
len
(
ffn_cfgs
)
==
num_ffns
for
ffn_index
in
range
(
num_ffns
):
if
'embed_dims'
not
in
ffn_cfgs
[
ffn_index
]:
ffn_cfgs
[
'embed_dims'
]
=
self
.
embed_dims
else
:
assert
ffn_cfgs
[
ffn_index
][
'embed_dims'
]
==
self
.
embed_dims
self
.
ffns
.
append
(
build_feedforward_network
(
ffn_cfgs
[
ffn_index
]))
self
.
norms
=
ModuleList
()
num_norms
=
operation_order
.
count
(
'norm'
)
for
_
in
range
(
num_norms
):
self
.
norms
.
append
(
build_norm_layer
(
norm_cfg
,
self
.
embed_dims
)[
1
])
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
query_pos
=
None
,
key_pos
=
None
,
attn_masks
=
None
,
query_key_padding_mask
=
None
,
key_padding_mask
=
None
,
**
kwargs
):
"""Forward function for `TransformerDecoderLayer`.
**kwargs contains some specific arguments of attentions.
Args:
query (Tensor): The input query with shape
[num_queries, bs, embed_dims] if
self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
value (Tensor): The value tensor with same shape as `key`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor] | None): 2D Tensor used in
calculation of corresponding attention. The length of
it should equal to the number of `attention` in
`operation_order`. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in `self_attn` layer.
Defaults to None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
norm_index
=
0
attn_index
=
0
ffn_index
=
0
identity
=
query
if
attn_masks
is
None
:
attn_masks
=
[
None
for
_
in
range
(
self
.
num_attn
)]
elif
isinstance
(
attn_masks
,
torch
.
Tensor
):
attn_masks
=
[
copy
.
deepcopy
(
attn_masks
)
for
_
in
range
(
self
.
num_attn
)
]
warnings
.
warn
(
f
'Use same attn_mask in all attentions in '
f
'
{
self
.
__class__
.
__name__
}
'
)
else
:
assert
len
(
attn_masks
)
==
self
.
num_attn
,
f
'The length of '
\
f
'attn_masks
{
len
(
attn_masks
)
}
must be equal '
\
f
'to the number of attention in '
\
f
'operation_order
{
self
.
num_attn
}
'
for
layer
in
self
.
operation_order
:
if
layer
==
'self_attn'
:
temp_key
=
temp_value
=
query
query
=
self
.
attentions
[
attn_index
](
query
,
temp_key
,
temp_value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
query_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
query_key_padding_mask
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'norm'
:
query
=
self
.
norms
[
norm_index
](
query
)
norm_index
+=
1
elif
layer
==
'cross_attn'
:
query
=
self
.
attentions
[
attn_index
](
query
,
key
,
value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
key_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
key_padding_mask
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'ffn'
:
query
=
self
.
ffns
[
ffn_index
](
query
,
identity
if
self
.
pre_norm
else
None
)
ffn_index
+=
1
return
query
@
TRANSFORMER_LAYER
.
register_module
()
class
MyCustomBaseTransformerLayerWithoutSelfAttn
(
BaseModule
):
"""Base `TransformerLayer` for vision transformer.
It can be built from `mmcv.ConfigDict` and support more flexible
customization, for example, using any number of `FFN or LN ` and
use different kinds of `attention` by specifying a list of `ConfigDict`
named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
when you specifying `norm` as the first element of `operation_order`.
More details about the `prenorm`: `On Layer Normalization in the
Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for `self_attention` or `cross_attention` modules,
The order of the configs in the list should be consistent with
corresponding attentions in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config. Default: None.
ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for FFN, The order of the configs in the list should be
consistent with corresponding ffn in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Support `prenorm` when you specifying first element as `norm`.
Default:None.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape
of (batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
"""
def
__init__
(
self
,
attn_cfgs
=
None
,
ffn_cfgs
=
dict
(
type
=
'FFN'
,
embed_dims
=
256
,
feedforward_channels
=
1024
,
num_fcs
=
2
,
ffn_drop
=
0.
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
operation_order
=
None
,
norm_cfg
=
dict
(
type
=
'LN'
),
init_cfg
=
None
,
batch_first
=
True
,
**
kwargs
):
deprecated_args
=
dict
(
feedforward_channels
=
'feedforward_channels'
,
ffn_dropout
=
'ffn_drop'
,
ffn_num_fcs
=
'num_fcs'
)
for
ori_name
,
new_name
in
deprecated_args
.
items
():
if
ori_name
in
kwargs
:
warnings
.
warn
(
f
'The arguments `
{
ori_name
}
` in BaseTransformerLayer '
f
'has been deprecated, now you should set `
{
new_name
}
` '
f
'and other FFN related arguments '
f
'to a dict named `ffn_cfgs`. '
)
ffn_cfgs
[
new_name
]
=
kwargs
[
ori_name
]
super
(
MyCustomBaseTransformerLayerWithoutSelfAttn
,
self
).
__init__
(
init_cfg
)
self
.
batch_first
=
batch_first
assert
set
(
operation_order
)
&
set
(
[
'norm'
,
'ffn'
,
'cross_attn'
])
==
\
set
(
operation_order
),
f
'The operation_order of'
\
f
'
{
self
.
__class__
.
__name__
}
should '
\
f
'contains all three operation type '
\
f
"
{
[
'norm'
,
'ffn'
,
'cross_attn'
]
}
"
num_attn
=
operation_order
.
count
(
'cross_attn'
)
if
isinstance
(
attn_cfgs
,
dict
):
attn_cfgs
=
[
copy
.
deepcopy
(
attn_cfgs
)
for
_
in
range
(
num_attn
)]
else
:
assert
num_attn
==
len
(
attn_cfgs
),
f
'The length '
\
f
'of attn_cfg
{
num_attn
}
is '
\
f
'not consistent with the number of attention'
\
f
'in operation_order
{
operation_order
}
.'
self
.
num_attn
=
num_attn
self
.
operation_order
=
operation_order
self
.
norm_cfg
=
norm_cfg
self
.
pre_norm
=
operation_order
[
0
]
==
'norm'
self
.
attentions
=
ModuleList
()
index
=
0
for
operation_name
in
operation_order
:
if
operation_name
in
[
'self_attn'
,
'cross_attn'
]:
if
'batch_first'
in
attn_cfgs
[
index
]:
assert
self
.
batch_first
==
attn_cfgs
[
index
][
'batch_first'
]
else
:
attn_cfgs
[
index
][
'batch_first'
]
=
self
.
batch_first
attention
=
build_attention
(
attn_cfgs
[
index
])
# Some custom attentions used as `self_attn`
# or `cross_attn` can have different behavior.
attention
.
operation_name
=
operation_name
self
.
attentions
.
append
(
attention
)
index
+=
1
self
.
embed_dims
=
self
.
attentions
[
0
].
embed_dims
self
.
ffns
=
ModuleList
()
num_ffns
=
operation_order
.
count
(
'ffn'
)
if
isinstance
(
ffn_cfgs
,
dict
):
ffn_cfgs
=
ConfigDict
(
ffn_cfgs
)
if
isinstance
(
ffn_cfgs
,
dict
):
ffn_cfgs
=
[
copy
.
deepcopy
(
ffn_cfgs
)
for
_
in
range
(
num_ffns
)]
assert
len
(
ffn_cfgs
)
==
num_ffns
for
ffn_index
in
range
(
num_ffns
):
if
'embed_dims'
not
in
ffn_cfgs
[
ffn_index
]:
ffn_cfgs
[
'embed_dims'
]
=
self
.
embed_dims
else
:
assert
ffn_cfgs
[
ffn_index
][
'embed_dims'
]
==
self
.
embed_dims
self
.
ffns
.
append
(
build_feedforward_network
(
ffn_cfgs
[
ffn_index
]))
self
.
norms
=
ModuleList
()
num_norms
=
operation_order
.
count
(
'norm'
)
for
_
in
range
(
num_norms
):
self
.
norms
.
append
(
build_norm_layer
(
norm_cfg
,
self
.
embed_dims
)[
1
])
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
query_pos
=
None
,
key_pos
=
None
,
attn_masks
=
None
,
query_key_padding_mask
=
None
,
key_padding_mask
=
None
,
**
kwargs
):
"""Forward function for `TransformerDecoderLayer`.
**kwargs contains some specific arguments of attentions.
Args:
query (Tensor): The input query with shape
[num_queries, bs, embed_dims] if
self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
value (Tensor): The value tensor with same shape as `key`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor] | None): 2D Tensor used in
calculation of corresponding attention. The length of
it should equal to the number of `attention` in
`operation_order`. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in `self_attn` layer.
Defaults to None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
norm_index
=
0
attn_index
=
0
ffn_index
=
0
identity
=
query
if
attn_masks
is
None
:
attn_masks
=
[
None
for
_
in
range
(
self
.
num_attn
)]
elif
isinstance
(
attn_masks
,
torch
.
Tensor
):
attn_masks
=
[
copy
.
deepcopy
(
attn_masks
)
for
_
in
range
(
self
.
num_attn
)
]
warnings
.
warn
(
f
'Use same attn_mask in all attentions in '
f
'
{
self
.
__class__
.
__name__
}
'
)
else
:
assert
len
(
attn_masks
)
==
self
.
num_attn
,
f
'The length of '
\
f
'attn_masks
{
len
(
attn_masks
)
}
must be equal '
\
f
'to the number of attention in '
\
f
'operation_order
{
self
.
num_attn
}
'
for
layer
in
self
.
operation_order
:
if
layer
==
'self_attn'
:
temp_key
=
temp_value
=
query
query
=
self
.
attentions
[
attn_index
](
query
,
temp_key
,
temp_value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
query_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
query_key_padding_mask
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'norm'
:
query
=
self
.
norms
[
norm_index
](
query
)
norm_index
+=
1
elif
layer
==
'cross_attn'
:
query
=
self
.
attentions
[
attn_index
](
query
,
key
,
value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
key_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
key_padding_mask
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'ffn'
:
query
=
self
.
ffns
[
ffn_index
](
query
,
identity
if
self
.
pre_norm
else
None
)
ffn_index
+=
1
return
query
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/decoder.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
mmcv.ops.multi_scale_deform_attn
import
multi_scale_deformable_attn_pytorch
import
mmcv
import
cv2
as
cv
import
copy
import
warnings
from
matplotlib
import
pyplot
as
plt
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
xavier_init
,
constant_init
from
mmcv.cnn.bricks.registry
import
(
ATTENTION
,
TRANSFORMER_LAYER_SEQUENCE
)
from
mmcv.cnn.bricks.transformer
import
TransformerLayerSequence
import
math
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.utils
import
(
ConfigDict
,
build_from_cfg
,
deprecated_api_warning
,
to_2tuple
)
from
mmcv.utils
import
ext_loader
from
.multi_scale_deformable_attn_function
import
MultiScaleDeformableAttnFunction_fp32
,
\
MultiScaleDeformableAttnFunction_fp16
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
def
inverse_sigmoid
(
x
,
eps
=
1e-5
):
"""Inverse function of sigmoid.
Args:
x (Tensor): The tensor to do the
inverse.
eps (float): EPS avoid numerical
overflow. Defaults 1e-5.
Returns:
Tensor: The x has passed the inverse
function of sigmoid, has same
shape with input.
"""
x
=
x
.
clamp
(
min
=
0
,
max
=
1
)
x1
=
x
.
clamp
(
min
=
eps
)
x2
=
(
1
-
x
).
clamp
(
min
=
eps
)
return
torch
.
log
(
x1
/
x2
)
@
TRANSFORMER_LAYER_SEQUENCE
.
register_module
()
class
DetectionTransformerDecoder
(
TransformerLayerSequence
):
"""Implements the decoder in DETR3D transformer.
Args:
return_intermediate (bool): Whether to return intermediate outputs.
coder_norm_cfg (dict): Config of last normalization layer. Default:
`LN`.
"""
def
__init__
(
self
,
*
args
,
return_intermediate
=
False
,
**
kwargs
):
super
(
DetectionTransformerDecoder
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
return_intermediate
=
return_intermediate
self
.
fp16_enabled
=
False
def
forward
(
self
,
query
,
*
args
,
reference_points
=
None
,
reg_branches
=
None
,
key_padding_mask
=
None
,
**
kwargs
):
"""Forward function for `Detr3DTransformerDecoder`.
Args:
query (Tensor): Input query with shape
`(num_query, bs, embed_dims)`.
reference_points (Tensor): The reference
points of offset. has shape
(bs, num_query, 4) when as_two_stage,
otherwise has shape ((bs, num_query, 2).
reg_branch: (obj:`nn.ModuleList`): Used for
refining the regression results. Only would
be passed when with_box_refine is True,
otherwise would be passed a `None`.
Returns:
Tensor: Results with shape [1, num_query, bs, embed_dims] when
return_intermediate is `False`, otherwise it has shape
[num_layers, num_query, bs, embed_dims].
"""
output
=
query
intermediate
=
[]
intermediate_reference_points
=
[]
for
lid
,
layer
in
enumerate
(
self
.
layers
):
reference_points_input
=
reference_points
[...,
:
2
].
unsqueeze
(
2
)
# BS NUM_QUERY NUM_LEVEL 2
output
=
layer
(
output
,
*
args
,
reference_points
=
reference_points_input
,
key_padding_mask
=
key_padding_mask
,
**
kwargs
)
output
=
output
.
permute
(
1
,
0
,
2
)
if
reg_branches
is
not
None
:
tmp
=
reg_branches
[
lid
](
output
)
assert
reference_points
.
shape
[
-
1
]
==
3
new_reference_points
=
torch
.
zeros_like
(
reference_points
)
new_reference_points
[...,
:
2
]
=
tmp
[
...,
:
2
]
+
inverse_sigmoid
(
reference_points
[...,
:
2
])
new_reference_points
[...,
2
:
3
]
=
tmp
[
...,
4
:
5
]
+
inverse_sigmoid
(
reference_points
[...,
2
:
3
])
new_reference_points
=
new_reference_points
.
sigmoid
()
reference_points
=
new_reference_points
.
detach
()
output
=
output
.
permute
(
1
,
0
,
2
)
if
self
.
return_intermediate
:
intermediate
.
append
(
output
)
intermediate_reference_points
.
append
(
reference_points
)
if
self
.
return_intermediate
:
return
torch
.
stack
(
intermediate
),
torch
.
stack
(
intermediate_reference_points
)
return
output
,
reference_points
@
ATTENTION
.
register_module
()
class
CustomMSDeformableAttention
(
BaseModule
):
"""An attention module used in Deformable-Detr.
`Deformable DETR: Deformable Transformers for End-to-End Object Detection.
<https://arxiv.org/pdf/2010.04159.pdf>`_.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_heads (int): Parallel attention heads. Default: 64.
num_levels (int): The number of feature map used in
Attention. Default: 4.
num_points (int): The number of sampling points for
each query in each head. Default: 4.
im2col_step (int): The step used in image_to_column.
Default: 64.
dropout (float): A Dropout layer on `inp_identity`.
Default: 0.1.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
norm_cfg (dict): Config dict for normalization layer.
Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_heads
=
8
,
num_levels
=
4
,
num_points
=
4
,
im2col_step
=
64
,
dropout
=
0.1
,
batch_first
=
False
,
norm_cfg
=
None
,
init_cfg
=
None
):
super
().
__init__
(
init_cfg
)
if
embed_dims
%
num_heads
!=
0
:
raise
ValueError
(
f
'embed_dims must be divisible by num_heads, '
f
'but got
{
embed_dims
}
and
{
num_heads
}
'
)
dim_per_head
=
embed_dims
//
num_heads
self
.
norm_cfg
=
norm_cfg
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
batch_first
=
batch_first
self
.
fp16_enabled
=
False
# you'd better set dim_per_head to a power of 2
# which is more efficient in the CUDA implementation
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
'invalid input for _is_power_of_2: {} (type: {})'
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
if
not
_is_power_of_2
(
dim_per_head
):
warnings
.
warn
(
"You'd better set embed_dims in "
'MultiScaleDeformAttention to make '
'the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.'
)
self
.
im2col_step
=
im2col_step
self
.
embed_dims
=
embed_dims
self
.
num_levels
=
num_levels
self
.
num_heads
=
num_heads
self
.
num_points
=
num_points
self
.
sampling_offsets
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
*
2
)
self
.
attention_weights
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
)
self
.
value_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
output_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
init_weights
()
def
init_weights
(
self
):
"""Default initialization for Parameters of Module."""
constant_init
(
self
.
sampling_offsets
,
0.
)
thetas
=
torch
.
arange
(
self
.
num_heads
,
dtype
=
torch
.
float32
)
*
(
2.0
*
math
.
pi
/
self
.
num_heads
)
grid_init
=
torch
.
stack
([
thetas
.
cos
(),
thetas
.
sin
()],
-
1
)
grid_init
=
(
grid_init
/
grid_init
.
abs
().
max
(
-
1
,
keepdim
=
True
)[
0
]).
view
(
self
.
num_heads
,
1
,
1
,
2
).
repeat
(
1
,
self
.
num_levels
,
self
.
num_points
,
1
)
for
i
in
range
(
self
.
num_points
):
grid_init
[:,
:,
i
,
:]
*=
i
+
1
self
.
sampling_offsets
.
bias
.
data
=
grid_init
.
view
(
-
1
)
constant_init
(
self
.
attention_weights
,
val
=
0.
,
bias
=
0.
)
xavier_init
(
self
.
value_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
self
.
_is_init
=
True
@
deprecated_api_warning
({
'residual'
:
'identity'
},
cls_name
=
'MultiScaleDeformableAttention'
)
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
identity
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
flag
=
'decoder'
,
**
kwargs
):
"""Forward Function of MultiScaleDeformAttention.
Args:
query (Tensor): Query of Transformer with shape
(num_query, bs, embed_dims).
key (Tensor): The key tensor with shape
`(num_key, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_key, bs, embed_dims)`.
identity (Tensor): The tensor used for addition, with the
same shape as `query`. Default None. If None,
`query` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, num_levels, 2),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different levels. With shape (num_levels, 2),
last dimension represents (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape ``(num_levels, )`` and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
value
is
None
:
value
=
query
if
identity
is
None
:
identity
=
query
if
query_pos
is
not
None
:
query
=
query
+
query_pos
if
not
self
.
batch_first
:
# change to (bs, num_query ,embed_dims)
query
=
query
.
permute
(
1
,
0
,
2
)
value
=
value
.
permute
(
1
,
0
,
2
)
bs
,
num_query
,
_
=
query
.
shape
bs
,
num_value
,
_
=
value
.
shape
assert
(
spatial_shapes
[:,
0
]
*
spatial_shapes
[:,
1
]).
sum
()
==
num_value
value
=
self
.
value_proj
(
value
)
if
key_padding_mask
is
not
None
:
value
=
value
.
masked_fill
(
key_padding_mask
[...,
None
],
0.0
)
value
=
value
.
view
(
bs
,
num_value
,
self
.
num_heads
,
-
1
)
sampling_offsets
=
self
.
sampling_offsets
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
,
2
)
attention_weights
=
self
.
attention_weights
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
*
self
.
num_points
)
attention_weights
=
attention_weights
.
softmax
(
-
1
)
attention_weights
=
attention_weights
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
)
if
reference_points
.
shape
[
-
1
]
==
2
:
offset_normalizer
=
torch
.
stack
(
[
spatial_shapes
[...,
1
],
spatial_shapes
[...,
0
]],
-
1
)
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:]
\
+
sampling_offsets
\
/
offset_normalizer
[
None
,
None
,
None
,
:,
None
,
:]
elif
reference_points
.
shape
[
-
1
]
==
4
:
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:
2
]
\
+
sampling_offsets
/
self
.
num_points
\
*
reference_points
[:,
:,
None
,
:,
None
,
2
:]
\
*
0.5
else
:
raise
ValueError
(
f
'Last dim of reference_points must be'
f
' 2 or 4, but get
{
reference_points
.
shape
[
-
1
]
}
instead.'
)
if
torch
.
cuda
.
is_available
()
and
value
.
is_cuda
:
# using fp16 deformable attention is unstable because it performs many sum operations
if
value
.
dtype
==
torch
.
float16
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
else
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
output
=
MultiScaleDeformableAttnFunction
.
apply
(
value
,
spatial_shapes
,
level_start_index
,
sampling_locations
,
attention_weights
,
self
.
im2col_step
)
else
:
output
=
multi_scale_deformable_attn_pytorch
(
value
,
spatial_shapes
,
sampling_locations
,
attention_weights
)
output
=
self
.
output_proj
(
output
)
if
not
self
.
batch_first
:
# (num_query, bs ,embed_dims)
output
=
output
.
permute
(
1
,
0
,
2
)
return
self
.
dropout
(
output
)
+
identity
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/encoder.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
projects.mmdet3d_plugin.models.utils.bricks
import
run_time
from
projects.mmdet3d_plugin.models.utils.visual
import
save_tensor
from
.custom_base_transformer_layer
import
MyCustomBaseTransformerLayer
import
copy
import
warnings
from
mmcv.cnn.bricks.registry
import
(
ATTENTION
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
from
mmcv.cnn.bricks.transformer
import
TransformerLayerSequence
from
mmcv.runner
import
force_fp32
,
auto_fp16
import
numpy
as
np
import
torch
import
cv2
as
cv
import
mmcv
from
mmcv.utils
import
TORCH_VERSION
,
digit_version
from
mmcv.utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
@
TRANSFORMER_LAYER_SEQUENCE
.
register_module
()
class
BEVFormerEncoder
(
TransformerLayerSequence
):
"""
Attention with both self and cross
Implements the decoder in DETR transformer.
Args:
return_intermediate (bool): Whether to return intermediate outputs.
coder_norm_cfg (dict): Config of last normalization layer. Default:
`LN`.
"""
def
__init__
(
self
,
*
args
,
pc_range
=
None
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
dataset_type
=
'nuscenes'
,
**
kwargs
):
super
(
BEVFormerEncoder
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
return_intermediate
=
return_intermediate
self
.
num_points_in_pillar
=
num_points_in_pillar
self
.
pc_range
=
pc_range
self
.
fp16_enabled
=
False
@
staticmethod
def
get_reference_points
(
H
,
W
,
Z
=
8
,
num_points_in_pillar
=
4
,
dim
=
'3d'
,
bs
=
1
,
device
=
'cuda'
,
dtype
=
torch
.
float
):
"""Get the reference points used in SCA and TSA.
Args:
H, W: spatial shape of bev.
Z: hight of pillar.
D: sample D points uniformly from each pillar.
device (obj:`device`): The device where
reference_points should be.
Returns:
Tensor: reference points used in decoder, has \
shape (bs, num_keys, num_levels, 2).
"""
# reference points in 3D space, used in spatial cross-attention (SCA)
if
dim
==
'3d'
:
zs
=
torch
.
linspace
(
0.5
,
Z
-
0.5
,
num_points_in_pillar
,
dtype
=
dtype
,
device
=
device
).
view
(
-
1
,
1
,
1
).
expand
(
num_points_in_pillar
,
H
,
W
)
/
Z
xs
=
torch
.
linspace
(
0.5
,
W
-
0.5
,
W
,
dtype
=
dtype
,
device
=
device
).
view
(
1
,
1
,
W
).
expand
(
num_points_in_pillar
,
H
,
W
)
/
W
ys
=
torch
.
linspace
(
0.5
,
H
-
0.5
,
H
,
dtype
=
dtype
,
device
=
device
).
view
(
1
,
H
,
1
).
expand
(
num_points_in_pillar
,
H
,
W
)
/
H
ref_3d
=
torch
.
stack
((
xs
,
ys
,
zs
),
-
1
)
ref_3d
=
ref_3d
.
permute
(
0
,
3
,
1
,
2
).
flatten
(
2
).
permute
(
0
,
2
,
1
)
ref_3d
=
ref_3d
[
None
].
repeat
(
bs
,
1
,
1
,
1
)
return
ref_3d
# reference points on 2D bev plane, used in temporal self-attention (TSA).
elif
dim
==
'2d'
:
ref_y
,
ref_x
=
torch
.
meshgrid
(
torch
.
linspace
(
0.5
,
H
-
0.5
,
H
,
dtype
=
dtype
,
device
=
device
),
torch
.
linspace
(
0.5
,
W
-
0.5
,
W
,
dtype
=
dtype
,
device
=
device
)
)
ref_y
=
ref_y
.
reshape
(
-
1
)[
None
]
/
H
ref_x
=
ref_x
.
reshape
(
-
1
)[
None
]
/
W
ref_2d
=
torch
.
stack
((
ref_x
,
ref_y
),
-
1
)
ref_2d
=
ref_2d
.
repeat
(
bs
,
1
,
1
).
unsqueeze
(
2
)
return
ref_2d
# This function must use fp32!!!
@
force_fp32
(
apply_to
=
(
'reference_points'
,
'img_metas'
))
def
point_sampling
(
self
,
reference_points
,
pc_range
,
img_metas
):
lidar2img
=
[]
for
img_meta
in
img_metas
:
lidar2img
.
append
(
img_meta
[
'lidar2img'
])
lidar2img
=
np
.
asarray
(
lidar2img
)
lidar2img
=
reference_points
.
new_tensor
(
lidar2img
)
# (B, N, 4, 4)
reference_points
=
reference_points
.
clone
()
reference_points
[...,
0
:
1
]
=
reference_points
[...,
0
:
1
]
*
\
(
pc_range
[
3
]
-
pc_range
[
0
])
+
pc_range
[
0
]
reference_points
[...,
1
:
2
]
=
reference_points
[...,
1
:
2
]
*
\
(
pc_range
[
4
]
-
pc_range
[
1
])
+
pc_range
[
1
]
reference_points
[...,
2
:
3
]
=
reference_points
[...,
2
:
3
]
*
\
(
pc_range
[
5
]
-
pc_range
[
2
])
+
pc_range
[
2
]
reference_points
=
torch
.
cat
(
(
reference_points
,
torch
.
ones_like
(
reference_points
[...,
:
1
])),
-
1
)
reference_points
=
reference_points
.
permute
(
1
,
0
,
2
,
3
)
D
,
B
,
num_query
=
reference_points
.
size
()[:
3
]
num_cam
=
lidar2img
.
size
(
1
)
reference_points
=
reference_points
.
view
(
D
,
B
,
1
,
num_query
,
4
).
repeat
(
1
,
1
,
num_cam
,
1
,
1
).
unsqueeze
(
-
1
)
lidar2img
=
lidar2img
.
view
(
1
,
B
,
num_cam
,
1
,
4
,
4
).
repeat
(
D
,
1
,
1
,
num_query
,
1
,
1
)
reference_points_cam
=
torch
.
matmul
(
lidar2img
.
to
(
torch
.
float32
),
reference_points
.
to
(
torch
.
float32
)).
squeeze
(
-
1
)
eps
=
1e-5
bev_mask
=
(
reference_points_cam
[...,
2
:
3
]
>
eps
)
reference_points_cam
=
reference_points_cam
[...,
0
:
2
]
/
torch
.
maximum
(
reference_points_cam
[...,
2
:
3
],
torch
.
ones_like
(
reference_points_cam
[...,
2
:
3
])
*
eps
)
reference_points_cam
[...,
0
]
/=
img_metas
[
0
][
'img_shape'
][
0
][
1
]
reference_points_cam
[...,
1
]
/=
img_metas
[
0
][
'img_shape'
][
0
][
0
]
bev_mask
=
(
bev_mask
&
(
reference_points_cam
[...,
1
:
2
]
>
0.0
)
&
(
reference_points_cam
[...,
1
:
2
]
<
1.0
)
&
(
reference_points_cam
[...,
0
:
1
]
<
1.0
)
&
(
reference_points_cam
[...,
0
:
1
]
>
0.0
))
if
digit_version
(
TORCH_VERSION
)
>=
digit_version
(
'1.8'
):
bev_mask
=
torch
.
nan_to_num
(
bev_mask
)
else
:
bev_mask
=
bev_mask
.
new_tensor
(
np
.
nan_to_num
(
bev_mask
.
cpu
().
numpy
()))
reference_points_cam
=
reference_points_cam
.
permute
(
2
,
1
,
3
,
0
,
4
)
bev_mask
=
bev_mask
.
permute
(
2
,
1
,
3
,
0
,
4
).
squeeze
(
-
1
)
return
reference_points_cam
,
bev_mask
@
auto_fp16
()
def
forward
(
self
,
bev_query
,
key
,
value
,
*
args
,
bev_h
=
None
,
bev_w
=
None
,
bev_pos
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
valid_ratios
=
None
,
prev_bev
=
None
,
shift
=
0.
,
**
kwargs
):
"""Forward function for `TransformerDecoder`.
Args:
bev_query (Tensor): Input BEV query with shape
`(num_query, bs, embed_dims)`.
key & value (Tensor): Input multi-cameta features with shape
(num_cam, num_value, bs, embed_dims)
reference_points (Tensor): The reference
points of offset. has shape
(bs, num_query, 4) when as_two_stage,
otherwise has shape ((bs, num_query, 2).
valid_ratios (Tensor): The radios of valid
points on the feature map, has shape
(bs, num_levels, 2)
Returns:
Tensor: Results with shape [1, num_query, bs, embed_dims] when
return_intermediate is `False`, otherwise it has shape
[num_layers, num_query, bs, embed_dims].
"""
output
=
bev_query
intermediate
=
[]
ref_3d
=
self
.
get_reference_points
(
bev_h
,
bev_w
,
self
.
pc_range
[
5
]
-
self
.
pc_range
[
2
],
self
.
num_points_in_pillar
,
dim
=
'3d'
,
bs
=
bev_query
.
size
(
1
),
device
=
bev_query
.
device
,
dtype
=
bev_query
.
dtype
)
ref_2d
=
self
.
get_reference_points
(
bev_h
,
bev_w
,
dim
=
'2d'
,
bs
=
bev_query
.
size
(
1
),
device
=
bev_query
.
device
,
dtype
=
bev_query
.
dtype
)
reference_points_cam
,
bev_mask
=
self
.
point_sampling
(
ref_3d
,
self
.
pc_range
,
kwargs
[
'img_metas'
])
# bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper.
# shift_ref_2d = ref_2d # .clone()
shift_ref_2d
=
ref_2d
.
clone
()
shift_ref_2d
+=
shift
[:,
None
,
None
,
:]
# (num_query, bs, embed_dims) -> (bs, num_query, embed_dims)
bev_query
=
bev_query
.
permute
(
1
,
0
,
2
)
bev_pos
=
bev_pos
.
permute
(
1
,
0
,
2
)
bs
,
len_bev
,
num_bev_level
,
_
=
ref_2d
.
shape
if
prev_bev
is
not
None
:
prev_bev
=
prev_bev
.
permute
(
1
,
0
,
2
)
prev_bev
=
torch
.
stack
(
[
prev_bev
,
bev_query
],
1
).
reshape
(
bs
*
2
,
len_bev
,
-
1
)
hybird_ref_2d
=
torch
.
stack
([
shift_ref_2d
,
ref_2d
],
1
).
reshape
(
bs
*
2
,
len_bev
,
num_bev_level
,
2
)
else
:
hybird_ref_2d
=
torch
.
stack
([
ref_2d
,
ref_2d
],
1
).
reshape
(
bs
*
2
,
len_bev
,
num_bev_level
,
2
)
for
lid
,
layer
in
enumerate
(
self
.
layers
):
output
=
layer
(
bev_query
,
key
,
value
,
*
args
,
bev_pos
=
bev_pos
,
ref_2d
=
hybird_ref_2d
,
ref_3d
=
ref_3d
,
bev_h
=
bev_h
,
bev_w
=
bev_w
,
spatial_shapes
=
spatial_shapes
,
level_start_index
=
level_start_index
,
reference_points_cam
=
reference_points_cam
,
bev_mask
=
bev_mask
,
prev_bev
=
prev_bev
,
**
kwargs
)
bev_query
=
output
if
self
.
return_intermediate
:
intermediate
.
append
(
output
)
if
self
.
return_intermediate
:
return
torch
.
stack
(
intermediate
)
return
output
@
TRANSFORMER_LAYER
.
register_module
()
class
BEVFormerLayer
(
MyCustomBaseTransformerLayer
):
"""Implements decoder layer in DETR transformer.
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
Configs for self_attention or cross_attention, the order
should be consistent with it in `operation_order`. If it is
a dict, it would be expand to the number of attention in
`operation_order`.
feedforward_channels (int): The hidden dimension for FFNs.
ffn_dropout (float): Probability of an element to be zeroed
in ffn. Default 0.0.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Default:None
act_cfg (dict): The activation config for FFNs. Default: `LN`
norm_cfg (dict): Config dict for normalization layer.
Default: `LN`.
ffn_num_fcs (int): The number of fully-connected layers in FFNs.
Default:2.
"""
def
__init__
(
self
,
attn_cfgs
,
feedforward_channels
,
ffn_dropout
=
0.0
,
operation_order
=
None
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
norm_cfg
=
dict
(
type
=
'LN'
),
ffn_num_fcs
=
2
,
**
kwargs
):
super
(
BEVFormerLayer
,
self
).
__init__
(
attn_cfgs
=
attn_cfgs
,
feedforward_channels
=
feedforward_channels
,
ffn_dropout
=
ffn_dropout
,
operation_order
=
operation_order
,
act_cfg
=
act_cfg
,
norm_cfg
=
norm_cfg
,
ffn_num_fcs
=
ffn_num_fcs
,
**
kwargs
)
self
.
fp16_enabled
=
False
assert
len
(
operation_order
)
==
6
assert
set
(
operation_order
)
==
set
(
[
'self_attn'
,
'norm'
,
'cross_attn'
,
'ffn'
])
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
bev_pos
=
None
,
query_pos
=
None
,
key_pos
=
None
,
attn_masks
=
None
,
query_key_padding_mask
=
None
,
key_padding_mask
=
None
,
ref_2d
=
None
,
ref_3d
=
None
,
bev_h
=
None
,
bev_w
=
None
,
reference_points_cam
=
None
,
mask
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
prev_bev
=
None
,
**
kwargs
):
"""Forward function for `TransformerDecoderLayer`.
**kwargs contains some specific arguments of attentions.
Args:
query (Tensor): The input query with shape
[num_queries, bs, embed_dims] if
self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
value (Tensor): The value tensor with same shape as `key`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor] | None): 2D Tensor used in
calculation of corresponding attention. The length of
it should equal to the number of `attention` in
`operation_order`. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in `self_attn` layer.
Defaults to None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
norm_index
=
0
attn_index
=
0
ffn_index
=
0
identity
=
query
if
attn_masks
is
None
:
attn_masks
=
[
None
for
_
in
range
(
self
.
num_attn
)]
elif
isinstance
(
attn_masks
,
torch
.
Tensor
):
attn_masks
=
[
copy
.
deepcopy
(
attn_masks
)
for
_
in
range
(
self
.
num_attn
)
]
warnings
.
warn
(
f
'Use same attn_mask in all attentions in '
f
'
{
self
.
__class__
.
__name__
}
'
)
else
:
assert
len
(
attn_masks
)
==
self
.
num_attn
,
f
'The length of '
\
f
'attn_masks
{
len
(
attn_masks
)
}
must be equal '
\
f
'to the number of attention in '
\
f
'operation_order
{
self
.
num_attn
}
'
for
layer
in
self
.
operation_order
:
# temporal self attention
if
layer
==
'self_attn'
:
query
=
self
.
attentions
[
attn_index
](
query
,
prev_bev
,
prev_bev
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
bev_pos
,
key_pos
=
bev_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
query_key_padding_mask
,
reference_points
=
ref_2d
,
spatial_shapes
=
torch
.
tensor
(
[[
bev_h
,
bev_w
]],
device
=
query
.
device
),
level_start_index
=
torch
.
tensor
([
0
],
device
=
query
.
device
),
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'norm'
:
query
=
self
.
norms
[
norm_index
](
query
)
norm_index
+=
1
# spaital cross attention
elif
layer
==
'cross_attn'
:
query
=
self
.
attentions
[
attn_index
](
query
,
key
,
value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
key_pos
,
reference_points
=
ref_3d
,
reference_points_cam
=
reference_points_cam
,
mask
=
mask
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
key_padding_mask
,
spatial_shapes
=
spatial_shapes
,
level_start_index
=
level_start_index
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'ffn'
:
query
=
self
.
ffns
[
ffn_index
](
query
,
identity
if
self
.
pre_norm
else
None
)
ffn_index
+=
1
return
query
docker-hub/MapTRv2/MapTR/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py
0 → 100644
View file @
19472568
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
torch
from
torch.cuda.amp
import
custom_bwd
,
custom_fwd
from
torch.autograd.function
import
Function
,
once_differentiable
from
mmcv.utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
class
MultiScaleDeformableAttnFunction_fp16
(
Function
):
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float16
)
def
forward
(
ctx
,
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
):
"""GPU version of multi-scale deformable attention.
Args:
value (Tensor): The value has shape
(bs, num_keys, mum_heads, embed_dims//num_heads)
value_spatial_shapes (Tensor): Spatial shape of
each feature map, has shape (num_levels, 2),
last dimension 2 represent (h, w)
sampling_locations (Tensor): The location of sampling points,
has shape
(bs ,num_queries, num_heads, num_levels, num_points, 2),
the last dimension 2 represent (x, y).
attention_weights (Tensor): The weight of sampling points used
when calculate the attention, has shape
(bs ,num_queries, num_heads, num_levels, num_points),
im2col_step (Tensor): The step used in image to column.
Returns:
Tensor: has shape (bs, num_queries, embed_dims)
"""
ctx
.
im2col_step
=
im2col_step
output
=
ext_module
.
ms_deform_attn_forward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
=
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
"""GPU version of backward function.
Args:
grad_output (Tensor): Gradient
of output tensor of forward.
Returns:
Tuple[Tensor]: Gradient
of input tensors in forward.
"""
value
,
value_spatial_shapes
,
value_level_start_index
,
\
sampling_locations
,
attention_weights
=
ctx
.
saved_tensors
grad_value
=
torch
.
zeros_like
(
value
)
grad_sampling_loc
=
torch
.
zeros_like
(
sampling_locations
)
grad_attn_weight
=
torch
.
zeros_like
(
attention_weights
)
ext_module
.
ms_deform_attn_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
grad_output
.
contiguous
(),
grad_value
,
grad_sampling_loc
,
grad_attn_weight
,
im2col_step
=
ctx
.
im2col_step
)
return
grad_value
,
None
,
None
,
\
grad_sampling_loc
,
grad_attn_weight
,
None
class
MultiScaleDeformableAttnFunction_fp32
(
Function
):
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float32
)
def
forward
(
ctx
,
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
):
"""GPU version of multi-scale deformable attention.
Args:
value (Tensor): The value has shape
(bs, num_keys, mum_heads, embed_dims//num_heads)
value_spatial_shapes (Tensor): Spatial shape of
each feature map, has shape (num_levels, 2),
last dimension 2 represent (h, w)
sampling_locations (Tensor): The location of sampling points,
has shape
(bs ,num_queries, num_heads, num_levels, num_points, 2),
the last dimension 2 represent (x, y).
attention_weights (Tensor): The weight of sampling points used
when calculate the attention, has shape
(bs ,num_queries, num_heads, num_levels, num_points),
im2col_step (Tensor): The step used in image to column.
Returns:
Tensor: has shape (bs, num_queries, embed_dims)
"""
ctx
.
im2col_step
=
im2col_step
output
=
ext_module
.
ms_deform_attn_forward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
=
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
)
return
output
@
staticmethod
@
once_differentiable
@
custom_bwd
def
backward
(
ctx
,
grad_output
):
"""GPU version of backward function.
Args:
grad_output (Tensor): Gradient
of output tensor of forward.
Returns:
Tuple[Tensor]: Gradient
of input tensors in forward.
"""
value
,
value_spatial_shapes
,
value_level_start_index
,
\
sampling_locations
,
attention_weights
=
ctx
.
saved_tensors
grad_value
=
torch
.
zeros_like
(
value
)
grad_sampling_loc
=
torch
.
zeros_like
(
sampling_locations
)
grad_attn_weight
=
torch
.
zeros_like
(
attention_weights
)
ext_module
.
ms_deform_attn_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
grad_output
.
contiguous
(),
grad_value
,
grad_sampling_loc
,
grad_attn_weight
,
im2col_step
=
ctx
.
im2col_step
)
return
grad_value
,
None
,
None
,
\
grad_sampling_loc
,
grad_attn_weight
,
None
Prev
1
2
3
4
5
6
7
8
9
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment