Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
lishj6
BEVFomer
Commits
4cd43886
Commit
4cd43886
authored
Sep 01, 2025
by
lishj6
🏸
Browse files
init
parent
a9a1fe81
Changes
207
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2814 additions
and
0 deletions
+2814
-0
projects/mmdet3d_plugin/datasets/pipelines/formating.py
projects/mmdet3d_plugin/datasets/pipelines/formating.py
+39
-0
projects/mmdet3d_plugin/datasets/pipelines/loading.py
projects/mmdet3d_plugin/datasets/pipelines/loading.py
+0
-0
projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
+330
-0
projects/mmdet3d_plugin/datasets/samplers/__init__.py
projects/mmdet3d_plugin/datasets/samplers/__init__.py
+4
-0
projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
...s/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
+41
-0
projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
+110
-0
projects/mmdet3d_plugin/datasets/samplers/sampler.py
projects/mmdet3d_plugin/datasets/samplers/sampler.py
+7
-0
projects/mmdet3d_plugin/dd3d/__init__.py
projects/mmdet3d_plugin/dd3d/__init__.py
+1
-0
projects/mmdet3d_plugin/dd3d/datasets/__init__.py
projects/mmdet3d_plugin/dd3d/datasets/__init__.py
+0
-0
projects/mmdet3d_plugin/dd3d/datasets/nuscenes.py
projects/mmdet3d_plugin/dd3d/datasets/nuscenes.py
+360
-0
projects/mmdet3d_plugin/dd3d/datasets/transform_utils.py
projects/mmdet3d_plugin/dd3d/datasets/transform_utils.py
+136
-0
projects/mmdet3d_plugin/dd3d/layers/iou_loss.py
projects/mmdet3d_plugin/dd3d/layers/iou_loss.py
+71
-0
projects/mmdet3d_plugin/dd3d/layers/normalization.py
projects/mmdet3d_plugin/dd3d/layers/normalization.py
+40
-0
projects/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
projects/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
+80
-0
projects/mmdet3d_plugin/dd3d/modeling/__init__.py
projects/mmdet3d_plugin/dd3d/modeling/__init__.py
+1
-0
projects/mmdet3d_plugin/dd3d/modeling/core.py
projects/mmdet3d_plugin/dd3d/modeling/core.py
+217
-0
projects/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
...s/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
+46
-0
projects/mmdet3d_plugin/dd3d/modeling/fcos2d.py
projects/mmdet3d_plugin/dd3d/modeling/fcos2d.py
+382
-0
projects/mmdet3d_plugin/dd3d/modeling/fcos3d.py
projects/mmdet3d_plugin/dd3d/modeling/fcos3d.py
+427
-0
projects/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
projects/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
+522
-0
No files found.
projects/mmdet3d_plugin/datasets/pipelines/formating.py
0 → 100644
View file @
4cd43886
# Copyright (c) OpenMMLab. All rights reserved.
import
numpy
as
np
from
mmcv.parallel
import
DataContainer
as
DC
from
mmdet3d.core.bbox
import
BaseInstance3DBoxes
from
mmdet3d.core.points
import
BasePoints
from
mmdet.datasets.builder
import
PIPELINES
from
mmdet.datasets.pipelines
import
to_tensor
from
mmdet3d.datasets.pipelines
import
DefaultFormatBundle3D
@
PIPELINES
.
register_module
()
class
CustomDefaultFormatBundle3D
(
DefaultFormatBundle3D
):
"""Default formatting bundle.
It simplifies the pipeline of formatting common fields for voxels,
including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
"gt_semantic_seg".
These fields are formatted as follows.
- img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
- proposals: (1)to tensor, (2)to DataContainer
- gt_bboxes: (1)to tensor, (2)to DataContainer
- gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
- gt_labels: (1)to tensor, (2)to DataContainer
"""
def
__call__
(
self
,
results
):
"""Call function to transform and format common fields in results.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data that is formatted with
default bundle.
"""
# Format 3D data
results
=
super
(
CustomDefaultFormatBundle3D
,
self
).
__call__
(
results
)
results
[
'gt_map_masks'
]
=
DC
(
to_tensor
(
results
[
'gt_map_masks'
]),
stack
=
True
)
return
results
\ No newline at end of file
projects/mmdet3d_plugin/datasets/pipelines/loading.py
0 → 100644
View file @
4cd43886
projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
0 → 100644
View file @
4cd43886
import
numpy
as
np
from
numpy
import
random
import
mmcv
from
mmdet.datasets.builder
import
PIPELINES
from
mmcv.parallel
import
DataContainer
as
DC
@
PIPELINES
.
register_module
()
class
PadMultiViewImage
(
object
):
"""Pad the multi-view image.
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
Args:
size (tuple, optional): Fixed padding size.
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
"""
def
__init__
(
self
,
size
=
None
,
size_divisor
=
None
,
pad_val
=
0
):
self
.
size
=
size
self
.
size_divisor
=
size_divisor
self
.
pad_val
=
pad_val
# only one of size and size_divisor should be valid
assert
size
is
not
None
or
size_divisor
is
not
None
assert
size
is
None
or
size_divisor
is
None
def
_pad_img
(
self
,
results
):
"""Pad images according to ``self.size``."""
if
self
.
size
is
not
None
:
padded_img
=
[
mmcv
.
impad
(
img
,
shape
=
self
.
size
,
pad_val
=
self
.
pad_val
)
for
img
in
results
[
'img'
]]
elif
self
.
size_divisor
is
not
None
:
padded_img
=
[
mmcv
.
impad_to_multiple
(
img
,
self
.
size_divisor
,
pad_val
=
self
.
pad_val
)
for
img
in
results
[
'img'
]]
results
[
'ori_shape'
]
=
[
img
.
shape
for
img
in
results
[
'img'
]]
results
[
'img'
]
=
padded_img
results
[
'img_shape'
]
=
[
img
.
shape
for
img
in
padded_img
]
results
[
'pad_shape'
]
=
[
img
.
shape
for
img
in
padded_img
]
results
[
'pad_fixed_size'
]
=
self
.
size
results
[
'pad_size_divisor'
]
=
self
.
size_divisor
def
__call__
(
self
,
results
):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
self
.
_pad_img
(
results
)
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(size=
{
self
.
size
}
, '
repr_str
+=
f
'size_divisor=
{
self
.
size_divisor
}
, '
repr_str
+=
f
'pad_val=
{
self
.
pad_val
}
)'
return
repr_str
@
PIPELINES
.
register_module
()
class
NormalizeMultiviewImage
(
object
):
"""Normalize the image.
Added key is "img_norm_cfg".
Args:
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
"""
def
__init__
(
self
,
mean
,
std
,
to_rgb
=
True
):
self
.
mean
=
np
.
array
(
mean
,
dtype
=
np
.
float32
)
self
.
std
=
np
.
array
(
std
,
dtype
=
np
.
float32
)
self
.
to_rgb
=
to_rgb
def
__call__
(
self
,
results
):
"""Call function to normalize images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
"""
results
[
'img'
]
=
[
mmcv
.
imnormalize
(
img
,
self
.
mean
,
self
.
std
,
self
.
to_rgb
)
for
img
in
results
[
'img'
]]
results
[
'img_norm_cfg'
]
=
dict
(
mean
=
self
.
mean
,
std
=
self
.
std
,
to_rgb
=
self
.
to_rgb
)
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(mean=
{
self
.
mean
}
, std=
{
self
.
std
}
, to_rgb=
{
self
.
to_rgb
}
)'
return
repr_str
@
PIPELINES
.
register_module
()
class
PhotoMetricDistortionMultiViewImage
:
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def
__init__
(
self
,
brightness_delta
=
32
,
contrast_range
=
(
0.5
,
1.5
),
saturation_range
=
(
0.5
,
1.5
),
hue_delta
=
18
):
self
.
brightness_delta
=
brightness_delta
self
.
contrast_lower
,
self
.
contrast_upper
=
contrast_range
self
.
saturation_lower
,
self
.
saturation_upper
=
saturation_range
self
.
hue_delta
=
hue_delta
def
__call__
(
self
,
results
):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
imgs
=
results
[
'img'
]
new_imgs
=
[]
for
img
in
imgs
:
assert
img
.
dtype
==
np
.
float32
,
\
'PhotoMetricDistortion needs the input image of dtype np.float32,'
\
' please set "to_float32=True" in "LoadImageFromFile" pipeline'
# random brightness
if
random
.
randint
(
2
):
delta
=
random
.
uniform
(
-
self
.
brightness_delta
,
self
.
brightness_delta
)
img
+=
delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode
=
random
.
randint
(
2
)
if
mode
==
1
:
if
random
.
randint
(
2
):
alpha
=
random
.
uniform
(
self
.
contrast_lower
,
self
.
contrast_upper
)
img
*=
alpha
# convert color from BGR to HSV
img
=
mmcv
.
bgr2hsv
(
img
)
# random saturation
if
random
.
randint
(
2
):
img
[...,
1
]
*=
random
.
uniform
(
self
.
saturation_lower
,
self
.
saturation_upper
)
# random hue
if
random
.
randint
(
2
):
img
[...,
0
]
+=
random
.
uniform
(
-
self
.
hue_delta
,
self
.
hue_delta
)
img
[...,
0
][
img
[...,
0
]
>
360
]
-=
360
img
[...,
0
][
img
[...,
0
]
<
0
]
+=
360
# convert color from HSV to BGR
img
=
mmcv
.
hsv2bgr
(
img
)
# random contrast
if
mode
==
0
:
if
random
.
randint
(
2
):
alpha
=
random
.
uniform
(
self
.
contrast_lower
,
self
.
contrast_upper
)
img
*=
alpha
# randomly swap channels
if
random
.
randint
(
2
):
img
=
img
[...,
random
.
permutation
(
3
)]
new_imgs
.
append
(
img
)
results
[
'img'
]
=
new_imgs
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(
\n
brightness_delta=
{
self
.
brightness_delta
}
,
\n
'
repr_str
+=
'contrast_range='
repr_str
+=
f
'
{
(
self
.
contrast_lower
,
self
.
contrast_upper
)
}
,
\n
'
repr_str
+=
'saturation_range='
repr_str
+=
f
'
{
(
self
.
saturation_lower
,
self
.
saturation_upper
)
}
,
\n
'
repr_str
+=
f
'hue_delta=
{
self
.
hue_delta
}
)'
return
repr_str
@
PIPELINES
.
register_module
()
class
CustomCollect3D
(
object
):
"""Collect data from the loader relevant to the specific task.
This is usually the last stage of the data loader pipeline. Typically keys
is set to some subset of "img", "proposals", "gt_bboxes",
"gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
The "img_meta" item is always populated. The contents of the "img_meta"
dictionary depends on "meta_keys". By default this includes:
- 'img_shape': shape of the image input to the network as a tuple
\
(h, w, c). Note that images may be zero padded on the
\
bottom/right if the batch tensor is larger than this shape.
- 'scale_factor': a float indicating the preprocessing scale
- 'flip': a boolean indicating if image flip transform was used
- 'filename': path to the image file
- 'ori_shape': original shape of the image as a tuple (h, w, c)
- 'pad_shape': image shape after padding
- 'lidar2img': transform from lidar to image
- 'depth2img': transform from depth to image
- 'cam2img': transform from camera to image
- 'pcd_horizontal_flip': a boolean indicating if point cloud is
\
flipped horizontally
- 'pcd_vertical_flip': a boolean indicating if point cloud is
\
flipped vertically
- 'box_mode_3d': 3D box mode
- 'box_type_3d': 3D box type
- 'img_norm_cfg': a dict of normalization information:
- mean: per channel mean subtraction
- std: per channel std divisor
- to_rgb: bool indicating if bgr was converted to rgb
- 'pcd_trans': point cloud transformations
- 'sample_idx': sample index
- 'pcd_scale_factor': point cloud scale factor
- 'pcd_rotation': rotation applied to point cloud
- 'pts_filename': path to point cloud file.
Args:
keys (Sequence[str]): Keys of results to be collected in ``data``.
meta_keys (Sequence[str], optional): Meta keys to be converted to
``mmcv.DataContainer`` and collected in ``data[img_metas]``.
Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
'box_type_3d', 'img_norm_cfg', 'pcd_trans',
'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
"""
def
__init__
(
self
,
keys
,
meta_keys
=
(
'filename'
,
'ori_shape'
,
'img_shape'
,
'lidar2img'
,
'lidar2cam'
,
'depth2img'
,
'cam2img'
,
'pad_shape'
,
'scale_factor'
,
'flip'
,
'pcd_horizontal_flip'
,
'pcd_vertical_flip'
,
'box_mode_3d'
,
'box_type_3d'
,
'img_norm_cfg'
,
'pcd_trans'
,
'sample_idx'
,
'prev_idx'
,
'next_idx'
,
'pcd_scale_factor'
,
'pcd_rotation'
,
'pts_filename'
,
'transformation_3d_flow'
,
'scene_token'
,
'can_bus'
,
)):
self
.
keys
=
keys
self
.
meta_keys
=
meta_keys
def
__call__
(
self
,
results
):
"""Call function to collect keys in results. The keys in ``meta_keys``
will be converted to :obj:`mmcv.DataContainer`.
Args:
results (dict): Result dict contains the data to collect.
Returns:
dict: The result dict contains the following keys
- keys in ``self.keys``
- ``img_metas``
"""
data
=
{}
img_metas
=
{}
for
key
in
self
.
meta_keys
:
if
key
in
results
:
img_metas
[
key
]
=
results
[
key
]
data
[
'img_metas'
]
=
DC
(
img_metas
,
cpu_only
=
True
)
for
key
in
self
.
keys
:
if
key
not
in
results
:
data
[
key
]
=
None
else
:
data
[
key
]
=
results
[
key
]
return
data
def
__repr__
(
self
):
"""str: Return a string that describes the module."""
return
self
.
__class__
.
__name__
+
\
f
'(keys=
{
self
.
keys
}
, meta_keys=
{
self
.
meta_keys
}
)'
@
PIPELINES
.
register_module
()
class
RandomScaleImageMultiViewImage
(
object
):
"""Random scale the image
Args:
scales
"""
def
__init__
(
self
,
scales
=
[]):
self
.
scales
=
scales
assert
len
(
self
.
scales
)
==
1
def
__call__
(
self
,
results
):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
rand_ind
=
np
.
random
.
permutation
(
range
(
len
(
self
.
scales
)))[
0
]
rand_scale
=
self
.
scales
[
rand_ind
]
y_size
=
[
int
(
img
.
shape
[
0
]
*
rand_scale
)
for
img
in
results
[
'img'
]]
x_size
=
[
int
(
img
.
shape
[
1
]
*
rand_scale
)
for
img
in
results
[
'img'
]]
scale_factor
=
np
.
eye
(
4
)
scale_factor
[
0
,
0
]
*=
rand_scale
scale_factor
[
1
,
1
]
*=
rand_scale
results
[
'img'
]
=
[
mmcv
.
imresize
(
img
,
(
x_size
[
idx
],
y_size
[
idx
]),
return_scale
=
False
)
for
idx
,
img
in
enumerate
(
results
[
'img'
])]
lidar2img
=
[
scale_factor
@
l2i
for
l2i
in
results
[
'lidar2img'
]]
results
[
'lidar2img'
]
=
lidar2img
results
[
'img_shape'
]
=
[
img
.
shape
for
img
in
results
[
'img'
]]
results
[
'ori_shape'
]
=
[
img
.
shape
for
img
in
results
[
'img'
]]
return
results
def
__repr__
(
self
):
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(size=
{
self
.
scales
}
, '
return
repr_str
\ No newline at end of file
projects/mmdet3d_plugin/datasets/samplers/__init__.py
0 → 100644
View file @
4cd43886
from
.group_sampler
import
DistributedGroupSampler
from
.distributed_sampler
import
DistributedSampler
from
.sampler
import
SAMPLER
,
build_sampler
projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
0 → 100644
View file @
4cd43886
import
math
import
torch
from
torch.utils.data
import
DistributedSampler
as
_DistributedSampler
from
.sampler
import
SAMPLER
@
SAMPLER
.
register_module
()
class
DistributedSampler
(
_DistributedSampler
):
def
__init__
(
self
,
dataset
=
None
,
num_replicas
=
None
,
rank
=
None
,
shuffle
=
True
,
seed
=
0
):
super
().
__init__
(
dataset
,
num_replicas
=
num_replicas
,
rank
=
rank
,
shuffle
=
shuffle
)
# for the compatibility from PyTorch 1.3+
self
.
seed
=
seed
if
seed
is
not
None
else
0
def
__iter__
(
self
):
# deterministically shuffle based on epoch
if
self
.
shuffle
:
assert
False
else
:
indices
=
torch
.
arange
(
len
(
self
.
dataset
)).
tolist
()
# add extra samples to make it evenly divisible
# in case that indices is shorter than half of total_size
indices
=
(
indices
*
math
.
ceil
(
self
.
total_size
/
len
(
indices
)))[:
self
.
total_size
]
assert
len
(
indices
)
==
self
.
total_size
# subsample
per_replicas
=
self
.
total_size
//
self
.
num_replicas
# indices = indices[self.rank:self.total_size:self.num_replicas]
indices
=
indices
[
self
.
rank
*
per_replicas
:(
self
.
rank
+
1
)
*
per_replicas
]
assert
len
(
indices
)
==
self
.
num_samples
return
iter
(
indices
)
projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
0 → 100644
View file @
4cd43886
# Copyright (c) OpenMMLab. All rights reserved.
import
math
import
numpy
as
np
import
torch
from
mmcv.runner
import
get_dist_info
from
torch.utils.data
import
Sampler
from
.sampler
import
SAMPLER
import
random
from
IPython
import
embed
@
SAMPLER
.
register_module
()
class
DistributedGroupSampler
(
Sampler
):
"""Sampler that restricts data loading to a subset of the dataset.
It is especially useful in conjunction with
:class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
process can pass a DistributedSampler instance as a DataLoader sampler,
and load a subset of the original dataset that is exclusive to it.
.. note::
Dataset is assumed to be of constant size.
Arguments:
dataset: Dataset used for sampling.
num_replicas (optional): Number of processes participating in
distributed training.
rank (optional): Rank of the current process within num_replicas.
seed (int, optional): random seed used to shuffle the sampler if
``shuffle=True``. This number should be identical across all
processes in the distributed group. Default: 0.
"""
def
__init__
(
self
,
dataset
,
samples_per_gpu
=
1
,
num_replicas
=
None
,
rank
=
None
,
seed
=
0
):
_rank
,
_num_replicas
=
get_dist_info
()
if
num_replicas
is
None
:
num_replicas
=
_num_replicas
if
rank
is
None
:
rank
=
_rank
self
.
dataset
=
dataset
self
.
samples_per_gpu
=
samples_per_gpu
self
.
num_replicas
=
num_replicas
self
.
rank
=
rank
self
.
epoch
=
0
self
.
seed
=
seed
if
seed
is
not
None
else
0
assert
hasattr
(
self
.
dataset
,
'flag'
)
self
.
flag
=
self
.
dataset
.
flag
self
.
group_sizes
=
np
.
bincount
(
self
.
flag
)
self
.
num_samples
=
0
for
i
,
j
in
enumerate
(
self
.
group_sizes
):
self
.
num_samples
+=
int
(
math
.
ceil
(
self
.
group_sizes
[
i
]
*
1.0
/
self
.
samples_per_gpu
/
self
.
num_replicas
))
*
self
.
samples_per_gpu
self
.
total_size
=
self
.
num_samples
*
self
.
num_replicas
def
__iter__
(
self
):
# deterministically shuffle based on epoch
g
=
torch
.
Generator
()
g
.
manual_seed
(
self
.
epoch
+
self
.
seed
)
indices
=
[]
for
i
,
size
in
enumerate
(
self
.
group_sizes
):
if
size
>
0
:
indice
=
np
.
where
(
self
.
flag
==
i
)[
0
]
assert
len
(
indice
)
==
size
# add .numpy() to avoid bug when selecting indice in parrots.
# TODO: check whether torch.randperm() can be replaced by
# numpy.random.permutation().
indice
=
indice
[
list
(
torch
.
randperm
(
int
(
size
),
generator
=
g
).
numpy
())].
tolist
()
extra
=
int
(
math
.
ceil
(
size
*
1.0
/
self
.
samples_per_gpu
/
self
.
num_replicas
)
)
*
self
.
samples_per_gpu
*
self
.
num_replicas
-
len
(
indice
)
# pad indice
tmp
=
indice
.
copy
()
for
_
in
range
(
extra
//
size
):
indice
.
extend
(
tmp
)
indice
.
extend
(
tmp
[:
extra
%
size
])
indices
.
extend
(
indice
)
assert
len
(
indices
)
==
self
.
total_size
indices
=
[
indices
[
j
]
for
i
in
list
(
torch
.
randperm
(
len
(
indices
)
//
self
.
samples_per_gpu
,
generator
=
g
))
for
j
in
range
(
i
*
self
.
samples_per_gpu
,
(
i
+
1
)
*
self
.
samples_per_gpu
)
]
# subsample
offset
=
self
.
num_samples
*
self
.
rank
indices
=
indices
[
offset
:
offset
+
self
.
num_samples
]
assert
len
(
indices
)
==
self
.
num_samples
return
iter
(
indices
)
def
__len__
(
self
):
return
self
.
num_samples
def
set_epoch
(
self
,
epoch
):
self
.
epoch
=
epoch
projects/mmdet3d_plugin/datasets/samplers/sampler.py
0 → 100644
View file @
4cd43886
from
mmcv.utils.registry
import
Registry
,
build_from_cfg
SAMPLER
=
Registry
(
'sampler'
)
def
build_sampler
(
cfg
,
default_args
):
return
build_from_cfg
(
cfg
,
SAMPLER
,
default_args
)
projects/mmdet3d_plugin/dd3d/__init__.py
0 → 100644
View file @
4cd43886
from
.modeling
import
*
\ No newline at end of file
projects/mmdet3d_plugin/dd3d/datasets/__init__.py
0 → 100644
View file @
4cd43886
projects/mmdet3d_plugin/dd3d/datasets/nuscenes.py
0 → 100644
View file @
4cd43886
# Copyright 2021 Toyota Research Institute. All rights reserved.
#import functools
from
collections
import
OrderedDict
import
numpy
as
np
import
seaborn
as
sns
from
torch.utils.data
import
Dataset
from
tqdm
import
tqdm
#from detectron2.data import MetadataCatalog
from
detectron2.structures.boxes
import
BoxMode
from
nuscenes.eval.detection.utils
import
category_to_detection_name
from
nuscenes.nuscenes
import
NuScenes
from
nuscenes.utils.splits
import
create_splits_scenes
#from tridet.data import collect_dataset_dicts
from
projects.mmdet3d_plugin.dd3d.structures.boxes3d
import
GenericBoxes3D
from
projects.mmdet3d_plugin.dd3d.structures.pose
import
Pose
from
projects.mmdet3d_plugin.dd3d.utils.geometry
import
project_points3d
from
projects.mmdet3d_plugin.dd3d.utils.visualization
import
float_to_uint8_color
# https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
# - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
# - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
# - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
# tracker if required
DATASET_NAME_TO_VERSION
=
{
"nusc_train"
:
"v1.0-trainval"
,
"nusc_val"
:
"v1.0-trainval"
,
"nusc_val-subsample-8"
:
"v1.0-trainval"
,
"nusc_trainval"
:
"v1.0-trainval"
,
"nusc_test"
:
"v1.0-test"
,
"nusc_mini_train"
:
"v1.0-mini"
,
"nusc_mini_val"
:
"v1.0-mini"
,
}
CAMERA_NAMES
=
(
'CAM_FRONT_LEFT'
,
'CAM_FRONT'
,
'CAM_FRONT_RIGHT'
,
'CAM_BACK_LEFT'
,
'CAM_BACK'
,
'CAM_BACK_RIGHT'
)
ATTRIBUTE_IDS
=
{
'vehicle.moving'
:
0
,
'vehicle.parked'
:
1
,
'vehicle.stopped'
:
2
,
'pedestrian.moving'
:
0
,
'pedestrian.standing'
:
1
,
'pedestrian.sitting_lying_down'
:
2
,
'cycle.with_rider'
:
0
,
'cycle.without_rider'
:
1
,
}
CATEGORY_IDS
=
OrderedDict
({
'barrier'
:
0
,
'bicycle'
:
1
,
'bus'
:
2
,
'car'
:
3
,
'construction_vehicle'
:
4
,
'motorcycle'
:
5
,
'pedestrian'
:
6
,
'traffic_cone'
:
7
,
'trailer'
:
8
,
'truck'
:
9
,
})
COLORS
=
[
float_to_uint8_color
(
clr
)
for
clr
in
sns
.
color_palette
(
"bright"
,
n_colors
=
10
)]
COLORMAP
=
OrderedDict
({
'barrier'
:
COLORS
[
8
],
# yellow
'bicycle'
:
COLORS
[
0
],
# blue
'bus'
:
COLORS
[
6
],
# pink
'car'
:
COLORS
[
2
],
# green
'construction_vehicle'
:
COLORS
[
7
],
# gray
'motorcycle'
:
COLORS
[
4
],
# purple
'pedestrian'
:
COLORS
[
1
],
# orange
'traffic_cone'
:
COLORS
[
3
],
# red
'trailer'
:
COLORS
[
9
],
# skyblue
'truck'
:
COLORS
[
5
],
# brown
})
MAX_NUM_ATTRIBUTES
=
3
def
_compute_iou
(
box1
,
box2
):
"""
Parameters
----------
box1, box2:
(x1, y1, x2, y2)
"""
xx1
=
max
(
box1
[
0
],
box2
[
0
])
yy1
=
max
(
box1
[
1
],
box2
[
1
])
xx2
=
min
(
box1
[
2
],
box2
[
2
])
yy2
=
min
(
box1
[
3
],
box2
[
3
])
if
xx1
>=
xx2
or
yy1
>=
yy2
:
return
0.
inter
=
(
xx2
-
xx1
)
*
(
yy2
-
yy1
)
a1
=
(
box1
[
2
]
-
box1
[
0
])
*
(
box1
[
3
]
-
box1
[
1
])
a2
=
(
box2
[
2
]
-
box2
[
0
])
*
(
box2
[
3
]
-
box2
[
1
])
return
inter
/
(
a1
+
a2
-
inter
)
class
NuscenesDataset
(
Dataset
):
def
__init__
(
self
,
name
,
data_root
,
datum_names
=
CAMERA_NAMES
,
min_num_lidar_points
=
3
,
min_box_visibility
=
0.2
,
**
unused
):
self
.
data_root
=
data_root
assert
name
in
DATASET_NAME_TO_VERSION
version
=
DATASET_NAME_TO_VERSION
[
name
]
self
.
nusc
=
NuScenes
(
version
=
version
,
dataroot
=
data_root
,
verbose
=
True
)
self
.
datum_names
=
datum_names
self
.
min_num_lidar_points
=
min_num_lidar_points
self
.
min_box_visibility
=
min_box_visibility
self
.
dataset_item_info
=
self
.
_build_dataset_item_info
(
name
)
# Index instance tokens to their IDs
self
.
_instance_token_to_id
=
self
.
_index_instance_tokens
()
# Construct the mapping from datum_token (image id) to index
print
(
"Generating the mapping from image id to idx..."
)
self
.
datumtoken2idx
=
{}
for
idx
,
(
datum_token
,
_
,
_
,
_
,
_
)
in
enumerate
(
self
.
dataset_item_info
):
self
.
datumtoken2idx
[
datum_token
]
=
idx
print
(
"Done."
)
def
_build_dataset_item_info
(
self
,
name
):
scenes_in_split
=
self
.
_get_split_scenes
(
name
)
dataset_items
=
[]
for
_
,
scene_token
in
tqdm
(
scenes_in_split
):
scene
=
self
.
nusc
.
get
(
'scene'
,
scene_token
)
sample_token
=
scene
[
'first_sample_token'
]
for
sample_idx
in
range
(
scene
[
'nbr_samples'
]):
if
name
.
endswith
(
'subsample-8'
)
and
sample_idx
%
8
>
0
:
# Sample-level subsampling.
continue
sample
=
self
.
nusc
.
get
(
'sample'
,
sample_token
)
for
datum_name
,
datum_token
in
sample
[
'data'
].
items
():
if
datum_name
not
in
self
.
datum_names
:
continue
dataset_items
.
append
((
datum_token
,
sample_token
,
scene
[
'name'
],
sample_idx
,
datum_name
))
sample_token
=
sample
[
'next'
]
return
dataset_items
def
_get_split_scenes
(
self
,
name
):
scenes_in_splits
=
create_splits_scenes
()
if
name
==
"nusc_trainval"
:
scenes
=
scenes_in_splits
[
"train"
]
+
scenes_in_splits
[
"val"
]
elif
name
==
"nusc_val-subsample-8"
:
scenes
=
scenes_in_splits
[
"val"
]
else
:
assert
name
.
startswith
(
'nusc_'
),
f
"Invalid dataset name:
{
name
}
"
split
=
name
[
5
:]
assert
split
in
scenes_in_splits
,
f
"Invalid dataset:
{
split
}
"
scenes
=
scenes_in_splits
[
split
]
# Mapping from scene name to token.
name_to_token
=
{
scene
[
'name'
]:
scene
[
'token'
]
for
scene
in
self
.
nusc
.
scene
}
return
[(
name
,
name_to_token
[
name
])
for
name
in
scenes
]
def
__len__
(
self
):
return
len
(
self
.
dataset_item_info
)
def
_build_id
(
self
,
scene_name
,
sample_idx
,
datum_name
):
sample_id
=
f
"
{
scene_name
}
_
{
sample_idx
:
03
d
}
"
image_id
=
f
"
{
sample_id
}
_
{
datum_name
}
"
return
image_id
,
sample_id
def
_index_instance_tokens
(
self
):
"""Index instance tokens for uniquely identifying instances across samples"""
instance_token_to_id
=
{}
for
record
in
self
.
nusc
.
sample_annotation
:
instance_token
=
record
[
'instance_token'
]
if
instance_token
not
in
instance_token_to_id
:
next_instance_id
=
len
(
instance_token_to_id
)
instance_token_to_id
[
instance_token
]
=
next_instance_id
return
instance_token_to_id
def
get_instance_annotations
(
self
,
annotation_list
,
K
,
image_shape
,
pose_WS
):
annotations
=
[]
for
_ann
in
annotation_list
:
ann
=
self
.
nusc
.
get
(
'sample_annotation'
,
_ann
.
token
)
if
ann
[
'num_lidar_pts'
]
+
ann
[
'num_radar_pts'
]
<
self
.
min_num_lidar_points
:
continue
annotation
=
OrderedDict
()
# --------
# Category
# --------
category
=
category_to_detection_name
(
ann
[
'category_name'
])
if
category
is
None
:
continue
annotation
[
'category_id'
]
=
CATEGORY_IDS
[
category
]
# ------
# 3D box
# ------
# NOTE: ann['rotation'], ann['translation'] is in global frame.
pose_SO
=
Pose
(
wxyz
=
_ann
.
orientation
,
tvec
=
_ann
.
center
)
# pose in sensor frame
# DEBUG:
# pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
# pose_WO_2 = pose_WS * pose_SO
# assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
bbox3d
=
GenericBoxes3D
(
_ann
.
orientation
,
_ann
.
center
,
_ann
.
wlh
)
annotation
[
'bbox3d'
]
=
bbox3d
.
vectorize
().
tolist
()[
0
]
# --------------------------------------
# 2D box -- project 8 corners of 3D bbox
# --------------------------------------
corners
=
project_points3d
(
bbox3d
.
corners
.
cpu
().
numpy
().
squeeze
(
0
),
K
)
l
,
t
=
corners
[:,
0
].
min
(),
corners
[:,
1
].
min
()
r
,
b
=
corners
[:,
0
].
max
(),
corners
[:,
1
].
max
()
x1
=
max
(
0
,
l
)
y1
=
max
(
0
,
t
)
x2
=
min
(
image_shape
[
1
],
r
)
y2
=
min
(
image_shape
[
0
],
b
)
iou
=
_compute_iou
([
l
,
t
,
r
,
b
],
[
x1
,
y1
,
x2
,
y2
])
if
iou
<
self
.
min_box_visibility
:
continue
annotation
[
'bbox'
]
=
[
x1
,
y1
,
x2
,
y2
]
annotation
[
'bbox_mode'
]
=
BoxMode
.
XYXY_ABS
# --------
# Track ID
# --------
annotation
[
'track_id'
]
=
self
.
_instance_token_to_id
[
ann
[
'instance_token'
]]
# ---------
# Attribute
# ---------
attr_tokens
=
ann
[
'attribute_tokens'
]
assert
len
(
attr_tokens
)
<
2
# NOTE: Allow only single attrubute.
attribute_id
=
MAX_NUM_ATTRIBUTES
# By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
if
attr_tokens
:
attribute
=
self
.
nusc
.
get
(
'attribute'
,
attr_tokens
[
0
])[
'name'
]
attribute_id
=
ATTRIBUTE_IDS
[
attribute
]
annotation
[
'attribute_id'
]
=
attribute_id
# -----
# Speed
# -----
vel_global
=
self
.
nusc
.
box_velocity
(
ann
[
'token'
])
speed
=
np
.
linalg
.
norm
(
vel_global
)
# NOTE: This can be NaN.
# DEBUG:
# speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
annotation
[
'speed'
]
=
speed
annotations
.
append
(
annotation
)
return
annotations
def
_get_ego_velocity
(
self
,
current
,
max_time_diff
=
1.5
):
"""Velocity of ego-vehicle in m/s.
"""
has_prev
=
current
[
'prev'
]
!=
''
has_next
=
current
[
'next'
]
!=
''
# Cannot estimate velocity for a single annotation.
if
not
has_prev
and
not
has_next
:
return
np
.
array
([
np
.
nan
,
np
.
nan
,
np
.
nan
])
if
has_prev
:
first
=
self
.
nusc
.
get
(
'sample_data'
,
current
[
'prev'
])
else
:
first
=
current
if
has_next
:
last
=
self
.
nusc
.
get
(
'sample_data'
,
current
[
'next'
])
else
:
last
=
current
pos_first
=
self
.
nusc
.
get
(
'ego_pose'
,
first
[
'ego_pose_token'
])[
'translation'
]
pos_last
=
self
.
nusc
.
get
(
'ego_pose'
,
last
[
'ego_pose_token'
])[
'translation'
]
pos_diff
=
np
.
float32
(
pos_last
)
-
np
.
float32
(
pos_first
)
time_last
=
1e-6
*
last
[
'timestamp'
]
time_first
=
1e-6
*
first
[
'timestamp'
]
time_diff
=
time_last
-
time_first
if
has_next
and
has_prev
:
# If doing centered difference, allow for up to double the max_time_diff.
max_time_diff
*=
2
if
time_diff
>
max_time_diff
:
# If time_diff is too big, don't return an estimate.
return
np
.
array
([
np
.
nan
,
np
.
nan
,
np
.
nan
])
else
:
return
pos_diff
/
time_diff
def
__getitem__
(
self
,
idx
):
datum_token
,
sample_token
,
scene_name
,
sample_idx
,
datum_name
=
self
.
dataset_item_info
[
idx
]
datum
=
self
.
nusc
.
get
(
'sample_data'
,
datum_token
)
assert
datum
[
'is_key_frame'
]
filename
,
_annotations
,
K
=
self
.
nusc
.
get_sample_data
(
datum_token
)
image_id
,
sample_id
=
self
.
_build_id
(
scene_name
,
sample_idx
,
datum_name
)
height
,
width
=
datum
[
'height'
],
datum
[
'width'
]
d2_dict
=
OrderedDict
(
file_name
=
filename
,
height
=
height
,
width
=
width
,
image_id
=
image_id
,
sample_id
=
sample_id
,
sample_token
=
sample_token
)
# Intrinsics
d2_dict
[
'intrinsics'
]
=
list
(
K
.
flatten
())
# Get pose of the sensor (S) from vehicle (V) frame
_pose_VS
=
self
.
nusc
.
get
(
'calibrated_sensor'
,
datum
[
'calibrated_sensor_token'
])
pose_VS
=
Pose
(
wxyz
=
np
.
float64
(
_pose_VS
[
'rotation'
]),
tvec
=
np
.
float64
(
_pose_VS
[
'translation'
]))
# Get ego-pose of the vehicle (V) from global/world (W) frame
_pose_WV
=
self
.
nusc
.
get
(
'ego_pose'
,
datum
[
'ego_pose_token'
])
pose_WV
=
Pose
(
wxyz
=
np
.
float64
(
_pose_WV
[
'rotation'
]),
tvec
=
np
.
float64
(
_pose_WV
[
'translation'
]))
pose_WS
=
pose_WV
*
pose_VS
d2_dict
[
'pose'
]
=
{
'wxyz'
:
list
(
pose_WS
.
quat
.
elements
),
'tvec'
:
list
(
pose_WS
.
tvec
)}
d2_dict
[
'extrinsics'
]
=
{
'wxyz'
:
list
(
pose_VS
.
quat
.
elements
),
'tvec'
:
list
(
pose_VS
.
tvec
)}
d2_dict
[
'ego_speed'
]
=
np
.
linalg
.
norm
(
self
.
_get_ego_velocity
(
datum
))
d2_dict
[
'annotations'
]
=
self
.
get_instance_annotations
(
_annotations
,
K
,
(
height
,
width
),
pose_WS
)
return
d2_dict
def
getitem_by_datumtoken
(
self
,
datum_token
):
# idx = self.datumtoken2idx[datum_token]
# ret = self.__getitem__(idx)
datum
=
self
.
nusc
.
get
(
'sample_data'
,
datum_token
)
sample_token
=
datum
[
'sample_token'
]
filename
,
_annotations
,
K
=
self
.
nusc
.
get_sample_data
(
datum_token
)
height
,
width
=
datum
[
'height'
],
datum
[
'width'
]
d2_dict
=
OrderedDict
(
file_name
=
filename
,
height
=
height
,
width
=
width
,
image_id
=
0
,
sample_id
=
0
,
sample_token
=
sample_token
)
# Intrinsics
d2_dict
[
'intrinsics'
]
=
list
(
K
.
flatten
())
# Get pose of the sensor (S) from vehicle (V) frame
_pose_VS
=
self
.
nusc
.
get
(
'calibrated_sensor'
,
datum
[
'calibrated_sensor_token'
])
pose_VS
=
Pose
(
wxyz
=
np
.
float64
(
_pose_VS
[
'rotation'
]),
tvec
=
np
.
float64
(
_pose_VS
[
'translation'
]))
# Get ego-pose of the vehicle (V) from global/world (W) frame
_pose_WV
=
self
.
nusc
.
get
(
'ego_pose'
,
datum
[
'ego_pose_token'
])
pose_WV
=
Pose
(
wxyz
=
np
.
float64
(
_pose_WV
[
'rotation'
]),
tvec
=
np
.
float64
(
_pose_WV
[
'translation'
]))
pose_WS
=
pose_WV
*
pose_VS
d2_dict
[
'pose'
]
=
{
'wxyz'
:
list
(
pose_WS
.
quat
.
elements
),
'tvec'
:
list
(
pose_WS
.
tvec
)}
d2_dict
[
'extrinsics'
]
=
{
'wxyz'
:
list
(
pose_VS
.
quat
.
elements
),
'tvec'
:
list
(
pose_VS
.
tvec
)}
d2_dict
[
'ego_speed'
]
=
np
.
linalg
.
norm
(
self
.
_get_ego_velocity
(
datum
))
d2_dict
[
'annotations'
]
=
self
.
get_instance_annotations
(
_annotations
,
K
,
(
height
,
width
),
pose_WS
)
return
d2_dict
\ No newline at end of file
projects/mmdet3d_plugin/dd3d/datasets/transform_utils.py
0 → 100644
View file @
4cd43886
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from detectron2:
# https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py
import
numpy
as
np
import
torch
from
detectron2.data
import
transforms
as
T
from
detectron2.structures
import
Boxes
,
BoxMode
,
Instances
from
projects.mmdet3d_plugin.dd3d.structures.boxes3d
import
Boxes3D
__all__
=
[
"transform_instance_annotations"
,
"annotations_to_instances"
]
def
transform_instance_annotations
(
annotation
,
transforms
,
image_size
,
):
"""Adapted from:
https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py#L254
The changes from original:
- The presence of 2D bounding box (i.e. "bbox" field) is assumed by default in d2; here it's optional.
- Add optional 3D bounding box support.
- If the instance mask annotation is in RLE, then it's decoded into polygons, not bitmask, to save memory.
===============================================================================================================
Apply transforms to box, segmentation and keypoints annotations of a single instance.
It will use `transforms.apply_box` for the box, and
`transforms.apply_coords` for segmentation polygons & keypoints.
If you need anything more specially designed for each data structure,
you'll need to implement your own version of this function or the transforms.
Args:
annotation (dict): dict of instance annotations for a single instance.
It will be modified in-place.
transforms (TransformList or list[Transform]):
image_size (tuple): the height, width of the transformed image
keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
Returns:
dict:
the same input dict with fields "bbox", "segmentation", "keypoints"
transformed according to `transforms`.
The "bbox_mode" field will be set to XYXY_ABS.
"""
if
isinstance
(
transforms
,
(
tuple
,
list
)):
transforms
=
T
.
TransformList
(
transforms
)
# (dennis.park) Here 2D bounding box is optional.
if
"bbox"
in
annotation
:
assert
"bbox_mode"
in
annotation
,
"'bbox' is present, but 'bbox_mode' is not."
# bbox is 1d (per-instance bounding box)
bbox
=
BoxMode
.
convert
(
annotation
[
"bbox"
],
annotation
[
"bbox_mode"
],
BoxMode
.
XYXY_ABS
)
bbox
=
transforms
.
apply_box
(
np
.
array
([
bbox
]))[
0
]
# clip transformed bbox to image size
bbox
=
bbox
.
clip
(
min
=
0
)
bbox
=
np
.
minimum
(
bbox
,
list
(
image_size
+
image_size
)[::
-
1
])
annotation
[
"bbox"
]
=
bbox
annotation
[
"bbox_mode"
]
=
BoxMode
.
XYXY_ABS
# Vertical flipping is not implemented (`flip_transform.py`). TODO: implement if needed.
if
"bbox3d"
in
annotation
:
bbox3d
=
np
.
array
(
annotation
[
"bbox3d"
])
annotation
[
'bbox3d'
]
=
transforms
.
apply_box3d
(
bbox3d
)
return
annotation
def
_create_empty_instances
(
image_size
):
target
=
Instances
(
image_size
)
target
.
gt_boxes
=
Boxes
([])
target
.
gt_classes
=
torch
.
tensor
([],
dtype
=
torch
.
int64
)
target
.
gt_boxes3d
=
Boxes3D
.
from_vectors
([],
torch
.
eye
(
3
,
dtype
=
torch
.
float32
))
return
target
def
annotations_to_instances
(
annos
,
image_size
,
intrinsics
=
None
,
):
"""
Create an :class:`Instances` object used by the models,
from instance annotations in the dataset dict.
Args:
annos (list[dict]): a list of instance annotations in one image, each
element for one instance.
image_size (tuple): height, width
Returns:
Instances:
It will contain fields "gt_boxes", "gt_classes",
"gt_masks", "gt_keypoints", if they can be obtained from `annos`.
This is the format that builtin models expect.
"""
if
len
(
annos
)
==
0
:
return
_create_empty_instances
(
image_size
)
boxes
=
[
BoxMode
.
convert
(
obj
[
"bbox"
],
obj
[
"bbox_mode"
],
BoxMode
.
XYXY_ABS
)
for
obj
in
annos
]
target
=
Instances
(
image_size
)
target
.
gt_boxes
=
Boxes
(
boxes
)
classes
=
[
obj
[
"category_id"
]
for
obj
in
annos
]
classes
=
torch
.
tensor
(
classes
,
dtype
=
torch
.
int64
)
target
.
gt_classes
=
classes
if
len
(
annos
)
and
"bbox3d"
in
annos
[
0
]:
assert
intrinsics
is
not
None
target
.
gt_boxes3d
=
Boxes3D
.
from_vectors
([
anno
[
'bbox3d'
]
for
anno
in
annos
],
intrinsics
)
if
len
(
target
.
gt_boxes3d
)
!=
target
.
gt_boxes
.
tensor
.
shape
[
0
]:
raise
ValueError
(
f
"The sizes of `gt_boxes3d` and `gt_boxes` do not match: a=
{
len
(
target
.
gt_boxes3d
)
}
, b=
{
target
.
gt_boxes
.
tensor
.
shape
[
0
]
}
."
)
# NOTE: add nuscenes attributes here
# NOTE: instances will be filtered later
# NuScenes attributes
if
len
(
annos
)
and
"attribute_id"
in
annos
[
0
]:
attributes
=
[
obj
[
"attribute_id"
]
for
obj
in
annos
]
target
.
gt_attributes
=
torch
.
tensor
(
attributes
,
dtype
=
torch
.
int64
)
# Speed (magnitude of velocity)
if
len
(
annos
)
and
"speed"
in
annos
[
0
]:
speeds
=
[
obj
[
"speed"
]
for
obj
in
annos
]
target
.
gt_speeds
=
torch
.
tensor
(
speeds
,
dtype
=
torch
.
float32
)
assert
len
(
boxes
)
==
len
(
classes
)
==
len
(
attributes
)
==
len
(
speeds
),
\
'the numbers of annotations should be the same'
return
target
projects/mmdet3d_plugin/dd3d/layers/iou_loss.py
0 → 100644
View file @
4cd43886
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from AdelaiDet:
# https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/iou_loss.py
import
torch
from
torch
import
nn
class
IOULoss
(
nn
.
Module
):
"""
Intersetion Over Union (IoU) loss which supports three
different IoU computations:
* IoU
* Linear IoU
* gIoU
"""
def
__init__
(
self
,
loc_loss_type
=
'iou'
):
super
(
IOULoss
,
self
).
__init__
()
self
.
loc_loss_type
=
loc_loss_type
def
forward
(
self
,
pred
,
target
,
weight
=
None
):
"""
Args:
pred: Nx4 predicted bounding boxes
target: Nx4 target bounding boxes
weight: N loss weight for each instance
"""
pred_left
=
pred
[:,
0
]
pred_top
=
pred
[:,
1
]
pred_right
=
pred
[:,
2
]
pred_bottom
=
pred
[:,
3
]
target_left
=
target
[:,
0
]
target_top
=
target
[:,
1
]
target_right
=
target
[:,
2
]
target_bottom
=
target
[:,
3
]
target_aera
=
(
target_left
+
target_right
)
*
\
(
target_top
+
target_bottom
)
pred_aera
=
(
pred_left
+
pred_right
)
*
\
(
pred_top
+
pred_bottom
)
w_intersect
=
torch
.
min
(
pred_left
,
target_left
)
+
\
torch
.
min
(
pred_right
,
target_right
)
h_intersect
=
torch
.
min
(
pred_bottom
,
target_bottom
)
+
\
torch
.
min
(
pred_top
,
target_top
)
g_w_intersect
=
torch
.
max
(
pred_left
,
target_left
)
+
\
torch
.
max
(
pred_right
,
target_right
)
g_h_intersect
=
torch
.
max
(
pred_bottom
,
target_bottom
)
+
\
torch
.
max
(
pred_top
,
target_top
)
ac_uion
=
g_w_intersect
*
g_h_intersect
area_intersect
=
w_intersect
*
h_intersect
area_union
=
target_aera
+
pred_aera
-
area_intersect
ious
=
(
area_intersect
+
1.0
)
/
(
area_union
+
1.0
)
gious
=
ious
-
(
ac_uion
-
area_union
)
/
ac_uion
if
self
.
loc_loss_type
==
'iou'
:
losses
=
-
torch
.
log
(
ious
)
elif
self
.
loc_loss_type
==
'linear_iou'
:
losses
=
1
-
ious
elif
self
.
loc_loss_type
==
'giou'
:
losses
=
1
-
gious
else
:
raise
NotImplementedError
if
weight
is
not
None
:
return
(
losses
*
weight
).
sum
()
else
:
return
losses
.
sum
()
projects/mmdet3d_plugin/dd3d/layers/normalization.py
0 → 100644
View file @
4cd43886
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from AdelaiDet
# https://github.com/aim-uofa/AdelaiDet/
import
logging
import
torch
from
torch
import
nn
LOG
=
logging
.
getLogger
(
__name__
)
class
Scale
(
nn
.
Module
):
def
__init__
(
self
,
init_value
=
1.0
):
super
(
Scale
,
self
).
__init__
()
self
.
scale
=
nn
.
Parameter
(
torch
.
FloatTensor
([
init_value
]))
def
forward
(
self
,
input
):
return
input
*
self
.
scale
class
Offset
(
nn
.
Module
):
def
__init__
(
self
,
init_value
=
0.
):
super
(
Offset
,
self
).
__init__
()
self
.
bias
=
nn
.
Parameter
(
torch
.
FloatTensor
([
init_value
]))
def
forward
(
self
,
input
):
return
input
+
self
.
bias
class
ModuleListDial
(
nn
.
ModuleList
):
def
__init__
(
self
,
modules
=
None
):
super
(
ModuleListDial
,
self
).
__init__
(
modules
)
self
.
cur_position
=
0
def
forward
(
self
,
x
):
result
=
self
[
self
.
cur_position
](
x
)
self
.
cur_position
+=
1
if
self
.
cur_position
>=
len
(
self
):
self
.
cur_position
=
0
return
result
projects/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
0 → 100644
View file @
4cd43886
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from fvcore:
# https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/smooth_l1_loss.py
import
torch
def
smooth_l1_loss
(
input
:
torch
.
Tensor
,
target
:
torch
.
Tensor
,
beta
:
float
,
reduction
:
str
=
"none"
)
->
torch
.
Tensor
:
"""
Smooth L1 loss defined in the Fast R-CNN paper as:
| 0.5 * x ** 2 / beta if abs(x) < beta
smoothl1(x) = |
| abs(x) - 0.5 * beta otherwise,
where x = input - target.
Smooth L1 loss is related to Huber loss, which is defined as:
| 0.5 * x ** 2 if abs(x) < beta
huber(x) = |
| beta * (abs(x) - 0.5 * beta) otherwise
Smooth L1 loss is equal to huber(x) / beta. This leads to the following
differences:
- As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss
converges to a constant 0 loss.
- As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss
converges to L2 loss.
- For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant
slope of 1. For Huber loss, the slope of the L1 segment is beta.
Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta
portion replaced with a quadratic function such that at abs(x) = beta, its
slope is 1. The quadratic segment smooths the L1 loss near x = 0.
Args:
input (Tensor): input tensor of any shape
target (Tensor): target value tensor with the same shape as input
beta (float): L1 to L2 change point.
For beta values < 1e-5, L1 loss is computed.
reduction: 'none' | 'mean' | 'sum'
'none': No reduction will be applied to the output.
'mean': The output will be averaged.
'sum': The output will be summed.
Returns:
The loss with the reduction option applied.
Note:
PyTorch's builtin "Smooth L1 loss" implementation does not actually
implement Smooth L1 loss, nor does it implement Huber loss. It implements
the special case of both in which they are equal (beta=1).
See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss.
"""
# (dennis.park) Make it work with mixed precision training.
beta
=
torch
.
as_tensor
(
beta
).
to
(
input
.
dtype
)
if
beta
<
1e-5
:
# if beta == 0, then torch.where will result in nan gradients when
# the chain rule is applied due to pytorch implementation details
# (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of
# zeros, rather than "no gradient"). To avoid this issue, we define
# small values of beta to be exactly l1 loss.
loss
=
torch
.
abs
(
input
-
target
)
else
:
n
=
torch
.
abs
(
input
-
target
)
cond
=
n
<
beta
a
=
0.5
*
n
**
2
b
=
n
-
0.5
*
beta
a
,
b
=
a
.
to
(
input
.
dtype
),
b
.
to
(
input
.
dtype
)
loss
=
torch
.
where
(
cond
,
a
,
b
)
# loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
if
reduction
==
"mean"
:
loss
=
loss
.
mean
()
elif
reduction
==
"sum"
:
loss
=
loss
.
sum
()
return
loss
projects/mmdet3d_plugin/dd3d/modeling/__init__.py
0 → 100644
View file @
4cd43886
from
.nuscenes_dd3d
import
NuscenesDD3D
\ No newline at end of file
projects/mmdet3d_plugin/dd3d/modeling/core.py
0 → 100644
View file @
4cd43886
# Copyright 2021 Toyota Research Institute. All rights reserved.
import
torch
from
torch
import
nn
#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from
detectron2.modeling.postprocessing
import
detector_postprocess
as
resize_instances
from
detectron2.structures
import
Instances
from
detectron2.layers
import
ShapeSpec
from
mmcv.runner
import
force_fp32
from
.fcos2d
import
FCOS2DHead
,
FCOS2DInference
,
FCOS2DLoss
from
.fcos3d
import
FCOS3DHead
,
FCOS3DInference
,
FCOS3DLoss
#from tridet.modeling.dd3d.postprocessing import nuscenes_sample_aggregate
from
.prepare_targets
import
DD3DTargetPreparer
#from tridet.modeling.feature_extractor import build_feature_extractor
from
projects.mmdet3d_plugin.dd3d.structures.image_list
import
ImageList
from
projects.mmdet3d_plugin.dd3d.utils.tensor2d
import
compute_features_locations
as
compute_locations_per_level
#@META_ARCH_REGISTRY.register()
class
DD3D
(
nn
.
Module
):
def
__init__
(
self
,
num_classes
,
in_channels
,
strides
,
fcos2d_cfg
=
dict
(),
fcos2d_loss_cfg
=
dict
(),
fcos3d_cfg
=
dict
(),
fcos3d_loss_cfg
=
dict
(),
target_assign_cfg
=
dict
(),
box3d_on
=
True
,
feature_locations_offset
=
"none"
):
super
().
__init__
()
# NOTE: do not need backbone
# self.backbone = build_feature_extractor(cfg)
# backbone_output_shape = self.backbone.output_shape()
# self.in_features = cfg.DD3D.IN_FEATURES or list(backbone_output_shape.keys())
self
.
backbone_output_shape
=
[
ShapeSpec
(
channels
=
in_channels
,
stride
=
s
)
for
s
in
strides
]
self
.
feature_locations_offset
=
feature_locations_offset
self
.
fcos2d_head
=
FCOS2DHead
(
num_classes
=
num_classes
,
input_shape
=
self
.
backbone_output_shape
,
**
fcos2d_cfg
)
self
.
fcos2d_loss
=
FCOS2DLoss
(
num_classes
=
num_classes
,
**
fcos2d_loss_cfg
)
# NOTE: inference later
# self.fcos2d_inference = FCOS2DInference(cfg)
if
box3d_on
:
self
.
fcos3d_head
=
FCOS3DHead
(
num_classes
=
num_classes
,
input_shape
=
self
.
backbone_output_shape
,
**
fcos3d_cfg
)
self
.
fcos3d_loss
=
FCOS3DLoss
(
num_classes
=
num_classes
,
**
fcos3d_loss_cfg
)
# NOTE: inference later
# self.fcos3d_inference = FCOS3DInference(cfg)
self
.
only_box2d
=
False
else
:
self
.
only_box2d
=
True
self
.
prepare_targets
=
DD3DTargetPreparer
(
num_classes
=
num_classes
,
input_shape
=
self
.
backbone_output_shape
,
box3d_on
=
box3d_on
,
**
target_assign_cfg
)
# NOTE: inference later
# self.postprocess_in_inference = cfg.DD3D.INFERENCE.DO_POSTPROCESS
# self.do_nms = cfg.DD3D.INFERENCE.DO_NMS
# self.do_bev_nms = cfg.DD3D.INFERENCE.DO_BEV_NMS
# self.bev_nms_iou_thresh = cfg.DD3D.INFERENCE.BEV_NMS_IOU_THRESH
# nuScenes inference aggregates detections over all 6 cameras.
# self.nusc_sample_aggregate_in_inference = cfg.DD3D.INFERENCE.NUSC_SAMPLE_AGGREGATE
self
.
num_classes
=
num_classes
# NOTE: do not need normalize
# self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
# self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
# NOTE:
# @property
# def device(self):
# return self.pixel_mean.device
# def preprocess_image(self, x):
# return (x - self.pixel_mean) / self.pixel_std
@
force_fp32
(
apply_to
=
(
'features'
))
def
forward
(
self
,
features
,
batched_inputs
):
# NOTE:
# images = [x["image"].to(self.device) for x in batched_inputs]
# images = [self.preprocess_image(x) for x in images]
# NOTE: directly use inv_intrinsics
# if 'intrinsics' in batched_inputs[0]:
# intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
# else:
# intrinsics = None
# images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
if
'inv_intrinsics'
in
batched_inputs
[
0
]:
inv_intrinsics
=
[
x
[
'inv_intrinsics'
].
to
(
features
[
0
].
device
)
for
x
in
batched_inputs
]
inv_intrinsics
=
torch
.
stack
(
inv_intrinsics
,
dim
=
0
)
else
:
inv_intrinsics
=
None
# NOTE:
# gt_dense_depth = None
# if 'depth' in batched_inputs[0]:
# gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
# gt_dense_depth = ImageList.from_tensors(
# gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
# )
# NOTE: directly input feature
# features = self.backbone(images.tensor)
# features = [features[f] for f in self.in_features]
if
"instances"
in
batched_inputs
[
0
]:
gt_instances
=
[
x
[
"instances"
].
to
(
features
[
0
].
device
)
for
x
in
batched_inputs
]
else
:
gt_instances
=
None
locations
=
self
.
compute_locations
(
features
)
logits
,
box2d_reg
,
centerness
,
_
=
self
.
fcos2d_head
(
features
)
if
not
self
.
only_box2d
:
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
dense_depth
=
self
.
fcos3d_head
(
features
)
# NOTE: directly use inv_intrinsics
# inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
if
self
.
training
:
assert
gt_instances
is
not
None
feature_shapes
=
[
x
.
shape
[
-
2
:]
for
x
in
features
]
training_targets
=
self
.
prepare_targets
(
locations
,
gt_instances
,
feature_shapes
)
# NOTE:
# if gt_dense_depth is not None:
# training_targets.update({"dense_depth": gt_dense_depth})
losses
=
{}
fcos2d_loss
,
fcos2d_info
=
self
.
fcos2d_loss
(
logits
,
box2d_reg
,
centerness
,
training_targets
)
losses
.
update
(
fcos2d_loss
)
if
not
self
.
only_box2d
:
fcos3d_loss
=
self
.
fcos3d_loss
(
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
dense_depth
,
inv_intrinsics
,
fcos2d_info
,
training_targets
)
losses
.
update
(
fcos3d_loss
)
return
losses
else
:
# TODO: do not support inference now
raise
NotImplementedError
pred_instances
,
fcos2d_info
=
self
.
fcos2d_inference
(
logits
,
box2d_reg
,
centerness
,
locations
,
images
.
image_sizes
)
if
not
self
.
only_box2d
:
# This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances' in place.
self
.
fcos3d_inference
(
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
inv_intrinsics
,
pred_instances
,
fcos2d_info
)
# 3D score == 2D score x confidence.
score_key
=
"scores_3d"
else
:
score_key
=
"scores"
# Transpose to "image-first", i.e. (B, L)
pred_instances
=
list
(
zip
(
*
pred_instances
))
pred_instances
=
[
Instances
.
cat
(
instances
)
for
instances
in
pred_instances
]
# 2D NMS and pick top-K.
if
self
.
do_nms
:
pred_instances
=
self
.
fcos2d_inference
.
nms_and_top_k
(
pred_instances
,
score_key
)
if
not
self
.
only_box2d
and
self
.
do_bev_nms
:
# Bird-eye-view NMS.
dummy_group_idxs
=
{
i
:
[
i
]
for
i
,
_
in
enumerate
(
pred_instances
)}
if
'pose'
in
batched_inputs
[
0
]:
poses
=
[
x
[
'pose'
]
for
x
in
batched_inputs
]
else
:
poses
=
[
x
[
'extrinsics'
]
for
x
in
batched_inputs
]
pred_instances
=
nuscenes_sample_aggregate
(
pred_instances
,
dummy_group_idxs
,
self
.
num_classes
,
poses
,
iou_threshold
=
self
.
bev_nms_iou_thresh
,
include_boxes3d_global
=
False
)
if
self
.
postprocess_in_inference
:
processed_results
=
[]
for
results_per_image
,
input_per_image
,
image_size
in
\
zip
(
pred_instances
,
batched_inputs
,
images
.
image_sizes
):
height
=
input_per_image
.
get
(
"height"
,
image_size
[
0
])
width
=
input_per_image
.
get
(
"width"
,
image_size
[
1
])
r
=
resize_instances
(
results_per_image
,
height
,
width
)
processed_results
.
append
({
"instances"
:
r
})
else
:
processed_results
=
[{
"instances"
:
x
}
for
x
in
pred_instances
]
return
processed_results
def
compute_locations
(
self
,
features
):
locations
=
[]
in_strides
=
[
x
.
stride
for
x
in
self
.
backbone_output_shape
]
for
level
,
feature
in
enumerate
(
features
):
h
,
w
=
feature
.
size
()[
-
2
:]
locations_per_level
=
compute_locations_per_level
(
h
,
w
,
in_strides
[
level
],
feature
.
dtype
,
feature
.
device
,
offset
=
self
.
feature_locations_offset
)
locations
.
append
(
locations_per_level
)
return
locations
def
forward_train
(
self
,
features
,
batched_inputs
):
self
.
train
()
return
self
.
forward
(
features
,
batched_inputs
)
\ No newline at end of file
projects/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
0 → 100644
View file @
4cd43886
# Copyright 2021 Toyota Research Institute. All rights reserved.
import
logging
import
torch
import
torch.nn
as
nn
from
projects.mmdet3d_plugin.dd3d.layers.smooth_l1_loss
import
smooth_l1_loss
LOG
=
logging
.
getLogger
(
__name__
)
class
DisentangledBox3DLoss
(
nn
.
Module
):
def
__init__
(
self
,
smooth_l1_loss_beta
,
max_loss_per_group
):
super
().
__init__
()
self
.
smooth_l1_loss_beta
=
smooth_l1_loss_beta
self
.
max_loss_per_group
=
max_loss_per_group
def
forward
(
self
,
box3d_pred
,
box3d_targets
,
locations
,
weights
=
None
):
box3d_pred
=
box3d_pred
.
to
(
torch
.
float32
)
box3d_targets
=
box3d_targets
.
to
(
torch
.
float32
)
target_corners
=
box3d_targets
.
corners
disentangled_losses
=
{}
for
component_key
in
[
"quat"
,
"proj_ctr"
,
"depth"
,
"size"
]:
disentangled_boxes
=
box3d_targets
.
clone
()
setattr
(
disentangled_boxes
,
component_key
,
getattr
(
box3d_pred
,
component_key
))
pred_corners
=
disentangled_boxes
.
to
(
torch
.
float32
).
corners
loss
=
smooth_l1_loss
(
pred_corners
,
target_corners
,
beta
=
self
.
smooth_l1_loss_beta
)
# Bound the loss
loss
.
clamp
(
max
=
self
.
max_loss_per_group
)
if
weights
is
not
None
:
# loss = torch.sum(loss.reshape(-1, 24) * weights.unsqueeze(-1))
loss
=
torch
.
sum
(
loss
.
reshape
(
-
1
,
24
).
mean
(
dim
=
1
)
*
weights
)
else
:
loss
=
loss
.
reshape
(
-
1
,
24
).
mean
()
disentangled_losses
[
"loss_box3d_"
+
component_key
]
=
loss
entangled_l1_dist
=
(
target_corners
-
box3d_pred
.
corners
).
detach
().
abs
().
reshape
(
-
1
,
24
).
mean
(
dim
=
1
)
return
disentangled_losses
,
entangled_l1_dist
projects/mmdet3d_plugin/dd3d/modeling/fcos2d.py
0 → 100644
View file @
4cd43886
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from AdelaiDet:
# https://github.com/aim-uofa/AdelaiDet
import
torch
from
fvcore.nn
import
sigmoid_focal_loss
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
detectron2.layers
import
Conv2d
,
batched_nms
,
cat
,
get_norm
from
detectron2.structures
import
Boxes
,
Instances
from
detectron2.utils.comm
import
get_world_size
from
mmcv.runner
import
force_fp32
from
projects.mmdet3d_plugin.dd3d.layers.iou_loss
import
IOULoss
from
projects.mmdet3d_plugin.dd3d.layers.normalization
import
ModuleListDial
,
Scale
from
projects.mmdet3d_plugin.dd3d.utils.comm
import
reduce_sum
INF
=
100000000
def
compute_ctrness_targets
(
reg_targets
):
if
len
(
reg_targets
)
==
0
:
return
reg_targets
.
new_zeros
(
len
(
reg_targets
))
left_right
=
reg_targets
[:,
[
0
,
2
]]
top_bottom
=
reg_targets
[:,
[
1
,
3
]]
ctrness
=
(
left_right
.
min
(
dim
=-
1
)[
0
]
/
left_right
.
max
(
dim
=-
1
)[
0
])
*
\
(
top_bottom
.
min
(
dim
=-
1
)[
0
]
/
top_bottom
.
max
(
dim
=-
1
)[
0
])
return
torch
.
sqrt
(
ctrness
)
class
FCOS2DHead
(
nn
.
Module
):
def
__init__
(
self
,
num_classes
,
input_shape
,
num_cls_convs
=
4
,
num_box_convs
=
4
,
norm
=
'BN'
,
use_deformable
=
False
,
use_scale
=
True
,
box2d_scale_init_factor
=
1.0
,
version
=
'v2'
):
super
().
__init__
()
self
.
num_classes
=
num_classes
self
.
in_strides
=
[
shape
.
stride
for
shape
in
input_shape
]
self
.
num_levels
=
len
(
input_shape
)
self
.
use_scale
=
use_scale
self
.
box2d_scale_init_factor
=
box2d_scale_init_factor
self
.
_version
=
version
in_channels
=
[
s
.
channels
for
s
in
input_shape
]
assert
len
(
set
(
in_channels
))
==
1
,
"Each level must have the same channel!"
in_channels
=
in_channels
[
0
]
if
use_deformable
:
raise
ValueError
(
"Not supported yet."
)
head_configs
=
{
'cls'
:
num_cls_convs
,
'box2d'
:
num_box_convs
}
for
head_name
,
num_convs
in
head_configs
.
items
():
tower
=
[]
if
self
.
_version
==
"v1"
:
for
_
in
range
(
num_convs
):
conv_func
=
nn
.
Conv2d
tower
.
append
(
conv_func
(
in_channels
,
in_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
True
))
if
norm
==
"GN"
:
raise
NotImplementedError
()
elif
norm
==
"NaiveGN"
:
raise
NotImplementedError
()
elif
norm
==
"BN"
:
tower
.
append
(
ModuleListDial
([
nn
.
BatchNorm2d
(
in_channels
)
for
_
in
range
(
self
.
num_levels
)]))
elif
norm
==
"SyncBN"
:
raise
NotImplementedError
()
tower
.
append
(
nn
.
ReLU
())
elif
self
.
_version
==
"v2"
:
for
_
in
range
(
num_convs
):
if
norm
in
(
"BN"
,
"FrozenBN"
,
"SyncBN"
,
"GN"
):
# NOTE: need to add norm here!
# Each FPN level has its own batchnorm layer.
# NOTE: do not use dd3d train.py!
# "BN" is converted to "SyncBN" in distributed training (see train.py)
norm_layer
=
ModuleListDial
([
get_norm
(
norm
,
in_channels
)
for
_
in
range
(
self
.
num_levels
)])
else
:
norm_layer
=
get_norm
(
norm
,
in_channels
)
tower
.
append
(
Conv2d
(
in_channels
,
in_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
norm_layer
is
None
,
norm
=
norm_layer
,
activation
=
F
.
relu
)
)
else
:
raise
ValueError
(
f
"Invalid FCOS2D version:
{
self
.
_version
}
"
)
self
.
add_module
(
f
'
{
head_name
}
_tower'
,
nn
.
Sequential
(
*
tower
))
self
.
cls_logits
=
nn
.
Conv2d
(
in_channels
,
self
.
num_classes
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
box2d_reg
=
nn
.
Conv2d
(
in_channels
,
4
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
self
.
centerness
=
nn
.
Conv2d
(
in_channels
,
1
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
if
self
.
use_scale
:
if
self
.
_version
==
"v1"
:
self
.
scales_reg
=
nn
.
ModuleList
([
Scale
(
init_value
=
stride
*
self
.
box2d_scale_init_factor
)
for
stride
in
self
.
in_strides
])
else
:
self
.
scales_box2d_reg
=
nn
.
ModuleList
([
Scale
(
init_value
=
stride
*
self
.
box2d_scale_init_factor
)
for
stride
in
self
.
in_strides
])
self
.
init_weights
()
def
init_weights
(
self
):
for
tower
in
[
self
.
cls_tower
,
self
.
box2d_tower
]:
for
l
in
tower
.
modules
():
if
isinstance
(
l
,
nn
.
Conv2d
):
torch
.
nn
.
init
.
kaiming_normal_
(
l
.
weight
,
mode
=
'fan_out'
,
nonlinearity
=
'relu'
)
if
l
.
bias
is
not
None
:
torch
.
nn
.
init
.
constant_
(
l
.
bias
,
0
)
predictors
=
[
self
.
cls_logits
,
self
.
box2d_reg
,
self
.
centerness
]
for
modules
in
predictors
:
for
l
in
modules
.
modules
():
if
isinstance
(
l
,
nn
.
Conv2d
):
torch
.
nn
.
init
.
kaiming_uniform_
(
l
.
weight
,
a
=
1
)
if
l
.
bias
is
not
None
:
# depth head may not have bias.
torch
.
nn
.
init
.
constant_
(
l
.
bias
,
0
)
def
forward
(
self
,
x
):
logits
=
[]
box2d_reg
=
[]
centerness
=
[]
extra_output
=
{
"cls_tower_out"
:
[]}
for
l
,
feature
in
enumerate
(
x
):
cls_tower_out
=
self
.
cls_tower
(
feature
)
bbox_tower_out
=
self
.
box2d_tower
(
feature
)
# 2D box
logits
.
append
(
self
.
cls_logits
(
cls_tower_out
))
centerness
.
append
(
self
.
centerness
(
bbox_tower_out
))
box_reg
=
self
.
box2d_reg
(
bbox_tower_out
)
if
self
.
use_scale
:
# TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
if
self
.
_version
==
"v1"
:
box_reg
=
self
.
scales_reg
[
l
](
box_reg
)
else
:
box_reg
=
self
.
scales_box2d_reg
[
l
](
box_reg
)
# Note that we use relu, as in the improved FCOS, instead of exp.
box2d_reg
.
append
(
F
.
relu
(
box_reg
))
extra_output
[
'cls_tower_out'
].
append
(
cls_tower_out
)
return
logits
,
box2d_reg
,
centerness
,
extra_output
class
FCOS2DLoss
(
nn
.
Module
):
def
__init__
(
self
,
num_classes
,
focal_loss_alpha
=
0.25
,
focal_loss_gamma
=
2.0
,
loc_loss_type
=
'giou'
,
):
super
().
__init__
()
self
.
focal_loss_alpha
=
focal_loss_alpha
self
.
focal_loss_gamma
=
focal_loss_gamma
self
.
box2d_reg_loss_fn
=
IOULoss
(
loc_loss_type
)
self
.
num_classes
=
num_classes
@
force_fp32
(
apply_to
=
(
'logits'
,
'box2d_reg'
,
'centerness'
))
def
forward
(
self
,
logits
,
box2d_reg
,
centerness
,
targets
):
labels
=
targets
[
'labels'
]
box2d_reg_targets
=
targets
[
'box2d_reg_targets'
]
pos_inds
=
targets
[
"pos_inds"
]
if
len
(
labels
)
!=
box2d_reg_targets
.
shape
[
0
]:
raise
ValueError
(
f
"The size of 'labels' and 'box2d_reg_targets' does not match: a=
{
len
(
labels
)
}
, b=
{
box2d_reg_targets
.
shape
[
0
]
}
"
)
# Flatten predictions
logits
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
self
.
num_classes
)
for
x
in
logits
])
box2d_reg_pred
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
4
)
for
x
in
box2d_reg
])
centerness_pred
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
)
for
x
in
centerness
])
# -------------------
# Classification loss
# -------------------
num_pos_local
=
pos_inds
.
numel
()
num_gpus
=
get_world_size
()
total_num_pos
=
reduce_sum
(
pos_inds
.
new_tensor
([
num_pos_local
])).
item
()
num_pos_avg
=
max
(
total_num_pos
/
num_gpus
,
1.0
)
# prepare one_hot
cls_target
=
torch
.
zeros_like
(
logits
)
cls_target
[
pos_inds
,
labels
[
pos_inds
]]
=
1
loss_cls
=
sigmoid_focal_loss
(
logits
,
cls_target
,
alpha
=
self
.
focal_loss_alpha
,
gamma
=
self
.
focal_loss_gamma
,
reduction
=
"sum"
,
)
/
num_pos_avg
# NOTE: The rest of losses only consider foreground pixels.
box2d_reg_pred
=
box2d_reg_pred
[
pos_inds
]
box2d_reg_targets
=
box2d_reg_targets
[
pos_inds
]
centerness_pred
=
centerness_pred
[
pos_inds
]
# Compute centerness targets here using 2D regression targets of foreground pixels.
centerness_targets
=
compute_ctrness_targets
(
box2d_reg_targets
)
# Denominator for all foreground losses.
ctrness_targets_sum
=
centerness_targets
.
sum
()
loss_denom
=
max
(
reduce_sum
(
ctrness_targets_sum
).
item
()
/
num_gpus
,
1e-6
)
# NOTE: change the return after reduce_sum
if
pos_inds
.
numel
()
==
0
:
losses
=
{
"loss_cls"
:
loss_cls
,
"loss_box2d_reg"
:
box2d_reg_pred
.
sum
()
*
0.
,
"loss_centerness"
:
centerness_pred
.
sum
()
*
0.
,
}
return
losses
,
{}
# ----------------------
# 2D box regression loss
# ----------------------
loss_box2d_reg
=
self
.
box2d_reg_loss_fn
(
box2d_reg_pred
,
box2d_reg_targets
,
centerness_targets
)
/
loss_denom
# ---------------
# Centerness loss
# ---------------
loss_centerness
=
F
.
binary_cross_entropy_with_logits
(
centerness_pred
,
centerness_targets
,
reduction
=
"sum"
)
/
num_pos_avg
loss_dict
=
{
"loss_cls"
:
loss_cls
,
"loss_box2d_reg"
:
loss_box2d_reg
,
"loss_centerness"
:
loss_centerness
}
extra_info
=
{
"loss_denom"
:
loss_denom
,
"centerness_targets"
:
centerness_targets
}
return
loss_dict
,
extra_info
class
FCOS2DInference
():
def
__init__
(
self
,
cfg
):
self
.
thresh_with_ctr
=
cfg
.
DD3D
.
FCOS2D
.
INFERENCE
.
THRESH_WITH_CTR
self
.
pre_nms_thresh
=
cfg
.
DD3D
.
FCOS2D
.
INFERENCE
.
PRE_NMS_THRESH
self
.
pre_nms_topk
=
cfg
.
DD3D
.
FCOS2D
.
INFERENCE
.
PRE_NMS_TOPK
self
.
post_nms_topk
=
cfg
.
DD3D
.
FCOS2D
.
INFERENCE
.
POST_NMS_TOPK
self
.
nms_thresh
=
cfg
.
DD3D
.
FCOS2D
.
INFERENCE
.
NMS_THRESH
self
.
num_classes
=
cfg
.
DD3D
.
NUM_CLASSES
def
__call__
(
self
,
logits
,
box2d_reg
,
centerness
,
locations
,
image_sizes
):
pred_instances
=
[]
# List[List[Instances]], shape = (L, B)
extra_info
=
[]
for
lvl
,
(
logits_lvl
,
box2d_reg_lvl
,
centerness_lvl
,
locations_lvl
)
in
\
enumerate
(
zip
(
logits
,
box2d_reg
,
centerness
,
locations
)):
instances_per_lvl
,
extra_info_per_lvl
=
self
.
forward_for_single_feature_map
(
logits_lvl
,
box2d_reg_lvl
,
centerness_lvl
,
locations_lvl
,
image_sizes
)
# List of Instances; one for each image.
for
instances_per_im
in
instances_per_lvl
:
instances_per_im
.
fpn_levels
=
locations_lvl
.
new_ones
(
len
(
instances_per_im
),
dtype
=
torch
.
long
)
*
lvl
pred_instances
.
append
(
instances_per_lvl
)
extra_info
.
append
(
extra_info_per_lvl
)
return
pred_instances
,
extra_info
def
forward_for_single_feature_map
(
self
,
logits
,
box2d_reg
,
centerness
,
locations
,
image_sizes
):
N
,
C
,
_
,
__
=
logits
.
shape
# put in the same format as locations
scores
=
logits
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
,
C
).
sigmoid
()
box2d_reg
=
box2d_reg
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
,
4
)
centerness
=
centerness
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
).
sigmoid
()
# if self.thresh_with_ctr is True, we multiply the classification
# scores with centerness scores before applying the threshold.
if
self
.
thresh_with_ctr
:
scores
=
scores
*
centerness
[:,
:,
None
]
candidate_mask
=
scores
>
self
.
pre_nms_thresh
pre_nms_topk
=
candidate_mask
.
reshape
(
N
,
-
1
).
sum
(
1
)
pre_nms_topk
=
pre_nms_topk
.
clamp
(
max
=
self
.
pre_nms_topk
)
if
not
self
.
thresh_with_ctr
:
scores
=
scores
*
centerness
[:,
:,
None
]
results
=
[]
all_fg_inds_per_im
,
all_topk_indices
,
all_class_inds_per_im
=
[],
[],
[]
for
i
in
range
(
N
):
scores_per_im
=
scores
[
i
]
candidate_mask_per_im
=
candidate_mask
[
i
]
scores_per_im
=
scores_per_im
[
candidate_mask_per_im
]
candidate_inds_per_im
=
candidate_mask_per_im
.
nonzero
(
as_tuple
=
False
)
fg_inds_per_im
=
candidate_inds_per_im
[:,
0
]
class_inds_per_im
=
candidate_inds_per_im
[:,
1
]
# Cache info here.
all_fg_inds_per_im
.
append
(
fg_inds_per_im
)
all_class_inds_per_im
.
append
(
class_inds_per_im
)
box2d_reg_per_im
=
box2d_reg
[
i
][
fg_inds_per_im
]
locations_per_im
=
locations
[
fg_inds_per_im
]
pre_nms_topk_per_im
=
pre_nms_topk
[
i
]
if
candidate_mask_per_im
.
sum
().
item
()
>
pre_nms_topk_per_im
.
item
():
scores_per_im
,
topk_indices
=
\
scores_per_im
.
topk
(
pre_nms_topk_per_im
,
sorted
=
False
)
class_inds_per_im
=
class_inds_per_im
[
topk_indices
]
box2d_reg_per_im
=
box2d_reg_per_im
[
topk_indices
]
locations_per_im
=
locations_per_im
[
topk_indices
]
else
:
topk_indices
=
None
all_topk_indices
.
append
(
topk_indices
)
detections
=
torch
.
stack
([
locations_per_im
[:,
0
]
-
box2d_reg_per_im
[:,
0
],
locations_per_im
[:,
1
]
-
box2d_reg_per_im
[:,
1
],
locations_per_im
[:,
0
]
+
box2d_reg_per_im
[:,
2
],
locations_per_im
[:,
1
]
+
box2d_reg_per_im
[:,
3
],
],
dim
=
1
)
instances
=
Instances
(
image_sizes
[
i
])
instances
.
pred_boxes
=
Boxes
(
detections
)
instances
.
scores
=
torch
.
sqrt
(
scores_per_im
)
instances
.
pred_classes
=
class_inds_per_im
instances
.
locations
=
locations_per_im
results
.
append
(
instances
)
extra_info
=
{
"fg_inds_per_im"
:
all_fg_inds_per_im
,
"class_inds_per_im"
:
all_class_inds_per_im
,
"topk_indices"
:
all_topk_indices
}
return
results
,
extra_info
def
nms_and_top_k
(
self
,
instances_per_im
,
score_key_for_nms
=
"scores"
):
results
=
[]
for
instances
in
instances_per_im
:
if
self
.
nms_thresh
>
0
:
# Multiclass NMS.
keep
=
batched_nms
(
instances
.
pred_boxes
.
tensor
,
instances
.
get
(
score_key_for_nms
),
instances
.
pred_classes
,
self
.
nms_thresh
)
instances
=
instances
[
keep
]
num_detections
=
len
(
instances
)
# Limit to max_per_image detections **over all classes**
if
num_detections
>
self
.
post_nms_topk
>
0
:
scores
=
instances
.
scores
# image_thresh, _ = torch.kthvalue(scores.cpu(), num_detections - self.post_nms_topk + 1)
image_thresh
,
_
=
torch
.
kthvalue
(
scores
,
num_detections
-
self
.
post_nms_topk
+
1
)
keep
=
scores
>=
image_thresh
.
item
()
keep
=
torch
.
nonzero
(
keep
).
squeeze
(
1
)
instances
=
instances
[
keep
]
results
.
append
(
instances
)
return
results
projects/mmdet3d_plugin/dd3d/modeling/fcos3d.py
0 → 100644
View file @
4cd43886
# Copyright 2021 Toyota Research Institute. All rights reserved.
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
detectron2.layers
import
Conv2d
,
cat
,
get_norm
from
mmcv.runner
import
force_fp32
from
projects.mmdet3d_plugin.dd3d.layers.normalization
import
ModuleListDial
,
Offset
,
Scale
from
.disentangled_box3d_loss
import
DisentangledBox3DLoss
from
projects.mmdet3d_plugin.dd3d.structures.boxes3d
import
Boxes3D
from
projects.mmdet3d_plugin.dd3d.utils.geometry
import
allocentric_to_egocentric
,
unproject_points2d
EPS
=
1e-7
def
predictions_to_boxes3d
(
quat
,
proj_ctr
,
depth
,
size
,
locations
,
inv_intrinsics
,
canon_box_sizes
,
min_depth
,
max_depth
,
scale_depth_by_focal_lengths_factor
,
scale_depth_by_focal_lengths
=
True
,
quat_is_allocentric
=
True
,
depth_is_distance
=
False
):
# Normalize to make quat unit norm.
quat
=
quat
/
quat
.
norm
(
dim
=
1
,
keepdim
=
True
).
clamp
(
min
=
EPS
)
# Make sure again it's numerically unit-norm.
quat
=
quat
/
quat
.
norm
(
dim
=
1
,
keepdim
=
True
)
if
scale_depth_by_focal_lengths
:
pixel_size
=
torch
.
norm
(
torch
.
stack
([
inv_intrinsics
[:,
0
,
0
],
inv_intrinsics
[:,
1
,
1
]],
dim
=-
1
),
dim
=-
1
)
depth
=
depth
/
(
pixel_size
*
scale_depth_by_focal_lengths_factor
)
if
depth_is_distance
:
depth
=
depth
/
unproject_points2d
(
locations
,
inv_intrinsics
).
norm
(
dim
=
1
).
clamp
(
min
=
EPS
)
depth
=
depth
.
reshape
(
-
1
,
1
).
clamp
(
min_depth
,
max_depth
)
proj_ctr
=
proj_ctr
+
locations
if
quat_is_allocentric
:
quat
=
allocentric_to_egocentric
(
quat
,
proj_ctr
,
inv_intrinsics
)
size
=
(
size
.
tanh
()
+
1.
)
*
canon_box_sizes
# max size = 2 * canon_size
return
Boxes3D
(
quat
,
proj_ctr
,
depth
,
size
,
inv_intrinsics
)
class
FCOS3DHead
(
nn
.
Module
):
def
__init__
(
self
,
num_classes
,
input_shape
,
num_convs
=
4
,
norm
=
'BN'
,
use_scale
=
True
,
depth_scale_init_factor
=
0.3
,
proj_ctr_scale_init_factor
=
1.0
,
use_per_level_predictors
=
False
,
class_agnostic
=
False
,
use_deformable
=
False
,
mean_depth_per_level
=
None
,
std_depth_per_level
=
None
,
):
super
().
__init__
()
self
.
num_classes
=
num_classes
self
.
in_strides
=
[
shape
.
stride
for
shape
in
input_shape
]
self
.
num_levels
=
len
(
input_shape
)
self
.
use_scale
=
use_scale
self
.
depth_scale_init_factor
=
depth_scale_init_factor
self
.
proj_ctr_scale_init_factor
=
proj_ctr_scale_init_factor
self
.
use_per_level_predictors
=
use_per_level_predictors
self
.
register_buffer
(
"mean_depth_per_level"
,
torch
.
Tensor
(
mean_depth_per_level
))
self
.
register_buffer
(
"std_depth_per_level"
,
torch
.
Tensor
(
std_depth_per_level
))
in_channels
=
[
s
.
channels
for
s
in
input_shape
]
assert
len
(
set
(
in_channels
))
==
1
,
"Each level must have the same channel!"
in_channels
=
in_channels
[
0
]
if
use_deformable
:
raise
ValueError
(
"Not supported yet."
)
box3d_tower
=
[]
for
i
in
range
(
num_convs
):
if
norm
in
(
"BN"
,
"FrozenBN"
,
"SyncBN"
,
"GN"
):
# NOTE: need to add norm here!
# Each FPN level has its own batchnorm layer.
# NOTE: do not use dd3d train.py!
# "BN" is converted to "SyncBN" in distributed training (see train.py)
norm_layer
=
ModuleListDial
([
get_norm
(
norm
,
in_channels
)
for
_
in
range
(
self
.
num_levels
)])
else
:
norm_layer
=
get_norm
(
norm
,
in_channels
)
box3d_tower
.
append
(
Conv2d
(
in_channels
,
in_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
norm_layer
is
None
,
norm
=
norm_layer
,
activation
=
F
.
relu
)
)
self
.
add_module
(
'box3d_tower'
,
nn
.
Sequential
(
*
box3d_tower
))
num_classes
=
self
.
num_classes
if
not
class_agnostic
else
1
num_levels
=
self
.
num_levels
if
use_per_level_predictors
else
1
# 3D box branches.
self
.
box3d_quat
=
nn
.
ModuleList
([
Conv2d
(
in_channels
,
4
*
num_classes
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
True
)
for
_
in
range
(
num_levels
)
])
self
.
box3d_ctr
=
nn
.
ModuleList
([
Conv2d
(
in_channels
,
2
*
num_classes
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
True
)
for
_
in
range
(
num_levels
)
])
self
.
box3d_depth
=
nn
.
ModuleList
([
Conv2d
(
in_channels
,
1
*
num_classes
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
(
not
self
.
use_scale
))
for
_
in
range
(
num_levels
)
])
self
.
box3d_size
=
nn
.
ModuleList
([
Conv2d
(
in_channels
,
3
*
num_classes
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
True
)
for
_
in
range
(
num_levels
)
])
self
.
box3d_conf
=
nn
.
ModuleList
([
Conv2d
(
in_channels
,
1
*
num_classes
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
True
)
for
_
in
range
(
num_levels
)
])
if
self
.
use_scale
:
self
.
scales_proj_ctr
=
nn
.
ModuleList
([
Scale
(
init_value
=
stride
*
self
.
proj_ctr_scale_init_factor
)
for
stride
in
self
.
in_strides
])
# (pre-)compute (mean, std) of depth for each level, and determine the init value here.
self
.
scales_size
=
nn
.
ModuleList
([
Scale
(
init_value
=
1.0
)
for
_
in
range
(
self
.
num_levels
)])
self
.
scales_conf
=
nn
.
ModuleList
([
Scale
(
init_value
=
1.0
)
for
_
in
range
(
self
.
num_levels
)])
self
.
scales_depth
=
nn
.
ModuleList
([
Scale
(
init_value
=
sigma
*
self
.
depth_scale_init_factor
)
for
sigma
in
self
.
std_depth_per_level
])
self
.
offsets_depth
=
nn
.
ModuleList
([
Offset
(
init_value
=
b
)
for
b
in
self
.
mean_depth_per_level
])
self
.
_init_weights
()
def
_init_weights
(
self
):
for
l
in
self
.
box3d_tower
.
modules
():
if
isinstance
(
l
,
nn
.
Conv2d
):
torch
.
nn
.
init
.
kaiming_normal_
(
l
.
weight
,
mode
=
'fan_out'
,
nonlinearity
=
'relu'
)
if
l
.
bias
is
not
None
:
torch
.
nn
.
init
.
constant_
(
l
.
bias
,
0
)
predictors
=
[
self
.
box3d_quat
,
self
.
box3d_ctr
,
self
.
box3d_depth
,
self
.
box3d_size
,
self
.
box3d_conf
]
for
modules
in
predictors
:
for
l
in
modules
.
modules
():
if
isinstance
(
l
,
nn
.
Conv2d
):
torch
.
nn
.
init
.
kaiming_uniform_
(
l
.
weight
,
a
=
1
)
if
l
.
bias
is
not
None
:
# depth head may not have bias.
torch
.
nn
.
init
.
constant_
(
l
.
bias
,
0
)
def
forward
(
self
,
x
):
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
=
[],
[],
[],
[],
[]
dense_depth
=
None
for
l
,
features
in
enumerate
(
x
):
box3d_tower_out
=
self
.
box3d_tower
(
features
)
_l
=
l
if
self
.
use_per_level_predictors
else
0
# 3D box
quat
=
self
.
box3d_quat
[
_l
](
box3d_tower_out
)
proj_ctr
=
self
.
box3d_ctr
[
_l
](
box3d_tower_out
)
depth
=
self
.
box3d_depth
[
_l
](
box3d_tower_out
)
size3d
=
self
.
box3d_size
[
_l
](
box3d_tower_out
)
conf3d
=
self
.
box3d_conf
[
_l
](
box3d_tower_out
)
if
self
.
use_scale
:
# TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
proj_ctr
=
self
.
scales_proj_ctr
[
l
](
proj_ctr
)
size3d
=
self
.
scales_size
[
l
](
size3d
)
conf3d
=
self
.
scales_conf
[
l
](
conf3d
)
depth
=
self
.
offsets_depth
[
l
](
self
.
scales_depth
[
l
](
depth
))
box3d_quat
.
append
(
quat
)
box3d_ctr
.
append
(
proj_ctr
)
box3d_depth
.
append
(
depth
)
box3d_size
.
append
(
size3d
)
box3d_conf
.
append
(
conf3d
)
return
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
dense_depth
class
FCOS3DLoss
(
nn
.
Module
):
def
__init__
(
self
,
num_classes
,
min_depth
=
0.1
,
max_depth
=
80.0
,
box3d_loss_weight
=
2.0
,
conf3d_loss_weight
=
1.0
,
conf_3d_temperature
=
1.0
,
smooth_l1_loss_beta
=
0.05
,
max_loss_per_group
=
20
,
predict_allocentric_rot
=
True
,
scale_depth_by_focal_lengths
=
True
,
scale_depth_by_focal_lengths_factor
=
500.0
,
class_agnostic
=
False
,
predict_distance
=
False
,
canon_box_sizes
=
None
):
super
().
__init__
()
self
.
canon_box_sizes
=
canon_box_sizes
self
.
min_depth
=
min_depth
self
.
max_depth
=
max_depth
self
.
predict_allocentric_rot
=
predict_allocentric_rot
self
.
scale_depth_by_focal_lengths
=
scale_depth_by_focal_lengths
self
.
scale_depth_by_focal_lengths_factor
=
scale_depth_by_focal_lengths_factor
self
.
predict_distance
=
predict_distance
self
.
box3d_reg_loss_fn
=
DisentangledBox3DLoss
(
smooth_l1_loss_beta
,
max_loss_per_group
)
self
.
box3d_loss_weight
=
box3d_loss_weight
self
.
conf3d_loss_weight
=
conf3d_loss_weight
self
.
conf_3d_temperature
=
conf_3d_temperature
self
.
num_classes
=
num_classes
self
.
class_agnostic
=
class_agnostic
@
force_fp32
(
apply_to
=
(
'box3d_quat'
,
'box3d_ctr'
,
'box3d_depth'
,
'box3d_size'
,
'box3d_conf'
,
'inv_intrinsics'
))
def
forward
(
self
,
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
dense_depth
,
inv_intrinsics
,
fcos2d_info
,
targets
):
labels
=
targets
[
'labels'
]
box3d_targets
=
targets
[
'box3d_targets'
]
pos_inds
=
targets
[
"pos_inds"
]
if
pos_inds
.
numel
()
==
0
:
losses
=
{
"loss_box3d_quat"
:
torch
.
stack
([
x
.
sum
()
*
0.
for
x
in
box3d_quat
]).
sum
(),
"loss_box3d_proj_ctr"
:
torch
.
stack
([
x
.
sum
()
*
0.
for
x
in
box3d_ctr
]).
sum
(),
"loss_box3d_depth"
:
torch
.
stack
([
x
.
sum
()
*
0.
for
x
in
box3d_depth
]).
sum
(),
"loss_box3d_size"
:
torch
.
stack
([
x
.
sum
()
*
0.
for
x
in
box3d_size
]).
sum
(),
"loss_conf3d"
:
torch
.
stack
([
x
.
sum
()
*
0.
for
x
in
box3d_conf
]).
sum
()
}
return
losses
if
len
(
labels
)
!=
len
(
box3d_targets
):
raise
ValueError
(
f
"The size of 'labels' and 'box3d_targets' does not match: a=
{
len
(
labels
)
}
, b=
{
len
(
box3d_targets
)
}
"
)
num_classes
=
self
.
num_classes
if
not
self
.
class_agnostic
else
1
box3d_quat_pred
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
4
,
num_classes
)
for
x
in
box3d_quat
])
box3d_ctr_pred
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
2
,
num_classes
)
for
x
in
box3d_ctr
])
box3d_depth_pred
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
num_classes
)
for
x
in
box3d_depth
])
box3d_size_pred
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
3
,
num_classes
)
for
x
in
box3d_size
])
box3d_conf_pred
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
num_classes
)
for
x
in
box3d_conf
])
# ----------------------
# 3D box disentangled loss
# ----------------------
box3d_targets
=
box3d_targets
[
pos_inds
]
box3d_quat_pred
=
box3d_quat_pred
[
pos_inds
]
box3d_ctr_pred
=
box3d_ctr_pred
[
pos_inds
]
box3d_depth_pred
=
box3d_depth_pred
[
pos_inds
]
box3d_size_pred
=
box3d_size_pred
[
pos_inds
]
box3d_conf_pred
=
box3d_conf_pred
[
pos_inds
]
if
self
.
class_agnostic
:
box3d_quat_pred
=
box3d_quat_pred
.
squeeze
(
-
1
)
box3d_ctr_pred
=
box3d_ctr_pred
.
squeeze
(
-
1
)
box3d_depth_pred
=
box3d_depth_pred
.
squeeze
(
-
1
)
box3d_size_pred
=
box3d_size_pred
.
squeeze
(
-
1
)
box3d_conf_pred
=
box3d_conf_pred
.
squeeze
(
-
1
)
else
:
I
=
labels
[
pos_inds
][...,
None
,
None
]
box3d_quat_pred
=
torch
.
gather
(
box3d_quat_pred
,
dim
=
2
,
index
=
I
.
repeat
(
1
,
4
,
1
)).
squeeze
(
-
1
)
box3d_ctr_pred
=
torch
.
gather
(
box3d_ctr_pred
,
dim
=
2
,
index
=
I
.
repeat
(
1
,
2
,
1
)).
squeeze
(
-
1
)
box3d_depth_pred
=
torch
.
gather
(
box3d_depth_pred
,
dim
=
1
,
index
=
I
.
squeeze
(
-
1
)).
squeeze
(
-
1
)
box3d_size_pred
=
torch
.
gather
(
box3d_size_pred
,
dim
=
2
,
index
=
I
.
repeat
(
1
,
3
,
1
)).
squeeze
(
-
1
)
box3d_conf_pred
=
torch
.
gather
(
box3d_conf_pred
,
dim
=
1
,
index
=
I
.
squeeze
(
-
1
)).
squeeze
(
-
1
)
canon_box_sizes
=
box3d_quat_pred
.
new_tensor
(
self
.
canon_box_sizes
)[
labels
[
pos_inds
]]
locations
=
targets
[
"locations"
][
pos_inds
]
im_inds
=
targets
[
"im_inds"
][
pos_inds
]
inv_intrinsics
=
inv_intrinsics
[
im_inds
]
box3d_pred
=
predictions_to_boxes3d
(
box3d_quat_pred
,
box3d_ctr_pred
,
box3d_depth_pred
,
box3d_size_pred
,
locations
,
inv_intrinsics
,
canon_box_sizes
,
self
.
min_depth
,
self
.
max_depth
,
scale_depth_by_focal_lengths_factor
=
self
.
scale_depth_by_focal_lengths_factor
,
scale_depth_by_focal_lengths
=
self
.
scale_depth_by_focal_lengths
,
quat_is_allocentric
=
self
.
predict_allocentric_rot
,
depth_is_distance
=
self
.
predict_distance
)
centerness_targets
=
fcos2d_info
[
"centerness_targets"
]
loss_denom
=
fcos2d_info
[
"loss_denom"
]
losses_box3d
,
box3d_l1_error
=
self
.
box3d_reg_loss_fn
(
box3d_pred
,
box3d_targets
,
locations
,
centerness_targets
)
losses_box3d
=
{
k
:
self
.
box3d_loss_weight
*
v
/
loss_denom
for
k
,
v
in
losses_box3d
.
items
()}
conf_3d_targets
=
torch
.
exp
(
-
1.
/
self
.
conf_3d_temperature
*
box3d_l1_error
)
loss_conf3d
=
F
.
binary_cross_entropy_with_logits
(
box3d_conf_pred
,
conf_3d_targets
,
reduction
=
'none'
)
loss_conf3d
=
self
.
conf3d_loss_weight
*
(
loss_conf3d
*
centerness_targets
).
sum
()
/
loss_denom
losses
=
{
"loss_conf3d"
:
loss_conf3d
,
**
losses_box3d
}
return
losses
class
FCOS3DInference
():
def
__init__
(
self
,
cfg
):
self
.
canon_box_sizes
=
cfg
.
DD3D
.
FCOS3D
.
CANONICAL_BOX3D_SIZES
self
.
min_depth
=
cfg
.
DD3D
.
FCOS3D
.
MIN_DEPTH
self
.
max_depth
=
cfg
.
DD3D
.
FCOS3D
.
MAX_DEPTH
self
.
predict_allocentric_rot
=
cfg
.
DD3D
.
FCOS3D
.
PREDICT_ALLOCENTRIC_ROT
self
.
scale_depth_by_focal_lengths
=
cfg
.
DD3D
.
FCOS3D
.
SCALE_DEPTH_BY_FOCAL_LENGTHS
self
.
scale_depth_by_focal_lengths_factor
=
cfg
.
DD3D
.
FCOS3D
.
SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR
self
.
predict_distance
=
cfg
.
DD3D
.
FCOS3D
.
PREDICT_DISTANCE
self
.
num_classes
=
cfg
.
DD3D
.
NUM_CLASSES
self
.
class_agnostic
=
cfg
.
DD3D
.
FCOS3D
.
CLASS_AGNOSTIC_BOX3D
def
__call__
(
self
,
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
inv_intrinsics
,
pred_instances
,
fcos2d_info
):
# pred_instances: # List[List[Instances]], shape = (L, B)
for
lvl
,
(
box3d_quat_lvl
,
box3d_ctr_lvl
,
box3d_depth_lvl
,
box3d_size_lvl
,
box3d_conf_lvl
)
in
\
enumerate
(
zip
(
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
)):
# In-place modification: update per-level pred_instances.
self
.
forward_for_single_feature_map
(
box3d_quat_lvl
,
box3d_ctr_lvl
,
box3d_depth_lvl
,
box3d_size_lvl
,
box3d_conf_lvl
,
inv_intrinsics
,
pred_instances
[
lvl
],
fcos2d_info
[
lvl
]
)
# List of Instances; one for each image.
def
forward_for_single_feature_map
(
self
,
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
inv_intrinsics
,
pred_instances
,
fcos2d_info
):
N
=
box3d_quat
.
shape
[
0
]
num_classes
=
self
.
num_classes
if
not
self
.
class_agnostic
else
1
box3d_quat
=
box3d_quat
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
,
4
,
num_classes
)
box3d_ctr
=
box3d_ctr
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
,
2
,
num_classes
)
box3d_depth
=
box3d_depth
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
,
num_classes
)
box3d_size
=
box3d_size
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
,
3
,
num_classes
)
box3d_conf
=
box3d_conf
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
,
num_classes
).
sigmoid
()
for
i
in
range
(
N
):
fg_inds_per_im
=
fcos2d_info
[
'fg_inds_per_im'
][
i
]
class_inds_per_im
=
fcos2d_info
[
'class_inds_per_im'
][
i
]
topk_indices
=
fcos2d_info
[
'topk_indices'
][
i
]
box3d_quat_per_im
=
box3d_quat
[
i
][
fg_inds_per_im
]
box3d_ctr_per_im
=
box3d_ctr
[
i
][
fg_inds_per_im
]
box3d_depth_per_im
=
box3d_depth
[
i
][
fg_inds_per_im
]
box3d_size_per_im
=
box3d_size
[
i
][
fg_inds_per_im
]
box3d_conf_per_im
=
box3d_conf
[
i
][
fg_inds_per_im
]
if
self
.
class_agnostic
:
box3d_quat_per_im
=
box3d_quat_per_im
.
squeeze
(
-
1
)
box3d_ctr_per_im
=
box3d_ctr_per_im
.
squeeze
(
-
1
)
box3d_depth_per_im
=
box3d_depth_per_im
.
squeeze
(
-
1
)
box3d_size_per_im
=
box3d_size_per_im
.
squeeze
(
-
1
)
box3d_conf_per_im
=
box3d_conf_per_im
.
squeeze
(
-
1
)
else
:
I
=
class_inds_per_im
[...,
None
,
None
]
box3d_quat_per_im
=
torch
.
gather
(
box3d_quat_per_im
,
dim
=
2
,
index
=
I
.
repeat
(
1
,
4
,
1
)).
squeeze
(
-
1
)
box3d_ctr_per_im
=
torch
.
gather
(
box3d_ctr_per_im
,
dim
=
2
,
index
=
I
.
repeat
(
1
,
2
,
1
)).
squeeze
(
-
1
)
box3d_depth_per_im
=
torch
.
gather
(
box3d_depth_per_im
,
dim
=
1
,
index
=
I
.
squeeze
(
-
1
)).
squeeze
(
-
1
)
box3d_size_per_im
=
torch
.
gather
(
box3d_size_per_im
,
dim
=
2
,
index
=
I
.
repeat
(
1
,
3
,
1
)).
squeeze
(
-
1
)
box3d_conf_per_im
=
torch
.
gather
(
box3d_conf_per_im
,
dim
=
1
,
index
=
I
.
squeeze
(
-
1
)).
squeeze
(
-
1
)
if
topk_indices
is
not
None
:
box3d_quat_per_im
=
box3d_quat_per_im
[
topk_indices
]
box3d_ctr_per_im
=
box3d_ctr_per_im
[
topk_indices
]
box3d_depth_per_im
=
box3d_depth_per_im
[
topk_indices
]
box3d_size_per_im
=
box3d_size_per_im
[
topk_indices
]
box3d_conf_per_im
=
box3d_conf_per_im
[
topk_indices
]
# scores_per_im = pred_instances[i].scores.square()
# NOTE: Before refactoring, the squared score was used. Is raw 2D score better?
scores_per_im
=
pred_instances
[
i
].
scores
scores_3d_per_im
=
scores_per_im
*
box3d_conf_per_im
canon_box_sizes
=
box3d_quat
.
new_tensor
(
self
.
canon_box_sizes
)[
pred_instances
[
i
].
pred_classes
]
inv_K
=
inv_intrinsics
[
i
][
None
,
...].
expand
(
len
(
box3d_quat_per_im
),
3
,
3
)
locations
=
pred_instances
[
i
].
locations
pred_boxes3d
=
predictions_to_boxes3d
(
box3d_quat_per_im
,
box3d_ctr_per_im
,
box3d_depth_per_im
,
box3d_size_per_im
,
locations
,
inv_K
,
canon_box_sizes
,
self
.
min_depth
,
self
.
max_depth
,
scale_depth_by_focal_lengths_factor
=
self
.
scale_depth_by_focal_lengths_factor
,
scale_depth_by_focal_lengths
=
self
.
scale_depth_by_focal_lengths
,
quat_is_allocentric
=
self
.
predict_allocentric_rot
,
depth_is_distance
=
self
.
predict_distance
)
# In-place modification: add fields to instances.
pred_instances
[
i
].
pred_boxes3d
=
pred_boxes3d
pred_instances
[
i
].
scores_3d
=
scores_3d_per_im
projects/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
0 → 100644
View file @
4cd43886
# Copyright 2021 Toyota Research Institute. All rights reserved.
import
torch
import
torch.nn.functional
as
F
from
fvcore.nn.smooth_l1_loss
import
smooth_l1_loss
from
torch
import
nn
from
detectron2.layers
import
Conv2d
,
cat
#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from
detectron2.modeling.postprocessing
import
detector_postprocess
as
resize_instances
from
detectron2.structures
import
Instances
from
detectron2.utils
import
comm
as
d2_comm
from
mmdet.models.builder
import
HEADS
from
mmcv.runner
import
force_fp32
from
projects.mmdet3d_plugin.dd3d.datasets.nuscenes
import
MAX_NUM_ATTRIBUTES
from
.core
import
DD3D
#from tridet.modeling.dd3d.postprocessing import get_group_idxs, nuscenes_sample_aggregate
from
.prepare_targets
import
DD3DTargetPreparer
from
projects.mmdet3d_plugin.dd3d.structures.boxes3d
import
Boxes3D
from
projects.mmdet3d_plugin.dd3d.structures.image_list
import
ImageList
from
projects.mmdet3d_plugin.dd3d.utils.comm
import
reduce_sum
INF
=
100000000.
class
NuscenesDD3DTargetPreparer
(
DD3DTargetPreparer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
assert
self
.
dd3d_enabled
,
f
"
{
type
(
self
).
__name__
}
requires dd3d_enabled = True"
def
__call__
(
self
,
locations
,
gt_instances
,
feature_shapes
):
num_loc_list
=
[
len
(
loc
)
for
loc
in
locations
]
# compute locations to size ranges
loc_to_size_range
=
[]
for
l
,
loc_per_level
in
enumerate
(
locations
):
loc_to_size_range_per_level
=
loc_per_level
.
new_tensor
(
self
.
sizes_of_interest
[
l
])
loc_to_size_range
.
append
(
loc_to_size_range_per_level
[
None
].
expand
(
num_loc_list
[
l
],
-
1
))
loc_to_size_range
=
torch
.
cat
(
loc_to_size_range
,
dim
=
0
)
locations
=
torch
.
cat
(
locations
,
dim
=
0
)
training_targets
=
self
.
compute_targets_for_locations
(
locations
,
gt_instances
,
loc_to_size_range
,
num_loc_list
)
training_targets
[
"locations"
]
=
[
locations
.
clone
()
for
_
in
range
(
len
(
gt_instances
))]
training_targets
[
"im_inds"
]
=
[
locations
.
new_ones
(
locations
.
size
(
0
),
dtype
=
torch
.
long
)
*
i
for
i
in
range
(
len
(
gt_instances
))
]
box2d
=
training_targets
.
pop
(
"box2d"
,
None
)
# transpose im first training_targets to level first ones
training_targets
=
{
k
:
self
.
_transpose
(
v
,
num_loc_list
)
for
k
,
v
in
training_targets
.
items
()
if
k
!=
"box2d"
}
training_targets
[
"fpn_levels"
]
=
[
loc
.
new_ones
(
len
(
loc
),
dtype
=
torch
.
long
)
*
level
for
level
,
loc
in
enumerate
(
training_targets
[
"locations"
])
]
# Flatten targets: (L x B x H x W, TARGET_SIZE)
labels
=
cat
([
x
.
reshape
(
-
1
)
for
x
in
training_targets
[
"labels"
]])
box2d_reg_targets
=
cat
([
x
.
reshape
(
-
1
,
4
)
for
x
in
training_targets
[
"box2d_reg"
]])
target_inds
=
cat
([
x
.
reshape
(
-
1
)
for
x
in
training_targets
[
"target_inds"
]])
locations
=
cat
([
x
.
reshape
(
-
1
,
2
)
for
x
in
training_targets
[
"locations"
]])
im_inds
=
cat
([
x
.
reshape
(
-
1
)
for
x
in
training_targets
[
"im_inds"
]])
fpn_levels
=
cat
([
x
.
reshape
(
-
1
)
for
x
in
training_targets
[
"fpn_levels"
]])
pos_inds
=
torch
.
nonzero
(
labels
!=
self
.
num_classes
).
squeeze
(
1
)
targets
=
{
"labels"
:
labels
,
"box2d_reg_targets"
:
box2d_reg_targets
,
"locations"
:
locations
,
"target_inds"
:
target_inds
,
"im_inds"
:
im_inds
,
"fpn_levels"
:
fpn_levels
,
"pos_inds"
:
pos_inds
}
if
self
.
dd3d_enabled
:
box3d_targets
=
Boxes3D
.
cat
(
training_targets
[
"box3d"
])
targets
.
update
({
"box3d_targets"
:
box3d_targets
})
if
box2d
is
not
None
:
# Original format is B x L x (H x W, 4)
# Need to be in L x (B, 4, H, W).
batched_box2d
=
[]
for
lvl
,
per_lvl_box2d
in
enumerate
(
zip
(
*
box2d
)):
# B x (H x W, 4)
h
,
w
=
feature_shapes
[
lvl
]
batched_box2d_lvl
=
torch
.
stack
([
x
.
T
.
reshape
(
4
,
h
,
w
)
for
x
in
per_lvl_box2d
],
dim
=
0
)
batched_box2d
.
append
(
batched_box2d_lvl
)
targets
.
update
({
"batched_box2d"
:
batched_box2d
})
# Nuscenes targets -- attribute / speed
attributes
=
cat
([
x
.
reshape
(
-
1
)
for
x
in
training_targets
[
"attributes"
]])
speeds
=
cat
([
x
.
reshape
(
-
1
)
for
x
in
training_targets
[
"speeds"
]])
targets
.
update
({
'attributes'
:
attributes
,
'speeds'
:
speeds
})
return
targets
def
compute_targets_for_locations
(
self
,
locations
,
targets
,
size_ranges
,
num_loc_list
):
labels
=
[]
box2d_reg
=
[]
if
self
.
dd3d_enabled
:
box3d
=
[]
target_inds
=
[]
xs
,
ys
=
locations
[:,
0
],
locations
[:,
1
]
# NuScenes targets -- attribute / speed
attributes
,
speeds
=
[],
[]
num_targets
=
0
for
im_i
in
range
(
len
(
targets
)):
targets_per_im
=
targets
[
im_i
]
bboxes
=
targets_per_im
.
gt_boxes
.
tensor
labels_per_im
=
targets_per_im
.
gt_classes
# no gt
if
bboxes
.
numel
()
==
0
:
labels
.
append
(
labels_per_im
.
new_zeros
(
locations
.
size
(
0
))
+
self
.
num_classes
)
# reg_targets.append(locations.new_zeros((locations.size(0), 4)))
box2d_reg
.
append
(
locations
.
new_zeros
((
locations
.
size
(
0
),
4
)))
target_inds
.
append
(
labels_per_im
.
new_zeros
(
locations
.
size
(
0
))
-
1
)
if
self
.
dd3d_enabled
:
box3d
.
append
(
Boxes3D
(
locations
.
new_zeros
(
locations
.
size
(
0
),
4
),
locations
.
new_zeros
(
locations
.
size
(
0
),
2
),
locations
.
new_zeros
(
locations
.
size
(
0
),
1
),
locations
.
new_zeros
(
locations
.
size
(
0
),
3
),
locations
.
new_zeros
(
locations
.
size
(
0
),
3
,
3
),
).
to
(
torch
.
float32
)
)
# NOTE: attributes and speeds.
attributes
.
append
(
labels_per_im
.
new_zeros
(
locations
.
size
(
0
)))
speeds
.
append
(
labels_per_im
.
new_zeros
(
locations
.
size
(
0
)))
continue
area
=
targets_per_im
.
gt_boxes
.
area
()
l
=
xs
[:,
None
]
-
bboxes
[:,
0
][
None
]
t
=
ys
[:,
None
]
-
bboxes
[:,
1
][
None
]
r
=
bboxes
[:,
2
][
None
]
-
xs
[:,
None
]
b
=
bboxes
[:,
3
][
None
]
-
ys
[:,
None
]
# reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
box2d_reg_per_im
=
torch
.
stack
([
l
,
t
,
r
,
b
],
dim
=
2
)
if
self
.
center_sample
:
is_in_boxes
=
self
.
get_sample_region
(
bboxes
,
num_loc_list
,
xs
,
ys
)
else
:
is_in_boxes
=
box2d_reg_per_im
.
min
(
dim
=
2
)[
0
]
>
0
max_reg_targets_per_im
=
box2d_reg_per_im
.
max
(
dim
=
2
)[
0
]
# limit the regression range for each location
is_cared_in_the_level
=
\
(
max_reg_targets_per_im
>=
size_ranges
[:,
[
0
]])
&
\
(
max_reg_targets_per_im
<=
size_ranges
[:,
[
1
]])
locations_to_gt_area
=
area
[
None
].
repeat
(
len
(
locations
),
1
)
locations_to_gt_area
[
is_in_boxes
==
0
]
=
INF
locations_to_gt_area
[
is_cared_in_the_level
==
0
]
=
INF
# if there are still more than one objects for a location,
# we choose the one with minimal area
locations_to_min_area
,
locations_to_gt_inds
=
locations_to_gt_area
.
min
(
dim
=
1
)
box2d_reg_per_im
=
box2d_reg_per_im
[
range
(
len
(
locations
)),
locations_to_gt_inds
]
target_inds_per_im
=
locations_to_gt_inds
+
num_targets
num_targets
+=
len
(
targets_per_im
)
labels_per_im
=
labels_per_im
[
locations_to_gt_inds
]
labels_per_im
[
locations_to_min_area
==
INF
]
=
self
.
num_classes
labels
.
append
(
labels_per_im
)
box2d_reg
.
append
(
box2d_reg_per_im
)
target_inds
.
append
(
target_inds_per_im
)
if
self
.
dd3d_enabled
:
# 3D box targets
box3d_per_im
=
targets_per_im
.
gt_boxes3d
[
locations_to_gt_inds
]
box3d
.
append
(
box3d_per_im
)
# NuScenes targets -- attribute / speed
attributes_per_im
=
targets_per_im
.
gt_attributes
[
locations_to_gt_inds
]
speeds_per_im
=
targets_per_im
.
gt_speeds
[
locations_to_gt_inds
]
attributes
.
append
(
attributes_per_im
)
speeds
.
append
(
speeds_per_im
)
ret
=
{
"labels"
:
labels
,
"box2d_reg"
:
box2d_reg
,
"target_inds"
:
target_inds
}
if
self
.
dd3d_enabled
:
ret
.
update
({
"box3d"
:
box3d
})
# NuScenes targets -- attribute / speed
ret
.
update
({
"attributes"
:
attributes
,
"speeds"
:
speeds
})
return
ret
class
NuscenesLoss
(
nn
.
Module
):
def
__init__
(
self
,
attr_loss_weight
=
0.2
,
speed_loss_weight
=
0.2
):
super
().
__init__
()
self
.
attr_loss_weight
=
attr_loss_weight
self
.
speed_loss_weight
=
speed_loss_weight
@
force_fp32
(
apply_to
=
(
'attr_logits'
,
'speeds'
))
def
forward
(
self
,
attr_logits
,
speeds
,
fcos2d_info
,
targets
):
# Flatten predictions
attr_logits
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
MAX_NUM_ATTRIBUTES
)
for
x
in
attr_logits
])
speeds
=
cat
([
x
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
)
for
x
in
speeds
])
pos_inds
=
targets
[
'pos_inds'
]
losses
=
{}
# 1. Attributes
attr_logits
=
attr_logits
[
pos_inds
]
target_attr
=
targets
[
'attributes'
][
pos_inds
]
valid_attr_mask
=
target_attr
!=
MAX_NUM_ATTRIBUTES
# No attrs associated with class, or just attr missing.
if
pos_inds
.
numel
()
==
0
:
attr_weights
=
attr_logits
.
new_tensor
(
0.0
)
#torch.tensor(0.0).cuda()
else
:
attr_weights
=
fcos2d_info
[
'centerness_targets'
][
valid_attr_mask
]
# Denominator for all foreground losses -- re-computed for features with valid attributes.
# attr_loss_denom = max(reduce_sum(attr_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
# NOTE: compute attr_weights_sum, and then feed it to reduce_sum() works, but not above.
attr_weights_sum
=
attr_weights
.
sum
()
attr_loss_denom
=
max
(
reduce_sum
(
attr_weights_sum
).
item
()
/
d2_comm
.
get_world_size
(),
1e-6
)
if
valid_attr_mask
.
sum
()
==
0
:
losses
.
update
({
"loss_attr"
:
attr_logits
.
sum
()
*
0.
})
else
:
attr_logits
=
attr_logits
[
valid_attr_mask
]
target_attr
=
target_attr
[
valid_attr_mask
]
xent
=
F
.
cross_entropy
(
attr_logits
,
target_attr
)
loss_attr
=
(
xent
*
attr_weights
).
sum
()
/
attr_loss_denom
losses
.
update
({
"loss_attr"
:
self
.
attr_loss_weight
*
loss_attr
})
# 2. Speed
speeds
=
speeds
[
pos_inds
]
target_speeds
=
targets
[
'speeds'
][
pos_inds
]
# NOTE: some GT speeds are NaN.
valid_gt_mask
=
torch
.
logical_not
(
torch
.
isnan
(
target_speeds
))
if
pos_inds
.
numel
()
==
0
:
speed_weights
=
speeds
.
new_tensor
(
0.0
)
#torch.tensor(0.0).cuda()
else
:
speed_weights
=
fcos2d_info
[
'centerness_targets'
][
valid_gt_mask
]
# Denominator for all foreground losses -- re-computed for features with valid speeds.
# speed_loss_denom = max(reduce_sum(speed_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
speed_weights_sum
=
speed_weights
.
sum
()
speed_loss_denom
=
max
(
reduce_sum
(
speed_weights_sum
).
item
()
/
d2_comm
.
get_world_size
(),
1e-6
)
# NOTE: move after reduce sum
if
pos_inds
.
numel
()
==
0
:
losses
=
{
"loss_attr"
:
attr_logits
.
sum
()
*
0.
,
"loss_speed"
:
speeds
.
sum
()
*
0.
}
# NOTE: This is probably un-reachable, because the training filter images with empty annotations.
# NOTE: If not, attr_weights can be unavailable in the reduce_sum below().
return
losses
if
valid_gt_mask
.
sum
()
==
0
:
losses
.
update
({
"loss_speed"
:
speeds
.
sum
()
*
0.
})
# return losses
else
:
speeds
=
speeds
[
valid_gt_mask
]
target_speeds
=
target_speeds
[
valid_gt_mask
]
l1_error
=
smooth_l1_loss
(
speeds
,
target_speeds
,
beta
=
0.05
)
loss_speed
=
(
l1_error
*
speed_weights
).
sum
()
/
speed_loss_denom
losses
.
update
({
"loss_speed"
:
self
.
speed_loss_weight
*
loss_speed
})
return
losses
class
NuscenesInference
():
def
__init__
(
self
,
cfg
):
pass
def
__call__
(
self
,
attr_logits
,
speeds
,
pred_instances
,
fcos2d_info
):
"""Add 'pred_attribute', 'pred_speed' to Instances in 'pred_instances'."""
N
=
attr_logits
[
0
].
shape
[
0
]
for
lvl
,
(
attr_logits_lvl
,
speed_lvl
,
info_lvl
,
instances_lvl
)
in
\
enumerate
(
zip
(
attr_logits
,
speeds
,
fcos2d_info
,
pred_instances
)):
attr_logits_lvl
=
attr_logits_lvl
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
,
MAX_NUM_ATTRIBUTES
)
speed_lvl
=
speed_lvl
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
N
,
-
1
)
for
i
in
range
(
N
):
fg_inds_per_im
=
info_lvl
[
'fg_inds_per_im'
][
i
]
topk_indices
=
info_lvl
[
'topk_indices'
][
i
]
attr_logits_per_im
=
attr_logits_lvl
[
i
][
fg_inds_per_im
]
speed_per_im
=
speed_lvl
[
i
][
fg_inds_per_im
]
if
topk_indices
is
not
None
:
attr_logits_per_im
=
attr_logits_per_im
[
topk_indices
]
speed_per_im
=
speed_per_im
[
topk_indices
]
if
len
(
attr_logits_per_im
)
==
0
:
instances_lvl
[
i
].
pred_attributes
=
instances_lvl
[
i
].
pred_classes
.
new_tensor
([])
instances_lvl
[
i
].
pred_speeds
=
instances_lvl
[
i
].
scores
.
new_tensor
([])
else
:
instances_lvl
[
i
].
pred_attributes
=
attr_logits_per_im
.
argmax
(
dim
=
1
)
instances_lvl
[
i
].
pred_speeds
=
speed_per_im
@
HEADS
.
register_module
()
class
NuscenesDD3D
(
DD3D
):
def
__init__
(
self
,
num_classes
,
in_channels
,
strides
,
fcos2d_cfg
=
dict
(),
fcos2d_loss_cfg
=
dict
(),
fcos3d_cfg
=
dict
(),
fcos3d_loss_cfg
=
dict
(),
target_assign_cfg
=
dict
(),
nusc_loss_weight
=
dict
(),
box3d_on
=
True
,
feature_locations_offset
=
"none"
):
super
().
__init__
(
num_classes
,
in_channels
,
strides
,
fcos2d_cfg
=
fcos2d_cfg
,
fcos2d_loss_cfg
=
fcos2d_loss_cfg
,
fcos3d_cfg
=
fcos3d_cfg
,
fcos3d_loss_cfg
=
fcos3d_loss_cfg
,
target_assign_cfg
=
target_assign_cfg
,
box3d_on
=
box3d_on
,
feature_locations_offset
=
feature_locations_offset
)
# backbone_output_shape = self.backbone_output_shape
# in_channels = backbone_output_shape[0].channels
# --------------------------------------------------------------------------
# NuScenes predictions -- attribute / speed, computed from cls_tower output.
# --------------------------------------------------------------------------
self
.
attr_logits
=
Conv2d
(
in_channels
,
MAX_NUM_ATTRIBUTES
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
True
)
self
.
speed
=
Conv2d
(
in_channels
,
1
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
,
bias
=
True
,
activation
=
F
.
relu
)
# init weights
for
modules
in
[
self
.
attr_logits
,
self
.
speed
]:
for
l
in
modules
.
modules
():
if
isinstance
(
l
,
nn
.
Conv2d
):
torch
.
nn
.
init
.
kaiming_uniform_
(
l
.
weight
,
a
=
1
)
if
l
.
bias
is
not
None
:
# depth head may not have bias.
torch
.
nn
.
init
.
constant_
(
l
.
bias
,
0
)
# Re-define target preparer
del
self
.
prepare_targets
self
.
prepare_targets
=
NuscenesDD3DTargetPreparer
(
num_classes
=
num_classes
,
input_shape
=
self
.
backbone_output_shape
,
box3d_on
=
box3d_on
,
**
target_assign_cfg
)
self
.
nuscenes_loss
=
NuscenesLoss
(
**
nusc_loss_weight
)
# NOTE: inference later
# self.nuscenes_inference = NuscenesInference(cfg)
# self.num_images_per_sample = cfg.MODEL.FCOS3D.NUSC_NUM_IMAGES_PER_SAMPLE
# NOTE: inference later
# self.num_images_per_sample = cfg.DD3D.NUSC.INFERENCE.NUM_IMAGES_PER_SAMPLE
# assert self.num_images_per_sample == 6
# assert cfg.DATALOADER.TEST.NUM_IMAGES_PER_GROUP == 6
# NOTE: NuScenes evaluator allows max. 500 detections per sample.
# self.max_num_dets_per_sample = cfg.DD3D.NUSC.INFERENCE.MAX_NUM_DETS_PER_SAMPLE
@
force_fp32
(
apply_to
=
(
'features'
))
def
forward
(
self
,
features
,
batched_inputs
):
# NOTE:
# images = [x["image"].to(self.device) for x in batched_inputs]
# images = [self.preprocess_image(x) for x in images]
# NOTE: directly use inv_intrinsics
# if 'intrinsics' in batched_inputs[0]:
# intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
# else:
# intrinsics = None
# images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
if
'inv_intrinsics'
in
batched_inputs
[
0
]:
inv_intrinsics
=
[
x
[
'inv_intrinsics'
].
to
(
features
[
0
].
device
)
for
x
in
batched_inputs
]
inv_intrinsics
=
torch
.
stack
(
inv_intrinsics
,
dim
=
0
)
else
:
inv_intrinsics
=
None
# NOTE:
# gt_dense_depth = None
# if 'depth' in batched_inputs[0]:
# gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
# gt_dense_depth = ImageList.from_tensors(
# gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
# )
# NOTE: directly input feature
# features = self.backbone(images.tensor)
# features = [features[f] for f in self.in_features]
if
"instances"
in
batched_inputs
[
0
]:
gt_instances
=
[
x
[
"instances"
].
to
(
features
[
0
].
device
)
for
x
in
batched_inputs
]
else
:
gt_instances
=
None
locations
=
self
.
compute_locations
(
features
)
logits
,
box2d_reg
,
centerness
,
fcos2d_extra_output
=
self
.
fcos2d_head
(
features
)
if
not
self
.
only_box2d
:
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
dense_depth
=
self
.
fcos3d_head
(
features
)
# NOTE: directly use inv_intrinsics
# inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
# --------------------------------------------------------------------------
# NuScenes predictions -- attribute / speed, computed from cls_tower output.
# --------------------------------------------------------------------------
attr_logits
,
speeds
=
[],
[]
for
x
in
fcos2d_extra_output
[
'cls_tower_out'
]:
attr_logits
.
append
(
self
.
attr_logits
(
x
))
speeds
.
append
(
self
.
speed
(
x
))
if
self
.
training
:
assert
gt_instances
is
not
None
feature_shapes
=
[
x
.
shape
[
-
2
:]
for
x
in
features
]
training_targets
=
self
.
prepare_targets
(
locations
,
gt_instances
,
feature_shapes
)
# NOTE:
# if gt_dense_depth is not None:
# training_targets.update({"dense_depth": gt_dense_depth})
losses
=
{}
fcos2d_loss
,
fcos2d_info
=
self
.
fcos2d_loss
(
logits
,
box2d_reg
,
centerness
,
training_targets
)
losses
.
update
(
fcos2d_loss
)
if
not
self
.
only_box2d
:
fcos3d_loss
=
self
.
fcos3d_loss
(
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
dense_depth
,
inv_intrinsics
,
fcos2d_info
,
training_targets
)
losses
.
update
(
fcos3d_loss
)
# Nuscenes loss -- attribute / speed
nuscenes_loss
=
self
.
nuscenes_loss
(
attr_logits
,
speeds
,
fcos2d_info
,
training_targets
)
losses
.
update
(
nuscenes_loss
)
return
losses
else
:
# TODO: do not support inference now
raise
NotImplementedError
pred_instances
,
fcos2d_info
=
self
.
fcos2d_inference
(
logits
,
box2d_reg
,
centerness
,
locations
,
images
.
image_sizes
)
if
not
self
.
only_box2d
:
# This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances'.
self
.
fcos3d_inference
(
box3d_quat
,
box3d_ctr
,
box3d_depth
,
box3d_size
,
box3d_conf
,
inv_intrinsics
,
pred_instances
,
fcos2d_info
)
score_key
=
"scores_3d"
else
:
score_key
=
"scores"
# This adds 'pred_attributes', 'pred_speed' to Instances in 'pred_instances'.
self
.
nuscenes_inference
(
attr_logits
,
speeds
,
pred_instances
,
fcos2d_info
)
# Transpose to "image-first", i.e. (B, L)
pred_instances
=
list
(
zip
(
*
pred_instances
))
pred_instances
=
[
Instances
.
cat
(
instances
)
for
instances
in
pred_instances
]
# 2D NMS and pick top-K.
if
self
.
do_nms
:
pred_instances
=
self
.
fcos2d_inference
.
nms_and_top_k
(
pred_instances
,
score_key
)
if
not
self
.
only_box2d
and
self
.
do_bev_nms
:
# Bird-eye-view NMS.
dummy_group_idxs
=
{
i
:
[
i
]
for
i
,
_
in
enumerate
(
pred_instances
)}
if
'pose'
in
batched_inputs
[
0
]:
poses
=
[
x
[
'pose'
]
for
x
in
batched_inputs
]
else
:
poses
=
[
x
[
'extrinsics'
]
for
x
in
batched_inputs
]
pred_instances
=
nuscenes_sample_aggregate
(
pred_instances
,
dummy_group_idxs
,
self
.
num_classes
,
poses
,
iou_threshold
=
self
.
bev_nms_iou_thresh
,
include_boxes3d_global
=
False
)
if
self
.
postprocess_in_inference
:
processed_results
=
[]
for
results_per_image
,
input_per_image
,
image_size
in
\
zip
(
pred_instances
,
batched_inputs
,
images
.
image_sizes
):
height
=
input_per_image
.
get
(
"height"
,
image_size
[
0
])
width
=
input_per_image
.
get
(
"width"
,
image_size
[
1
])
r
=
resize_instances
(
results_per_image
,
height
,
width
)
processed_results
.
append
({
"instances"
:
r
})
# ----------------------------------------------------------
# NuScenes specific: cross-image (i.e. sample-level) BEV NMS.
# ----------------------------------------------------------
sample_tokens
=
[
x
[
'sample_token'
]
for
x
in
batched_inputs
]
group_idxs
=
get_group_idxs
(
sample_tokens
,
self
.
num_images_per_sample
)
instances
=
[
x
[
'instances'
]
for
x
in
processed_results
]
global_poses
=
[
x
[
'pose'
]
for
x
in
batched_inputs
]
filtered_instances
=
nuscenes_sample_aggregate
(
instances
,
group_idxs
,
self
.
num_classes
,
global_poses
,
self
.
bev_nms_iou_thresh
,
max_num_dets_per_sample
=
self
.
max_num_dets_per_sample
)
processed_results
=
[{
"instances"
:
x
}
for
x
in
filtered_instances
]
else
:
processed_results
=
[{
"instances"
:
x
}
for
x
in
pred_instances
]
return
processed_results
Prev
1
…
3
4
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment