Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
80e8c1d3
Commit
80e8c1d3
authored
Apr 21, 2023
by
Charlie W
Browse files
Add openlane v2
parent
dbf29e61
Changes
93
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5144 additions
and
0 deletions
+5144
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
...aseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
+174
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
...baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
+31
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
...models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
+1045
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/dcnv3.h
...n/mmdet3d/baseline/models/backbones/ops_dcnv3/src/dcnv3.h
+59
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/vision.cpp
...mdet3d/baseline/models/backbones/ops_dcnv3/src/vision.cpp
+17
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/test.py
...lugin/mmdet3d/baseline/models/backbones/ops_dcnv3/test.py
+263
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/detectors/__init__.py
...e-v2/plugin/mmdet3d/baseline/models/detectors/__init__.py
+2
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/detectors/baseline.py
...e-v2/plugin/mmdet3d/baseline/models/detectors/baseline.py
+216
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/detectors/road_bev.py
...e-v2/plugin/mmdet3d/baseline/models/detectors/road_bev.py
+288
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/__init__.py
...nlane-v2/plugin/mmdet3d/baseline/models/heads/__init__.py
+5
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/custom_detr_head.py
.../plugin/mmdet3d/baseline/models/heads/custom_detr_head.py
+389
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/lc_deformable_detr_head.py
.../mmdet3d/baseline/models/heads/lc_deformable_detr_head.py
+550
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/relationship_head.py
...plugin/mmdet3d/baseline/models/heads/relationship_head.py
+116
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/te_deformable_detr_head.py
.../mmdet3d/baseline/models/heads/te_deformable_detr_head.py
+638
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/topology_head.py
...-v2/plugin/mmdet3d/baseline/models/heads/topology_head.py
+111
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/__init__.py
...ane-v2/plugin/mmdet3d/baseline/models/modules/__init__.py
+6
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/bevformer_constructer.py
.../mmdet3d/baseline/models/modules/bevformer_constructer.py
+199
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/custom_base_transformer_layer.py
.../baseline/models/modules/custom_base_transformer_layer.py
+260
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/decoder.py
...lane-v2/plugin/mmdet3d/baseline/models/modules/decoder.py
+378
-0
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/encoder.py
...lane-v2/plugin/mmdet3d/baseline/models/modules/encoder.py
+397
-0
No files found.
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.cu
0 → 100644
View file @
80e8c1d3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "cuda/dcnv3_im2col_cuda.cuh"
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/torch.h>
at
::
Tensor
dcnv3_cuda_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
AT_ASSERTM
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
AT_ASSERTM
(
offset
.
is_contiguous
(),
"offset tensor has to be contiguous"
);
AT_ASSERTM
(
mask
.
is_contiguous
(),
"mask tensor has to be contiguous"
);
AT_ASSERTM
(
input
.
type
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
offset
.
type
().
is_cuda
(),
"offset must be a CUDA tensor"
);
AT_ASSERTM
(
mask
.
type
().
is_cuda
(),
"mask must be a CUDA tensor"
);
const
int
batch
=
input
.
size
(
0
);
const
int
height_in
=
input
.
size
(
1
);
const
int
width_in
=
input
.
size
(
2
);
const
int
channels
=
input
.
size
(
3
);
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
AT_ASSERTM
(
batch
%
im2col_step_
==
0
,
"batch(%d) must divide im2col_step(%d)"
,
batch
,
im2col_step_
);
AT_ASSERTM
(
channels
==
(
group
*
group_channels
),
"Input channels and group times group channels wont match: (%d vs %d)."
,
channels
,
group
*
group_channels
);
auto
output
=
at
::
zeros
({
batch
,
height_out
,
width_out
,
group
*
group_channels
},
input
.
options
());
const
int
batch_n
=
im2col_step_
;
auto
output_n
=
output
.
view
({
batch
/
batch_n
,
batch_n
,
height_out
,
width_out
,
group
*
group_channels
});
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
auto
columns
=
output_n
.
select
(
0
,
n
);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
type
(),
"ms_deform_attn_forward_cuda"
,
([
&
]
{
dcnv3_im2col_cuda
(
at
::
cuda
::
getCurrentCUDAStream
(),
input
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
offset
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
mask
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_mask_size
,
columns
.
data
<
scalar_t
>
(),
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
}));
}
return
output
;
}
std
::
vector
<
at
::
Tensor
>
dcnv3_cuda_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
AT_ASSERTM
(
input
.
is_contiguous
(),
"input tensor has to be contiguous"
);
AT_ASSERTM
(
offset
.
is_contiguous
(),
"offset tensor has to be contiguous"
);
AT_ASSERTM
(
mask
.
is_contiguous
(),
"mask tensor has to be contiguous"
);
AT_ASSERTM
(
grad_output
.
is_contiguous
(),
"grad_output tensor has to be contiguous"
);
AT_ASSERTM
(
input
.
type
().
is_cuda
(),
"input must be a CUDA tensor"
);
AT_ASSERTM
(
offset
.
type
().
is_cuda
(),
"offset must be a CUDA tensor"
);
AT_ASSERTM
(
mask
.
type
().
is_cuda
(),
"mask must be a CUDA tensor"
);
AT_ASSERTM
(
grad_output
.
type
().
is_cuda
(),
"grad_output must be a CUDA tensor"
);
const
int
batch
=
input
.
size
(
0
);
const
int
height_in
=
input
.
size
(
1
);
const
int
width_in
=
input
.
size
(
2
);
const
int
channels
=
input
.
size
(
3
);
const
int
height_out
=
(
height_in
+
2
*
pad_h
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
width_out
=
(
width_in
+
2
*
pad_w
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
im2col_step_
=
std
::
min
(
batch
,
im2col_step
);
AT_ASSERTM
(
batch
%
im2col_step_
==
0
,
"batch(%d) must divide im2col_step(%d)"
,
batch
,
im2col_step_
);
AT_ASSERTM
(
channels
==
(
group
*
group_channels
),
"Input channels and group times group channels wont match: (%d vs %d)."
,
channels
,
group
*
group_channels
);
auto
dtype
=
input
.
dtype
();
if
(
dtype
==
at
::
kHalf
)
{
dtype
=
at
::
kFloat
;
}
auto
grad_input
=
at
::
zeros_like
(
input
,
dtype
);
auto
grad_offset
=
at
::
zeros_like
(
offset
,
dtype
);
auto
grad_mask
=
at
::
zeros_like
(
mask
,
dtype
);
const
int
batch_n
=
im2col_step_
;
auto
per_input_size
=
height_in
*
width_in
*
group
*
group_channels
;
auto
per_offset_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
*
2
;
auto
per_mask_size
=
height_out
*
width_out
*
group
*
kernel_h
*
kernel_w
;
auto
grad_output_n
=
grad_output
.
view
({
batch
/
im2col_step_
,
batch_n
,
height_out
*
width_out
,
group
,
group_channels
});
for
(
int
n
=
0
;
n
<
batch
/
im2col_step_
;
++
n
)
{
auto
grad_output_g
=
grad_output_n
.
select
(
0
,
n
);
// AT_DISPATCH_FLOATING_TYPES(
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
input
.
type
(),
"ms_deform_attn_backward_cuda"
,
([
&
]
{
dcnv3_col2im_cuda
(
at
::
cuda
::
getCurrentCUDAStream
(),
grad_output_g
.
data
<
scalar_t
>
(),
input
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
offset
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
mask
.
data
<
scalar_t
>
()
+
n
*
im2col_step_
*
per_mask_size
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
batch_n
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_input
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_input_size
,
grad_offset
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_offset_size
,
grad_mask
.
data
<
opmath_t
>
()
+
n
*
im2col_step_
*
per_mask_size
);
}));
}
if
(
input
.
dtype
()
==
torch
::
kHalf
)
{
return
{
grad_input
.
to
(
torch
::
kHalf
),
grad_offset
.
to
(
torch
::
kHalf
),
grad_mask
.
to
(
torch
::
kHalf
)};
}
else
{
return
{
grad_input
,
grad_offset
,
grad_mask
};
}
}
\ No newline at end of file
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_cuda.h
0 → 100644
View file @
80e8c1d3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include <torch/extension.h>
at
::
Tensor
dcnv3_cuda_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
);
std
::
vector
<
at
::
Tensor
>
dcnv3_cuda_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
);
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh
0 → 100644
View file @
80e8c1d3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include <algorithm>
#include <cstdio>
#include <cstring>
#include <ATen/ATen.h>
#include <ATen/OpMathType.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCAtomics.cuh>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
const
int
CUDA_NUM_THREADS
=
256
;
inline
int
GET_BLOCKS
(
const
int
N
,
const
int
num_threads
)
{
return
(
N
+
num_threads
-
1
)
/
num_threads
;
}
#define opmath_t at::opmath_type<scalar_t>
template
<
typename
scalar_t
>
__device__
opmath_t
dcnv3_im2col_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
group
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
g
,
const
int
&
c
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
group
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
g
*
group_channels
+
c
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
}
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
return
val
;
}
template
<
typename
scalar_t
>
__device__
void
dcnv3_col2im_bilinear
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
nheads
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
m
,
const
int
&
c
,
const
opmath_t
offset_scale
,
const
opmath_t
&
top_grad
,
const
opmath_t
&
mask
,
opmath_t
*&
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
nheads
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
m
*
group_channels
+
c
;
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
top_grad_im
=
top_grad
*
mask
;
opmath_t
grad_h_weight
=
0
,
grad_w_weight
=
0
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
grad_h_weight
-=
hw
*
v1
;
grad_w_weight
-=
hh
*
v1
;
atomicAdd
(
grad_im
+
ptr1
,
w1
*
top_grad_im
);
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
grad_h_weight
-=
lw
*
v2
;
grad_w_weight
+=
hh
*
v2
;
atomicAdd
(
grad_im
+
ptr2
,
w2
*
top_grad_im
);
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
grad_h_weight
+=
hw
*
v3
;
grad_w_weight
-=
lh
*
v3
;
atomicAdd
(
grad_im
+
ptr3
,
w3
*
top_grad_im
);
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
grad_h_weight
+=
lw
*
v4
;
grad_w_weight
+=
lh
*
v4
;
atomicAdd
(
grad_im
+
ptr4
,
w4
*
top_grad_im
);
}
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
*
grad_mask
=
top_grad
*
val
;
*
grad_offset
=
offset_scale
*
grad_w_weight
*
top_grad_im
;
*
(
grad_offset
+
1
)
=
offset_scale
*
grad_h_weight
*
top_grad_im
;
}
template
<
typename
scalar_t
>
__device__
void
dcnv3_col2im_bilinear_gm
(
const
scalar_t
*&
bottom_data
,
const
int
&
height
,
const
int
&
width
,
const
int
&
nheads
,
const
int
&
group_channels
,
const
opmath_t
&
h
,
const
opmath_t
&
w
,
const
int
&
m
,
const
int
&
c
,
const
opmath_t
offset_scale
,
const
opmath_t
&
top_grad
,
const
opmath_t
&
mask
,
opmath_t
*&
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
h_low
=
floor
(
h
);
const
int
w_low
=
floor
(
w
);
const
int
h_high
=
h_low
+
1
;
const
int
w_high
=
w_low
+
1
;
const
opmath_t
lh
=
h
-
h_low
;
const
opmath_t
lw
=
w
-
w_low
;
const
opmath_t
hh
=
1
-
lh
,
hw
=
1
-
lw
;
const
int
w_stride
=
nheads
*
group_channels
;
const
int
h_stride
=
width
*
w_stride
;
const
int
h_low_ptr_offset
=
h_low
*
h_stride
;
const
int
h_high_ptr_offset
=
h_low_ptr_offset
+
h_stride
;
const
int
w_low_ptr_offset
=
w_low
*
w_stride
;
const
int
w_high_ptr_offset
=
w_low_ptr_offset
+
w_stride
;
const
int
base_ptr
=
m
*
group_channels
+
c
;
const
opmath_t
w1
=
hh
*
hw
,
w2
=
hh
*
lw
,
w3
=
lh
*
hw
,
w4
=
lh
*
lw
;
const
opmath_t
top_grad_im
=
top_grad
*
mask
;
opmath_t
grad_h_weight
=
0
,
grad_w_weight
=
0
;
opmath_t
v1
=
0
;
if
(
h_low
>=
0
&&
w_low
>=
0
)
{
const
int
ptr1
=
h_low_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v1
=
bottom_data
[
ptr1
];
grad_h_weight
-=
hw
*
v1
;
grad_w_weight
-=
hh
*
v1
;
atomicAdd
(
grad_im
+
ptr1
,
w1
*
top_grad_im
);
}
opmath_t
v2
=
0
;
if
(
h_low
>=
0
&&
w_high
<=
width
-
1
)
{
const
int
ptr2
=
h_low_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v2
=
bottom_data
[
ptr2
];
grad_h_weight
-=
lw
*
v2
;
grad_w_weight
+=
hh
*
v2
;
atomicAdd
(
grad_im
+
ptr2
,
w2
*
top_grad_im
);
}
opmath_t
v3
=
0
;
if
(
h_high
<=
height
-
1
&&
w_low
>=
0
)
{
const
int
ptr3
=
h_high_ptr_offset
+
w_low_ptr_offset
+
base_ptr
;
v3
=
bottom_data
[
ptr3
];
grad_h_weight
+=
hw
*
v3
;
grad_w_weight
-=
lh
*
v3
;
atomicAdd
(
grad_im
+
ptr3
,
w3
*
top_grad_im
);
}
opmath_t
v4
=
0
;
if
(
h_high
<=
height
-
1
&&
w_high
<=
width
-
1
)
{
const
int
ptr4
=
h_high_ptr_offset
+
w_high_ptr_offset
+
base_ptr
;
v4
=
bottom_data
[
ptr4
];
grad_h_weight
+=
lw
*
v4
;
grad_w_weight
+=
lh
*
v4
;
atomicAdd
(
grad_im
+
ptr4
,
w4
*
top_grad_im
);
}
const
opmath_t
val
=
(
w1
*
v1
+
w2
*
v2
+
w3
*
v3
+
w4
*
v4
);
atomicAdd
(
grad_mask
,
top_grad
*
val
);
atomicAdd
(
grad_offset
,
offset_scale
*
grad_w_weight
*
top_grad_im
);
atomicAdd
(
grad_offset
+
1
,
offset_scale
*
grad_h_weight
*
top_grad_im
);
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_im2col_gpu_kernel
(
const
int
num_kernels
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
int
input_size
=
height_in
*
width_in
;
scalar_t
*
data_col_ptr
=
data_col
+
index
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
qid_stride
=
group
*
group_channels
;
opmath_t
col
=
0
;
const
scalar_t
*
data_im_ptr
=
data_im
+
b_col
*
input_size
*
qid_stride
;
// top-left
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
col
+=
dcnv3_im2col_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
)
*
weight
;
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
}
}
*
data_col_ptr
=
col
;
}
}
// debug
template
<
typename
scalar_t
,
unsigned
int
blockSize
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
__shared__
opmath_t
cache_grad_offset
[
blockSize
*
2
];
__shared__
opmath_t
cache_grad_mask
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
if
(
tid
==
0
)
{
opmath_t
_grad_w
=
cache_grad_offset
[
0
],
_grad_h
=
cache_grad_offset
[
1
],
_grad_a
=
cache_grad_mask
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockSize
;
++
tid
)
{
_grad_w
+=
cache_grad_offset
[
sid
];
_grad_h
+=
cache_grad_offset
[
sid
+
1
];
_grad_a
+=
cache_grad_mask
[
tid
];
sid
+=
2
;
}
*
grad_offset
=
_grad_w
;
*
(
grad_offset
+
1
)
=
_grad_h
;
*
grad_mask
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
,
unsigned
int
blockSize
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
__shared__
opmath_t
cache_grad_offset
[
blockSize
*
2
];
__shared__
opmath_t
cache_grad_mask
[
blockSize
];
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockSize
/
2
;
s
>
0
;
s
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
grad_offset
=
cache_grad_offset
[
0
];
*
(
grad_offset
+
1
)
=
cache_grad_offset
[
1
];
*
grad_mask
=
cache_grad_mask
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v1
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
if
(
tid
==
0
)
{
opmath_t
_grad_w
=
cache_grad_offset
[
0
],
_grad_h
=
cache_grad_offset
[
1
],
_grad_a
=
cache_grad_mask
[
0
];
int
sid
=
2
;
for
(
unsigned
int
tid
=
1
;
tid
<
blockDim
.
x
;
++
tid
)
{
_grad_w
+=
cache_grad_offset
[
sid
];
_grad_h
+=
cache_grad_offset
[
sid
+
1
];
_grad_a
+=
cache_grad_mask
[
tid
];
sid
+=
2
;
}
*
grad_offset
=
_grad_w
;
*
(
grad_offset
+
1
)
=
_grad_h
;
*
grad_mask
=
_grad_a
;
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v2
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockDim
.
x
/
2
,
spre
=
blockDim
.
x
;
s
>
0
;
s
>>=
1
,
spre
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
if
(
tid
+
(
s
<<
1
)
<
spre
)
{
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
+
(
s
<<
1
)];
}
}
__syncthreads
();
}
if
(
tid
==
0
)
{
*
grad_offset
=
cache_grad_offset
[
0
];
*
(
grad_offset
+
1
)
=
cache_grad_offset
[
1
];
*
grad_mask
=
cache_grad_mask
[
0
];
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
extern
__shared__
int
_s
[];
opmath_t
*
cache_grad_offset
=
(
opmath_t
*
)
_s
;
opmath_t
*
cache_grad_mask
=
cache_grad_offset
+
2
*
blockDim
.
x
;
unsigned
int
tid
=
threadIdx
.
x
;
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
*
(
cache_grad_offset
+
(
threadIdx
.
x
<<
1
))
=
0
;
*
(
cache_grad_offset
+
((
threadIdx
.
x
<<
1
)
+
1
))
=
0
;
*
(
cache_grad_mask
+
threadIdx
.
x
)
=
0
;
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
cache_grad_offset
+
(
threadIdx
.
x
<<
1
),
cache_grad_mask
+
threadIdx
.
x
);
}
__syncthreads
();
for
(
unsigned
int
s
=
blockDim
.
x
/
2
,
spre
=
blockDim
.
x
;
s
>
0
;
s
>>=
1
,
spre
>>=
1
)
{
if
(
tid
<
s
)
{
const
unsigned
int
xid1
=
tid
<<
1
;
const
unsigned
int
xid2
=
(
tid
+
s
)
<<
1
;
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
s
];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
];
if
(
tid
+
(
s
<<
1
)
<
spre
)
{
cache_grad_mask
[
tid
]
+=
cache_grad_mask
[
tid
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
]
+=
cache_grad_offset
[
xid2
+
(
s
<<
1
)];
cache_grad_offset
[
xid1
+
1
]
+=
cache_grad_offset
[
xid2
+
1
+
(
s
<<
1
)];
}
}
__syncthreads
();
}
if
(
tid
==
0
)
{
atomicAdd
(
grad_offset
,
cache_grad_offset
[
0
]);
atomicAdd
(
grad_offset
+
1
,
cache_grad_offset
[
1
]);
atomicAdd
(
grad_mask
,
cache_grad_mask
[
0
]);
}
__syncthreads
();
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
__global__
void
dcnv3_col2im_gpu_kernel_gm
(
const
int
num_kernels
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
CUDA_KERNEL_LOOP
(
index
,
num_kernels
)
{
int
_temp
=
index
;
const
int
c_col
=
_temp
%
group_channels
;
_temp
/=
group_channels
;
const
int
sampling_index
=
_temp
;
const
int
g_col
=
_temp
%
group
;
_temp
/=
group
;
const
int
p0_w
=
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
-
pad_w
+
(
_temp
%
width_out
)
*
stride_w
;
_temp
/=
width_out
;
const
int
p0_h
=
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
-
pad_h
+
(
_temp
%
height_out
)
*
stride_h
;
_temp
/=
height_out
;
const
int
b_col
=
_temp
;
const
opmath_t
top_grad
=
grad_col
[
index
];
const
int
input_size
=
height_in
*
width_in
;
const
int
kernel_size
=
kernel_h
*
kernel_w
;
int
data_weight_ptr
=
sampling_index
*
kernel_size
;
int
data_loc_w_ptr
=
data_weight_ptr
<<
1
;
const
int
grad_sampling_ptr
=
data_weight_ptr
;
grad_offset
+=
grad_sampling_ptr
<<
1
;
grad_mask
+=
grad_sampling_ptr
;
const
int
qid_stride
=
group
*
group_channels
;
const
int
im_ptr_offset
=
b_col
*
input_size
*
qid_stride
;
const
scalar_t
*
data_im_ptr
=
data_im
+
im_ptr_offset
;
opmath_t
*
grad_im_ptr
=
grad_im
+
im_ptr_offset
;
const
opmath_t
p0_w_
=
p0_w
-
((
dilation_w
*
(
kernel_w
-
1
))
>>
1
)
*
offset_scale
;
const
opmath_t
p0_h_
=
p0_h
-
((
dilation_h
*
(
kernel_h
-
1
))
>>
1
)
*
offset_scale
;
for
(
int
i
=
0
;
i
<
kernel_w
;
++
i
)
{
for
(
int
j
=
0
;
j
<
kernel_h
;
++
j
)
{
const
opmath_t
offset_w
=
data_offset
[
data_loc_w_ptr
];
const
opmath_t
offset_h
=
data_offset
[
data_loc_w_ptr
+
1
];
const
opmath_t
loc_w
=
p0_w_
+
(
i
*
dilation_w
+
offset_w
)
*
offset_scale
;
const
opmath_t
loc_h
=
p0_h_
+
(
j
*
dilation_h
+
offset_h
)
*
offset_scale
;
const
opmath_t
weight
=
data_mask
[
data_weight_ptr
];
if
(
loc_h
>
-
1
&&
loc_w
>
-
1
&&
loc_h
<
height_in
&&
loc_w
<
width_in
)
{
dcnv3_col2im_bilinear_gm
(
data_im_ptr
,
height_in
,
width_in
,
group
,
group_channels
,
loc_h
,
loc_w
,
g_col
,
c_col
,
offset_scale
,
top_grad
,
weight
,
grad_im_ptr
,
grad_offset
,
grad_mask
);
}
data_weight_ptr
+=
1
;
data_loc_w_ptr
+=
2
;
grad_mask
+=
1
;
grad_offset
+=
2
;
}
}
}
}
template
<
typename
scalar_t
>
void
dcnv3_im2col_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
scalar_t
*
data_col
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
)
{
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_threads
=
CUDA_NUM_THREADS
;
dcnv3_im2col_gpu_kernel
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
data_im
,
data_offset
,
data_mask
,
data_col
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
);
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_im2col_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
template
<
typename
scalar_t
>
void
dcnv3_col2im_cuda
(
cudaStream_t
stream
,
const
scalar_t
*
grad_col
,
const
scalar_t
*
data_im
,
const
scalar_t
*
data_offset
,
const
scalar_t
*
data_mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
int
batch_n
,
const
int
height_in
,
const
int
width_in
,
const
int
height_out
,
const
int
width_out
,
const
opmath_t
offset_scale
,
opmath_t
*
grad_im
,
opmath_t
*
grad_offset
,
opmath_t
*
grad_mask
)
{
const
int
num_threads
=
(
group_channels
>
CUDA_NUM_THREADS
)
?
CUDA_NUM_THREADS
:
group_channels
;
const
int
num_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
const
int
num_actual_kernels
=
batch_n
*
height_out
*
width_out
*
group
*
group_channels
;
if
(
group_channels
>
1024
)
{
if
((
group_channels
&
1023
)
==
0
)
{
dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
else
{
dcnv3_col2im_gpu_kernel_gm
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
}
else
{
switch
(
group_channels
)
{
case
1
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
1
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
2
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
2
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
4
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
4
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
8
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
8
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
16
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
16
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
32
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<
scalar_t
,
32
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
64
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
64
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
128
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
128
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
256
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
256
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
512
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
512
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
case
1024
:
dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<
scalar_t
,
1024
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
0
,
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
break
;
default:
if
(
group_channels
<
64
)
{
dcnv3_col2im_gpu_kernel_shm_reduce_v1
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
else
{
dcnv3_col2im_gpu_kernel_shm_reduce_v2
<
scalar_t
>
<<<
GET_BLOCKS
(
num_actual_kernels
,
num_threads
),
num_threads
,
num_threads
*
3
*
sizeof
(
opmath_t
),
stream
>>>
(
num_kernels
,
grad_col
,
data_im
,
data_offset
,
data_mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
height_in
,
width_in
,
height_out
,
width_out
,
offset_scale
,
grad_im
,
grad_offset
,
grad_mask
);
}
}
}
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"error in dcnv3_col2im_cuda: %s
\n
"
,
cudaGetErrorString
(
err
));
}
}
\ No newline at end of file
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/dcnv3.h
0 → 100644
View file @
80e8c1d3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include "cpu/dcnv3_cpu.h"
#ifdef WITH_CUDA
#include "cuda/dcnv3_cuda.h"
#endif
at
::
Tensor
dcnv3_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
int
im2col_step
)
{
if
(
input
.
type
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dcnv3_cuda_forward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
std
::
vector
<
at
::
Tensor
>
dcnv3_backward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
offset
,
const
at
::
Tensor
&
mask
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
stride_h
,
const
int
stride_w
,
const
int
pad_h
,
const
int
pad_w
,
const
int
dilation_h
,
const
int
dilation_w
,
const
int
group
,
const
int
group_channels
,
const
float
offset_scale
,
const
at
::
Tensor
&
grad_output
,
const
int
im2col_step
)
{
if
(
input
.
type
().
is_cuda
())
{
#ifdef WITH_CUDA
return
dcnv3_cuda_backward
(
input
,
offset
,
mask
,
kernel_h
,
kernel_w
,
stride_h
,
stride_w
,
pad_h
,
pad_w
,
dilation_h
,
dilation_w
,
group
,
group_channels
,
offset_scale
,
grad_output
,
im2col_step
);
#else
AT_ERROR
(
"Not compiled with GPU support"
);
#endif
}
AT_ERROR
(
"Not implemented on the CPU"
);
}
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/src/vision.cpp
0 → 100644
View file @
80e8c1d3
/*!
**************************************************************************************************
* InternImage
* Copyright (c) 2022 OpenGVLab
* Licensed under The MIT License [see LICENSE for details]
**************************************************************************************************
* Modified from
*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "dcnv3.h"
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"dcnv3_forward"
,
&
dcnv3_forward
,
"dcnv3_forward"
);
m
.
def
(
"dcnv3_backward"
,
&
dcnv3_backward
,
"dcnv3_backward"
);
}
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/backbones/ops_dcnv3/test.py
0 → 100644
View file @
80e8c1d3
# --------------------------------------------------------
# InternImage
# Copyright (c) 2022 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
__future__
import
absolute_import
from
__future__
import
print_function
from
__future__
import
division
import
time
import
torch
import
torch.nn
as
nn
import
math
from
torch.autograd
import
gradcheck
from
functions.dcnv3_func
import
DCNv3Function
,
dcnv3_core_pytorch
H_in
,
W_in
=
8
,
8
N
,
M
,
D
=
2
,
4
,
16
Kh
,
Kw
=
3
,
3
P
=
Kh
*
Kw
offset_scale
=
2.0
pad
=
1
dilation
=
1
stride
=
1
H_out
=
(
H_in
+
2
*
pad
-
(
dilation
*
(
Kh
-
1
)
+
1
))
//
stride
+
1
W_out
=
(
W_in
+
2
*
pad
-
(
dilation
*
(
Kw
-
1
)
+
1
))
//
stride
+
1
torch
.
manual_seed
(
3
)
@
torch
.
no_grad
()
def
check_forward_equal_with_pytorch_double
():
input
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask
/=
mask
.
sum
(
-
1
,
keepdim
=
True
)
mask
=
mask
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
output_pytorch
=
dcnv3_core_pytorch
(
input
.
double
(),
offset
.
double
(),
mask
.
double
(),
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
).
detach
().
cpu
()
im2col_step
=
2
output_cuda
=
DCNv3Function
.
apply
(
input
.
double
(),
offset
.
double
(),
mask
.
double
(),
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
,
im2col_step
).
detach
().
cpu
()
fwdok
=
torch
.
allclose
(
output_cuda
,
output_pytorch
)
max_abs_err
=
(
output_cuda
-
output_pytorch
).
abs
().
max
()
max_rel_err
=
((
output_cuda
-
output_pytorch
).
abs
()
/
output_pytorch
.
abs
()).
max
()
print
(
'>>> forward double'
)
print
(
f
'*
{
fwdok
}
check_forward_equal_with_pytorch_double: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
@
torch
.
no_grad
()
def
check_forward_equal_with_pytorch_float
():
input
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask
/=
mask
.
sum
(
-
1
,
keepdim
=
True
)
mask
=
mask
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
output_pytorch
=
dcnv3_core_pytorch
(
input
,
offset
,
mask
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
).
detach
().
cpu
()
im2col_step
=
2
output_cuda
=
DCNv3Function
.
apply
(
input
,
offset
,
mask
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
,
im2col_step
).
detach
().
cpu
()
fwdok
=
torch
.
allclose
(
output_cuda
,
output_pytorch
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
output_cuda
-
output_pytorch
).
abs
().
max
()
max_rel_err
=
((
output_cuda
-
output_pytorch
).
abs
()
/
output_pytorch
.
abs
()).
max
()
print
(
'>>> forward float'
)
print
(
f
'*
{
fwdok
}
check_forward_equal_with_pytorch_float: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
def
check_backward_equal_with_pytorch_double
(
channels
=
4
,
grad_input
=
True
,
grad_offset
=
True
,
grad_mask
=
True
):
# H_in, W_in = 4, 4
N
=
2
M
=
2
H_out
=
(
H_in
+
2
*
pad
-
(
dilation
*
(
Kh
-
1
)
+
1
))
//
stride
+
1
W_out
=
(
W_in
+
2
*
pad
-
(
dilation
*
(
Kw
-
1
)
+
1
))
//
stride
+
1
D
=
channels
input0
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset0
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask0
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask0
/=
mask0
.
sum
(
-
1
,
keepdim
=
True
)
mask0
=
mask0
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
input0
.
requires_grad
=
grad_input
offset0
.
requires_grad
=
grad_offset
mask0
.
requires_grad
=
grad_mask
output_pytorch
=
dcnv3_core_pytorch
(
input0
.
double
(),
offset0
.
double
(),
mask0
.
double
(),
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
)
output_pytorch
.
sum
().
backward
()
input1
=
input0
.
detach
()
offset1
=
offset0
.
detach
()
mask1
=
mask0
.
detach
()
input1
.
requires_grad
=
grad_input
offset1
.
requires_grad
=
grad_offset
mask1
.
requires_grad
=
grad_mask
im2col_step
=
2
output_cuda
=
DCNv3Function
.
apply
(
input1
.
double
(),
offset1
.
double
(),
mask1
.
double
(),
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
,
im2col_step
)
output_cuda
.
sum
().
backward
()
print
(
f
'>>> backward double: channels
{
D
}
'
)
bwdok
=
torch
.
allclose
(
input0
.
grad
,
input1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
input0
.
grad
-
input1
.
grad
).
abs
().
max
()
max_rel_err
=
((
input0
.
grad
-
input1
.
grad
).
abs
()
/
input0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
input_grad check_backward_equal_with_pytorch_double: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
bwdok
=
torch
.
allclose
(
offset0
.
grad
,
offset1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
offset0
.
grad
-
offset1
.
grad
).
abs
().
max
()
max_rel_err
=
((
offset0
.
grad
-
offset1
.
grad
).
abs
()
/
offset0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
offset_grad check_backward_equal_with_pytorch_double: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
bwdok
=
torch
.
allclose
(
mask0
.
grad
,
mask1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
mask0
.
grad
-
mask1
.
grad
).
abs
().
max
()
max_rel_err
=
((
mask0
.
grad
-
mask1
.
grad
).
abs
()
/
mask0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
mask_grad check_backward_equal_with_pytorch_double: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
def
check_backward_equal_with_pytorch_float
(
channels
=
4
,
grad_input
=
True
,
grad_offset
=
True
,
grad_mask
=
True
):
# H_in, W_in = 4, 4
N
=
2
M
=
2
H_out
=
(
H_in
+
2
*
pad
-
(
dilation
*
(
Kh
-
1
)
+
1
))
//
stride
+
1
W_out
=
(
W_in
+
2
*
pad
-
(
dilation
*
(
Kw
-
1
)
+
1
))
//
stride
+
1
D
=
channels
input0
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset0
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask0
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask0
/=
mask0
.
sum
(
-
1
,
keepdim
=
True
)
mask0
=
mask0
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
input0
.
requires_grad
=
grad_input
offset0
.
requires_grad
=
grad_offset
mask0
.
requires_grad
=
grad_mask
output_pytorch
=
dcnv3_core_pytorch
(
input0
,
offset0
,
mask0
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
)
output_pytorch
.
sum
().
backward
()
input1
=
input0
.
detach
()
offset1
=
offset0
.
detach
()
mask1
=
mask0
.
detach
()
input1
.
requires_grad
=
grad_input
offset1
.
requires_grad
=
grad_offset
mask1
.
requires_grad
=
grad_mask
im2col_step
=
2
output_cuda
=
DCNv3Function
.
apply
(
input1
,
offset1
,
mask1
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
offset_scale
,
im2col_step
)
output_cuda
.
sum
().
backward
()
print
(
f
'>>> backward float: channels
{
D
}
'
)
bwdok
=
torch
.
allclose
(
input0
.
grad
,
input1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
input0
.
grad
-
input1
.
grad
).
abs
().
max
()
max_rel_err
=
((
input0
.
grad
-
input1
.
grad
).
abs
()
/
input0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
input_grad check_backward_equal_with_pytorch_float: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
bwdok
=
torch
.
allclose
(
offset0
.
grad
,
offset1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
offset0
.
grad
-
offset1
.
grad
).
abs
().
max
()
max_rel_err
=
((
offset0
.
grad
-
offset1
.
grad
).
abs
()
/
offset0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
offset_grad check_backward_equal_with_pytorch_float: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
bwdok
=
torch
.
allclose
(
mask0
.
grad
,
mask1
.
grad
,
rtol
=
1e-2
,
atol
=
1e-3
)
max_abs_err
=
(
mask0
.
grad
-
mask1
.
grad
).
abs
().
max
()
max_rel_err
=
((
mask0
.
grad
-
mask1
.
grad
).
abs
()
/
mask0
.
grad
.
abs
()).
max
()
print
(
f
'*
{
bwdok
}
mask_grad check_backward_equal_with_pytorch_float: max_abs_err
{
max_abs_err
:.
2
e
}
max_rel_err
{
max_rel_err
:.
2
e
}
'
)
@
torch
.
no_grad
()
def
check_time_cost
(
im2col_step
=
128
):
N
=
512
H_in
,
W_in
=
64
,
64
H_out
=
(
H_in
+
2
*
pad
-
(
dilation
*
(
Kh
-
1
)
+
1
))
//
stride
+
1
W_out
=
(
W_in
+
2
*
pad
-
(
dilation
*
(
Kw
-
1
)
+
1
))
//
stride
+
1
input
=
torch
.
rand
(
N
,
H_in
,
W_in
,
M
*
D
).
cuda
()
*
0.01
offset
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
*
P
*
2
).
cuda
()
*
10
mask
=
torch
.
rand
(
N
,
H_out
,
W_out
,
M
,
P
).
cuda
()
+
1e-5
mask
/=
mask
.
sum
(
-
1
,
keepdim
=
True
)
mask
=
mask
.
reshape
(
N
,
H_out
,
W_out
,
M
*
P
)
print
(
f
'>>> time cost: im2col_step
{
im2col_step
}
; input
{
input
.
shape
}
; points
{
P
}
'
)
repeat
=
100
for
i
in
range
(
repeat
):
output_cuda
=
DCNv3Function
.
apply
(
input
,
offset
,
mask
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
1.0
,
im2col_step
)
torch
.
cuda
.
synchronize
()
start
=
time
.
time
()
for
i
in
range
(
repeat
):
output_cuda
=
DCNv3Function
.
apply
(
input
,
offset
,
mask
,
Kh
,
Kw
,
stride
,
stride
,
Kh
//
2
,
Kw
//
2
,
dilation
,
dilation
,
M
,
D
,
1.0
,
im2col_step
)
torch
.
cuda
.
synchronize
()
print
(
f
'foward time cost:
{
(
time
.
time
()
-
start
)
/
repeat
}
'
)
if
__name__
==
'__main__'
:
check_forward_equal_with_pytorch_double
()
check_forward_equal_with_pytorch_float
()
for
channels
in
[
1
,
16
,
30
,
32
,
64
,
71
,
1025
]:
check_backward_equal_with_pytorch_double
(
channels
,
True
,
True
,
True
)
for
channels
in
[
1
,
16
,
30
,
32
,
64
,
71
,
1025
]:
check_backward_equal_with_pytorch_float
(
channels
,
True
,
True
,
True
)
for
i
in
range
(
3
):
im2col_step
=
128
*
(
2
**
i
)
check_time_cost
(
im2col_step
)
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/detectors/__init__.py
0 → 100644
View file @
80e8c1d3
from
.baseline
import
Baseline
from
.road_bev
import
ROAD_BEVFormer
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/detectors/baseline.py
0 → 100644
View file @
80e8c1d3
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# baseline.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
torch
from
mmdet3d.models
import
DETECTORS
,
build_neck
,
build_head
from
mmdet3d.models.detectors
import
MVXTwoStageDetector
@
DETECTORS
.
register_module
()
class
Baseline
(
MVXTwoStageDetector
):
def
__init__
(
self
,
img_backbone
=
None
,
img_neck
=
None
,
img_view_transformer
=
None
,
lc_head
=
None
,
te_head
=
None
,
lclc_head
=
None
,
lcte_head
=
None
,
**
kwargs
):
super
().
__init__
(
img_backbone
=
img_backbone
,
img_neck
=
img_neck
,
**
kwargs
)
self
.
img_view_transformer
=
build_neck
(
img_view_transformer
)
self
.
lc_head
=
build_head
(
lc_head
)
self
.
te_head
=
build_head
(
te_head
)
self
.
lclc_head
=
build_head
(
lclc_head
)
self
.
lcte_head
=
build_head
(
lcte_head
)
def
simple_forward
(
self
,
img
,
img_metas
):
# extract image features
B
,
N
,
C
,
imH
,
imW
=
img
.
shape
img
=
img
.
view
(
B
*
N
,
C
,
imH
,
imW
)
x
=
self
.
img_backbone
(
img
)
if
self
.
with_img_neck
:
x
=
self
.
img_neck
(
x
)
if
type
(
x
)
in
[
list
,
tuple
]:
x
=
x
[
0
]
# view transformation
bev_feat
=
self
.
img_view_transformer
(
x
,
torch
.
cat
([
torch
.
cat
([
torch
.
tensor
(
l
,
device
=
x
.
device
,
dtype
=
torch
.
float32
).
unsqueeze
(
0
)
for
l
in
img_metas
[
b
][
'lidar2img'
]],
dim
=
0
).
unsqueeze
(
0
)
for
b
in
range
(
B
)],
dim
=
0
),
(
img_metas
[
0
][
'img_shape'
][
0
][
0
],
img_metas
[
0
][
'img_shape'
][
0
][
1
]),
)
_
,
output_dim
,
ouput_H
,
output_W
=
x
.
shape
pv_feat
=
x
.
view
(
B
,
N
,
output_dim
,
ouput_H
,
output_W
)[:,
0
,
...]
# lc
lc_img_metas
=
[{
'batch_input_shape'
:
(
bev_feat
.
shape
[
-
2
],
bev_feat
.
shape
[
-
1
]),
'img_shape'
:
(
bev_feat
.
shape
[
-
2
],
bev_feat
.
shape
[
-
1
],
None
),
'scale_factor'
:
None
,
# dummy
}
for
_
in
range
(
B
)]
all_lc_cls_scores_list
,
all_lc_preds_list
,
lc_outs_dec_list
=
self
.
lc_head
(
[
bev_feat
],
lc_img_metas
,
)
# te
te_img_metas
=
[{
'batch_input_shape'
:
(
img_metas
[
b
][
'pad_shape'
][
0
][
0
],
img_metas
[
b
][
'pad_shape'
][
0
][
1
]),
'img_shape'
:
(
img_metas
[
b
][
'img_shape'
][
0
][
0
],
img_metas
[
b
][
'img_shape'
][
0
][
1
],
None
),
'scale_factor'
:
img_metas
[
b
][
'scale_factor'
],
}
for
b
in
range
(
B
)]
all_te_cls_scores_list
,
all_te_preds_list
,
te_outs_dec_list
=
self
.
te_head
(
[
pv_feat
],
te_img_metas
,
)
# topology_lclc
all_lclc_preds_list
=
self
.
lclc_head
(
lc_outs_dec_list
,
lc_outs_dec_list
,
)
# topology_lcte
all_lcte_preds_list
=
self
.
lcte_head
(
lc_outs_dec_list
,
te_outs_dec_list
,
)
return
{
'all_lc_cls_scores_list'
:
all_lc_cls_scores_list
,
'all_lc_preds_list'
:
all_lc_preds_list
,
'lc_img_metas'
:
lc_img_metas
,
'all_te_cls_scores_list'
:
all_te_cls_scores_list
,
'all_te_preds_list'
:
all_te_preds_list
,
'te_img_metas'
:
te_img_metas
,
'all_lclc_preds_list'
:
all_lclc_preds_list
,
'all_lcte_preds_list'
:
all_lcte_preds_list
,
}
def
forward_train
(
self
,
img
,
img_metas
,
gt_lc
=
None
,
gt_lc_labels
=
None
,
gt_te
=
None
,
gt_te_labels
=
None
,
gt_topology_lclc
=
None
,
gt_topology_lcte
=
None
,
**
kwargs
):
outs
=
self
.
simple_forward
(
img
,
img_metas
)
losses
=
dict
()
# lc
lc_loss_dict
,
lc_assign_results
=
self
.
lc_head
.
loss
(
outs
[
'all_lc_cls_scores_list'
],
outs
[
'all_lc_preds_list'
],
gt_lc
,
gt_lc_labels
,
outs
[
'lc_img_metas'
],
)
losses
.
update
({
f
'lc_
{
key
}
'
:
val
for
key
,
val
in
lc_loss_dict
.
items
()
})
# te
te_loss_dict
,
te_assign_results
=
self
.
te_head
.
loss
(
outs
[
'all_te_cls_scores_list'
],
outs
[
'all_te_preds_list'
],
gt_te
,
gt_te_labels
,
outs
[
'te_img_metas'
],
)
losses
.
update
({
f
'te_
{
key
}
'
:
val
for
key
,
val
in
te_loss_dict
.
items
()
})
# topology_lclc
topology_lclc_loss_dict
=
self
.
lclc_head
.
loss
(
outs
[
'all_lclc_preds_list'
],
lc_assign_results
,
lc_assign_results
,
gt_topology_lclc
,
)
losses
.
update
({
f
'topology_lclc_
{
key
}
'
:
val
for
key
,
val
in
topology_lclc_loss_dict
.
items
()
})
# topology_lcte
topology_lcte_loss_dict
=
self
.
lcte_head
.
loss
(
outs
[
'all_lcte_preds_list'
],
lc_assign_results
,
te_assign_results
,
gt_topology_lcte
)
losses
.
update
({
f
'topology_lcte_
{
key
}
'
:
val
for
key
,
val
in
topology_lcte_loss_dict
.
items
()
})
return
losses
def
forward_test
(
self
,
img
,
img_metas
,
**
kwargs
):
outs
=
self
.
simple_forward
(
img
,
img_metas
)
pred_lc
=
self
.
lc_head
.
get_bboxes
(
outs
[
'all_lc_cls_scores_list'
],
outs
[
'all_lc_preds_list'
],
outs
[
'lc_img_metas'
],
)
pred_te
=
self
.
te_head
.
get_bboxes
(
outs
[
'all_te_cls_scores_list'
],
outs
[
'all_te_preds_list'
],
outs
[
'te_img_metas'
],
rescale
=
True
,
)
pred_topology_lclc
=
self
.
lclc_head
.
get_topology
(
outs
[
'all_lclc_preds_list'
])
pred_topology_lcte
=
self
.
lcte_head
.
get_topology
(
outs
[
'all_lcte_preds_list'
])
assert
len
(
pred_lc
)
==
len
(
pred_te
)
==
1
,
\
'evaluation implemented for bs=1'
return
[{
'pred_lc'
:
pred_lc
[
0
],
'pred_te'
:
pred_te
[
0
],
'pred_topology_lclc'
:
pred_topology_lclc
[
0
],
'pred_topology_lcte'
:
pred_topology_lcte
[
0
],
}]
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/detectors/road_bev.py
0 → 100644
View file @
80e8c1d3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Tianyu Li
# ---------------------------------------------
import
time
import
copy
import
numpy
as
np
import
torch
from
mmcv.runner
import
force_fp32
,
auto_fp16
from
mmdet.core
import
bbox2result
from
mmdet.models
import
DETECTORS
from
mmdet.models.builder
import
build_head
from
mmdet3d.models.builder
import
build_neck
from
mmdet3d.models.detectors.mvx_two_stage
import
MVXTwoStageDetector
@
DETECTORS
.
register_module
()
class
ROAD_BEVFormer
(
MVXTwoStageDetector
):
def
__init__
(
self
,
pts_voxel_layer
=
None
,
pts_voxel_encoder
=
None
,
pts_middle_encoder
=
None
,
pts_fusion_layer
=
None
,
img_backbone
=
None
,
pts_backbone
=
None
,
img_neck
=
None
,
pts_neck
=
None
,
pts_bbox_head
=
None
,
img_roi_head
=
None
,
img_rpn_head
=
None
,
bev_constructor
=
None
,
bbox_head
=
None
,
bbox_train_cfg
=
None
,
lclc_head
=
None
,
lcte_head
=
None
,
train_cfg
=
None
,
test_cfg
=
None
,
pretrained
=
None
,
video_test_mode
=
False
):
super
(
ROAD_BEVFormer
,
self
).
__init__
(
pts_voxel_layer
,
pts_voxel_encoder
,
pts_middle_encoder
,
pts_fusion_layer
,
img_backbone
,
pts_backbone
,
img_neck
,
pts_neck
,
pts_bbox_head
,
img_roi_head
,
img_rpn_head
,
train_cfg
,
test_cfg
,
pretrained
)
if
bev_constructor
is
not
None
:
self
.
bev_constructor
=
build_neck
(
bev_constructor
)
if
bbox_head
is
not
None
:
bbox_head
.
update
(
train_cfg
=
bbox_train_cfg
)
self
.
bbox_head
=
build_head
(
bbox_head
)
else
:
self
.
bbox_head
=
None
if
lclc_head
is
not
None
:
self
.
lclc_head
=
build_head
(
lclc_head
)
else
:
self
.
lclc_head
=
None
if
lcte_head
is
not
None
:
self
.
lcte_head
=
build_head
(
lcte_head
)
else
:
self
.
lcte_head
=
None
self
.
fp16_enabled
=
False
# temporal
self
.
video_test_mode
=
video_test_mode
self
.
prev_frame_info
=
{
'prev_bev'
:
None
,
'scene_token'
:
None
,
'prev_pos'
:
0
,
'prev_angle'
:
0
,
}
def
extract_img_feat
(
self
,
img
,
img_metas
,
len_queue
=
None
):
"""Extract features of images."""
B
=
img
.
size
(
0
)
if
img
is
not
None
:
# input_shape = img.shape[-2:]
# # update real input shape of each single img
# for img_meta in img_metas:
# img_meta.update(input_shape=input_shape)
if
img
.
dim
()
==
5
and
img
.
size
(
0
)
==
1
:
img
.
squeeze_
()
elif
img
.
dim
()
==
5
and
img
.
size
(
0
)
>
1
:
B
,
N
,
C
,
H
,
W
=
img
.
size
()
img
=
img
.
reshape
(
B
*
N
,
C
,
H
,
W
)
img_feats
=
self
.
img_backbone
(
img
)
if
isinstance
(
img_feats
,
dict
):
img_feats
=
list
(
img_feats
.
values
())
else
:
return
None
if
self
.
with_img_neck
:
img_feats
=
self
.
img_neck
(
img_feats
)
img_feats_reshaped
=
[]
for
img_feat
in
img_feats
:
BN
,
C
,
H
,
W
=
img_feat
.
size
()
if
len_queue
is
not
None
:
img_feats_reshaped
.
append
(
img_feat
.
view
(
int
(
B
/
len_queue
),
len_queue
,
int
(
BN
/
B
),
C
,
H
,
W
))
else
:
img_feats_reshaped
.
append
(
img_feat
.
view
(
B
,
int
(
BN
/
B
),
C
,
H
,
W
))
return
img_feats_reshaped
@
auto_fp16
(
apply_to
=
(
'img'
))
def
extract_feat
(
self
,
img
,
img_metas
=
None
,
len_queue
=
None
):
"""Extract features from images and points."""
img_feats
=
self
.
extract_img_feat
(
img
,
img_metas
,
len_queue
=
len_queue
)
return
img_feats
def
forward_dummy
(
self
,
img
):
dummy_metas
=
None
return
self
.
forward_test
(
img
=
img
,
img_metas
=
[[
dummy_metas
]])
def
forward
(
self
,
return_loss
=
True
,
**
kwargs
):
"""Calls either forward_train or forward_test depending on whether
return_loss=True.
Note this setting will change the expected inputs. When
`return_loss=True`, img and img_metas are single-nested (i.e.
torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
img_metas should be double nested (i.e. list[torch.Tensor],
list[list[dict]]), with the outer list indicating test time
augmentations.
"""
if
return_loss
:
return
self
.
forward_train
(
**
kwargs
)
else
:
return
self
.
forward_test
(
**
kwargs
)
def
obtain_history_bev
(
self
,
imgs_queue
,
img_metas_list
):
"""Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
"""
self
.
eval
()
with
torch
.
no_grad
():
prev_bev
=
None
bs
,
len_queue
,
num_cams
,
C
,
H
,
W
=
imgs_queue
.
shape
imgs_queue
=
imgs_queue
.
reshape
(
bs
*
len_queue
,
num_cams
,
C
,
H
,
W
)
img_feats_list
=
self
.
extract_feat
(
img
=
imgs_queue
,
len_queue
=
len_queue
)
for
i
in
range
(
len_queue
):
img_metas
=
[
each
[
i
]
for
each
in
img_metas_list
]
# img_feats = self.extract_feat(img=img, img_metas=img_metas)
img_feats
=
[
each_scale
[:,
i
]
for
each_scale
in
img_feats_list
]
prev_bev
=
self
.
bev_constructor
(
img_feats
,
img_metas
,
prev_bev
)
self
.
train
()
return
prev_bev
@
auto_fp16
(
apply_to
=
(
'img'
,
'points'
))
def
forward_train
(
self
,
img
=
None
,
img_metas
=
None
,
gt_te
=
None
,
gt_te_labels
=
None
,
gt_lc
=
None
,
gt_lc_labels
=
None
,
gt_topology_lclc
=
None
,
gt_topology_lcte
=
None
,
gt_bboxes_ignore
=
None
,
):
prev_bev
=
None
img_feats
=
self
.
extract_feat
(
img
=
img
,
img_metas
=
img_metas
)
bev_feats
=
self
.
bev_constructor
(
img_feats
,
img_metas
,
prev_bev
)
losses
=
dict
()
outs
=
self
.
pts_bbox_head
(
img_feats
,
bev_feats
,
img_metas
)
loss_inputs
=
[
outs
,
gt_lc
,
gt_lc_labels
]
lane_losses
,
lane_assign_result
=
self
.
pts_bbox_head
.
loss
(
*
loss_inputs
,
img_metas
=
img_metas
)
for
loss
in
lane_losses
:
losses
[
'lane_head.'
+
loss
]
=
lane_losses
[
loss
]
lane_feats
=
outs
[
'history_states'
]
if
self
.
lclc_head
is
not
None
:
lclc_losses
=
self
.
lclc_head
.
forward_train
(
lane_feats
,
lane_assign_result
,
lane_feats
,
lane_assign_result
,
gt_topology_lclc
)
for
loss
in
lclc_losses
:
losses
[
'lclc_head.'
+
loss
]
=
lclc_losses
[
loss
]
if
self
.
bbox_head
is
not
None
:
front_view_img_feats
=
[
lvl
[:,
0
]
for
lvl
in
img_feats
]
batch_input_shape
=
tuple
(
img
[
0
,
0
].
size
()[
-
2
:])
bbox_img_metas
=
[]
for
img_meta
in
img_metas
:
bbox_img_metas
.
append
(
dict
(
batch_input_shape
=
batch_input_shape
,
img_shape
=
img_meta
[
'img_shape'
][
0
],
scale_factor
=
img_meta
[
'scale_factor'
][
0
]))
img_meta
[
'batch_input_shape'
]
=
batch_input_shape
te_losses
=
{}
bbox_outs
=
self
.
bbox_head
(
front_view_img_feats
,
bbox_img_metas
)
bbox_losses
,
te_assign_result
=
self
.
bbox_head
.
loss
(
bbox_outs
,
gt_te
,
gt_te_labels
,
bbox_img_metas
,
gt_bboxes_ignore
)
for
loss
in
bbox_losses
:
te_losses
[
'bbox_head.'
+
loss
]
=
bbox_losses
[
loss
]
if
self
.
lcte_head
is
not
None
:
te_feats
=
bbox_outs
[
'history_states'
]
lcte_losses
=
self
.
lcte_head
.
forward_train
(
lane_feats
,
lane_assign_result
,
te_feats
,
te_assign_result
,
gt_topology_lcte
)
for
loss
in
lcte_losses
:
te_losses
[
'lcte_head.'
+
loss
]
=
lcte_losses
[
loss
]
num_gt_bboxes
=
sum
([
len
(
gt
)
for
gt
in
gt_te_labels
])
if
num_gt_bboxes
==
0
:
for
loss
in
te_losses
:
te_losses
[
loss
]
*=
0
losses
.
update
(
te_losses
)
return
losses
def
forward_test
(
self
,
img_metas
,
img
=
None
,
**
kwargs
):
for
var
,
name
in
[(
img_metas
,
'img_metas'
)]:
if
not
isinstance
(
var
,
list
):
raise
TypeError
(
'{} must be a list, but got {}'
.
format
(
name
,
type
(
var
)))
img
=
[
img
]
if
img
is
None
else
img
new_prev_bev
,
results_list
=
self
.
simple_test
(
img_metas
,
img
,
prev_bev
=
None
,
**
kwargs
)
return
results_list
def
simple_test_pts
(
self
,
x
,
img_metas
,
img
=
None
,
prev_bev
=
None
,
rescale
=
False
):
"""Test function"""
batchsize
=
len
(
img_metas
)
bev_feats
=
self
.
bev_constructor
(
x
,
img_metas
,
prev_bev
)
outs
=
self
.
pts_bbox_head
(
x
,
bev_feats
,
img_metas
)
lane_results
=
self
.
pts_bbox_head
.
get_lanes
(
outs
,
img_metas
,
rescale
=
rescale
)
if
self
.
lclc_head
is
not
None
:
lane_feats
=
outs
[
'history_states'
]
lclc_results
=
self
.
lclc_head
.
get_relationship
(
lane_feats
,
lane_feats
)
lclc_results
=
[
result
.
detach
().
cpu
().
numpy
()
for
result
in
lclc_results
]
else
:
lclc_results
=
[
None
for
_
in
range
(
batchsize
)]
if
self
.
bbox_head
is
not
None
:
front_view_img_feats
=
[
lvl
[:,
0
]
for
lvl
in
x
]
batch_input_shape
=
tuple
(
img
[
0
,
0
].
size
()[
-
2
:])
bbox_img_metas
=
[]
for
img_meta
in
img_metas
:
bbox_img_metas
.
append
(
dict
(
batch_input_shape
=
batch_input_shape
,
img_shape
=
img_meta
[
'img_shape'
][
0
],
scale_factor
=
img_meta
[
'scale_factor'
][
0
]))
img_meta
[
'batch_input_shape'
]
=
batch_input_shape
bbox_outs
=
self
.
bbox_head
(
front_view_img_feats
,
bbox_img_metas
)
bbox_results
=
self
.
bbox_head
.
get_bboxes
(
bbox_outs
,
bbox_img_metas
,
rescale
=
rescale
)
else
:
bbox_results
=
[
None
for
_
in
range
(
batchsize
)]
if
self
.
bbox_head
is
not
None
and
self
.
lcte_head
is
not
None
:
te_feats
=
bbox_outs
[
'history_states'
]
lcte_results
=
self
.
lcte_head
.
get_relationship
(
lane_feats
,
te_feats
)
lcte_results
=
[
result
.
detach
().
cpu
().
numpy
()
for
result
in
lcte_results
]
else
:
lcte_results
=
[
None
for
_
in
range
(
batchsize
)]
return
bev_feats
,
bbox_results
,
lane_results
,
lclc_results
,
lcte_results
def
simple_test
(
self
,
img_metas
,
img
=
None
,
prev_bev
=
None
,
rescale
=
False
):
"""Test function without augmentaiton."""
img_feats
=
self
.
extract_feat
(
img
=
img
,
img_metas
=
img_metas
)
results_list
=
[
dict
()
for
i
in
range
(
len
(
img_metas
))]
new_prev_bev
,
bbox_results
,
lane_results
,
lclc_results
,
lcte_results
=
self
.
simple_test_pts
(
img_feats
,
img_metas
,
img
,
prev_bev
,
rescale
=
rescale
)
for
result_dict
,
bbox
,
lane
,
lclc
,
lcte
in
zip
(
results_list
,
bbox_results
,
lane_results
,
lclc_results
,
lcte_results
):
result_dict
[
'pred_te'
]
=
bbox
result_dict
[
'pred_lc'
]
=
lane
result_dict
[
'pred_topology_lclc'
]
=
lclc
result_dict
[
'pred_topology_lcte'
]
=
lcte
return
new_prev_bev
,
results_list
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/__init__.py
0 → 100644
View file @
80e8c1d3
from
.custom_detr_head
import
*
from
.topology_head
import
*
from
.lc_deformable_detr_head
import
LCDeformableDETRHead
from
.te_deformable_detr_head
import
TEDeformableDETRHead
from
.relationship_head
import
RelationshipHead
\ No newline at end of file
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/custom_detr_head.py
0 → 100644
View file @
80e8c1d3
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# custom_detr_head.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
torch
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Linear
from
mmdet.core
import
bbox_cxcywh_to_xyxy
,
bbox_xyxy_to_cxcywh
,
multi_apply
,
reduce_mean
from
mmdet.models
import
HEADS
,
DETRHead
@
HEADS
.
register_module
()
class
CustomDETRHead
(
DETRHead
):
def
__init__
(
self
,
num_classes
,
in_channels
,
num_query
,
object_type
,
num_reg_dim
=
4
,
num_layers
=
1
,
feedforward_channels
=
512
,
embed_dims
=
64
,
num_heads
=
4
,
dropout
=
0.1
,
ffn_dropout
=
0.1
,
**
kwargs
):
self
.
object_type
=
object_type
if
self
.
object_type
==
'lane'
:
self
.
num_reg_dim
=
num_reg_dim
assert
self
.
num_reg_dim
%
3
==
0
self
.
bev_range
=
kwargs
[
'bev_range'
]
elif
self
.
object_type
==
'bbox'
:
self
.
num_reg_dim
=
4
assert
self
.
num_reg_dim
==
num_reg_dim
==
4
else
:
raise
NotImplementedError
transformer
=
dict
(
type
=
'Transformer'
,
encoder
=
dict
(
type
=
'DetrTransformerEncoder'
,
num_layers
=
num_layers
,
transformerlayers
=
dict
(
type
=
'BaseTransformerLayer'
,
attn_cfgs
=
[
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
embed_dims
,
num_heads
=
num_heads
,
dropout
=
dropout
)
],
ffn_cfgs
=
dict
(
type
=
'FFN'
,
embed_dims
=
embed_dims
,
feedforward_channels
=
feedforward_channels
,
num_fcs
=
2
,
ffn_drop
=
0.
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
feedforward_channels
=
feedforward_channels
,
ffn_dropout
=
ffn_dropout
,
operation_order
=
(
'self_attn'
,
'norm'
,
'ffn'
,
'norm'
))),
decoder
=
dict
(
type
=
'DetrTransformerDecoder'
,
return_intermediate
=
True
,
num_layers
=
num_layers
,
transformerlayers
=
dict
(
type
=
'DetrTransformerDecoderLayer'
,
attn_cfgs
=
dict
(
type
=
'MultiheadAttention'
,
embed_dims
=
embed_dims
,
num_heads
=
num_heads
,
dropout
=
dropout
),
ffn_cfgs
=
dict
(
type
=
'FFN'
,
embed_dims
=
embed_dims
,
feedforward_channels
=
feedforward_channels
,
num_fcs
=
2
,
ffn_drop
=
0.
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
feedforward_channels
=
feedforward_channels
,
ffn_dropout
=
ffn_dropout
,
operation_order
=
(
'self_attn'
,
'norm'
,
'cross_attn'
,
'norm'
,
'ffn'
,
'norm'
)),
))
positional_encoding
=
dict
(
type
=
'SinePositionalEncoding'
,
num_feats
=
embed_dims
//
2
,
normalize
=
True
)
super
().
__init__
(
num_classes
=
num_classes
,
in_channels
=
in_channels
,
num_query
=
num_query
,
transformer
=
transformer
,
positional_encoding
=
positional_encoding
,
**
kwargs
,
)
def
_init_layers
(
self
):
super
().
_init_layers
()
self
.
fc_reg
=
Linear
(
self
.
embed_dims
,
self
.
num_reg_dim
)
def
forward_single
(
self
,
x
,
img_metas
):
# construct binary masks which used for the transformer.
# NOTE following the official DETR repo, non-zero values representing
# ignored positions, while zero values means valid positions.
if
self
.
object_type
==
'lane'
:
masks
=
x
.
new_zeros
((
x
.
shape
[
0
],
x
.
shape
[
2
],
x
.
shape
[
3
]))
else
:
batch_size
=
x
.
size
(
0
)
input_img_h
,
input_img_w
=
img_metas
[
0
][
'batch_input_shape'
]
masks
=
x
.
new_ones
((
batch_size
,
input_img_h
,
input_img_w
))
for
img_id
in
range
(
batch_size
):
img_h
,
img_w
,
_
=
img_metas
[
img_id
][
'img_shape'
]
masks
[
img_id
,
:
img_h
,
:
img_w
]
=
0
x
=
self
.
input_proj
(
x
)
# interpolate masks to have the same spatial shape with x
masks
=
F
.
interpolate
(
masks
.
unsqueeze
(
1
),
size
=
x
.
shape
[
-
2
:]).
to
(
torch
.
bool
).
squeeze
(
1
)
# position encoding
pos_embed
=
self
.
positional_encoding
(
masks
)
# [bs, embed_dim, h, w]
# outs_dec: [nb_dec, bs, num_query, embed_dim]
outs_dec
,
_
=
self
.
transformer
(
x
,
masks
,
self
.
query_embedding
.
weight
,
pos_embed
)
all_cls_scores
=
self
.
fc_cls
(
outs_dec
)
all_bbox_preds
=
self
.
fc_reg
(
self
.
activate
(
self
.
reg_ffn
(
outs_dec
))).
sigmoid
()
return
all_cls_scores
,
all_bbox_preds
,
outs_dec
def
loss
(
self
,
all_cls_scores_list
,
all_bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore
=
None
):
# NOTE defaultly only the outputs from the last feature scale is used.
all_cls_scores
=
all_cls_scores_list
[
-
1
]
all_bbox_preds
=
all_bbox_preds_list
[
-
1
]
assert
gt_bboxes_ignore
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_dec_layers
=
len
(
all_cls_scores
)
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
img_metas_list
=
[
img_metas
for
_
in
range
(
num_dec_layers
)]
losses_cls
,
losses_bbox
,
losses_iou
,
assign_result
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
img_metas_list
,
all_gt_bboxes_ignore_list
)
loss_dict
=
dict
()
# loss from the last decoder layer
loss_dict
[
'loss_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_bbox'
]
=
losses_bbox
[
-
1
]
if
self
.
object_type
!=
'lane'
:
loss_dict
[
'loss_iou'
]
=
losses_iou
[
-
1
]
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
,
loss_iou_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
],
losses_iou
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
loss_bbox_i
if
self
.
object_type
!=
'lane'
:
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_iou'
]
=
loss_iou_i
num_dec_layer
+=
1
return
loss_dict
,
assign_result
def
loss_single
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
=
None
):
num_imgs
=
cls_scores
.
size
(
0
)
cls_scores_list
=
[
cls_scores
[
i
]
for
i
in
range
(
num_imgs
)]
bbox_preds_list
=
[
bbox_preds
[
i
]
for
i
in
range
(
num_imgs
)]
cls_reg_targets
=
self
.
get_targets
(
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
)
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
,
assign_result
)
=
cls_reg_targets
labels
=
torch
.
cat
(
labels_list
,
0
)
label_weights
=
torch
.
cat
(
label_weights_list
,
0
)
bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
0
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
0
)
# classification loss
cls_scores
=
cls_scores
.
reshape
(
-
1
,
self
.
cls_out_channels
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
cls_scores
.
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
labels
,
label_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
if
self
.
object_type
==
'lane'
:
bbox_preds
=
bbox_preds
.
reshape
(
-
1
,
self
.
num_reg_dim
)
loss_iou
=
None
else
:
# construct factors used for rescale bboxes
factors
=
[]
for
img_meta
,
bbox_pred
in
zip
(
img_metas
,
bbox_preds
):
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
).
repeat
(
bbox_pred
.
size
(
0
),
1
)
factors
.
append
(
factor
)
factors
=
torch
.
cat
(
factors
,
0
)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds
=
bbox_preds
.
reshape
(
-
1
,
self
.
num_reg_dim
)
bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_preds
)
*
factors
bboxes_gt
=
bbox_cxcywh_to_xyxy
(
bbox_targets
)
*
factors
# regression IoU loss, defaultly GIoU loss
loss_iou
=
self
.
loss_iou
(
bboxes
,
bboxes_gt
,
bbox_weights
,
avg_factor
=
num_total_pos
)
# regression L1 loss
loss_bbox
=
self
.
loss_bbox
(
bbox_preds
,
bbox_targets
,
bbox_weights
,
avg_factor
=
num_total_pos
)
return
loss_cls
,
loss_bbox
,
loss_iou
,
assign_result
def
get_targets
(
self
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
=
None
):
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs
=
len
(
cls_scores_list
)
gt_bboxes_ignore_list
=
[
gt_bboxes_ignore_list
for
_
in
range
(
num_imgs
)
]
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
pos_inds_list
,
neg_inds_list
,
pos_assigned_gt_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
assign_result
=
dict
(
pos_inds
=
pos_inds_list
,
neg_inds
=
neg_inds_list
,
pos_assigned_gt_inds
=
pos_assigned_gt_inds_list
)
return
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
,
assign_result
)
def
_get_target_single
(
self
,
cls_score
,
bbox_pred
,
gt_bboxes
,
gt_labels
,
img_meta
,
gt_bboxes_ignore
=
None
):
num_bboxes
=
bbox_pred
.
size
(
0
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
bbox_pred
,
cls_score
,
gt_bboxes
,
gt_labels
,
img_meta
,
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
bbox_pred
,
gt_bboxes
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
pos_assigned_gt_inds
=
sampling_result
.
pos_assigned_gt_inds
# label targets
labels
=
gt_bboxes
.
new_full
((
num_bboxes
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_bboxes
.
new_ones
(
num_bboxes
)
# bbox targets
bbox_targets
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
[
pos_inds
]
=
1.0
if
self
.
object_type
==
'lane'
:
pos_gt_bboxes
=
sampling_result
.
pos_gt_bboxes
pos_gt_bboxes_normalized
=
torch
.
zeros_like
(
pos_gt_bboxes
)
for
p
in
range
(
self
.
num_reg_dim
//
3
):
pos_gt_bboxes_normalized
[...,
3
*
p
]
=
(
pos_gt_bboxes
[...,
3
*
p
]
-
self
.
bev_range
[
0
])
/
(
self
.
bev_range
[
3
]
-
self
.
bev_range
[
0
])
pos_gt_bboxes_normalized
[...,
3
*
p
+
1
]
=
(
pos_gt_bboxes
[...,
3
*
p
+
1
]
-
self
.
bev_range
[
1
])
/
(
self
.
bev_range
[
4
]
-
self
.
bev_range
[
1
])
pos_gt_bboxes_normalized
[...,
3
*
p
+
2
]
=
(
pos_gt_bboxes
[...,
3
*
p
+
2
]
-
self
.
bev_range
[
2
])
/
(
self
.
bev_range
[
5
]
-
self
.
bev_range
[
2
])
pos_gt_bboxes_targets
=
pos_gt_bboxes_normalized
else
:
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
)
pos_gt_bboxes_normalized
=
sampling_result
.
pos_gt_bboxes
/
factor
pos_gt_bboxes_targets
=
bbox_xyxy_to_cxcywh
(
pos_gt_bboxes_normalized
)
bbox_targets
[
pos_inds
]
=
pos_gt_bboxes_targets
return
(
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
pos_inds
,
neg_inds
,
pos_assigned_gt_inds
)
def
_get_bboxes_single
(
self
,
cls_score
,
bbox_pred
,
img_shape
,
scale_factor
,
rescale
=
False
):
assert
len
(
cls_score
)
==
len
(
bbox_pred
)
if
self
.
object_type
==
'lane'
:
cls_score
=
cls_score
.
sigmoid
()
det_bboxes
=
bbox_pred
for
p
in
range
(
self
.
num_reg_dim
//
3
):
det_bboxes
[...,
3
*
p
]
=
det_bboxes
[...,
3
*
p
]
*
(
self
.
bev_range
[
3
]
-
self
.
bev_range
[
0
])
+
self
.
bev_range
[
0
]
det_bboxes
[...,
3
*
p
+
1
]
=
det_bboxes
[...,
3
*
p
+
1
]
*
(
self
.
bev_range
[
4
]
-
self
.
bev_range
[
1
])
+
self
.
bev_range
[
1
]
det_bboxes
[...,
3
*
p
+
2
]
=
det_bboxes
[...,
3
*
p
+
2
]
*
(
self
.
bev_range
[
5
]
-
self
.
bev_range
[
2
])
+
self
.
bev_range
[
2
]
det_bboxes
[...,
3
*
p
].
clamp_
(
min
=
self
.
bev_range
[
0
],
max
=
self
.
bev_range
[
3
])
det_bboxes
[...,
3
*
p
+
1
].
clamp_
(
min
=
self
.
bev_range
[
1
],
max
=
self
.
bev_range
[
4
])
det_bboxes
[...,
3
*
p
+
2
].
clamp_
(
min
=
self
.
bev_range
[
2
],
max
=
self
.
bev_range
[
5
])
else
:
# exclude background
if
self
.
loss_cls
.
use_sigmoid
:
cls_score
=
cls_score
.
sigmoid
()
else
:
cls_score
=
F
.
softmax
(
cls_score
,
dim
=-
1
)[...,
:
-
1
]
cls_score
,
det_labels
=
cls_score
.
max
(
-
1
)
det_bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_pred
)
det_bboxes
[:,
0
::
2
]
=
det_bboxes
[:,
0
::
2
]
*
img_shape
[
1
]
det_bboxes
[:,
1
::
2
]
=
det_bboxes
[:,
1
::
2
]
*
img_shape
[
0
]
det_bboxes
[:,
0
::
2
].
clamp_
(
min
=
0
,
max
=
img_shape
[
1
])
det_bboxes
[:,
1
::
2
].
clamp_
(
min
=
0
,
max
=
img_shape
[
0
])
if
rescale
:
det_bboxes
/=
det_bboxes
.
new_tensor
(
scale_factor
)
det_bboxes
=
torch
.
cat
((
det_bboxes
,
det_labels
.
unsqueeze
(
1
)),
-
1
)
return
det_bboxes
.
cpu
().
numpy
(),
cls_score
.
cpu
().
numpy
()
def
onnx_export
(
self
,
**
kwargs
):
raise
NotImplementedError
(
f
'TODO: replace 4 with self.num_reg_dim :
{
self
.
num_reg_dim
}
'
)
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/lc_deformable_detr_head.py
0 → 100644
View file @
80e8c1d3
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# custom_detr_head.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact litianyu@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
copy
import
numpy
as
np
import
cv2
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
mmcv
from
mmcv.cnn
import
Linear
,
bias_init_with_prob
,
build_activation_layer
from
mmcv.runner
import
auto_fp16
,
force_fp32
from
mmcv.utils
import
TORCH_VERSION
,
digit_version
from
mmdet.core
import
build_assigner
,
build_sampler
,
multi_apply
,
reduce_mean
from
mmdet.models.builder
import
HEADS
,
build_loss
from
mmdet.models.dense_heads
import
AnchorFreeHead
from
mmdet.models.utils
import
build_transformer
from
mmdet.models.utils.transformer
import
inverse_sigmoid
@
HEADS
.
register_module
()
class
LCDeformableDETRHead
(
AnchorFreeHead
):
def
__init__
(
self
,
num_classes
,
in_channels
,
num_query
=
100
,
with_box_refine
=
False
,
with_shared_param
=
None
,
transformer
=
None
,
num_reg_fcs
=
2
,
code_weights
=
None
,
pc_range
=
None
,
bev_h
=
30
,
bev_w
=
30
,
sync_cls_avg_factor
=
False
,
loss_cls
=
dict
(
type
=
'CrossEntropyLoss'
,
bg_cls_weight
=
0.1
,
use_sigmoid
=
False
,
loss_weight
=
1.0
,
class_weight
=
1.0
),
loss_bbox
=
dict
(
type
=
'L1Loss'
,
loss_weight
=
5.0
),
loss_iou
=
dict
(
type
=
'GIoULoss'
,
loss_weight
=
2.0
),
train_cfg
=
dict
(
assigner
=
dict
(
type
=
'HungarianAssigner'
,
cls_cost
=
dict
(
type
=
'ClassificationCost'
,
weight
=
1.
),
reg_cost
=
dict
(
type
=
'BBoxL1Cost'
,
weight
=
5.0
),
iou_cost
=
dict
(
type
=
'IoUCost'
,
iou_mode
=
'giou'
,
weight
=
2.0
))),
test_cfg
=
dict
(
max_per_img
=
100
),
init_cfg
=
None
,
**
kwargs
):
# NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
# since it brings inconvenience when the initialization of
# `AnchorFreeHead` is called.
super
(
AnchorFreeHead
,
self
).
__init__
(
init_cfg
)
self
.
bg_cls_weight
=
0
self
.
sync_cls_avg_factor
=
sync_cls_avg_factor
if
train_cfg
:
assert
'assigner'
in
train_cfg
,
'assigner should be provided '
\
'when train_cfg is set.'
assigner
=
train_cfg
[
'assigner'
]
assert
loss_cls
[
'loss_weight'
]
==
assigner
[
'cls_cost'
][
'weight'
],
\
'The classification weight for loss and matcher should be'
\
'exactly the same.'
assert
loss_bbox
[
'loss_weight'
]
==
assigner
[
'reg_cost'
][
'weight'
],
'The regression L1 weight for loss and matcher '
\
'should be exactly the same.'
assert
loss_iou
[
'loss_weight'
]
==
assigner
[
'iou_cost'
][
'weight'
],
\
'The regression iou weight for loss and matcher should be'
\
'exactly the same.'
self
.
assigner
=
build_assigner
(
assigner
)
# DETR sampling=False, so use PseudoSampler
sampler_cfg
=
dict
(
type
=
'PseudoSampler'
)
self
.
sampler
=
build_sampler
(
sampler_cfg
,
context
=
self
)
self
.
num_query
=
num_query
self
.
num_classes
=
num_classes
self
.
in_channels
=
in_channels
self
.
train_cfg
=
train_cfg
self
.
test_cfg
=
test_cfg
self
.
fp16_enabled
=
False
self
.
loss_cls
=
build_loss
(
loss_cls
)
self
.
loss_bbox
=
build_loss
(
loss_bbox
)
self
.
loss_iou
=
build_loss
(
loss_iou
)
if
self
.
loss_cls
.
use_sigmoid
:
self
.
cls_out_channels
=
num_classes
else
:
self
.
cls_out_channels
=
num_classes
+
1
self
.
act_cfg
=
transformer
.
get
(
'act_cfg'
,
dict
(
type
=
'ReLU'
,
inplace
=
True
))
self
.
activate
=
build_activation_layer
(
self
.
act_cfg
)
self
.
transformer
=
build_transformer
(
transformer
)
self
.
embed_dims
=
self
.
transformer
.
embed_dims
self
.
bev_h
=
bev_h
self
.
bev_w
=
bev_w
self
.
fp16_enabled
=
False
self
.
with_box_refine
=
with_box_refine
if
with_shared_param
is
not
None
:
self
.
with_shared_param
=
with_shared_param
else
:
self
.
with_shared_param
=
not
self
.
with_box_refine
self
.
as_two_stage
=
False
if
'code_size'
in
kwargs
:
self
.
code_size
=
kwargs
[
'code_size'
]
else
:
self
.
code_size
=
6
if
code_weights
is
not
None
:
self
.
code_weights
=
code_weights
else
:
self
.
code_weights
=
[
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
]
self
.
code_weights
=
nn
.
Parameter
(
torch
.
tensor
(
self
.
code_weights
,
requires_grad
=
False
),
requires_grad
=
False
)
self
.
gt_c_save
=
self
.
code_size
self
.
pc_range
=
pc_range
self
.
real_w
=
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
]
self
.
real_h
=
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
]
self
.
num_reg_fcs
=
num_reg_fcs
self
.
_init_layers
()
def
_init_layers
(
self
):
"""Initialize classification branch and regression branch of head."""
cls_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
cls_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
cls_branch
.
append
(
nn
.
LayerNorm
(
self
.
embed_dims
))
cls_branch
.
append
(
nn
.
ReLU
(
inplace
=
True
))
cls_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
))
fc_cls
=
nn
.
Sequential
(
*
cls_branch
)
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
code_size
))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
# last reg_branch is used to generate proposal from
# encode feature map when as_two_stage is True.
num_pred
=
(
self
.
transformer
.
decoder
.
num_layers
+
1
)
if
\
self
.
as_two_stage
else
self
.
transformer
.
decoder
.
num_layers
if
not
self
.
with_shared_param
:
self
.
cls_branches
=
_get_clones
(
fc_cls
,
num_pred
)
self
.
reg_branches
=
_get_clones
(
reg_branch
,
num_pred
)
else
:
self
.
cls_branches
=
nn
.
ModuleList
(
[
fc_cls
for
_
in
range
(
num_pred
)])
self
.
reg_branches
=
nn
.
ModuleList
(
[
reg_branch
for
_
in
range
(
num_pred
)])
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
*
2
)
def
init_weights
(
self
):
"""Initialize weights of the DeformDETR head."""
self
.
transformer
.
init_weights
()
if
self
.
loss_cls
.
use_sigmoid
:
bias_init
=
bias_init_with_prob
(
0.01
)
for
m
in
self
.
cls_branches
:
nn
.
init
.
constant_
(
m
[
-
1
].
bias
,
bias_init
)
@
auto_fp16
(
apply_to
=
(
'mlvl_feats'
))
def
forward
(
self
,
mlvl_feats
,
bev_feats
,
img_metas
):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 5D-tensor with shape
(B, N, C, H, W).
prev_bev: previous bev featues
only_bev: only compute BEV features with encoder.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
\
shape [nb_dec, bs, num_query, cls_out_channels]. Note
\
cls_out_channels should includes background.
all_lanes_preds (Tensor): Sigmoid outputs from the regression
\
head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy).
\
Shape [nb_dec, bs, num_query, 9].
"""
dtype
=
mlvl_feats
[
0
].
dtype
object_query_embeds
=
self
.
query_embedding
.
weight
.
to
(
dtype
)
outputs
=
self
.
transformer
(
mlvl_feats
,
bev_feats
,
object_query_embeds
,
bev_h
=
self
.
bev_h
,
bev_w
=
self
.
bev_w
,
reg_branches
=
self
.
reg_branches
if
self
.
with_box_refine
else
None
,
# noqa:E501
cls_branches
=
None
,
img_metas
=
img_metas
)
hs
,
init_reference
,
inter_references
=
outputs
hs
=
hs
.
permute
(
0
,
2
,
1
,
3
)
outputs_classes
=
[]
outputs_coords
=
[]
for
lvl
in
range
(
hs
.
shape
[
0
]):
if
lvl
==
0
:
reference
=
init_reference
else
:
reference
=
inter_references
[
lvl
-
1
]
reference
=
inverse_sigmoid
(
reference
)
outputs_class
=
self
.
cls_branches
[
lvl
](
hs
[
lvl
])
tmp
=
self
.
reg_branches
[
lvl
](
hs
[
lvl
])
assert
reference
.
shape
[
-
1
]
==
3
for
p
in
range
(
self
.
code_size
//
3
):
tmp
[...,
3
*
p
:
3
*
p
+
3
]
=
tmp
[...,
3
*
p
:
3
*
p
+
3
]
+
reference
tmp
[...,
3
*
p
:
3
*
p
+
3
]
=
tmp
[...,
3
*
p
:
3
*
p
+
3
].
sigmoid
()
tmp
[...,
3
*
p
]
=
tmp
[...,
3
*
p
]
*
(
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
])
+
self
.
pc_range
[
0
]
tmp
[...,
3
*
p
+
1
]
=
tmp
[...,
3
*
p
+
1
]
*
(
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
])
+
self
.
pc_range
[
1
]
tmp
[...,
3
*
p
+
2
]
=
tmp
[...,
3
*
p
+
2
]
*
(
self
.
pc_range
[
5
]
-
self
.
pc_range
[
2
])
+
self
.
pc_range
[
2
]
outputs_coord
=
tmp
outputs_classes
.
append
(
outputs_class
)
outputs_coords
.
append
(
outputs_coord
)
outputs_classes
=
torch
.
stack
(
outputs_classes
)
outputs_coords
=
torch
.
stack
(
outputs_coords
)
outs
=
{
'all_cls_scores'
:
outputs_classes
,
'all_lanes_preds'
:
outputs_coords
,
'enc_cls_scores'
:
None
,
'enc_bbox_preds'
:
None
,
'history_states'
:
hs
}
return
outs
def
_get_target_single
(
self
,
cls_score
,
lanes_pred
,
gt_labels
,
gt_lanes
,
gt_bboxes_ignore
=
None
):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes
=
lanes_pred
.
size
(
0
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
lanes_pred
,
cls_score
,
gt_lanes
,
gt_labels
,
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
lanes_pred
,
gt_lanes
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
pos_assigned_gt_inds
=
sampling_result
.
pos_assigned_gt_inds
labels
=
gt_lanes
.
new_full
((
num_bboxes
,),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
].
long
()
label_weights
=
gt_lanes
.
new_ones
(
num_bboxes
)
# bbox targets
gt_c
=
gt_lanes
.
shape
[
-
1
]
if
gt_c
==
0
:
gt_c
=
self
.
gt_c_save
sampling_result
.
pos_gt_bboxes
=
torch
.
zeros
((
0
,
gt_c
)).
to
(
sampling_result
.
pos_gt_bboxes
.
device
)
else
:
self
.
gt_c_save
=
gt_c
bbox_targets
=
torch
.
zeros_like
(
lanes_pred
)[...,
:
gt_c
]
bbox_weights
=
torch
.
zeros_like
(
lanes_pred
)
bbox_weights
[
pos_inds
]
=
1.0
# DETR
bbox_targets
[
pos_inds
]
=
sampling_result
.
pos_gt_bboxes
return
(
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
pos_inds
,
neg_inds
,
pos_assigned_gt_inds
)
def
get_targets
(
self
,
cls_scores_list
,
lanes_preds_list
,
gt_lanes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
=
None
):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_lanes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all
\
images.
- bbox_targets_list (list[Tensor]): BBox targets for all
\
images.
- bbox_weights_list (list[Tensor]): BBox weights for all
\
images.
- num_total_pos (int): Number of positive samples in all
\
images.
- num_total_neg (int): Number of negative samples in all
\
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs
=
len
(
cls_scores_list
)
gt_bboxes_ignore_list
=
[
gt_bboxes_ignore_list
for
_
in
range
(
num_imgs
)
]
(
labels_list
,
label_weights_list
,
lanes_targets_list
,
lanes_weights_list
,
pos_inds_list
,
neg_inds_list
,
pos_assigned_gt_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
cls_scores_list
,
lanes_preds_list
,
gt_labels_list
,
gt_lanes_list
,
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
assign_result
=
dict
(
pos_inds
=
pos_inds_list
,
neg_inds
=
neg_inds_list
,
pos_assigned_gt_inds
=
pos_assigned_gt_inds_list
)
return
(
labels_list
,
label_weights_list
,
lanes_targets_list
,
lanes_weights_list
,
num_total_pos
,
num_total_neg
,
assign_result
)
def
loss_single
(
self
,
cls_scores
,
lanes_preds
,
gt_lanes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
=
None
):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
lanes_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_lanes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs
=
cls_scores
.
size
(
0
)
cls_scores_list
=
[
cls_scores
[
i
]
for
i
in
range
(
num_imgs
)]
lanes_preds_list
=
[
lanes_preds
[
i
]
for
i
in
range
(
num_imgs
)]
cls_reg_targets
=
self
.
get_targets
(
cls_scores_list
,
lanes_preds_list
,
gt_lanes_list
,
gt_labels_list
,
gt_bboxes_ignore_list
)
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
,
assign_result
)
=
cls_reg_targets
labels
=
torch
.
cat
(
labels_list
,
0
)
label_weights
=
torch
.
cat
(
label_weights_list
,
0
)
bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
0
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
0
)
# classification loss
cls_scores
=
cls_scores
.
reshape
(
-
1
,
self
.
cls_out_channels
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
cls_scores
.
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
labels
,
label_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes accross all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# regression L1 loss
lanes_preds
=
lanes_preds
.
reshape
(
-
1
,
lanes_preds
.
size
(
-
1
))
isnotnan
=
torch
.
isfinite
(
bbox_targets
).
all
(
dim
=-
1
)
bbox_weights
=
bbox_weights
*
self
.
code_weights
loss_bbox
=
self
.
loss_bbox
(
lanes_preds
[
isnotnan
,
:
self
.
code_size
],
bbox_targets
[
isnotnan
,
:
self
.
code_size
],
bbox_weights
[
isnotnan
,
:
self
.
code_size
],
avg_factor
=
num_total_pos
)
if
digit_version
(
TORCH_VERSION
)
>=
digit_version
(
'1.8'
):
loss_cls
=
torch
.
nan_to_num
(
loss_cls
)
loss_bbox
=
torch
.
nan_to_num
(
loss_bbox
)
return
loss_cls
,
loss_bbox
,
assign_result
@
force_fp32
(
apply_to
=
(
'preds_dicts'
))
def
loss
(
self
,
preds_dicts
,
gt_lanes_3d
,
gt_labels_list
,
gt_bboxes_ignore
=
None
,
img_metas
=
None
):
""""Loss function.
Args:
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
preds_dicts:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_lanes_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
all_cls_scores
=
preds_dicts
[
'all_cls_scores'
]
all_lanes_preds
=
preds_dicts
[
'all_lanes_preds'
]
enc_cls_scores
=
preds_dicts
[
'enc_cls_scores'
]
enc_bbox_preds
=
preds_dicts
[
'enc_bbox_preds'
]
num_dec_layers
=
len
(
all_cls_scores
)
device
=
gt_labels_list
[
0
].
device
gt_lanes_list
=
[
lane
for
lane
in
gt_lanes_3d
]
all_gt_lanes_list
=
[
gt_lanes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
losses_cls
,
losses_bbox
,
assign_result
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_lanes_preds
,
all_gt_lanes_list
,
all_gt_labels_list
)
loss_dict
=
dict
()
# loss of proposal generated from encode feature map.
if
enc_cls_scores
is
not
None
:
binary_labels_list
=
[
torch
.
zeros_like
(
gt_labels_list
[
i
])
for
i
in
range
(
len
(
all_gt_labels_list
))
]
enc_loss_cls
,
enc_losses_bbox
=
\
self
.
loss_single
(
enc_cls_scores
,
enc_bbox_preds
,
gt_lanes_3d
,
binary_labels_list
,
gt_bboxes_ignore
)
loss_dict
[
'enc_loss_cls'
]
=
enc_loss_cls
loss_dict
[
'enc_loss_bbox'
]
=
enc_losses_bbox
# loss from the last decoder layer
loss_dict
[
'loss_lane_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_lane_reg'
]
=
losses_bbox
[
-
1
]
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_lane_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_lane_reg'
]
=
loss_bbox_i
num_dec_layer
+=
1
return
loss_dict
,
assign_result
@
force_fp32
(
apply_to
=
(
'preds_dicts'
))
def
get_lanes
(
self
,
preds_dicts
,
img_metas
,
rescale
=
False
):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
img_metas (list[dict]): Point cloud and image's meta info.
Returns:
list[dict]: Decoded bbox, scores and labels after nms.
"""
all_cls_scores
=
preds_dicts
[
'all_cls_scores'
][
-
1
]
all_lanes_preds
=
preds_dicts
[
'all_lanes_preds'
][
-
1
]
batch_size
=
all_cls_scores
.
size
()[
0
]
predictions_list
=
[]
for
i
in
range
(
batch_size
):
cls_scores
=
all_cls_scores
[
i
].
sigmoid
()
predictions_list
.
append
([
all_lanes_preds
[
i
].
detach
().
cpu
().
numpy
(),
cls_scores
.
detach
().
cpu
().
numpy
()])
return
predictions_list
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/relationship_head.py
0 → 100644
View file @
80e8c1d3
import
copy
import
numpy
as
np
import
cv2
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
mmcv
from
mmcv.cnn
import
Linear
,
bias_init_with_prob
,
build_activation_layer
from
mmcv.cnn.bricks.transformer
import
build_feedforward_network
from
mmcv.runner
import
auto_fp16
,
force_fp32
from
mmcv.utils
import
TORCH_VERSION
,
digit_version
from
mmdet.core
import
build_assigner
,
build_sampler
,
multi_apply
,
reduce_mean
from
mmdet.models.builder
import
HEADS
,
build_loss
from
mmdet.models.dense_heads
import
AnchorFreeHead
from
mmdet.models.utils
import
build_transformer
from
mmdet.models.utils.transformer
import
inverse_sigmoid
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
input_dim
,
hidden_dim
,
output_dim
,
num_layers
):
super
().
__init__
()
self
.
num_layers
=
num_layers
h
=
[
hidden_dim
]
*
(
num_layers
-
1
)
self
.
layers
=
nn
.
ModuleList
(
nn
.
Linear
(
n
,
k
)
for
n
,
k
in
zip
([
input_dim
]
+
h
,
h
+
[
output_dim
]))
def
forward
(
self
,
x
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
x
=
F
.
relu
(
layer
(
x
))
if
i
<
self
.
num_layers
-
1
else
layer
(
x
)
return
x
@
HEADS
.
register_module
()
class
RelationshipHead
(
nn
.
Module
):
def
__init__
(
self
,
in_channels_o1
,
in_channels_o2
=
None
,
shared_param
=
True
,
loss_rel
=
dict
(
type
=
'FocalLoss'
,
use_sigmoid
=
True
,
gamma
=
2.0
,
alpha
=
0.25
)):
super
().
__init__
()
self
.
MLP_o1
=
MLP
(
in_channels_o1
,
in_channels_o1
,
128
,
3
)
self
.
shared_param
=
shared_param
if
shared_param
:
self
.
MLP_o2
=
self
.
MLP_o1
else
:
self
.
MLP_o2
=
MLP
(
in_channels_o2
,
in_channels_o2
,
128
,
3
)
self
.
classifier
=
MLP
(
256
,
256
,
1
,
3
)
self
.
loss_rel
=
build_loss
(
loss_rel
)
def
forward_train
(
self
,
o1_feats
,
o1_assign_results
,
o2_feats
,
o2_assign_results
,
gt_adj
):
rel_pred
=
self
.
forward
(
o1_feats
,
o2_feats
)
losses
=
self
.
loss
(
rel_pred
,
gt_adj
,
o1_assign_results
,
o2_assign_results
)
return
losses
def
get_relationship
(
self
,
o1_feats
,
o2_feats
):
rel_pred
=
self
.
forward
(
o1_feats
,
o2_feats
)
rel_results
=
rel_pred
.
squeeze
(
-
1
).
sigmoid
()
rel_results
=
[
_
for
_
in
rel_results
]
return
rel_results
def
forward
(
self
,
o1_feats
,
o2_feats
):
# feats: D, B, num_query, num_embedding
o1_embeds
=
self
.
MLP_o1
(
o1_feats
[
-
1
])
o2_embeds
=
self
.
MLP_o2
(
o2_feats
[
-
1
])
num_query_o1
=
o1_embeds
.
size
(
1
)
num_query_o2
=
o2_embeds
.
size
(
1
)
o1_tensor
=
o1_embeds
.
unsqueeze
(
2
).
repeat
(
1
,
1
,
num_query_o2
,
1
)
o2_tensor
=
o2_embeds
.
unsqueeze
(
1
).
repeat
(
1
,
num_query_o1
,
1
,
1
)
relationship_tensor
=
torch
.
cat
([
o1_tensor
,
o2_tensor
],
dim
=-
1
)
relationship_pred
=
self
.
classifier
(
relationship_tensor
)
return
relationship_pred
def
loss
(
self
,
rel_preds
,
gt_adjs
,
o1_assign_results
,
o2_assign_results
):
B
,
num_query_o1
,
num_query_o2
,
_
=
rel_preds
.
size
()
o1_assign
=
o1_assign_results
[
-
1
]
o1_pos_inds
=
o1_assign
[
'pos_inds'
]
o1_pos_assigned_gt_inds
=
o1_assign
[
'pos_assigned_gt_inds'
]
if
self
.
shared_param
:
o2_assign
=
o1_assign
o2_pos_inds
=
o1_pos_inds
o2_pos_assigned_gt_inds
=
o1_pos_assigned_gt_inds
else
:
o2_assign
=
o2_assign_results
[
-
1
]
o2_pos_inds
=
o2_assign
[
'pos_inds'
]
o2_pos_assigned_gt_inds
=
o2_assign
[
'pos_assigned_gt_inds'
]
targets
=
[]
for
i
in
range
(
B
):
gt_adj
=
gt_adjs
[
i
]
target
=
torch
.
zeros_like
(
rel_preds
[
i
].
squeeze
(
-
1
),
dtype
=
gt_adj
.
dtype
,
device
=
rel_preds
.
device
)
xs
=
o1_pos_inds
[
i
].
unsqueeze
(
-
1
).
repeat
(
1
,
o2_pos_inds
[
i
].
size
(
0
))
ys
=
o2_pos_inds
[
i
].
unsqueeze
(
0
).
repeat
(
o1_pos_inds
[
i
].
size
(
0
),
1
)
target
[
xs
,
ys
]
=
gt_adj
[
o1_pos_assigned_gt_inds
[
i
]][:,
o2_pos_assigned_gt_inds
[
i
]]
targets
.
append
(
target
)
targets
=
torch
.
stack
(
targets
,
dim
=
0
)
targets
=
1
-
targets
.
view
(
-
1
).
long
()
rel_preds
=
rel_preds
.
view
(
-
1
,
1
)
# weight = (1 - targets) * 3 + targets
loss_rel
=
self
.
loss_rel
(
rel_preds
,
targets
)
if
digit_version
(
TORCH_VERSION
)
>=
digit_version
(
'1.8'
):
loss_rel
=
torch
.
nan_to_num
(
loss_rel
)
return
dict
(
loss_rel
=
loss_rel
)
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/te_deformable_detr_head.py
0 → 100644
View file @
80e8c1d3
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# custom_detr_head.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact litianyu@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
copy
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
Linear
,
bias_init_with_prob
,
constant_init
from
mmcv.runner
import
force_fp32
from
mmdet.core
import
(
bbox_cxcywh_to_xyxy
,
bbox_xyxy_to_cxcywh
,
multi_apply
,
reduce_mean
)
from
mmdet.models.utils.transformer
import
inverse_sigmoid
from
mmdet.models
import
HEADS
,
build_loss
from
mmdet.models.dense_heads
import
DETRHead
@
HEADS
.
register_module
()
class
TEDeformableDETRHead
(
DETRHead
):
"""Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to-
End Object Detection.
Code is modified from the `official github repo
<https://github.com/fundamentalvision/Deformable-DETR>`_.
More details can be found in the `paper
<https://arxiv.org/abs/2010.04159>`_ .
Args:
with_box_refine (bool): Whether to refine the reference points
in the decoder. Defaults to False.
as_two_stage (bool) : Whether to generate the proposal from
the outputs of encoder.
transformer (obj:`ConfigDict`): ConfigDict is used for building
the Encoder and Decoder.
"""
def
__init__
(
self
,
*
args
,
with_box_refine
=
False
,
as_two_stage
=
False
,
transformer
=
None
,
**
kwargs
):
self
.
with_box_refine
=
with_box_refine
self
.
as_two_stage
=
as_two_stage
if
self
.
as_two_stage
:
transformer
[
'as_two_stage'
]
=
self
.
as_two_stage
super
(
TEDeformableDETRHead
,
self
).
__init__
(
*
args
,
transformer
=
transformer
,
**
kwargs
)
def
_init_layers
(
self
):
"""Initialize classification branch and regression branch of head."""
fc_cls
=
Linear
(
self
.
embed_dims
,
self
.
cls_out_channels
)
reg_branch
=
[]
for
_
in
range
(
self
.
num_reg_fcs
):
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
self
.
embed_dims
))
reg_branch
.
append
(
nn
.
ReLU
())
reg_branch
.
append
(
Linear
(
self
.
embed_dims
,
4
))
reg_branch
=
nn
.
Sequential
(
*
reg_branch
)
def
_get_clones
(
module
,
N
):
return
nn
.
ModuleList
([
copy
.
deepcopy
(
module
)
for
i
in
range
(
N
)])
# last reg_branch is used to generate proposal from
# encode feature map when as_two_stage is True.
num_pred
=
(
self
.
transformer
.
decoder
.
num_layers
+
1
)
if
\
self
.
as_two_stage
else
self
.
transformer
.
decoder
.
num_layers
if
self
.
with_box_refine
:
self
.
cls_branches
=
_get_clones
(
fc_cls
,
num_pred
)
self
.
reg_branches
=
_get_clones
(
reg_branch
,
num_pred
)
else
:
self
.
cls_branches
=
nn
.
ModuleList
(
[
fc_cls
for
_
in
range
(
num_pred
)])
self
.
reg_branches
=
nn
.
ModuleList
(
[
reg_branch
for
_
in
range
(
num_pred
)])
if
not
self
.
as_two_stage
:
self
.
query_embedding
=
nn
.
Embedding
(
self
.
num_query
,
self
.
embed_dims
*
2
)
def
init_weights
(
self
):
"""Initialize weights of the DeformDETR head."""
self
.
transformer
.
init_weights
()
if
self
.
loss_cls
.
use_sigmoid
:
bias_init
=
bias_init_with_prob
(
0.01
)
for
m
in
self
.
cls_branches
:
nn
.
init
.
constant_
(
m
.
bias
,
bias_init
)
for
m
in
self
.
reg_branches
:
constant_init
(
m
[
-
1
],
0
,
bias
=
0
)
nn
.
init
.
constant_
(
self
.
reg_branches
[
0
][
-
1
].
bias
.
data
[
2
:],
-
2.0
)
if
self
.
as_two_stage
:
for
m
in
self
.
reg_branches
:
nn
.
init
.
constant_
(
m
[
-
1
].
bias
.
data
[
2
:],
0.0
)
def
forward
(
self
,
mlvl_feats
,
img_metas
):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 4D-tensor with shape
(N, C, H, W).
img_metas (list[dict]): List of image information.
Returns:
all_cls_scores (Tensor): Outputs from the classification head,
\
shape [nb_dec, bs, num_query, cls_out_channels]. Note
\
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression
\
head with normalized coordinate format (cx, cy, w, h).
\
Shape [nb_dec, bs, num_query, 4].
enc_outputs_class (Tensor): The score of each point on encode
\
feature map, has shape (N, h*w, num_class). Only when
\
as_two_stage is True it would be returned, otherwise
\
`None` would be returned.
enc_outputs_coord (Tensor): The proposal generate from the
\
encode feature map, has shape (N, h*w, 4). Only when
\
as_two_stage is True it would be returned, otherwise
\
`None` would be returned.
"""
batch_size
=
mlvl_feats
[
0
].
size
(
0
)
input_img_h
,
input_img_w
=
img_metas
[
0
][
'batch_input_shape'
]
img_masks
=
mlvl_feats
[
0
].
new_ones
(
(
batch_size
,
input_img_h
,
input_img_w
))
for
img_id
in
range
(
batch_size
):
img_h
,
img_w
,
_
=
img_metas
[
img_id
][
'img_shape'
]
img_masks
[
img_id
,
:
img_h
,
:
img_w
]
=
0
mlvl_masks
=
[]
mlvl_positional_encodings
=
[]
for
feat
in
mlvl_feats
:
mlvl_masks
.
append
(
F
.
interpolate
(
img_masks
[
None
],
size
=
feat
.
shape
[
-
2
:]).
to
(
torch
.
bool
).
squeeze
(
0
))
mlvl_positional_encodings
.
append
(
self
.
positional_encoding
(
mlvl_masks
[
-
1
]))
query_embeds
=
None
if
not
self
.
as_two_stage
:
query_embeds
=
self
.
query_embedding
.
weight
hs
,
init_reference
,
inter_references
,
\
enc_outputs_class
,
enc_outputs_coord
=
self
.
transformer
(
mlvl_feats
,
mlvl_masks
,
query_embeds
,
mlvl_positional_encodings
,
reg_branches
=
self
.
reg_branches
if
self
.
with_box_refine
else
None
,
# noqa:E501
cls_branches
=
self
.
cls_branches
if
self
.
as_two_stage
else
None
# noqa:E501
)
hs
=
hs
.
permute
(
0
,
2
,
1
,
3
)
outputs_classes
=
[]
outputs_coords
=
[]
for
lvl
in
range
(
hs
.
shape
[
0
]):
if
lvl
==
0
:
reference
=
init_reference
else
:
reference
=
inter_references
[
lvl
-
1
]
reference
=
inverse_sigmoid
(
reference
)
outputs_class
=
self
.
cls_branches
[
lvl
](
hs
[
lvl
])
tmp
=
self
.
reg_branches
[
lvl
](
hs
[
lvl
])
if
reference
.
shape
[
-
1
]
==
4
:
tmp
+=
reference
else
:
assert
reference
.
shape
[
-
1
]
==
2
tmp
[...,
:
2
]
+=
reference
outputs_coord
=
tmp
.
sigmoid
()
outputs_classes
.
append
(
outputs_class
)
outputs_coords
.
append
(
outputs_coord
)
outputs_classes
=
torch
.
stack
(
outputs_classes
)
outputs_coords
=
torch
.
stack
(
outputs_coords
)
outs
=
{
'all_cls_scores'
:
outputs_classes
,
'all_bbox_preds'
:
outputs_coords
,
'enc_cls_scores'
:
enc_outputs_class
if
self
.
as_two_stage
else
None
,
'enc_bbox_preds'
:
enc_outputs_coord
.
sigmoid
()
if
self
.
as_two_stage
else
None
,
'history_states'
:
hs
}
return
outs
@
force_fp32
(
apply_to
=
(
'preds_dicts'
))
def
loss
(
self
,
preds_dicts
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore
=
None
):
""""Loss function.
Args:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
assert
gt_bboxes_ignore
is
None
,
\
f
'
{
self
.
__class__
.
__name__
}
only supports '
\
f
'for gt_bboxes_ignore setting to None.'
all_cls_scores
=
preds_dicts
[
'all_cls_scores'
]
all_bbox_preds
=
preds_dicts
[
'all_bbox_preds'
]
enc_cls_scores
=
preds_dicts
[
'enc_cls_scores'
]
enc_bbox_preds
=
preds_dicts
[
'enc_bbox_preds'
]
num_dec_layers
=
len
(
all_cls_scores
)
all_gt_bboxes_list
=
[
gt_bboxes_list
for
_
in
range
(
num_dec_layers
)]
all_gt_labels_list
=
[
gt_labels_list
for
_
in
range
(
num_dec_layers
)]
all_gt_bboxes_ignore_list
=
[
gt_bboxes_ignore
for
_
in
range
(
num_dec_layers
)
]
img_metas_list
=
[
img_metas
for
_
in
range
(
num_dec_layers
)]
losses_cls
,
losses_bbox
,
losses_iou
,
assign_result
=
multi_apply
(
self
.
loss_single
,
all_cls_scores
,
all_bbox_preds
,
all_gt_bboxes_list
,
all_gt_labels_list
,
img_metas_list
,
all_gt_bboxes_ignore_list
)
loss_dict
=
dict
()
# loss of proposal generated from encode feature map.
if
enc_cls_scores
is
not
None
:
binary_labels_list
=
[
torch
.
zeros_like
(
gt_labels_list
[
i
])
for
i
in
range
(
len
(
img_metas
))
]
enc_loss_cls
,
enc_losses_bbox
,
enc_losses_iou
=
\
self
.
loss_single
(
enc_cls_scores
,
enc_bbox_preds
,
gt_bboxes_list
,
binary_labels_list
,
img_metas
,
gt_bboxes_ignore
)
loss_dict
[
'enc_loss_cls'
]
=
enc_loss_cls
loss_dict
[
'enc_loss_bbox'
]
=
enc_losses_bbox
loss_dict
[
'enc_loss_iou'
]
=
enc_losses_iou
# loss from the last decoder layer
loss_dict
[
'loss_cls'
]
=
losses_cls
[
-
1
]
loss_dict
[
'loss_bbox'
]
=
losses_bbox
[
-
1
]
loss_dict
[
'loss_iou'
]
=
losses_iou
[
-
1
]
# loss from other decoder layers
num_dec_layer
=
0
for
loss_cls_i
,
loss_bbox_i
,
loss_iou_i
in
zip
(
losses_cls
[:
-
1
],
losses_bbox
[:
-
1
],
losses_iou
[:
-
1
]):
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_cls'
]
=
loss_cls_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_bbox'
]
=
loss_bbox_i
loss_dict
[
f
'd
{
num_dec_layer
}
.loss_iou'
]
=
loss_iou_i
num_dec_layer
+=
1
return
loss_dict
,
assign_result
def
loss_single
(
self
,
cls_scores
,
bbox_preds
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
=
None
):
""""Loss function for outputs from a single decoder layer of a single
feature level.
Args:
cls_scores (Tensor): Box score logits from a single decoder layer
for all images. Shape [bs, num_query, cls_out_channels].
bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
for all images, with normalized coordinate (cx, cy, w, h) and
shape [bs, num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
dict[str, Tensor]: A dictionary of loss components for outputs from
a single decoder layer.
"""
num_imgs
=
cls_scores
.
size
(
0
)
cls_scores_list
=
[
cls_scores
[
i
]
for
i
in
range
(
num_imgs
)]
bbox_preds_list
=
[
bbox_preds
[
i
]
for
i
in
range
(
num_imgs
)]
cls_reg_targets
=
self
.
get_targets
(
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
)
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
,
assign_result
)
=
cls_reg_targets
labels
=
torch
.
cat
(
labels_list
,
0
)
label_weights
=
torch
.
cat
(
label_weights_list
,
0
)
bbox_targets
=
torch
.
cat
(
bbox_targets_list
,
0
)
bbox_weights
=
torch
.
cat
(
bbox_weights_list
,
0
)
# classification loss
cls_scores
=
cls_scores
.
reshape
(
-
1
,
self
.
cls_out_channels
)
# construct weighted avg_factor to match with the official DETR repo
cls_avg_factor
=
num_total_pos
*
1.0
+
\
num_total_neg
*
self
.
bg_cls_weight
if
self
.
sync_cls_avg_factor
:
cls_avg_factor
=
reduce_mean
(
cls_scores
.
new_tensor
([
cls_avg_factor
]))
cls_avg_factor
=
max
(
cls_avg_factor
,
1
)
loss_cls
=
self
.
loss_cls
(
cls_scores
,
labels
,
label_weights
,
avg_factor
=
cls_avg_factor
)
# Compute the average number of gt boxes across all gpus, for
# normalization purposes
num_total_pos
=
loss_cls
.
new_tensor
([
num_total_pos
])
num_total_pos
=
torch
.
clamp
(
reduce_mean
(
num_total_pos
),
min
=
1
).
item
()
# construct factors used for rescale bboxes
factors
=
[]
for
img_meta
,
bbox_pred
in
zip
(
img_metas
,
bbox_preds
):
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
).
repeat
(
bbox_pred
.
size
(
0
),
1
)
factors
.
append
(
factor
)
factors
=
torch
.
cat
(
factors
,
0
)
# DETR regress the relative position of boxes (cxcywh) in the image,
# thus the learning target is normalized by the image size. So here
# we need to re-scale them for calculating IoU loss
bbox_preds
=
bbox_preds
.
reshape
(
-
1
,
4
)
bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_preds
)
*
factors
bboxes_gt
=
bbox_cxcywh_to_xyxy
(
bbox_targets
)
*
factors
# regression IoU loss, defaultly GIoU loss
loss_iou
=
self
.
loss_iou
(
bboxes
,
bboxes_gt
,
bbox_weights
,
avg_factor
=
num_total_pos
)
# regression L1 loss
loss_bbox
=
self
.
loss_bbox
(
bbox_preds
,
bbox_targets
,
bbox_weights
,
avg_factor
=
num_total_pos
)
return
loss_cls
,
loss_bbox
,
loss_iou
,
assign_result
def
get_targets
(
self
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
=
None
):
""""Compute regression and classification targets for a batch image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_scores_list (list[Tensor]): Box score logits from a single
decoder layer for each image with shape [num_query,
cls_out_channels].
bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
decoder layer for each image, with normalized coordinate
(cx, cy, w, h) and shape [num_query, 4].
gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels_list (list[Tensor]): Ground truth class indices for each
image with shape (num_gts, ).
img_metas (list[dict]): List of image meta information.
gt_bboxes_ignore_list (list[Tensor], optional): Bounding
boxes which can be ignored for each image. Default None.
Returns:
tuple: a tuple containing the following targets.
- labels_list (list[Tensor]): Labels for all images.
- label_weights_list (list[Tensor]): Label weights for all
\
images.
- bbox_targets_list (list[Tensor]): BBox targets for all
\
images.
- bbox_weights_list (list[Tensor]): BBox weights for all
\
images.
- num_total_pos (int): Number of positive samples in all
\
images.
- num_total_neg (int): Number of negative samples in all
\
images.
"""
assert
gt_bboxes_ignore_list
is
None
,
\
'Only supports for gt_bboxes_ignore setting to None.'
num_imgs
=
len
(
cls_scores_list
)
gt_bboxes_ignore_list
=
[
gt_bboxes_ignore_list
for
_
in
range
(
num_imgs
)
]
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
pos_inds_list
,
neg_inds_list
,
pos_assigned_gt_inds_list
)
=
multi_apply
(
self
.
_get_target_single
,
cls_scores_list
,
bbox_preds_list
,
gt_bboxes_list
,
gt_labels_list
,
img_metas
,
gt_bboxes_ignore_list
)
num_total_pos
=
sum
((
inds
.
numel
()
for
inds
in
pos_inds_list
))
num_total_neg
=
sum
((
inds
.
numel
()
for
inds
in
neg_inds_list
))
assign_result
=
dict
(
pos_inds
=
pos_inds_list
,
neg_inds
=
neg_inds_list
,
pos_assigned_gt_inds
=
pos_assigned_gt_inds_list
)
return
(
labels_list
,
label_weights_list
,
bbox_targets_list
,
bbox_weights_list
,
num_total_pos
,
num_total_neg
,
assign_result
)
def
_get_target_single
(
self
,
cls_score
,
bbox_pred
,
gt_bboxes
,
gt_labels
,
img_meta
,
gt_bboxes_ignore
=
None
):
""""Compute regression and classification targets for one image.
Outputs from a single decoder layer of a single feature level are used.
Args:
cls_score (Tensor): Box score logits from a single decoder layer
for one image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
for one image, with normalized coordinate (cx, cy, w, h) and
shape [num_query, 4].
gt_bboxes (Tensor): Ground truth bboxes for one image with
shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (Tensor): Ground truth class indices for one image
with shape (num_gts, ).
img_meta (dict): Meta information for one image.
gt_bboxes_ignore (Tensor, optional): Bounding boxes
which can be ignored. Default None.
Returns:
tuple[Tensor]: a tuple containing the following for one image.
- labels (Tensor): Labels of each image.
- label_weights (Tensor]): Label weights of each image.
- bbox_targets (Tensor): BBox targets of each image.
- bbox_weights (Tensor): BBox weights of each image.
- pos_inds (Tensor): Sampled positive indices for each image.
- neg_inds (Tensor): Sampled negative indices for each image.
"""
num_bboxes
=
bbox_pred
.
size
(
0
)
# assigner and sampler
assign_result
=
self
.
assigner
.
assign
(
bbox_pred
,
cls_score
,
gt_bboxes
,
gt_labels
,
img_meta
,
gt_bboxes_ignore
)
sampling_result
=
self
.
sampler
.
sample
(
assign_result
,
bbox_pred
,
gt_bboxes
)
pos_inds
=
sampling_result
.
pos_inds
neg_inds
=
sampling_result
.
neg_inds
pos_assigned_gt_inds
=
sampling_result
.
pos_assigned_gt_inds
# label targets
labels
=
gt_bboxes
.
new_full
((
num_bboxes
,
),
self
.
num_classes
,
dtype
=
torch
.
long
)
labels
[
pos_inds
]
=
gt_labels
[
sampling_result
.
pos_assigned_gt_inds
]
label_weights
=
gt_bboxes
.
new_ones
(
num_bboxes
)
# bbox targets
bbox_targets
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
=
torch
.
zeros_like
(
bbox_pred
)
bbox_weights
[
pos_inds
]
=
1.0
img_h
,
img_w
,
_
=
img_meta
[
'img_shape'
]
# DETR regress the relative position of boxes (cxcywh) in the image.
# Thus the learning target should be normalized by the image size, also
# the box format should be converted from defaultly x1y1x2y2 to cxcywh.
factor
=
bbox_pred
.
new_tensor
([
img_w
,
img_h
,
img_w
,
img_h
]).
unsqueeze
(
0
)
pos_gt_bboxes_normalized
=
sampling_result
.
pos_gt_bboxes
/
factor
pos_gt_bboxes_targets
=
bbox_xyxy_to_cxcywh
(
pos_gt_bboxes_normalized
)
bbox_targets
[
pos_inds
]
=
pos_gt_bboxes_targets
return
(
labels
,
label_weights
,
bbox_targets
,
bbox_weights
,
pos_inds
,
neg_inds
,
pos_assigned_gt_inds
)
def
simple_test_bboxes
(
self
,
feats
,
img_metas
,
rescale
=
False
):
"""Test det bboxes without test-time augmentation.
Args:
feats (tuple[torch.Tensor]): Multi-level features from the
upstream network, each is a 4D-tensor.
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
Returns:
list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
The first item is ``bboxes`` with shape (n, 5),
where 5 represent (tl_x, tl_y, br_x, br_y, score).
The shape of the second tensor in the tuple is ``labels``
with shape (n,)
"""
# forward of this head requires img_metas
outs
=
self
.
forward
(
feats
,
img_metas
)
results_list
=
self
.
get_bboxes
(
outs
,
img_metas
,
rescale
=
rescale
)
return
results_list
@
force_fp32
(
apply_to
=
(
'preds_dicts'
))
def
get_bboxes
(
self
,
preds_dicts
,
img_metas
,
rescale
=
False
):
"""Transform network outputs for a batch into bbox predictions.
Args:
all_cls_scores (Tensor): Classification score of all
decoder layers, has shape
[nb_dec, bs, num_query, cls_out_channels].
all_bbox_preds (Tensor): Sigmoid regression
outputs of all decode layers. Each is a 4D-tensor with
normalized coordinate format (cx, cy, w, h) and shape
[nb_dec, bs, num_query, 4].
enc_cls_scores (Tensor): Classification scores of
points on encode feature map , has shape
(N, h*w, num_classes). Only be passed when as_two_stage is
True, otherwise is None.
enc_bbox_preds (Tensor): Regression results of each points
on the encode feature map, has shape (N, h*w, 4). Only be
passed when as_two_stage is True, otherwise is None.
img_metas (list[dict]): Meta information of each image.
rescale (bool, optional): If True, return boxes in original
image space. Default False.
Returns:
list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.
\
The first item is an (n, 5) tensor, where the first 4 columns
\
are bounding box positions (tl_x, tl_y, br_x, br_y) and the
\
5-th column is a score between 0 and 1. The second item is a
\
(n,) tensor where each item is the predicted class label of
\
the corresponding box.
"""
all_cls_scores
=
preds_dicts
[
'all_cls_scores'
]
all_bbox_preds
=
preds_dicts
[
'all_bbox_preds'
]
enc_cls_scores
=
preds_dicts
[
'enc_cls_scores'
]
enc_bbox_preds
=
preds_dicts
[
'enc_bbox_preds'
]
cls_scores
=
all_cls_scores
[
-
1
]
bbox_preds
=
all_bbox_preds
[
-
1
]
result_list
=
[]
for
img_id
in
range
(
len
(
img_metas
)):
cls_score
=
cls_scores
[
img_id
]
bbox_pred
=
bbox_preds
[
img_id
]
img_shape
=
img_metas
[
img_id
][
'img_shape'
]
scale_factor
=
img_metas
[
img_id
][
'scale_factor'
]
proposals
=
self
.
_get_bboxes_single
(
cls_score
,
bbox_pred
,
img_shape
,
scale_factor
,
rescale
)
result_list
.
append
(
proposals
)
return
result_list
def
_get_bboxes_single
(
self
,
cls_score
,
bbox_pred
,
img_shape
,
scale_factor
,
rescale
=
False
):
"""Transform outputs from the last decoder layer into bbox predictions
for each image.
Args:
cls_score (Tensor): Box score logits from the last decoder layer
for each image. Shape [num_query, cls_out_channels].
bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
for each image, with coordinate format (cx, cy, w, h) and
shape [num_query, 4].
img_shape (tuple[int]): Shape of input image, (height, width, 3).
scale_factor (ndarray, optional): Scale factor of the image arange
as (w_scale, h_scale, w_scale, h_scale).
rescale (bool, optional): If True, return boxes in original image
space. Default False.
Returns:
tuple[Tensor]: Results of detected bboxes and labels.
- det_bboxes: Predicted bboxes with shape [num_query, 5],
\
where the first 4 columns are bounding box positions
\
(tl_x, tl_y, br_x, br_y) and the 5-th column are scores
\
between 0 and 1.
- det_labels: Predicted labels of the corresponding box with
\
shape [num_query].
"""
assert
len
(
cls_score
)
==
len
(
bbox_pred
)
# exclude background
if
self
.
loss_cls
.
use_sigmoid
:
cls_score
=
cls_score
.
sigmoid
()
else
:
cls_score
=
F
.
softmax
(
cls_score
,
dim
=-
1
)[...,
:
-
1
]
scores
,
det_labels
=
cls_score
.
max
(
-
1
)
det_bboxes
=
bbox_cxcywh_to_xyxy
(
bbox_pred
)
det_bboxes
[:,
0
::
2
]
=
det_bboxes
[:,
0
::
2
]
*
img_shape
[
1
]
det_bboxes
[:,
1
::
2
]
=
det_bboxes
[:,
1
::
2
]
*
img_shape
[
0
]
det_bboxes
[:,
0
::
2
].
clamp_
(
min
=
0
,
max
=
img_shape
[
1
])
det_bboxes
[:,
1
::
2
].
clamp_
(
min
=
0
,
max
=
img_shape
[
0
])
if
rescale
:
det_bboxes
/=
det_bboxes
.
new_tensor
(
scale_factor
)
det_bboxes
=
torch
.
cat
((
det_bboxes
,
det_labels
.
unsqueeze
(
1
)),
-
1
)
pred
=
[
det_bboxes
.
detach
().
cpu
().
numpy
(),
scores
.
detach
().
cpu
().
numpy
()
]
return
pred
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/heads/topology_head.py
0 → 100644
View file @
80e8c1d3
# ==============================================================================
# Binaries and/or source for the following packages or projects
# are presented under one or more of the following open source licenses:
# topology_head.py The OpenLane-V2 Dataset Authors Apache License, Version 2.0
#
# Contact wanghuijie@pjlab.org.cn if you have any issue.
#
# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.runner
import
BaseModule
from
mmdet.models
import
HEADS
,
build_loss
class
MLP
(
nn
.
Module
):
def
__init__
(
self
,
input_dim
,
hidden_dim
,
output_dim
,
num_layers
):
super
().
__init__
()
self
.
num_layers
=
num_layers
h
=
[
hidden_dim
]
*
(
num_layers
-
1
)
self
.
layers
=
nn
.
ModuleList
(
nn
.
Linear
(
n
,
k
)
for
n
,
k
in
zip
([
input_dim
]
+
h
,
h
+
[
output_dim
]))
def
forward
(
self
,
x
):
for
i
,
layer
in
enumerate
(
self
.
layers
):
x
=
F
.
relu
(
layer
(
x
))
if
i
<
self
.
num_layers
-
1
else
layer
(
x
)
return
x
@
HEADS
.
register_module
()
class
TopologyHead
(
BaseModule
):
def
__init__
(
self
,
in_channels
,
hidden_channels
,
out_channels
,
num_layers
,
loss_cls
):
super
().
__init__
()
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
mlp
=
MLP
(
self
.
in_channels
,
hidden_channels
,
out_channels
,
num_layers
)
self
.
loss_cls
=
build_loss
(
loss_cls
)
def
forward
(
self
,
all_a_preds_list
,
all_b_preds_list
):
# NOTE defaultly only the outputs from the last feature scale is used.
all_a_preds_list
=
all_a_preds_list
[
-
1
]
all_b_preds_list
=
all_b_preds_list
[
-
1
]
num_out
=
all_a_preds_list
.
shape
[
0
]
assert
num_out
==
all_a_preds_list
.
shape
[
0
]
==
all_b_preds_list
.
shape
[
0
]
num_row
,
num_column
=
all_a_preds_list
.
shape
[
-
2
],
all_b_preds_list
.
shape
[
-
2
]
assert
self
.
in_channels
==
all_a_preds_list
.
shape
[
-
1
]
+
all_b_preds_list
.
shape
[
-
1
],
\
f
'self.in_channels =
{
self
.
in_channels
}
!= all_a_preds_list.shape[-1]
{
all_a_preds_list
.
shape
[
-
1
]
}
+ all_b_preds_list.shape[-1]
{
all_b_preds_list
.
shape
[
-
1
]
}
'
outs
=
[]
for
o
in
range
(
num_out
):
adj
=
torch
.
cat
([
all_a_preds_list
[
o
].
unsqueeze
(
2
).
repeat
(
1
,
1
,
num_column
,
1
),
all_b_preds_list
[
o
].
unsqueeze
(
1
).
repeat
(
1
,
num_row
,
1
,
1
),
],
dim
=-
1
)
outs
.
append
(
self
.
mlp
(
adj
).
sigmoid
())
return
outs
def
loss
(
self
,
pred_adj_list
,
row_assign_results
,
column_assign_results
,
gt_adj
):
# NOTE defaultly only the outputs from the last decoder layer is used.
pred_adj
=
pred_adj_list
[
-
1
]
row_assign_result
=
row_assign_results
[
-
1
]
column_assign_result
=
column_assign_results
[
-
1
]
targets
=
[]
for
b
in
range
(
pred_adj
.
shape
[
0
]):
target
=
pred_adj
.
new_zeros
(
pred_adj
[
b
].
shape
[:
-
1
])
rs
=
row_assign_result
[
'pos_inds'
][
b
].
unsqueeze
(
-
1
).
repeat
(
1
,
column_assign_result
[
'pos_inds'
][
b
].
shape
[
0
])
cs
=
column_assign_result
[
'pos_inds'
][
b
].
unsqueeze
(
0
).
repeat
(
row_assign_result
[
'pos_inds'
][
b
].
shape
[
0
],
1
)
target
[
rs
,
cs
]
=
gt_adj
[
b
][
row_assign_result
[
'pos_assigned_gt_inds'
][
b
]][:,
column_assign_result
[
'pos_assigned_gt_inds'
][
b
]].
float
()
targets
.
append
(
target
)
targets
=
1
-
torch
.
stack
(
targets
,
dim
=
0
)
# 0 as positive
loss_dict
=
dict
()
pred_adj
=
pred_adj
.
reshape
(
-
1
,
self
.
out_channels
)
targets
=
targets
.
long
().
reshape
(
-
1
)
loss_dict
[
'loss_cls'
]
=
self
.
loss_cls
(
pred_adj
,
targets
)
return
loss_dict
def
get_topology
(
self
,
pred_adj_list
):
# NOTE defaultly only the outputs from the last decoder layer is used.
pred_adj
=
pred_adj_list
[
-
1
].
squeeze
(
-
1
)
return
pred_adj
.
cpu
().
numpy
()
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/__init__.py
0 → 100644
View file @
80e8c1d3
from
.spatial_cross_attention
import
SpatialCrossAttention
,
MSDeformableAttention3D
from
.temporal_self_attention
import
TemporalSelfAttention
from
.encoder
import
BEVFormerEncoder
,
BEVFormerLayer
from
.decoder
import
LaneDetectionTransformerDecoder
from
.bevformer_constructer
import
BEVFormerConstructer
from
.transformer
import
PerceptionTransformer
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/bevformer_constructer.py
0 → 100644
View file @
80e8c1d3
import
numpy
as
np
import
torch
import
torch.nn
as
nn
from
torch.nn.init
import
normal_
from
torchvision.transforms.functional
import
rotate
from
mmcv.cnn
import
xavier_init
from
mmcv.cnn.bricks.transformer
import
build_transformer_layer_sequence
,
build_positional_encoding
from
mmcv.runner.base_module
import
BaseModule
from
mmcv.runner
import
force_fp32
,
auto_fp16
from
mmdet.models.utils.builder
import
TRANSFORMER
from
mmdet3d.models
import
NECKS
from
.temporal_self_attention
import
TemporalSelfAttention
from
.spatial_cross_attention
import
MSDeformableAttention3D
from
.decoder
import
CustomMSDeformableAttention
@
NECKS
.
register_module
()
class
BEVFormerConstructer
(
BaseModule
):
"""Implements the BEVFormer BEV Constructer.
Args:
as_two_stage (bool): Generate query from encoder features.
Default: False.
num_feature_levels (int): Number of feature maps from FPN:
Default: 4.
two_stage_num_proposals (int): Number of proposals when set
`as_two_stage` as True. Default: 300.
"""
def
__init__
(
self
,
num_feature_levels
=
4
,
num_cams
=
6
,
embed_dims
=
256
,
rotate_prev_bev
=
True
,
use_shift
=
True
,
use_can_bus
=
True
,
can_bus_norm
=
True
,
use_cams_embeds
=
True
,
pc_range
=
[
-
51.2
,
-
51.2
,
-
5.0
,
51.2
,
51.2
,
3.0
],
bev_h
=
200
,
bev_w
=
200
,
rotate_center
=
[
100
,
100
],
encoder
=
None
,
positional_encoding
=
None
,
**
kwargs
):
super
(
BEVFormerConstructer
,
self
).
__init__
(
**
kwargs
)
self
.
embed_dims
=
embed_dims
self
.
num_feature_levels
=
num_feature_levels
self
.
num_cams
=
num_cams
self
.
fp16_enabled
=
False
self
.
rotate_prev_bev
=
rotate_prev_bev
self
.
use_shift
=
use_shift
self
.
use_can_bus
=
use_can_bus
self
.
can_bus_norm
=
can_bus_norm
self
.
use_cams_embeds
=
use_cams_embeds
self
.
encoder
=
build_transformer_layer_sequence
(
encoder
)
self
.
positional_encoding
=
build_positional_encoding
(
positional_encoding
)
self
.
pc_range
=
pc_range
self
.
real_w
=
self
.
pc_range
[
3
]
-
self
.
pc_range
[
0
]
self
.
real_h
=
self
.
pc_range
[
4
]
-
self
.
pc_range
[
1
]
self
.
bev_h
=
bev_h
self
.
bev_w
=
bev_w
self
.
rotate_center
=
rotate_center
self
.
init_layers
()
def
init_layers
(
self
):
self
.
bev_embedding
=
nn
.
Embedding
(
self
.
bev_h
*
self
.
bev_w
,
self
.
embed_dims
)
self
.
level_embeds
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
num_feature_levels
,
self
.
embed_dims
))
self
.
cams_embeds
=
nn
.
Parameter
(
torch
.
Tensor
(
self
.
num_cams
,
self
.
embed_dims
))
self
.
can_bus_mlp
=
nn
.
Sequential
(
nn
.
Linear
(
18
,
self
.
embed_dims
//
2
),
nn
.
ReLU
(
inplace
=
True
),
nn
.
Linear
(
self
.
embed_dims
//
2
,
self
.
embed_dims
),
nn
.
ReLU
(
inplace
=
True
),
)
if
self
.
can_bus_norm
:
self
.
can_bus_mlp
.
add_module
(
'norm'
,
nn
.
LayerNorm
(
self
.
embed_dims
))
def
init_weights
(
self
):
"""Initialize the transformer weights."""
for
p
in
self
.
parameters
():
if
p
.
dim
()
>
1
:
nn
.
init
.
xavier_uniform_
(
p
)
for
m
in
self
.
modules
():
if
isinstance
(
m
,
MSDeformableAttention3D
)
or
isinstance
(
m
,
TemporalSelfAttention
)
\
or
isinstance
(
m
,
CustomMSDeformableAttention
):
try
:
m
.
init_weight
()
except
AttributeError
:
m
.
init_weights
()
normal_
(
self
.
level_embeds
)
normal_
(
self
.
cams_embeds
)
xavier_init
(
self
.
can_bus_mlp
,
distribution
=
'uniform'
,
bias
=
0.
)
# @auto_fp16(apply_to=('mlvl_feats', 'prev_bev'))
def
forward
(
self
,
mlvl_feats
,
img_metas
,
prev_bev
=
None
,
**
kwargs
):
"""
obtain bev features.
"""
bs
,
num_cam
,
_
,
_
,
_
=
mlvl_feats
[
0
].
shape
dtype
=
mlvl_feats
[
0
].
dtype
bev_queries
=
self
.
bev_embedding
.
weight
.
to
(
dtype
)
bev_queries
=
bev_queries
.
unsqueeze
(
1
).
repeat
(
1
,
bs
,
1
)
bev_mask
=
torch
.
zeros
((
bs
,
self
.
bev_h
,
self
.
bev_w
),
device
=
bev_queries
.
device
).
to
(
dtype
)
bev_pos
=
self
.
positional_encoding
(
bev_mask
).
to
(
dtype
)
bev_pos
=
bev_pos
.
flatten
(
2
).
permute
(
2
,
0
,
1
)
# obtain rotation angle and shift with ego motion
delta_x
=
np
.
array
([
each
[
'can_bus'
][
0
]
for
each
in
img_metas
])
delta_y
=
np
.
array
([
each
[
'can_bus'
][
1
]
for
each
in
img_metas
])
ego_angle
=
np
.
array
(
[
each
[
'can_bus'
][
-
2
]
/
np
.
pi
*
180
for
each
in
img_metas
])
grid_length_y
=
self
.
real_h
/
self
.
bev_h
grid_length_x
=
self
.
real_w
/
self
.
bev_w
translation_length
=
np
.
sqrt
(
delta_x
**
2
+
delta_y
**
2
)
translation_angle
=
np
.
arctan2
(
delta_y
,
delta_x
)
/
np
.
pi
*
180
bev_angle
=
ego_angle
-
translation_angle
shift_y
=
translation_length
*
\
np
.
cos
(
bev_angle
/
180
*
np
.
pi
)
/
grid_length_y
/
self
.
bev_h
shift_x
=
translation_length
*
\
np
.
sin
(
bev_angle
/
180
*
np
.
pi
)
/
grid_length_x
/
self
.
bev_w
shift_y
=
shift_y
*
self
.
use_shift
shift_x
=
shift_x
*
self
.
use_shift
shift
=
bev_queries
.
new_tensor
(
[
shift_x
,
shift_y
]).
permute
(
1
,
0
)
# xy, bs -> bs, xy
if
prev_bev
is
not
None
:
if
prev_bev
.
shape
[
1
]
==
self
.
bev_h
*
self
.
bev_w
:
prev_bev
=
prev_bev
.
permute
(
1
,
0
,
2
)
if
self
.
rotate_prev_bev
:
for
i
in
range
(
bs
):
# num_prev_bev = prev_bev.size(1)
rotation_angle
=
img_metas
[
i
][
'can_bus'
][
-
1
]
tmp_prev_bev
=
prev_bev
[:,
i
].
reshape
(
self
.
bev_h
,
self
.
bev_w
,
-
1
).
permute
(
2
,
0
,
1
)
tmp_prev_bev
=
rotate
(
tmp_prev_bev
,
rotation_angle
,
center
=
self
.
rotate_center
)
tmp_prev_bev
=
tmp_prev_bev
.
permute
(
1
,
2
,
0
).
reshape
(
self
.
bev_h
*
self
.
bev_w
,
1
,
-
1
)
prev_bev
[:,
i
]
=
tmp_prev_bev
[:,
0
]
# add can bus signals
can_bus
=
bev_queries
.
new_tensor
(
[
each
[
'can_bus'
]
for
each
in
img_metas
])
# [:, :]
can_bus
=
self
.
can_bus_mlp
(
can_bus
)[
None
,
:,
:]
bev_queries
=
bev_queries
+
can_bus
*
self
.
use_can_bus
feat_flatten
=
[]
spatial_shapes
=
[]
for
lvl
,
feat
in
enumerate
(
mlvl_feats
):
bs
,
num_cam
,
c
,
h
,
w
=
feat
.
shape
spatial_shape
=
(
h
,
w
)
feat
=
feat
.
flatten
(
3
).
permute
(
1
,
0
,
3
,
2
)
if
self
.
use_cams_embeds
:
feat
=
feat
+
self
.
cams_embeds
[:,
None
,
None
,
:].
to
(
feat
.
dtype
)
feat
=
feat
+
self
.
level_embeds
[
None
,
None
,
lvl
:
lvl
+
1
,
:].
to
(
feat
.
dtype
)
spatial_shapes
.
append
(
spatial_shape
)
feat_flatten
.
append
(
feat
)
feat_flatten
=
torch
.
cat
(
feat_flatten
,
2
)
spatial_shapes
=
torch
.
as_tensor
(
spatial_shapes
,
dtype
=
torch
.
long
,
device
=
bev_pos
.
device
)
level_start_index
=
torch
.
cat
((
spatial_shapes
.
new_zeros
(
(
1
,)),
spatial_shapes
.
prod
(
1
).
cumsum
(
0
)[:
-
1
]))
feat_flatten
=
feat_flatten
.
permute
(
0
,
2
,
1
,
3
)
# (num_cam, H*W, bs, embed_dims)
bev_embed
=
self
.
encoder
(
bev_queries
,
feat_flatten
,
feat_flatten
,
bev_h
=
self
.
bev_h
,
bev_w
=
self
.
bev_w
,
bev_pos
=
bev_pos
,
spatial_shapes
=
spatial_shapes
,
level_start_index
=
level_start_index
,
prev_bev
=
prev_bev
,
shift
=
shift
,
img_metas
=
img_metas
,
**
kwargs
)
return
bev_embed
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/custom_base_transformer_layer.py
0 → 100644
View file @
80e8c1d3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
import
copy
import
warnings
import
torch
import
torch.nn
as
nn
from
mmcv
import
ConfigDict
,
deprecated_api_warning
from
mmcv.cnn
import
Linear
,
build_activation_layer
,
build_norm_layer
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.cnn.bricks.registry
import
(
ATTENTION
,
FEEDFORWARD_NETWORK
,
POSITIONAL_ENCODING
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
try
:
from
mmcv.ops.multi_scale_deform_attn
import
MultiScaleDeformableAttention
# noqa F401
warnings
.
warn
(
ImportWarning
(
'``MultiScaleDeformableAttention`` has been moved to '
'``mmcv.ops.multi_scale_deform_attn``, please change original path '
# noqa E501
'``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '
# noqa E501
'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '
# noqa E501
))
except
ImportError
:
warnings
.
warn
(
'Fail to import ``MultiScaleDeformableAttention`` from '
'``mmcv.ops.multi_scale_deform_attn``, '
'You should install ``mmcv-full`` if you need this module. '
)
from
mmcv.cnn.bricks.transformer
import
build_feedforward_network
,
build_attention
@
TRANSFORMER_LAYER
.
register_module
()
class
MyCustomBaseTransformerLayer
(
BaseModule
):
"""Base `TransformerLayer` for vision transformer.
It can be built from `mmcv.ConfigDict` and support more flexible
customization, for example, using any number of `FFN or LN ` and
use different kinds of `attention` by specifying a list of `ConfigDict`
named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
when you specifying `norm` as the first element of `operation_order`.
More details about the `prenorm`: `On Layer Normalization in the
Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for `self_attention` or `cross_attention` modules,
The order of the configs in the list should be consistent with
corresponding attentions in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config. Default: None.
ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for FFN, The order of the configs in the list should be
consistent with corresponding ffn in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Support `prenorm` when you specifying first element as `norm`.
Default:None.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape
of (batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
"""
def
__init__
(
self
,
attn_cfgs
=
None
,
ffn_cfgs
=
dict
(
type
=
'FFN'
,
embed_dims
=
256
,
feedforward_channels
=
1024
,
num_fcs
=
2
,
ffn_drop
=
0.
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
),
operation_order
=
None
,
norm_cfg
=
dict
(
type
=
'LN'
),
init_cfg
=
None
,
batch_first
=
True
,
**
kwargs
):
deprecated_args
=
dict
(
feedforward_channels
=
'feedforward_channels'
,
ffn_dropout
=
'ffn_drop'
,
ffn_num_fcs
=
'num_fcs'
)
for
ori_name
,
new_name
in
deprecated_args
.
items
():
if
ori_name
in
kwargs
:
warnings
.
warn
(
f
'The arguments `
{
ori_name
}
` in BaseTransformerLayer '
f
'has been deprecated, now you should set `
{
new_name
}
` '
f
'and other FFN related arguments '
f
'to a dict named `ffn_cfgs`. '
)
ffn_cfgs
[
new_name
]
=
kwargs
[
ori_name
]
super
(
MyCustomBaseTransformerLayer
,
self
).
__init__
(
init_cfg
)
self
.
batch_first
=
batch_first
assert
set
(
operation_order
)
&
set
(
[
'self_attn'
,
'norm'
,
'ffn'
,
'cross_attn'
])
==
\
set
(
operation_order
),
f
'The operation_order of'
\
f
'
{
self
.
__class__
.
__name__
}
should '
\
f
'contains all four operation type '
\
f
"
{
[
'self_attn'
,
'norm'
,
'ffn'
,
'cross_attn'
]
}
"
num_attn
=
operation_order
.
count
(
'self_attn'
)
+
operation_order
.
count
(
'cross_attn'
)
if
isinstance
(
attn_cfgs
,
dict
):
attn_cfgs
=
[
copy
.
deepcopy
(
attn_cfgs
)
for
_
in
range
(
num_attn
)]
else
:
assert
num_attn
==
len
(
attn_cfgs
),
f
'The length '
\
f
'of attn_cfg
{
num_attn
}
is '
\
f
'not consistent with the number of attention'
\
f
'in operation_order
{
operation_order
}
.'
self
.
num_attn
=
num_attn
self
.
operation_order
=
operation_order
self
.
norm_cfg
=
norm_cfg
self
.
pre_norm
=
operation_order
[
0
]
==
'norm'
self
.
attentions
=
ModuleList
()
index
=
0
for
operation_name
in
operation_order
:
if
operation_name
in
[
'self_attn'
,
'cross_attn'
]:
if
'batch_first'
in
attn_cfgs
[
index
]:
assert
self
.
batch_first
==
attn_cfgs
[
index
][
'batch_first'
]
else
:
attn_cfgs
[
index
][
'batch_first'
]
=
self
.
batch_first
attention
=
build_attention
(
attn_cfgs
[
index
])
# Some custom attentions used as `self_attn`
# or `cross_attn` can have different behavior.
attention
.
operation_name
=
operation_name
self
.
attentions
.
append
(
attention
)
index
+=
1
self
.
embed_dims
=
self
.
attentions
[
0
].
embed_dims
self
.
ffns
=
ModuleList
()
num_ffns
=
operation_order
.
count
(
'ffn'
)
if
isinstance
(
ffn_cfgs
,
dict
):
ffn_cfgs
=
ConfigDict
(
ffn_cfgs
)
if
isinstance
(
ffn_cfgs
,
dict
):
ffn_cfgs
=
[
copy
.
deepcopy
(
ffn_cfgs
)
for
_
in
range
(
num_ffns
)]
assert
len
(
ffn_cfgs
)
==
num_ffns
for
ffn_index
in
range
(
num_ffns
):
if
'embed_dims'
not
in
ffn_cfgs
[
ffn_index
]:
ffn_cfgs
[
'embed_dims'
]
=
self
.
embed_dims
else
:
assert
ffn_cfgs
[
ffn_index
][
'embed_dims'
]
==
self
.
embed_dims
self
.
ffns
.
append
(
build_feedforward_network
(
ffn_cfgs
[
ffn_index
]))
self
.
norms
=
ModuleList
()
num_norms
=
operation_order
.
count
(
'norm'
)
for
_
in
range
(
num_norms
):
self
.
norms
.
append
(
build_norm_layer
(
norm_cfg
,
self
.
embed_dims
)[
1
])
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
query_pos
=
None
,
key_pos
=
None
,
attn_masks
=
None
,
query_key_padding_mask
=
None
,
key_padding_mask
=
None
,
**
kwargs
):
"""Forward function for `TransformerDecoderLayer`.
**kwargs contains some specific arguments of attentions.
Args:
query (Tensor): The input query with shape
[num_queries, bs, embed_dims] if
self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
value (Tensor): The value tensor with same shape as `key`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor] | None): 2D Tensor used in
calculation of corresponding attention. The length of
it should equal to the number of `attention` in
`operation_order`. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in `self_attn` layer.
Defaults to None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
norm_index
=
0
attn_index
=
0
ffn_index
=
0
identity
=
query
if
attn_masks
is
None
:
attn_masks
=
[
None
for
_
in
range
(
self
.
num_attn
)]
elif
isinstance
(
attn_masks
,
torch
.
Tensor
):
attn_masks
=
[
copy
.
deepcopy
(
attn_masks
)
for
_
in
range
(
self
.
num_attn
)
]
warnings
.
warn
(
f
'Use same attn_mask in all attentions in '
f
'
{
self
.
__class__
.
__name__
}
'
)
else
:
assert
len
(
attn_masks
)
==
self
.
num_attn
,
f
'The length of '
\
f
'attn_masks
{
len
(
attn_masks
)
}
must be equal '
\
f
'to the number of attention in '
\
f
'operation_order
{
self
.
num_attn
}
'
for
layer
in
self
.
operation_order
:
if
layer
==
'self_attn'
:
temp_key
=
temp_value
=
query
query
=
self
.
attentions
[
attn_index
](
query
,
temp_key
,
temp_value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
query_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
query_key_padding_mask
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'norm'
:
query
=
self
.
norms
[
norm_index
](
query
)
norm_index
+=
1
elif
layer
==
'cross_attn'
:
query
=
self
.
attentions
[
attn_index
](
query
,
key
,
value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
key_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
key_padding_mask
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'ffn'
:
query
=
self
.
ffns
[
ffn_index
](
query
,
identity
if
self
.
pre_norm
else
None
)
ffn_index
+=
1
return
query
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/decoder.py
0 → 100644
View file @
80e8c1d3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
cmath
import
pi
from
mmcv.ops.multi_scale_deform_attn
import
multi_scale_deformable_attn_pytorch
import
mmcv
import
cv2
as
cv
import
copy
import
warnings
from
matplotlib
import
pyplot
as
plt
import
numpy
as
np
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmcv.cnn
import
xavier_init
,
constant_init
from
mmcv.cnn.bricks.registry
import
(
ATTENTION
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
from
mmcv.cnn.bricks.transformer
import
BaseTransformerLayer
,
TransformerLayerSequence
import
math
from
mmcv.runner.base_module
import
BaseModule
,
ModuleList
,
Sequential
from
mmcv.utils
import
(
ConfigDict
,
build_from_cfg
,
deprecated_api_warning
,
to_2tuple
)
from
mmcv.utils
import
ext_loader
from
.multi_scale_deformable_attn_function
import
MultiScaleDeformableAttnFunction_fp32
,
\
MultiScaleDeformableAttnFunction_fp16
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
def
inverse_sigmoid
(
x
,
eps
=
1e-5
):
"""Inverse function of sigmoid.
Args:
x (Tensor): The tensor to do the
inverse.
eps (float): EPS avoid numerical
overflow. Defaults 1e-5.
Returns:
Tensor: The x has passed the inverse
function of sigmoid, has same
shape with input.
"""
x
=
x
.
clamp
(
min
=
0
,
max
=
1
)
x1
=
x
.
clamp
(
min
=
eps
)
x2
=
(
1
-
x
).
clamp
(
min
=
eps
)
return
torch
.
log
(
x1
/
x2
)
@
TRANSFORMER_LAYER_SEQUENCE
.
register_module
()
class
LaneDetectionTransformerDecoder
(
TransformerLayerSequence
):
def
__init__
(
self
,
*
args
,
return_intermediate
=
False
,
**
kwargs
):
super
(
LaneDetectionTransformerDecoder
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
return_intermediate
=
return_intermediate
self
.
fp16_enabled
=
False
def
forward
(
self
,
query
,
*
args
,
reference_points
=
None
,
reg_branches
=
None
,
key_padding_mask
=
None
,
**
kwargs
):
"""Forward function for `Detr3DTransformerDecoder`.
Args:
query (Tensor): Input query with shape
`(num_query, bs, embed_dims)`.
reference_points (Tensor): The reference
points of offset. has shape
(bs, num_query, 4) when as_two_stage,
otherwise has shape ((bs, num_query, 2).
reg_branch: (obj:`nn.ModuleList`): Used for
refining the regression results. Only would
be passed when with_box_refine is True,
otherwise would be passed a `None`.
Returns:
Tensor: Results with shape [1, num_query, bs, embed_dims] when
return_intermediate is `False`, otherwise it has shape
[num_layers, num_query, bs, embed_dims].
"""
output
=
query
intermediate
=
[]
intermediate_reference_points
=
[]
for
lid
,
layer
in
enumerate
(
self
.
layers
):
reference_points_input
=
reference_points
[...,
:
2
].
unsqueeze
(
2
)
# BS NUM_QUERY NUM_LEVEL 2
output
=
layer
(
output
,
*
args
,
reference_points
=
reference_points_input
,
key_padding_mask
=
key_padding_mask
,
**
kwargs
)
output
=
output
.
permute
(
1
,
0
,
2
)
if
reg_branches
is
not
None
:
tmp
=
reg_branches
[
lid
](
output
)
assert
reference_points
.
shape
[
-
1
]
==
3
new_reference_points
=
torch
.
zeros_like
(
reference_points
)
ref_center
=
(
tmp
[...,
:
3
]
+
tmp
[...,
-
3
:])
/
2
new_reference_points
=
ref_center
+
inverse_sigmoid
(
reference_points
)
new_reference_points
=
new_reference_points
.
sigmoid
()
reference_points
=
new_reference_points
.
detach
()
output
=
output
.
permute
(
1
,
0
,
2
)
if
self
.
return_intermediate
:
intermediate
.
append
(
output
)
intermediate_reference_points
.
append
(
reference_points
)
if
self
.
return_intermediate
:
return
torch
.
stack
(
intermediate
),
torch
.
stack
(
intermediate_reference_points
)
return
output
,
reference_points
@
TRANSFORMER_LAYER
.
register_module
()
class
CustomDetrTransformerDecoderLayer
(
BaseTransformerLayer
):
"""Implements decoder layer in DETR transformer.
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
Configs for self_attention or cross_attention, the order
should be consistent with it in `operation_order`. If it is
a dict, it would be expand to the number of attention in
`operation_order`.
feedforward_channels (int): The hidden dimension for FFNs.
ffn_dropout (float): Probability of an element to be zeroed
in ffn. Default 0.0.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Default:None
act_cfg (dict): The activation config for FFNs. Default: `LN`
norm_cfg (dict): Config dict for normalization layer.
Default: `LN`.
ffn_num_fcs (int): The number of fully-connected layers in FFNs.
Default:2.
"""
def
__init__
(
self
,
attn_cfgs
,
ffn_cfgs
,
operation_order
=
None
,
norm_cfg
=
dict
(
type
=
'LN'
),
**
kwargs
):
super
(
CustomDetrTransformerDecoderLayer
,
self
).
__init__
(
attn_cfgs
=
attn_cfgs
,
ffn_cfgs
=
ffn_cfgs
,
operation_order
=
operation_order
,
norm_cfg
=
norm_cfg
,
**
kwargs
)
assert
len
(
operation_order
)
==
6
assert
set
(
operation_order
)
==
set
(
[
'self_attn'
,
'norm'
,
'cross_attn'
,
'ffn'
])
@
ATTENTION
.
register_module
()
class
CustomMSDeformableAttention
(
BaseModule
):
"""An attention module used in Deformable-Detr.
`Deformable DETR: Deformable Transformers for End-to-End Object Detection.
<https://arxiv.org/pdf/2010.04159.pdf>`_.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_heads (int): Parallel attention heads. Default: 64.
num_levels (int): The number of feature map used in
Attention. Default: 4.
num_points (int): The number of sampling points for
each query in each head. Default: 4.
im2col_step (int): The step used in image_to_column.
Default: 64.
dropout (float): A Dropout layer on `inp_identity`.
Default: 0.1.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
norm_cfg (dict): Config dict for normalization layer.
Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_heads
=
8
,
num_levels
=
4
,
num_points
=
4
,
im2col_step
=
64
,
dropout
=
0.1
,
batch_first
=
False
,
norm_cfg
=
None
,
init_cfg
=
None
):
super
().
__init__
(
init_cfg
)
if
embed_dims
%
num_heads
!=
0
:
raise
ValueError
(
f
'embed_dims must be divisible by num_heads, '
f
'but got
{
embed_dims
}
and
{
num_heads
}
'
)
dim_per_head
=
embed_dims
//
num_heads
self
.
norm_cfg
=
norm_cfg
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
batch_first
=
batch_first
self
.
fp16_enabled
=
False
# you'd better set dim_per_head to a power of 2
# which is more efficient in the CUDA implementation
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
'invalid input for _is_power_of_2: {} (type: {})'
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
if
not
_is_power_of_2
(
dim_per_head
):
warnings
.
warn
(
"You'd better set embed_dims in "
'MultiScaleDeformAttention to make '
'the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.'
)
self
.
im2col_step
=
im2col_step
self
.
embed_dims
=
embed_dims
self
.
num_levels
=
num_levels
self
.
num_heads
=
num_heads
self
.
num_points
=
num_points
self
.
sampling_offsets
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
*
2
)
self
.
attention_weights
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
)
self
.
value_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
output_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
init_weights
()
def
init_weights
(
self
):
"""Default initialization for Parameters of Module."""
constant_init
(
self
.
sampling_offsets
,
0.
)
thetas
=
torch
.
arange
(
self
.
num_heads
,
dtype
=
torch
.
float32
)
*
(
2.0
*
math
.
pi
/
self
.
num_heads
)
grid_init
=
torch
.
stack
([
thetas
.
cos
(),
thetas
.
sin
()],
-
1
)
grid_init
=
(
grid_init
/
grid_init
.
abs
().
max
(
-
1
,
keepdim
=
True
)[
0
]).
view
(
self
.
num_heads
,
1
,
1
,
2
).
repeat
(
1
,
self
.
num_levels
,
self
.
num_points
,
1
)
for
i
in
range
(
self
.
num_points
):
grid_init
[:,
:,
i
,
:]
*=
i
+
1
self
.
sampling_offsets
.
bias
.
data
=
grid_init
.
view
(
-
1
)
constant_init
(
self
.
attention_weights
,
val
=
0.
,
bias
=
0.
)
xavier_init
(
self
.
value_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
self
.
_is_init
=
True
@
deprecated_api_warning
({
'residual'
:
'identity'
},
cls_name
=
'MultiScaleDeformableAttention'
)
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
identity
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
flag
=
'decoder'
,
**
kwargs
):
"""Forward Function of MultiScaleDeformAttention.
Args:
query (Tensor): Query of Transformer with shape
(num_query, bs, embed_dims).
key (Tensor): The key tensor with shape
`(num_key, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_key, bs, embed_dims)`.
identity (Tensor): The tensor used for addition, with the
same shape as `query`. Default None. If None,
`query` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, num_levels, 2),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different levels. With shape (num_levels, 2),
last dimension represents (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape ``(num_levels, )`` and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
value
is
None
:
value
=
query
if
identity
is
None
:
identity
=
query
if
query_pos
is
not
None
:
query
=
query
+
query_pos
if
not
self
.
batch_first
:
# change to (bs, num_query ,embed_dims)
query
=
query
.
permute
(
1
,
0
,
2
)
value
=
value
.
permute
(
1
,
0
,
2
)
bs
,
num_query
,
_
=
query
.
shape
bs
,
num_value
,
_
=
value
.
shape
assert
(
spatial_shapes
[:,
0
]
*
spatial_shapes
[:,
1
]).
sum
()
==
num_value
value
=
self
.
value_proj
(
value
)
if
key_padding_mask
is
not
None
:
value
=
value
.
masked_fill
(
key_padding_mask
[...,
None
],
0.0
)
value
=
value
.
view
(
bs
,
num_value
,
self
.
num_heads
,
-
1
)
sampling_offsets
=
self
.
sampling_offsets
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
,
2
)
attention_weights
=
self
.
attention_weights
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
*
self
.
num_points
)
attention_weights
=
attention_weights
.
softmax
(
-
1
)
attention_weights
=
attention_weights
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
)
if
reference_points
.
shape
[
-
1
]
==
2
:
offset_normalizer
=
torch
.
stack
(
[
spatial_shapes
[...,
1
],
spatial_shapes
[...,
0
]],
-
1
)
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:]
\
+
sampling_offsets
\
/
offset_normalizer
[
None
,
None
,
None
,
:,
None
,
:]
elif
reference_points
.
shape
[
-
1
]
==
4
:
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:
2
]
\
+
sampling_offsets
/
self
.
num_points
\
*
reference_points
[:,
:,
None
,
:,
None
,
2
:]
\
*
0.5
else
:
raise
ValueError
(
f
'Last dim of reference_points must be'
f
' 2 or 4, but get
{
reference_points
.
shape
[
-
1
]
}
instead.'
)
if
torch
.
cuda
.
is_available
()
and
value
.
is_cuda
:
# using fp16 deformable attention is unstable because it performs many sum operations
if
value
.
dtype
==
torch
.
float16
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
else
:
MultiScaleDeformableAttnFunction
=
MultiScaleDeformableAttnFunction_fp32
output
=
MultiScaleDeformableAttnFunction
.
apply
(
value
,
spatial_shapes
,
level_start_index
,
sampling_locations
,
attention_weights
,
self
.
im2col_step
)
else
:
output
=
multi_scale_deformable_attn_pytorch
(
value
,
spatial_shapes
,
sampling_locations
,
attention_weights
)
output
=
self
.
output_proj
(
output
)
if
not
self
.
batch_first
:
# (num_query, bs ,embed_dims)
output
=
output
.
permute
(
1
,
0
,
2
)
return
self
.
dropout
(
output
)
+
identity
autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/encoder.py
0 → 100644
View file @
80e8c1d3
# ---------------------------------------------
# Copyright (c) OpenMMLab. All rights reserved.
# ---------------------------------------------
# Modified by Zhiqi Li
# ---------------------------------------------
from
.custom_base_transformer_layer
import
MyCustomBaseTransformerLayer
import
copy
import
warnings
from
mmcv.cnn.bricks.registry
import
(
ATTENTION
,
TRANSFORMER_LAYER
,
TRANSFORMER_LAYER_SEQUENCE
)
from
mmcv.cnn.bricks.transformer
import
TransformerLayerSequence
from
mmcv.runner
import
force_fp32
,
auto_fp16
import
numpy
as
np
import
torch
import
cv2
as
cv
import
mmcv
from
mmcv.utils
import
TORCH_VERSION
,
digit_version
from
mmcv.utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
@
TRANSFORMER_LAYER_SEQUENCE
.
register_module
()
class
BEVFormerEncoder
(
TransformerLayerSequence
):
"""
Attention with both self and cross
Implements the decoder in DETR transformer.
Args:
return_intermediate (bool): Whether to return intermediate outputs.
coder_norm_cfg (dict): Config of last normalization layer. Default:
`LN`.
"""
def
__init__
(
self
,
*
args
,
pc_range
=
None
,
num_points_in_pillar
=
4
,
return_intermediate
=
False
,
dataset_type
=
'nuscenes'
,
**
kwargs
):
super
(
BEVFormerEncoder
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
return_intermediate
=
return_intermediate
self
.
num_points_in_pillar
=
num_points_in_pillar
self
.
pc_range
=
pc_range
self
.
fp16_enabled
=
False
@
staticmethod
def
get_reference_points
(
H
,
W
,
Z
=
8
,
num_points_in_pillar
=
4
,
dim
=
'3d'
,
bs
=
1
,
device
=
'cuda'
,
dtype
=
torch
.
float
):
"""Get the reference points used in SCA and TSA.
Args:
H, W: spatial shape of bev.
Z: hight of pillar.
D: sample D points uniformly from each pillar.
device (obj:`device`): The device where
reference_points should be.
Returns:
Tensor: reference points used in decoder, has \
shape (bs, num_keys, num_levels, 2).
"""
# reference points in 3D space, used in spatial cross-attention (SCA)
if
dim
==
'3d'
:
zs
=
torch
.
linspace
(
0.5
,
Z
-
0.5
,
num_points_in_pillar
,
dtype
=
dtype
,
device
=
device
).
view
(
-
1
,
1
,
1
).
expand
(
num_points_in_pillar
,
H
,
W
)
/
Z
xs
=
torch
.
linspace
(
0.5
,
W
-
0.5
,
W
,
dtype
=
dtype
,
device
=
device
).
view
(
1
,
1
,
W
).
expand
(
num_points_in_pillar
,
H
,
W
)
/
W
ys
=
torch
.
linspace
(
0.5
,
H
-
0.5
,
H
,
dtype
=
dtype
,
device
=
device
).
view
(
1
,
H
,
1
).
expand
(
num_points_in_pillar
,
H
,
W
)
/
H
ref_3d
=
torch
.
stack
((
xs
,
ys
,
zs
),
-
1
)
ref_3d
=
ref_3d
.
permute
(
0
,
3
,
1
,
2
).
flatten
(
2
).
permute
(
0
,
2
,
1
)
ref_3d
=
ref_3d
[
None
].
repeat
(
bs
,
1
,
1
,
1
)
return
ref_3d
# reference points on 2D bev plane, used in temporal self-attention (TSA).
elif
dim
==
'2d'
:
ref_y
,
ref_x
=
torch
.
meshgrid
(
torch
.
linspace
(
0.5
,
H
-
0.5
,
H
,
dtype
=
dtype
,
device
=
device
),
torch
.
linspace
(
0.5
,
W
-
0.5
,
W
,
dtype
=
dtype
,
device
=
device
)
)
ref_y
=
ref_y
.
reshape
(
-
1
)[
None
]
/
H
ref_x
=
ref_x
.
reshape
(
-
1
)[
None
]
/
W
ref_2d
=
torch
.
stack
((
ref_x
,
ref_y
),
-
1
)
ref_2d
=
ref_2d
.
repeat
(
bs
,
1
,
1
).
unsqueeze
(
2
)
return
ref_2d
# This function must use fp32!!!
@
force_fp32
(
apply_to
=
(
'reference_points'
,
'img_metas'
))
def
point_sampling
(
self
,
reference_points
,
pc_range
,
img_metas
):
lidar2img
=
[]
for
img_meta
in
img_metas
:
lidar2img
.
append
(
img_meta
[
'lidar2img'
])
lidar2img
=
np
.
asarray
(
lidar2img
)
lidar2img
=
reference_points
.
new_tensor
(
lidar2img
)
# (B, N, 4, 4)
reference_points
=
reference_points
.
clone
()
reference_points
[...,
0
:
1
]
=
reference_points
[...,
0
:
1
]
*
\
(
pc_range
[
3
]
-
pc_range
[
0
])
+
pc_range
[
0
]
reference_points
[...,
1
:
2
]
=
reference_points
[...,
1
:
2
]
*
\
(
pc_range
[
4
]
-
pc_range
[
1
])
+
pc_range
[
1
]
reference_points
[...,
2
:
3
]
=
reference_points
[...,
2
:
3
]
*
\
(
pc_range
[
5
]
-
pc_range
[
2
])
+
pc_range
[
2
]
reference_points
=
torch
.
cat
(
(
reference_points
,
torch
.
ones_like
(
reference_points
[...,
:
1
])),
-
1
)
reference_points
=
reference_points
.
permute
(
1
,
0
,
2
,
3
)
D
,
B
,
num_query
=
reference_points
.
size
()[:
3
]
num_cam
=
lidar2img
.
size
(
1
)
reference_points
=
reference_points
.
view
(
D
,
B
,
1
,
num_query
,
4
).
repeat
(
1
,
1
,
num_cam
,
1
,
1
).
unsqueeze
(
-
1
)
lidar2img
=
lidar2img
.
view
(
1
,
B
,
num_cam
,
1
,
4
,
4
).
repeat
(
D
,
1
,
1
,
num_query
,
1
,
1
)
reference_points_cam
=
torch
.
matmul
(
lidar2img
.
to
(
torch
.
float32
),
reference_points
.
to
(
torch
.
float32
)).
squeeze
(
-
1
)
eps
=
1e-5
bev_mask
=
(
reference_points_cam
[...,
2
:
3
]
>
eps
)
reference_points_cam
=
reference_points_cam
[...,
0
:
2
]
/
torch
.
maximum
(
reference_points_cam
[...,
2
:
3
],
torch
.
ones_like
(
reference_points_cam
[...,
2
:
3
])
*
eps
)
reference_points_cam
[...,
0
]
/=
img_metas
[
0
][
'img_shape'
][
0
][
1
]
reference_points_cam
[...,
1
]
/=
img_metas
[
0
][
'img_shape'
][
0
][
0
]
bev_mask
=
(
bev_mask
&
(
reference_points_cam
[...,
1
:
2
]
>
0.0
)
&
(
reference_points_cam
[...,
1
:
2
]
<
1.0
)
&
(
reference_points_cam
[...,
0
:
1
]
<
1.0
)
&
(
reference_points_cam
[...,
0
:
1
]
>
0.0
))
if
digit_version
(
TORCH_VERSION
)
>=
digit_version
(
'1.8'
):
bev_mask
=
torch
.
nan_to_num
(
bev_mask
)
else
:
bev_mask
=
bev_mask
.
new_tensor
(
np
.
nan_to_num
(
bev_mask
.
cpu
().
numpy
()))
reference_points_cam
=
reference_points_cam
.
permute
(
2
,
1
,
3
,
0
,
4
)
bev_mask
=
bev_mask
.
permute
(
2
,
1
,
3
,
0
,
4
).
squeeze
(
-
1
)
return
reference_points_cam
,
bev_mask
@
auto_fp16
()
def
forward
(
self
,
bev_query
,
key
,
value
,
*
args
,
bev_h
=
None
,
bev_w
=
None
,
bev_pos
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
valid_ratios
=
None
,
prev_bev
=
None
,
shift
=
0.
,
**
kwargs
):
"""Forward function for `TransformerDecoder`.
Args:
bev_query (Tensor): Input BEV query with shape
`(num_query, bs, embed_dims)`.
key & value (Tensor): Input multi-cameta features with shape
(num_cam, num_value, bs, embed_dims)
reference_points (Tensor): The reference
points of offset. has shape
(bs, num_query, 4) when as_two_stage,
otherwise has shape ((bs, num_query, 2).
valid_ratios (Tensor): The radios of valid
points on the feature map, has shape
(bs, num_levels, 2)
Returns:
Tensor: Results with shape [1, num_query, bs, embed_dims] when
return_intermediate is `False`, otherwise it has shape
[num_layers, num_query, bs, embed_dims].
"""
output
=
bev_query
intermediate
=
[]
ref_3d
=
self
.
get_reference_points
(
bev_h
,
bev_w
,
self
.
pc_range
[
5
]
-
self
.
pc_range
[
2
],
self
.
num_points_in_pillar
,
dim
=
'3d'
,
bs
=
bev_query
.
size
(
1
),
device
=
bev_query
.
device
,
dtype
=
bev_query
.
dtype
)
ref_2d
=
self
.
get_reference_points
(
bev_h
,
bev_w
,
dim
=
'2d'
,
bs
=
bev_query
.
size
(
1
),
device
=
bev_query
.
device
,
dtype
=
bev_query
.
dtype
)
reference_points_cam
,
bev_mask
=
self
.
point_sampling
(
ref_3d
,
self
.
pc_range
,
kwargs
[
'img_metas'
])
# bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper.
shift_ref_2d
=
ref_2d
.
clone
()
# .clone()
shift_ref_2d
+=
shift
[:,
None
,
None
,
:]
# (num_query, bs, embed_dims) -> (bs, num_query, embed_dims)
bev_query
=
bev_query
.
permute
(
1
,
0
,
2
)
bev_pos
=
bev_pos
.
permute
(
1
,
0
,
2
)
bs
,
len_bev
,
num_bev_level
,
_
=
ref_2d
.
shape
if
prev_bev
is
not
None
:
prev_bev
=
prev_bev
.
permute
(
1
,
0
,
2
)
prev_bev
=
torch
.
stack
(
[
prev_bev
,
bev_query
],
1
).
reshape
(
bs
*
2
,
len_bev
,
-
1
)
hybird_ref_2d
=
torch
.
stack
([
shift_ref_2d
,
ref_2d
],
1
).
reshape
(
bs
*
2
,
len_bev
,
num_bev_level
,
2
)
else
:
hybird_ref_2d
=
torch
.
stack
([
ref_2d
,
ref_2d
],
1
).
reshape
(
bs
*
2
,
len_bev
,
num_bev_level
,
2
)
for
lid
,
layer
in
enumerate
(
self
.
layers
):
output
=
layer
(
bev_query
,
key
,
value
,
*
args
,
bev_pos
=
bev_pos
,
ref_2d
=
hybird_ref_2d
,
ref_3d
=
ref_3d
,
bev_h
=
bev_h
,
bev_w
=
bev_w
,
spatial_shapes
=
spatial_shapes
,
level_start_index
=
level_start_index
,
reference_points_cam
=
reference_points_cam
,
bev_mask
=
bev_mask
,
prev_bev
=
prev_bev
,
**
kwargs
)
bev_query
=
output
if
self
.
return_intermediate
:
intermediate
.
append
(
output
)
if
self
.
return_intermediate
:
return
torch
.
stack
(
intermediate
)
return
output
@
TRANSFORMER_LAYER
.
register_module
()
class
BEVFormerLayer
(
MyCustomBaseTransformerLayer
):
"""Implements decoder layer in DETR transformer.
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
Configs for self_attention or cross_attention, the order
should be consistent with it in `operation_order`. If it is
a dict, it would be expand to the number of attention in
`operation_order`.
feedforward_channels (int): The hidden dimension for FFNs.
ffn_dropout (float): Probability of an element to be zeroed
in ffn. Default 0.0.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Default:None
act_cfg (dict): The activation config for FFNs. Default: `LN`
norm_cfg (dict): Config dict for normalization layer.
Default: `LN`.
ffn_num_fcs (int): The number of fully-connected layers in FFNs.
Default:2.
"""
def
__init__
(
self
,
attn_cfgs
,
ffn_cfgs
,
operation_order
=
None
,
act_cfg
=
dict
(
type
=
'ReLU'
,
inplace
=
True
),
norm_cfg
=
dict
(
type
=
'LN'
),
**
kwargs
):
super
(
BEVFormerLayer
,
self
).
__init__
(
attn_cfgs
=
attn_cfgs
,
ffn_cfgs
=
ffn_cfgs
,
operation_order
=
operation_order
,
act_cfg
=
act_cfg
,
norm_cfg
=
norm_cfg
,
**
kwargs
)
self
.
fp16_enabled
=
False
assert
len
(
operation_order
)
==
6
assert
set
(
operation_order
)
==
set
(
[
'self_attn'
,
'norm'
,
'cross_attn'
,
'ffn'
])
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
bev_pos
=
None
,
query_pos
=
None
,
key_pos
=
None
,
attn_masks
=
None
,
query_key_padding_mask
=
None
,
key_padding_mask
=
None
,
ref_2d
=
None
,
ref_3d
=
None
,
bev_h
=
None
,
bev_w
=
None
,
reference_points_cam
=
None
,
mask
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
prev_bev
=
None
,
**
kwargs
):
"""Forward function for `TransformerDecoderLayer`.
**kwargs contains some specific arguments of attentions.
Args:
query (Tensor): The input query with shape
[num_queries, bs, embed_dims] if
self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
value (Tensor): The value tensor with same shape as `key`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor] | None): 2D Tensor used in
calculation of corresponding attention. The length of
it should equal to the number of `attention` in
`operation_order`. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in `self_attn` layer.
Defaults to None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
norm_index
=
0
attn_index
=
0
ffn_index
=
0
identity
=
query
if
attn_masks
is
None
:
attn_masks
=
[
None
for
_
in
range
(
self
.
num_attn
)]
elif
isinstance
(
attn_masks
,
torch
.
Tensor
):
attn_masks
=
[
copy
.
deepcopy
(
attn_masks
)
for
_
in
range
(
self
.
num_attn
)
]
warnings
.
warn
(
f
'Use same attn_mask in all attentions in '
f
'
{
self
.
__class__
.
__name__
}
'
)
else
:
assert
len
(
attn_masks
)
==
self
.
num_attn
,
f
'The length of '
\
f
'attn_masks
{
len
(
attn_masks
)
}
must be equal '
\
f
'to the number of attention in '
\
f
'operation_order
{
self
.
num_attn
}
'
for
layer
in
self
.
operation_order
:
# temporal self attention
if
layer
==
'self_attn'
:
query
=
self
.
attentions
[
attn_index
](
query
,
prev_bev
,
prev_bev
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
bev_pos
,
key_pos
=
bev_pos
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
query_key_padding_mask
,
reference_points
=
ref_2d
,
spatial_shapes
=
torch
.
tensor
(
[[
bev_h
,
bev_w
]],
device
=
query
.
device
),
level_start_index
=
torch
.
tensor
([
0
],
device
=
query
.
device
),
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'norm'
:
query
=
self
.
norms
[
norm_index
](
query
)
norm_index
+=
1
# spaital cross attention
elif
layer
==
'cross_attn'
:
query
=
self
.
attentions
[
attn_index
](
query
,
key
,
value
,
identity
if
self
.
pre_norm
else
None
,
query_pos
=
query_pos
,
key_pos
=
key_pos
,
reference_points
=
ref_3d
,
reference_points_cam
=
reference_points_cam
,
mask
=
mask
,
attn_mask
=
attn_masks
[
attn_index
],
key_padding_mask
=
key_padding_mask
,
spatial_shapes
=
spatial_shapes
,
level_start_index
=
level_start_index
,
**
kwargs
)
attn_index
+=
1
identity
=
query
elif
layer
==
'ffn'
:
query
=
self
.
ffns
[
ffn_index
](
query
,
identity
if
self
.
pre_norm
else
None
)
ffn_index
+=
1
return
query
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment