Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
InstructBLIP_pytorch
Commits
c04f261a
Commit
c04f261a
authored
Aug 22, 2024
by
dongchy920
Browse files
InstruceBLIP
parents
Pipeline
#1594
canceled with stages
Changes
421
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3438 additions
and
0 deletions
+3438
-0
lavis/common/annotator/uniformer/mmcv/ops/deprecated_wrappers.py
...ommon/annotator/uniformer/mmcv/ops/deprecated_wrappers.py
+43
-0
lavis/common/annotator/uniformer/mmcv/ops/focal_loss.py
lavis/common/annotator/uniformer/mmcv/ops/focal_loss.py
+212
-0
lavis/common/annotator/uniformer/mmcv/ops/furthest_point_sample.py
...mon/annotator/uniformer/mmcv/ops/furthest_point_sample.py
+83
-0
lavis/common/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py
...mmon/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py
+268
-0
lavis/common/annotator/uniformer/mmcv/ops/gather_points.py
lavis/common/annotator/uniformer/mmcv/ops/gather_points.py
+57
-0
lavis/common/annotator/uniformer/mmcv/ops/group_points.py
lavis/common/annotator/uniformer/mmcv/ops/group_points.py
+224
-0
lavis/common/annotator/uniformer/mmcv/ops/info.py
lavis/common/annotator/uniformer/mmcv/ops/info.py
+36
-0
lavis/common/annotator/uniformer/mmcv/ops/iou3d.py
lavis/common/annotator/uniformer/mmcv/ops/iou3d.py
+85
-0
lavis/common/annotator/uniformer/mmcv/ops/knn.py
lavis/common/annotator/uniformer/mmcv/ops/knn.py
+77
-0
lavis/common/annotator/uniformer/mmcv/ops/masked_conv.py
lavis/common/annotator/uniformer/mmcv/ops/masked_conv.py
+111
-0
lavis/common/annotator/uniformer/mmcv/ops/merge_cells.py
lavis/common/annotator/uniformer/mmcv/ops/merge_cells.py
+149
-0
lavis/common/annotator/uniformer/mmcv/ops/modulated_deform_conv.py
...mon/annotator/uniformer/mmcv/ops/modulated_deform_conv.py
+282
-0
lavis/common/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py
...n/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py
+358
-0
lavis/common/annotator/uniformer/mmcv/ops/nms.py
lavis/common/annotator/uniformer/mmcv/ops/nms.py
+417
-0
lavis/common/annotator/uniformer/mmcv/ops/pixel_group.py
lavis/common/annotator/uniformer/mmcv/ops/pixel_group.py
+75
-0
lavis/common/annotator/uniformer/mmcv/ops/point_sample.py
lavis/common/annotator/uniformer/mmcv/ops/point_sample.py
+336
-0
lavis/common/annotator/uniformer/mmcv/ops/points_in_boxes.py
lavis/common/annotator/uniformer/mmcv/ops/points_in_boxes.py
+133
-0
lavis/common/annotator/uniformer/mmcv/ops/points_sampler.py
lavis/common/annotator/uniformer/mmcv/ops/points_sampler.py
+177
-0
lavis/common/annotator/uniformer/mmcv/ops/psa_mask.py
lavis/common/annotator/uniformer/mmcv/ops/psa_mask.py
+92
-0
lavis/common/annotator/uniformer/mmcv/ops/roi_align.py
lavis/common/annotator/uniformer/mmcv/ops/roi_align.py
+223
-0
No files found.
Too many changes to show.
To preserve performance only
421 of 421+
files are displayed.
Plain diff
Email patch
lavis/common/annotator/uniformer/mmcv/ops/deprecated_wrappers.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
# This file is for backward compatibility.
# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks.
import
warnings
from
..cnn.bricks.wrappers
import
Conv2d
,
ConvTranspose2d
,
Linear
,
MaxPool2d
class
Conv2d_deprecated
(
Conv2d
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
warnings
.
warn
(
'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
' the future. Please import them from "mmcv.cnn" instead'
)
class
ConvTranspose2d_deprecated
(
ConvTranspose2d
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
warnings
.
warn
(
'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
'deprecated in the future. Please import them from "mmcv.cnn" '
'instead'
)
class
MaxPool2d_deprecated
(
MaxPool2d
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
warnings
.
warn
(
'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
' the future. Please import them from "mmcv.cnn" instead'
)
class
Linear_deprecated
(
Linear
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
warnings
.
warn
(
'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
' the future. Please import them from "mmcv.cnn" instead'
)
lavis/common/annotator/uniformer/mmcv/ops/focal_loss.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'sigmoid_focal_loss_forward'
,
'sigmoid_focal_loss_backward'
,
'softmax_focal_loss_forward'
,
'softmax_focal_loss_backward'
])
class
SigmoidFocalLossFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
,
target
,
gamma
,
alpha
,
weight
,
reduction
):
return
g
.
op
(
'mmcv::MMCVSigmoidFocalLoss'
,
input
,
target
,
gamma_f
=
gamma
,
alpha_f
=
alpha
,
weight_f
=
weight
,
reduction_s
=
reduction
)
@
staticmethod
def
forward
(
ctx
,
input
,
target
,
gamma
=
2.0
,
alpha
=
0.25
,
weight
=
None
,
reduction
=
'mean'
):
assert
isinstance
(
target
,
(
torch
.
LongTensor
,
torch
.
cuda
.
LongTensor
))
assert
input
.
dim
()
==
2
assert
target
.
dim
()
==
1
assert
input
.
size
(
0
)
==
target
.
size
(
0
)
if
weight
is
None
:
weight
=
input
.
new_empty
(
0
)
else
:
assert
weight
.
dim
()
==
1
assert
input
.
size
(
1
)
==
weight
.
size
(
0
)
ctx
.
reduction_dict
=
{
'none'
:
0
,
'mean'
:
1
,
'sum'
:
2
}
assert
reduction
in
ctx
.
reduction_dict
.
keys
()
ctx
.
gamma
=
float
(
gamma
)
ctx
.
alpha
=
float
(
alpha
)
ctx
.
reduction
=
ctx
.
reduction_dict
[
reduction
]
output
=
input
.
new_zeros
(
input
.
size
())
ext_module
.
sigmoid_focal_loss_forward
(
input
,
target
,
weight
,
output
,
gamma
=
ctx
.
gamma
,
alpha
=
ctx
.
alpha
)
if
ctx
.
reduction
==
ctx
.
reduction_dict
[
'mean'
]:
output
=
output
.
sum
()
/
input
.
size
(
0
)
elif
ctx
.
reduction
==
ctx
.
reduction_dict
[
'sum'
]:
output
=
output
.
sum
()
ctx
.
save_for_backward
(
input
,
target
,
weight
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
input
,
target
,
weight
=
ctx
.
saved_tensors
grad_input
=
input
.
new_zeros
(
input
.
size
())
ext_module
.
sigmoid_focal_loss_backward
(
input
,
target
,
weight
,
grad_input
,
gamma
=
ctx
.
gamma
,
alpha
=
ctx
.
alpha
)
grad_input
*=
grad_output
if
ctx
.
reduction
==
ctx
.
reduction_dict
[
'mean'
]:
grad_input
/=
input
.
size
(
0
)
return
grad_input
,
None
,
None
,
None
,
None
,
None
sigmoid_focal_loss
=
SigmoidFocalLossFunction
.
apply
class
SigmoidFocalLoss
(
nn
.
Module
):
def
__init__
(
self
,
gamma
,
alpha
,
weight
=
None
,
reduction
=
'mean'
):
super
(
SigmoidFocalLoss
,
self
).
__init__
()
self
.
gamma
=
gamma
self
.
alpha
=
alpha
self
.
register_buffer
(
'weight'
,
weight
)
self
.
reduction
=
reduction
def
forward
(
self
,
input
,
target
):
return
sigmoid_focal_loss
(
input
,
target
,
self
.
gamma
,
self
.
alpha
,
self
.
weight
,
self
.
reduction
)
def
__repr__
(
self
):
s
=
self
.
__class__
.
__name__
s
+=
f
'(gamma=
{
self
.
gamma
}
, '
s
+=
f
'alpha=
{
self
.
alpha
}
, '
s
+=
f
'reduction=
{
self
.
reduction
}
)'
return
s
class
SoftmaxFocalLossFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
,
target
,
gamma
,
alpha
,
weight
,
reduction
):
return
g
.
op
(
'mmcv::MMCVSoftmaxFocalLoss'
,
input
,
target
,
gamma_f
=
gamma
,
alpha_f
=
alpha
,
weight_f
=
weight
,
reduction_s
=
reduction
)
@
staticmethod
def
forward
(
ctx
,
input
,
target
,
gamma
=
2.0
,
alpha
=
0.25
,
weight
=
None
,
reduction
=
'mean'
):
assert
isinstance
(
target
,
(
torch
.
LongTensor
,
torch
.
cuda
.
LongTensor
))
assert
input
.
dim
()
==
2
assert
target
.
dim
()
==
1
assert
input
.
size
(
0
)
==
target
.
size
(
0
)
if
weight
is
None
:
weight
=
input
.
new_empty
(
0
)
else
:
assert
weight
.
dim
()
==
1
assert
input
.
size
(
1
)
==
weight
.
size
(
0
)
ctx
.
reduction_dict
=
{
'none'
:
0
,
'mean'
:
1
,
'sum'
:
2
}
assert
reduction
in
ctx
.
reduction_dict
.
keys
()
ctx
.
gamma
=
float
(
gamma
)
ctx
.
alpha
=
float
(
alpha
)
ctx
.
reduction
=
ctx
.
reduction_dict
[
reduction
]
channel_stats
,
_
=
torch
.
max
(
input
,
dim
=
1
)
input_softmax
=
input
-
channel_stats
.
unsqueeze
(
1
).
expand_as
(
input
)
input_softmax
.
exp_
()
channel_stats
=
input_softmax
.
sum
(
dim
=
1
)
input_softmax
/=
channel_stats
.
unsqueeze
(
1
).
expand_as
(
input
)
output
=
input
.
new_zeros
(
input
.
size
(
0
))
ext_module
.
softmax_focal_loss_forward
(
input_softmax
,
target
,
weight
,
output
,
gamma
=
ctx
.
gamma
,
alpha
=
ctx
.
alpha
)
if
ctx
.
reduction
==
ctx
.
reduction_dict
[
'mean'
]:
output
=
output
.
sum
()
/
input
.
size
(
0
)
elif
ctx
.
reduction
==
ctx
.
reduction_dict
[
'sum'
]:
output
=
output
.
sum
()
ctx
.
save_for_backward
(
input_softmax
,
target
,
weight
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input_softmax
,
target
,
weight
=
ctx
.
saved_tensors
buff
=
input_softmax
.
new_zeros
(
input_softmax
.
size
(
0
))
grad_input
=
input_softmax
.
new_zeros
(
input_softmax
.
size
())
ext_module
.
softmax_focal_loss_backward
(
input_softmax
,
target
,
weight
,
buff
,
grad_input
,
gamma
=
ctx
.
gamma
,
alpha
=
ctx
.
alpha
)
grad_input
*=
grad_output
if
ctx
.
reduction
==
ctx
.
reduction_dict
[
'mean'
]:
grad_input
/=
input_softmax
.
size
(
0
)
return
grad_input
,
None
,
None
,
None
,
None
,
None
softmax_focal_loss
=
SoftmaxFocalLossFunction
.
apply
class
SoftmaxFocalLoss
(
nn
.
Module
):
def
__init__
(
self
,
gamma
,
alpha
,
weight
=
None
,
reduction
=
'mean'
):
super
(
SoftmaxFocalLoss
,
self
).
__init__
()
self
.
gamma
=
gamma
self
.
alpha
=
alpha
self
.
register_buffer
(
'weight'
,
weight
)
self
.
reduction
=
reduction
def
forward
(
self
,
input
,
target
):
return
softmax_focal_loss
(
input
,
target
,
self
.
gamma
,
self
.
alpha
,
self
.
weight
,
self
.
reduction
)
def
__repr__
(
self
):
s
=
self
.
__class__
.
__name__
s
+=
f
'(gamma=
{
self
.
gamma
}
, '
s
+=
f
'alpha=
{
self
.
alpha
}
, '
s
+=
f
'reduction=
{
self
.
reduction
}
)'
return
s
lavis/common/annotator/uniformer/mmcv/ops/furthest_point_sample.py
0 → 100644
View file @
c04f261a
import
torch
from
torch.autograd
import
Function
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'furthest_point_sampling_forward'
,
'furthest_point_sampling_with_dist_forward'
])
class
FurthestPointSampling
(
Function
):
"""Uses iterative furthest point sampling to select a set of features whose
corresponding points have the furthest distance."""
@
staticmethod
def
forward
(
ctx
,
points_xyz
:
torch
.
Tensor
,
num_points
:
int
)
->
torch
.
Tensor
:
"""
Args:
points_xyz (Tensor): (B, N, 3) where N > num_points.
num_points (int): Number of points in the sampled set.
Returns:
Tensor: (B, num_points) indices of the sampled points.
"""
assert
points_xyz
.
is_contiguous
()
B
,
N
=
points_xyz
.
size
()[:
2
]
output
=
torch
.
cuda
.
IntTensor
(
B
,
num_points
)
temp
=
torch
.
cuda
.
FloatTensor
(
B
,
N
).
fill_
(
1e10
)
ext_module
.
furthest_point_sampling_forward
(
points_xyz
,
temp
,
output
,
b
=
B
,
n
=
N
,
m
=
num_points
,
)
if
torch
.
__version__
!=
'parrots'
:
ctx
.
mark_non_differentiable
(
output
)
return
output
@
staticmethod
def
backward
(
xyz
,
a
=
None
):
return
None
,
None
class
FurthestPointSamplingWithDist
(
Function
):
"""Uses iterative furthest point sampling to select a set of features whose
corresponding points have the furthest distance."""
@
staticmethod
def
forward
(
ctx
,
points_dist
:
torch
.
Tensor
,
num_points
:
int
)
->
torch
.
Tensor
:
"""
Args:
points_dist (Tensor): (B, N, N) Distance between each point pair.
num_points (int): Number of points in the sampled set.
Returns:
Tensor: (B, num_points) indices of the sampled points.
"""
assert
points_dist
.
is_contiguous
()
B
,
N
,
_
=
points_dist
.
size
()
output
=
points_dist
.
new_zeros
([
B
,
num_points
],
dtype
=
torch
.
int32
)
temp
=
points_dist
.
new_zeros
([
B
,
N
]).
fill_
(
1e10
)
ext_module
.
furthest_point_sampling_with_dist_forward
(
points_dist
,
temp
,
output
,
b
=
B
,
n
=
N
,
m
=
num_points
)
if
torch
.
__version__
!=
'parrots'
:
ctx
.
mark_non_differentiable
(
output
)
return
output
@
staticmethod
def
backward
(
xyz
,
a
=
None
):
return
None
,
None
furthest_point_sample
=
FurthestPointSampling
.
apply
furthest_point_sample_with_dist
=
FurthestPointSamplingWithDist
.
apply
lavis/common/annotator/uniformer/mmcv/ops/fused_bias_leakyrelu.py
0 → 100644
View file @
c04f261a
# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501
# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
# Augmentation (ADA)
# =======================================================================
# 1. Definitions
# "Licensor" means any person or entity that distributes its Work.
# "Software" means the original work of authorship made available under
# this License.
# "Work" means the Software and any additions to or derivative works of
# the Software that are made available under this License.
# The terms "reproduce," "reproduction," "derivative works," and
# "distribution" have the meaning as provided under U.S. copyright law;
# provided, however, that for the purposes of this License, derivative
# works shall not include works that remain separable from, or merely
# link (or bind by name) to the interfaces of, the Work.
# Works, including the Software, are "made available" under this License
# by including in or with the Work either (a) a copyright notice
# referencing the applicability of this License to the Work, or (b) a
# copy of this License.
# 2. License Grants
# 2.1 Copyright Grant. Subject to the terms and conditions of this
# License, each Licensor grants to you a perpetual, worldwide,
# non-exclusive, royalty-free, copyright license to reproduce,
# prepare derivative works of, publicly display, publicly perform,
# sublicense and distribute its Work and any resulting derivative
# works in any form.
# 3. Limitations
# 3.1 Redistribution. You may reproduce or distribute the Work only
# if (a) you do so under this License, (b) you include a complete
# copy of this License with your distribution, and (c) you retain
# without modification any copyright, patent, trademark, or
# attribution notices that are present in the Work.
# 3.2 Derivative Works. You may specify that additional or different
# terms apply to the use, reproduction, and distribution of your
# derivative works of the Work ("Your Terms") only if (a) Your Terms
# provide that the use limitation in Section 3.3 applies to your
# derivative works, and (b) you identify the specific derivative
# works that are subject to Your Terms. Notwithstanding Your Terms,
# this License (including the redistribution requirements in Section
# 3.1) will continue to apply to the Work itself.
# 3.3 Use Limitation. The Work and any derivative works thereof only
# may be used or intended for use non-commercially. Notwithstanding
# the foregoing, NVIDIA and its affiliates may use the Work and any
# derivative works commercially. As used herein, "non-commercially"
# means for research or evaluation purposes only.
# 3.4 Patent Claims. If you bring or threaten to bring a patent claim
# against any Licensor (including any claim, cross-claim or
# counterclaim in a lawsuit) to enforce any patents that you allege
# are infringed by any Work, then your rights under this License from
# such Licensor (including the grant in Section 2.1) will terminate
# immediately.
# 3.5 Trademarks. This License does not grant any rights to use any
# Licensor’s or its affiliates’ names, logos, or trademarks, except
# as necessary to reproduce the notices described in this License.
# 3.6 Termination. If you violate any term of this License, then your
# rights under this License (including the grant in Section 2.1) will
# terminate immediately.
# 4. Disclaimer of Warranty.
# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
# THIS LICENSE.
# 5. Limitation of Liability.
# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGES.
# =======================================================================
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.autograd
import
Function
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'fused_bias_leakyrelu'
])
class
FusedBiasLeakyReLUFunctionBackward
(
Function
):
"""Calculate second order deviation.
This function is to compute the second order deviation for the fused leaky
relu operation.
"""
@
staticmethod
def
forward
(
ctx
,
grad_output
,
out
,
negative_slope
,
scale
):
ctx
.
save_for_backward
(
out
)
ctx
.
negative_slope
=
negative_slope
ctx
.
scale
=
scale
empty
=
grad_output
.
new_empty
(
0
)
grad_input
=
ext_module
.
fused_bias_leakyrelu
(
grad_output
,
empty
,
out
,
act
=
3
,
grad
=
1
,
alpha
=
negative_slope
,
scale
=
scale
)
dim
=
[
0
]
if
grad_input
.
ndim
>
2
:
dim
+=
list
(
range
(
2
,
grad_input
.
ndim
))
grad_bias
=
grad_input
.
sum
(
dim
).
detach
()
return
grad_input
,
grad_bias
@
staticmethod
def
backward
(
ctx
,
gradgrad_input
,
gradgrad_bias
):
out
,
=
ctx
.
saved_tensors
# The second order deviation, in fact, contains two parts, while the
# the first part is zero. Thus, we direct consider the second part
# which is similar with the first order deviation in implementation.
gradgrad_out
=
ext_module
.
fused_bias_leakyrelu
(
gradgrad_input
,
gradgrad_bias
.
to
(
out
.
dtype
),
out
,
act
=
3
,
grad
=
1
,
alpha
=
ctx
.
negative_slope
,
scale
=
ctx
.
scale
)
return
gradgrad_out
,
None
,
None
,
None
class
FusedBiasLeakyReLUFunction
(
Function
):
@
staticmethod
def
forward
(
ctx
,
input
,
bias
,
negative_slope
,
scale
):
empty
=
input
.
new_empty
(
0
)
out
=
ext_module
.
fused_bias_leakyrelu
(
input
,
bias
,
empty
,
act
=
3
,
grad
=
0
,
alpha
=
negative_slope
,
scale
=
scale
)
ctx
.
save_for_backward
(
out
)
ctx
.
negative_slope
=
negative_slope
ctx
.
scale
=
scale
return
out
@
staticmethod
def
backward
(
ctx
,
grad_output
):
out
,
=
ctx
.
saved_tensors
grad_input
,
grad_bias
=
FusedBiasLeakyReLUFunctionBackward
.
apply
(
grad_output
,
out
,
ctx
.
negative_slope
,
ctx
.
scale
)
return
grad_input
,
grad_bias
,
None
,
None
class
FusedBiasLeakyReLU
(
nn
.
Module
):
"""Fused bias leaky ReLU.
This function is introduced in the StyleGAN2:
http://arxiv.org/abs/1912.04958
The bias term comes from the convolution operation. In addition, to keep
the variance of the feature map or gradients unchanged, they also adopt a
scale similarly with Kaiming initialization. However, since the
:math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the
final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501
your own scale.
TODO: Implement the CPU version.
Args:
channel (int): The channel number of the feature map.
negative_slope (float, optional): Same as nn.LeakyRelu.
Defaults to 0.2.
scale (float, optional): A scalar to adjust the variance of the feature
map. Defaults to 2**0.5.
"""
def
__init__
(
self
,
num_channels
,
negative_slope
=
0.2
,
scale
=
2
**
0.5
):
super
(
FusedBiasLeakyReLU
,
self
).
__init__
()
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
num_channels
))
self
.
negative_slope
=
negative_slope
self
.
scale
=
scale
def
forward
(
self
,
input
):
return
fused_bias_leakyrelu
(
input
,
self
.
bias
,
self
.
negative_slope
,
self
.
scale
)
def
fused_bias_leakyrelu
(
input
,
bias
,
negative_slope
=
0.2
,
scale
=
2
**
0.5
):
"""Fused bias leaky ReLU function.
This function is introduced in the StyleGAN2:
http://arxiv.org/abs/1912.04958
The bias term comes from the convolution operation. In addition, to keep
the variance of the feature map or gradients unchanged, they also adopt a
scale similarly with Kaiming initialization. However, since the
:math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the
final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501
your own scale.
Args:
input (torch.Tensor): Input feature map.
bias (nn.Parameter): The bias from convolution operation.
negative_slope (float, optional): Same as nn.LeakyRelu.
Defaults to 0.2.
scale (float, optional): A scalar to adjust the variance of the feature
map. Defaults to 2**0.5.
Returns:
torch.Tensor: Feature map after non-linear activation.
"""
if
not
input
.
is_cuda
:
return
bias_leakyrelu_ref
(
input
,
bias
,
negative_slope
,
scale
)
return
FusedBiasLeakyReLUFunction
.
apply
(
input
,
bias
.
to
(
input
.
dtype
),
negative_slope
,
scale
)
def
bias_leakyrelu_ref
(
x
,
bias
,
negative_slope
=
0.2
,
scale
=
2
**
0.5
):
if
bias
is
not
None
:
assert
bias
.
ndim
==
1
assert
bias
.
shape
[
0
]
==
x
.
shape
[
1
]
x
=
x
+
bias
.
reshape
([
-
1
if
i
==
1
else
1
for
i
in
range
(
x
.
ndim
)])
x
=
F
.
leaky_relu
(
x
,
negative_slope
)
if
scale
!=
1
:
x
=
x
*
scale
return
x
lavis/common/annotator/uniformer/mmcv/ops/gather_points.py
0 → 100644
View file @
c04f261a
import
torch
from
torch.autograd
import
Function
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'gather_points_forward'
,
'gather_points_backward'
])
class
GatherPoints
(
Function
):
"""Gather points with given index."""
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
Args:
features (Tensor): (B, C, N) features to gather.
indices (Tensor): (B, M) where M is the number of points.
Returns:
Tensor: (B, C, M) where M is the number of points.
"""
assert
features
.
is_contiguous
()
assert
indices
.
is_contiguous
()
B
,
npoint
=
indices
.
size
()
_
,
C
,
N
=
features
.
size
()
output
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
npoint
)
ext_module
.
gather_points_forward
(
features
,
indices
,
output
,
b
=
B
,
c
=
C
,
n
=
N
,
npoints
=
npoint
)
ctx
.
for_backwards
=
(
indices
,
C
,
N
)
if
torch
.
__version__
!=
'parrots'
:
ctx
.
mark_non_differentiable
(
indices
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
):
idx
,
C
,
N
=
ctx
.
for_backwards
B
,
npoint
=
idx
.
size
()
grad_features
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
N
).
zero_
()
grad_out_data
=
grad_out
.
data
.
contiguous
()
ext_module
.
gather_points_backward
(
grad_out_data
,
idx
,
grad_features
.
data
,
b
=
B
,
c
=
C
,
n
=
N
,
npoints
=
npoint
)
return
grad_features
,
None
gather_points
=
GatherPoints
.
apply
lavis/common/annotator/uniformer/mmcv/ops/group_points.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
import
torch
from
torch
import
nn
as
nn
from
torch.autograd
import
Function
from
..utils
import
ext_loader
from
.ball_query
import
ball_query
from
.knn
import
knn
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'group_points_forward'
,
'group_points_backward'
])
class
QueryAndGroup
(
nn
.
Module
):
"""Groups points with a ball query of radius.
Args:
max_radius (float): The maximum radius of the balls.
If None is given, we will use kNN sampling instead of ball query.
sample_num (int): Maximum number of features to gather in the ball.
min_radius (float, optional): The minimum radius of the balls.
Default: 0.
use_xyz (bool, optional): Whether to use xyz.
Default: True.
return_grouped_xyz (bool, optional): Whether to return grouped xyz.
Default: False.
normalize_xyz (bool, optional): Whether to normalize xyz.
Default: False.
uniform_sample (bool, optional): Whether to sample uniformly.
Default: False
return_unique_cnt (bool, optional): Whether to return the count of
unique samples. Default: False.
return_grouped_idx (bool, optional): Whether to return grouped idx.
Default: False.
"""
def
__init__
(
self
,
max_radius
,
sample_num
,
min_radius
=
0
,
use_xyz
=
True
,
return_grouped_xyz
=
False
,
normalize_xyz
=
False
,
uniform_sample
=
False
,
return_unique_cnt
=
False
,
return_grouped_idx
=
False
):
super
().
__init__
()
self
.
max_radius
=
max_radius
self
.
min_radius
=
min_radius
self
.
sample_num
=
sample_num
self
.
use_xyz
=
use_xyz
self
.
return_grouped_xyz
=
return_grouped_xyz
self
.
normalize_xyz
=
normalize_xyz
self
.
uniform_sample
=
uniform_sample
self
.
return_unique_cnt
=
return_unique_cnt
self
.
return_grouped_idx
=
return_grouped_idx
if
self
.
return_unique_cnt
:
assert
self
.
uniform_sample
,
\
'uniform_sample should be True when '
\
'returning the count of unique samples'
if
self
.
max_radius
is
None
:
assert
not
self
.
normalize_xyz
,
\
'can not normalize grouped xyz when max_radius is None'
def
forward
(
self
,
points_xyz
,
center_xyz
,
features
=
None
):
"""
Args:
points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) coordinates of the centriods.
features (Tensor): (B, C, N) Descriptors of the features.
Returns:
Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
"""
# if self.max_radius is None, we will perform kNN instead of ball query
# idx is of shape [B, npoint, sample_num]
if
self
.
max_radius
is
None
:
idx
=
knn
(
self
.
sample_num
,
points_xyz
,
center_xyz
,
False
)
idx
=
idx
.
transpose
(
1
,
2
).
contiguous
()
else
:
idx
=
ball_query
(
self
.
min_radius
,
self
.
max_radius
,
self
.
sample_num
,
points_xyz
,
center_xyz
)
if
self
.
uniform_sample
:
unique_cnt
=
torch
.
zeros
((
idx
.
shape
[
0
],
idx
.
shape
[
1
]))
for
i_batch
in
range
(
idx
.
shape
[
0
]):
for
i_region
in
range
(
idx
.
shape
[
1
]):
unique_ind
=
torch
.
unique
(
idx
[
i_batch
,
i_region
,
:])
num_unique
=
unique_ind
.
shape
[
0
]
unique_cnt
[
i_batch
,
i_region
]
=
num_unique
sample_ind
=
torch
.
randint
(
0
,
num_unique
,
(
self
.
sample_num
-
num_unique
,
),
dtype
=
torch
.
long
)
all_ind
=
torch
.
cat
((
unique_ind
,
unique_ind
[
sample_ind
]))
idx
[
i_batch
,
i_region
,
:]
=
all_ind
xyz_trans
=
points_xyz
.
transpose
(
1
,
2
).
contiguous
()
# (B, 3, npoint, sample_num)
grouped_xyz
=
grouping_operation
(
xyz_trans
,
idx
)
grouped_xyz_diff
=
grouped_xyz
-
\
center_xyz
.
transpose
(
1
,
2
).
unsqueeze
(
-
1
)
# relative offsets
if
self
.
normalize_xyz
:
grouped_xyz_diff
/=
self
.
max_radius
if
features
is
not
None
:
grouped_features
=
grouping_operation
(
features
,
idx
)
if
self
.
use_xyz
:
# (B, C + 3, npoint, sample_num)
new_features
=
torch
.
cat
([
grouped_xyz_diff
,
grouped_features
],
dim
=
1
)
else
:
new_features
=
grouped_features
else
:
assert
(
self
.
use_xyz
),
'Cannot have not features and not use xyz as a feature!'
new_features
=
grouped_xyz_diff
ret
=
[
new_features
]
if
self
.
return_grouped_xyz
:
ret
.
append
(
grouped_xyz
)
if
self
.
return_unique_cnt
:
ret
.
append
(
unique_cnt
)
if
self
.
return_grouped_idx
:
ret
.
append
(
idx
)
if
len
(
ret
)
==
1
:
return
ret
[
0
]
else
:
return
tuple
(
ret
)
class
GroupAll
(
nn
.
Module
):
"""Group xyz with feature.
Args:
use_xyz (bool): Whether to use xyz.
"""
def
__init__
(
self
,
use_xyz
:
bool
=
True
):
super
().
__init__
()
self
.
use_xyz
=
use_xyz
def
forward
(
self
,
xyz
:
torch
.
Tensor
,
new_xyz
:
torch
.
Tensor
,
features
:
torch
.
Tensor
=
None
):
"""
Args:
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
new_xyz (Tensor): new xyz coordinates of the features.
features (Tensor): (B, C, N) features to group.
Returns:
Tensor: (B, C + 3, 1, N) Grouped feature.
"""
grouped_xyz
=
xyz
.
transpose
(
1
,
2
).
unsqueeze
(
2
)
if
features
is
not
None
:
grouped_features
=
features
.
unsqueeze
(
2
)
if
self
.
use_xyz
:
# (B, 3 + C, 1, N)
new_features
=
torch
.
cat
([
grouped_xyz
,
grouped_features
],
dim
=
1
)
else
:
new_features
=
grouped_features
else
:
new_features
=
grouped_xyz
return
new_features
class
GroupingOperation
(
Function
):
"""Group feature with given index."""
@
staticmethod
def
forward
(
ctx
,
features
:
torch
.
Tensor
,
indices
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
Args:
features (Tensor): (B, C, N) tensor of features to group.
indices (Tensor): (B, npoint, nsample) the indices of
features to group with.
Returns:
Tensor: (B, C, npoint, nsample) Grouped features.
"""
features
=
features
.
contiguous
()
indices
=
indices
.
contiguous
()
B
,
nfeatures
,
nsample
=
indices
.
size
()
_
,
C
,
N
=
features
.
size
()
output
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
nfeatures
,
nsample
)
ext_module
.
group_points_forward
(
B
,
C
,
N
,
nfeatures
,
nsample
,
features
,
indices
,
output
)
ctx
.
for_backwards
=
(
indices
,
N
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""
Args:
grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
of the output from forward.
Returns:
Tensor: (B, C, N) gradient of the features.
"""
idx
,
N
=
ctx
.
for_backwards
B
,
C
,
npoint
,
nsample
=
grad_out
.
size
()
grad_features
=
torch
.
cuda
.
FloatTensor
(
B
,
C
,
N
).
zero_
()
grad_out_data
=
grad_out
.
data
.
contiguous
()
ext_module
.
group_points_backward
(
B
,
C
,
N
,
npoint
,
nsample
,
grad_out_data
,
idx
,
grad_features
.
data
)
return
grad_features
,
None
grouping_operation
=
GroupingOperation
.
apply
lavis/common/annotator/uniformer/mmcv/ops/info.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
glob
import
os
import
torch
if
torch
.
__version__
==
'parrots'
:
import
parrots
def
get_compiler_version
():
return
'GCC '
+
parrots
.
version
.
compiler
def
get_compiling_cuda_version
():
return
parrots
.
version
.
cuda
else
:
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'get_compiler_version'
,
'get_compiling_cuda_version'
])
def
get_compiler_version
():
return
ext_module
.
get_compiler_version
()
def
get_compiling_cuda_version
():
return
ext_module
.
get_compiling_cuda_version
()
def
get_onnxruntime_op_path
():
wildcard
=
os
.
path
.
join
(
os
.
path
.
abspath
(
os
.
path
.
dirname
(
os
.
path
.
dirname
(
__file__
))),
'_ext_ort.*.so'
)
paths
=
glob
.
glob
(
wildcard
)
if
len
(
paths
)
>
0
:
return
paths
[
0
]
else
:
return
''
lavis/common/annotator/uniformer/mmcv/ops/iou3d.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'iou3d_boxes_iou_bev_forward'
,
'iou3d_nms_forward'
,
'iou3d_nms_normal_forward'
])
def
boxes_iou_bev
(
boxes_a
,
boxes_b
):
"""Calculate boxes IoU in the Bird's Eye View.
Args:
boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
Returns:
ans_iou (torch.Tensor): IoU result with shape (M, N).
"""
ans_iou
=
boxes_a
.
new_zeros
(
torch
.
Size
((
boxes_a
.
shape
[
0
],
boxes_b
.
shape
[
0
])))
ext_module
.
iou3d_boxes_iou_bev_forward
(
boxes_a
.
contiguous
(),
boxes_b
.
contiguous
(),
ans_iou
)
return
ans_iou
def
nms_bev
(
boxes
,
scores
,
thresh
,
pre_max_size
=
None
,
post_max_size
=
None
):
"""NMS function GPU implementation (for BEV boxes). The overlap of two
boxes for IoU calculation is defined as the exact overlapping area of the
two boxes. In this function, one can also set ``pre_max_size`` and
``post_max_size``.
Args:
boxes (torch.Tensor): Input boxes with the shape of [N, 5]
([x1, y1, x2, y2, ry]).
scores (torch.Tensor): Scores of boxes with the shape of [N].
thresh (float): Overlap threshold of NMS.
pre_max_size (int, optional): Max size of boxes before NMS.
Default: None.
post_max_size (int, optional): Max size of boxes after NMS.
Default: None.
Returns:
torch.Tensor: Indexes after NMS.
"""
assert
boxes
.
size
(
1
)
==
5
,
'Input boxes shape should be [N, 5]'
order
=
scores
.
sort
(
0
,
descending
=
True
)[
1
]
if
pre_max_size
is
not
None
:
order
=
order
[:
pre_max_size
]
boxes
=
boxes
[
order
].
contiguous
()
keep
=
torch
.
zeros
(
boxes
.
size
(
0
),
dtype
=
torch
.
long
)
num_out
=
ext_module
.
iou3d_nms_forward
(
boxes
,
keep
,
thresh
)
keep
=
order
[
keep
[:
num_out
].
cuda
(
boxes
.
device
)].
contiguous
()
if
post_max_size
is
not
None
:
keep
=
keep
[:
post_max_size
]
return
keep
def
nms_normal_bev
(
boxes
,
scores
,
thresh
):
"""Normal NMS function GPU implementation (for BEV boxes). The overlap of
two boxes for IoU calculation is defined as the exact overlapping area of
the two boxes WITH their yaw angle set to 0.
Args:
boxes (torch.Tensor): Input boxes with shape (N, 5).
scores (torch.Tensor): Scores of predicted boxes with shape (N).
thresh (float): Overlap threshold of NMS.
Returns:
torch.Tensor: Remaining indices with scores in descending order.
"""
assert
boxes
.
shape
[
1
]
==
5
,
'Input boxes shape should be [N, 5]'
order
=
scores
.
sort
(
0
,
descending
=
True
)[
1
]
boxes
=
boxes
[
order
].
contiguous
()
keep
=
torch
.
zeros
(
boxes
.
size
(
0
),
dtype
=
torch
.
long
)
num_out
=
ext_module
.
iou3d_nms_normal_forward
(
boxes
,
keep
,
thresh
)
return
order
[
keep
[:
num_out
].
cuda
(
boxes
.
device
)].
contiguous
()
lavis/common/annotator/uniformer/mmcv/ops/knn.py
0 → 100644
View file @
c04f261a
import
torch
from
torch.autograd
import
Function
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'knn_forward'
])
class
KNN
(
Function
):
r
"""KNN (CUDA) based on heap data structure.
Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
scene_seg/lib/pointops/src/knnquery_heap>`_.
Find k-nearest points.
"""
@
staticmethod
def
forward
(
ctx
,
k
:
int
,
xyz
:
torch
.
Tensor
,
center_xyz
:
torch
.
Tensor
=
None
,
transposed
:
bool
=
False
)
->
torch
.
Tensor
:
"""
Args:
k (int): number of nearest neighbors.
xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N).
xyz coordinates of the features.
center_xyz (Tensor, optional): (B, npoint, 3) if transposed ==
False, else (B, 3, npoint). centers of the knn query.
Default: None.
transposed (bool, optional): whether the input tensors are
transposed. Should not explicitly use this keyword when
calling knn (=KNN.apply), just add the fourth param.
Default: False.
Returns:
Tensor: (B, k, npoint) tensor with the indices of
the features that form k-nearest neighbours.
"""
assert
(
k
>
0
)
&
(
k
<
100
),
'k should be in range(0, 100)'
if
center_xyz
is
None
:
center_xyz
=
xyz
if
transposed
:
xyz
=
xyz
.
transpose
(
2
,
1
).
contiguous
()
center_xyz
=
center_xyz
.
transpose
(
2
,
1
).
contiguous
()
assert
xyz
.
is_contiguous
()
# [B, N, 3]
assert
center_xyz
.
is_contiguous
()
# [B, npoint, 3]
center_xyz_device
=
center_xyz
.
get_device
()
assert
center_xyz_device
==
xyz
.
get_device
(),
\
'center_xyz and xyz should be put on the same device'
if
torch
.
cuda
.
current_device
()
!=
center_xyz_device
:
torch
.
cuda
.
set_device
(
center_xyz_device
)
B
,
npoint
,
_
=
center_xyz
.
shape
N
=
xyz
.
shape
[
1
]
idx
=
center_xyz
.
new_zeros
((
B
,
npoint
,
k
)).
int
()
dist2
=
center_xyz
.
new_zeros
((
B
,
npoint
,
k
)).
float
()
ext_module
.
knn_forward
(
xyz
,
center_xyz
,
idx
,
dist2
,
b
=
B
,
n
=
N
,
m
=
npoint
,
nsample
=
k
)
# idx shape to [B, k, npoint]
idx
=
idx
.
transpose
(
2
,
1
).
contiguous
()
if
torch
.
__version__
!=
'parrots'
:
ctx
.
mark_non_differentiable
(
idx
)
return
idx
@
staticmethod
def
backward
(
ctx
,
a
=
None
):
return
None
,
None
,
None
knn
=
KNN
.
apply
lavis/common/annotator/uniformer/mmcv/ops/masked_conv.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
math
import
torch
import
torch.nn
as
nn
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.nn.modules.utils
import
_pair
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'masked_im2col_forward'
,
'masked_col2im_forward'
])
class
MaskedConv2dFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
features
,
mask
,
weight
,
bias
,
padding
,
stride
):
return
g
.
op
(
'mmcv::MMCVMaskedConv2d'
,
features
,
mask
,
weight
,
bias
,
padding_i
=
padding
,
stride_i
=
stride
)
@
staticmethod
def
forward
(
ctx
,
features
,
mask
,
weight
,
bias
,
padding
=
0
,
stride
=
1
):
assert
mask
.
dim
()
==
3
and
mask
.
size
(
0
)
==
1
assert
features
.
dim
()
==
4
and
features
.
size
(
0
)
==
1
assert
features
.
size
()[
2
:]
==
mask
.
size
()[
1
:]
pad_h
,
pad_w
=
_pair
(
padding
)
stride_h
,
stride_w
=
_pair
(
stride
)
if
stride_h
!=
1
or
stride_w
!=
1
:
raise
ValueError
(
'Stride could not only be 1 in masked_conv2d currently.'
)
out_channel
,
in_channel
,
kernel_h
,
kernel_w
=
weight
.
size
()
batch_size
=
features
.
size
(
0
)
out_h
=
int
(
math
.
floor
((
features
.
size
(
2
)
+
2
*
pad_h
-
(
kernel_h
-
1
)
-
1
)
/
stride_h
+
1
))
out_w
=
int
(
math
.
floor
((
features
.
size
(
3
)
+
2
*
pad_w
-
(
kernel_h
-
1
)
-
1
)
/
stride_w
+
1
))
mask_inds
=
torch
.
nonzero
(
mask
[
0
]
>
0
,
as_tuple
=
False
)
output
=
features
.
new_zeros
(
batch_size
,
out_channel
,
out_h
,
out_w
)
if
mask_inds
.
numel
()
>
0
:
mask_h_idx
=
mask_inds
[:,
0
].
contiguous
()
mask_w_idx
=
mask_inds
[:,
1
].
contiguous
()
data_col
=
features
.
new_zeros
(
in_channel
*
kernel_h
*
kernel_w
,
mask_inds
.
size
(
0
))
ext_module
.
masked_im2col_forward
(
features
,
mask_h_idx
,
mask_w_idx
,
data_col
,
kernel_h
=
kernel_h
,
kernel_w
=
kernel_w
,
pad_h
=
pad_h
,
pad_w
=
pad_w
)
masked_output
=
torch
.
addmm
(
1
,
bias
[:,
None
],
1
,
weight
.
view
(
out_channel
,
-
1
),
data_col
)
ext_module
.
masked_col2im_forward
(
masked_output
,
mask_h_idx
,
mask_w_idx
,
output
,
height
=
out_h
,
width
=
out_w
,
channels
=
out_channel
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
return
(
None
,
)
*
5
masked_conv2d
=
MaskedConv2dFunction
.
apply
class
MaskedConv2d
(
nn
.
Conv2d
):
"""A MaskedConv2d which inherits the official Conv2d.
The masked forward doesn't implement the backward function and only
supports the stride parameter to be 1 currently.
"""
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
bias
=
True
):
super
(
MaskedConv2d
,
self
).
__init__
(
in_channels
,
out_channels
,
kernel_size
,
stride
,
padding
,
dilation
,
groups
,
bias
)
def
forward
(
self
,
input
,
mask
=
None
):
if
mask
is
None
:
# fallback to the normal Conv2d
return
super
(
MaskedConv2d
,
self
).
forward
(
input
)
else
:
return
masked_conv2d
(
input
,
mask
,
self
.
weight
,
self
.
bias
,
self
.
padding
)
lavis/common/annotator/uniformer/mmcv/ops/merge_cells.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
abc
import
abstractmethod
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
..cnn
import
ConvModule
class
BaseMergeCell
(
nn
.
Module
):
"""The basic class for cells used in NAS-FPN and NAS-FCOS.
BaseMergeCell takes 2 inputs. After applying convolution
on them, they are resized to the target size. Then,
they go through binary_op, which depends on the type of cell.
If with_out_conv is True, the result of output will go through
another convolution layer.
Args:
in_channels (int): number of input channels in out_conv layer.
out_channels (int): number of output channels in out_conv layer.
with_out_conv (bool): Whether to use out_conv layer
out_conv_cfg (dict): Config dict for convolution layer, which should
contain "groups", "kernel_size", "padding", "bias" to build
out_conv layer.
out_norm_cfg (dict): Config dict for normalization layer in out_conv.
out_conv_order (tuple): The order of conv/norm/activation layers in
out_conv.
with_input1_conv (bool): Whether to use convolution on input1.
with_input2_conv (bool): Whether to use convolution on input2.
input_conv_cfg (dict): Config dict for building input1_conv layer and
input2_conv layer, which is expected to contain the type of
convolution.
Default: None, which means using conv2d.
input_norm_cfg (dict): Config dict for normalization layer in
input1_conv and input2_conv layer. Default: None.
upsample_mode (str): Interpolation method used to resize the output
of input1_conv and input2_conv to target size. Currently, we
support ['nearest', 'bilinear']. Default: 'nearest'.
"""
def
__init__
(
self
,
fused_channels
=
256
,
out_channels
=
256
,
with_out_conv
=
True
,
out_conv_cfg
=
dict
(
groups
=
1
,
kernel_size
=
3
,
padding
=
1
,
bias
=
True
),
out_norm_cfg
=
None
,
out_conv_order
=
(
'act'
,
'conv'
,
'norm'
),
with_input1_conv
=
False
,
with_input2_conv
=
False
,
input_conv_cfg
=
None
,
input_norm_cfg
=
None
,
upsample_mode
=
'nearest'
):
super
(
BaseMergeCell
,
self
).
__init__
()
assert
upsample_mode
in
[
'nearest'
,
'bilinear'
]
self
.
with_out_conv
=
with_out_conv
self
.
with_input1_conv
=
with_input1_conv
self
.
with_input2_conv
=
with_input2_conv
self
.
upsample_mode
=
upsample_mode
if
self
.
with_out_conv
:
self
.
out_conv
=
ConvModule
(
fused_channels
,
out_channels
,
**
out_conv_cfg
,
norm_cfg
=
out_norm_cfg
,
order
=
out_conv_order
)
self
.
input1_conv
=
self
.
_build_input_conv
(
out_channels
,
input_conv_cfg
,
input_norm_cfg
)
if
with_input1_conv
else
nn
.
Sequential
()
self
.
input2_conv
=
self
.
_build_input_conv
(
out_channels
,
input_conv_cfg
,
input_norm_cfg
)
if
with_input2_conv
else
nn
.
Sequential
()
def
_build_input_conv
(
self
,
channel
,
conv_cfg
,
norm_cfg
):
return
ConvModule
(
channel
,
channel
,
3
,
padding
=
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
bias
=
True
)
@
abstractmethod
def
_binary_op
(
self
,
x1
,
x2
):
pass
def
_resize
(
self
,
x
,
size
):
if
x
.
shape
[
-
2
:]
==
size
:
return
x
elif
x
.
shape
[
-
2
:]
<
size
:
return
F
.
interpolate
(
x
,
size
=
size
,
mode
=
self
.
upsample_mode
)
else
:
assert
x
.
shape
[
-
2
]
%
size
[
-
2
]
==
0
and
x
.
shape
[
-
1
]
%
size
[
-
1
]
==
0
kernel_size
=
x
.
shape
[
-
1
]
//
size
[
-
1
]
x
=
F
.
max_pool2d
(
x
,
kernel_size
=
kernel_size
,
stride
=
kernel_size
)
return
x
def
forward
(
self
,
x1
,
x2
,
out_size
=
None
):
assert
x1
.
shape
[:
2
]
==
x2
.
shape
[:
2
]
assert
out_size
is
None
or
len
(
out_size
)
==
2
if
out_size
is
None
:
# resize to larger one
out_size
=
max
(
x1
.
size
()[
2
:],
x2
.
size
()[
2
:])
x1
=
self
.
input1_conv
(
x1
)
x2
=
self
.
input2_conv
(
x2
)
x1
=
self
.
_resize
(
x1
,
out_size
)
x2
=
self
.
_resize
(
x2
,
out_size
)
x
=
self
.
_binary_op
(
x1
,
x2
)
if
self
.
with_out_conv
:
x
=
self
.
out_conv
(
x
)
return
x
class
SumCell
(
BaseMergeCell
):
def
__init__
(
self
,
in_channels
,
out_channels
,
**
kwargs
):
super
(
SumCell
,
self
).
__init__
(
in_channels
,
out_channels
,
**
kwargs
)
def
_binary_op
(
self
,
x1
,
x2
):
return
x1
+
x2
class
ConcatCell
(
BaseMergeCell
):
def
__init__
(
self
,
in_channels
,
out_channels
,
**
kwargs
):
super
(
ConcatCell
,
self
).
__init__
(
in_channels
*
2
,
out_channels
,
**
kwargs
)
def
_binary_op
(
self
,
x1
,
x2
):
ret
=
torch
.
cat
([
x1
,
x2
],
dim
=
1
)
return
ret
class
GlobalPoolingCell
(
BaseMergeCell
):
def
__init__
(
self
,
in_channels
=
None
,
out_channels
=
None
,
**
kwargs
):
super
().
__init__
(
in_channels
,
out_channels
,
**
kwargs
)
self
.
global_pool
=
nn
.
AdaptiveAvgPool2d
((
1
,
1
))
def
_binary_op
(
self
,
x1
,
x2
):
x2_att
=
self
.
global_pool
(
x2
).
sigmoid
()
return
x2
+
x2_att
*
x1
lavis/common/annotator/uniformer/mmcv/ops/modulated_deform_conv.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
math
import
torch
import
torch.nn
as
nn
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.nn.modules.utils
import
_pair
,
_single
from
annotator.uniformer.mmcv.utils
import
deprecated_api_warning
from
..cnn
import
CONV_LAYERS
from
..utils
import
ext_loader
,
print_log
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'modulated_deform_conv_forward'
,
'modulated_deform_conv_backward'
])
class
ModulatedDeformConv2dFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
,
offset
,
mask
,
weight
,
bias
,
stride
,
padding
,
dilation
,
groups
,
deform_groups
):
input_tensors
=
[
input
,
offset
,
mask
,
weight
]
if
bias
is
not
None
:
input_tensors
.
append
(
bias
)
return
g
.
op
(
'mmcv::MMCVModulatedDeformConv2d'
,
*
input_tensors
,
stride_i
=
stride
,
padding_i
=
padding
,
dilation_i
=
dilation
,
groups_i
=
groups
,
deform_groups_i
=
deform_groups
)
@
staticmethod
def
forward
(
ctx
,
input
,
offset
,
mask
,
weight
,
bias
=
None
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
deform_groups
=
1
):
if
input
is
not
None
and
input
.
dim
()
!=
4
:
raise
ValueError
(
f
'Expected 4D tensor as input, got
{
input
.
dim
()
}
D tensor
\
instead.'
)
ctx
.
stride
=
_pair
(
stride
)
ctx
.
padding
=
_pair
(
padding
)
ctx
.
dilation
=
_pair
(
dilation
)
ctx
.
groups
=
groups
ctx
.
deform_groups
=
deform_groups
ctx
.
with_bias
=
bias
is
not
None
if
not
ctx
.
with_bias
:
bias
=
input
.
new_empty
(
0
)
# fake tensor
# When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
# amp won't cast the type of model (float32), but "offset" is cast
# to float16 by nn.Conv2d automatically, leading to the type
# mismatch with input (when it is float32) or weight.
# The flag for whether to use fp16 or amp is the type of "offset",
# we cast weight and input to temporarily support fp16 and amp
# whatever the pytorch version is.
input
=
input
.
type_as
(
offset
)
weight
=
weight
.
type_as
(
input
)
ctx
.
save_for_backward
(
input
,
offset
,
mask
,
weight
,
bias
)
output
=
input
.
new_empty
(
ModulatedDeformConv2dFunction
.
_output_size
(
ctx
,
input
,
weight
))
ctx
.
_bufs
=
[
input
.
new_empty
(
0
),
input
.
new_empty
(
0
)]
ext_module
.
modulated_deform_conv_forward
(
input
,
weight
,
bias
,
ctx
.
_bufs
[
0
],
offset
,
mask
,
output
,
ctx
.
_bufs
[
1
],
kernel_h
=
weight
.
size
(
2
),
kernel_w
=
weight
.
size
(
3
),
stride_h
=
ctx
.
stride
[
0
],
stride_w
=
ctx
.
stride
[
1
],
pad_h
=
ctx
.
padding
[
0
],
pad_w
=
ctx
.
padding
[
1
],
dilation_h
=
ctx
.
dilation
[
0
],
dilation_w
=
ctx
.
dilation
[
1
],
group
=
ctx
.
groups
,
deformable_group
=
ctx
.
deform_groups
,
with_bias
=
ctx
.
with_bias
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
input
,
offset
,
mask
,
weight
,
bias
=
ctx
.
saved_tensors
grad_input
=
torch
.
zeros_like
(
input
)
grad_offset
=
torch
.
zeros_like
(
offset
)
grad_mask
=
torch
.
zeros_like
(
mask
)
grad_weight
=
torch
.
zeros_like
(
weight
)
grad_bias
=
torch
.
zeros_like
(
bias
)
grad_output
=
grad_output
.
contiguous
()
ext_module
.
modulated_deform_conv_backward
(
input
,
weight
,
bias
,
ctx
.
_bufs
[
0
],
offset
,
mask
,
ctx
.
_bufs
[
1
],
grad_input
,
grad_weight
,
grad_bias
,
grad_offset
,
grad_mask
,
grad_output
,
kernel_h
=
weight
.
size
(
2
),
kernel_w
=
weight
.
size
(
3
),
stride_h
=
ctx
.
stride
[
0
],
stride_w
=
ctx
.
stride
[
1
],
pad_h
=
ctx
.
padding
[
0
],
pad_w
=
ctx
.
padding
[
1
],
dilation_h
=
ctx
.
dilation
[
0
],
dilation_w
=
ctx
.
dilation
[
1
],
group
=
ctx
.
groups
,
deformable_group
=
ctx
.
deform_groups
,
with_bias
=
ctx
.
with_bias
)
if
not
ctx
.
with_bias
:
grad_bias
=
None
return
(
grad_input
,
grad_offset
,
grad_mask
,
grad_weight
,
grad_bias
,
None
,
None
,
None
,
None
,
None
)
@
staticmethod
def
_output_size
(
ctx
,
input
,
weight
):
channels
=
weight
.
size
(
0
)
output_size
=
(
input
.
size
(
0
),
channels
)
for
d
in
range
(
input
.
dim
()
-
2
):
in_size
=
input
.
size
(
d
+
2
)
pad
=
ctx
.
padding
[
d
]
kernel
=
ctx
.
dilation
[
d
]
*
(
weight
.
size
(
d
+
2
)
-
1
)
+
1
stride_
=
ctx
.
stride
[
d
]
output_size
+=
((
in_size
+
(
2
*
pad
)
-
kernel
)
//
stride_
+
1
,
)
if
not
all
(
map
(
lambda
s
:
s
>
0
,
output_size
)):
raise
ValueError
(
'convolution input is too small (output would be '
+
'x'
.
join
(
map
(
str
,
output_size
))
+
')'
)
return
output_size
modulated_deform_conv2d
=
ModulatedDeformConv2dFunction
.
apply
class
ModulatedDeformConv2d
(
nn
.
Module
):
@
deprecated_api_warning
({
'deformable_groups'
:
'deform_groups'
},
cls_name
=
'ModulatedDeformConv2d'
)
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
deform_groups
=
1
,
bias
=
True
):
super
(
ModulatedDeformConv2d
,
self
).
__init__
()
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
kernel_size
=
_pair
(
kernel_size
)
self
.
stride
=
_pair
(
stride
)
self
.
padding
=
_pair
(
padding
)
self
.
dilation
=
_pair
(
dilation
)
self
.
groups
=
groups
self
.
deform_groups
=
deform_groups
# enable compatibility with nn.Conv2d
self
.
transposed
=
False
self
.
output_padding
=
_single
(
0
)
self
.
weight
=
nn
.
Parameter
(
torch
.
Tensor
(
out_channels
,
in_channels
//
groups
,
*
self
.
kernel_size
))
if
bias
:
self
.
bias
=
nn
.
Parameter
(
torch
.
Tensor
(
out_channels
))
else
:
self
.
register_parameter
(
'bias'
,
None
)
self
.
init_weights
()
def
init_weights
(
self
):
n
=
self
.
in_channels
for
k
in
self
.
kernel_size
:
n
*=
k
stdv
=
1.
/
math
.
sqrt
(
n
)
self
.
weight
.
data
.
uniform_
(
-
stdv
,
stdv
)
if
self
.
bias
is
not
None
:
self
.
bias
.
data
.
zero_
()
def
forward
(
self
,
x
,
offset
,
mask
):
return
modulated_deform_conv2d
(
x
,
offset
,
mask
,
self
.
weight
,
self
.
bias
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
,
self
.
deform_groups
)
@
CONV_LAYERS
.
register_module
(
'DCNv2'
)
class
ModulatedDeformConv2dPack
(
ModulatedDeformConv2d
):
"""A ModulatedDeformable Conv Encapsulation that acts as normal Conv
layers.
Args:
in_channels (int): Same as nn.Conv2d.
out_channels (int): Same as nn.Conv2d.
kernel_size (int or tuple[int]): Same as nn.Conv2d.
stride (int): Same as nn.Conv2d, while tuple is not supported.
padding (int): Same as nn.Conv2d, while tuple is not supported.
dilation (int): Same as nn.Conv2d, while tuple is not supported.
groups (int): Same as nn.Conv2d.
bias (bool or str): If specified as `auto`, it will be decided by the
norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
False.
"""
_version
=
2
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
ModulatedDeformConv2dPack
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
conv_offset
=
nn
.
Conv2d
(
self
.
in_channels
,
self
.
deform_groups
*
3
*
self
.
kernel_size
[
0
]
*
self
.
kernel_size
[
1
],
kernel_size
=
self
.
kernel_size
,
stride
=
self
.
stride
,
padding
=
self
.
padding
,
dilation
=
self
.
dilation
,
bias
=
True
)
self
.
init_weights
()
def
init_weights
(
self
):
super
(
ModulatedDeformConv2dPack
,
self
).
init_weights
()
if
hasattr
(
self
,
'conv_offset'
):
self
.
conv_offset
.
weight
.
data
.
zero_
()
self
.
conv_offset
.
bias
.
data
.
zero_
()
def
forward
(
self
,
x
):
out
=
self
.
conv_offset
(
x
)
o1
,
o2
,
mask
=
torch
.
chunk
(
out
,
3
,
dim
=
1
)
offset
=
torch
.
cat
((
o1
,
o2
),
dim
=
1
)
mask
=
torch
.
sigmoid
(
mask
)
return
modulated_deform_conv2d
(
x
,
offset
,
mask
,
self
.
weight
,
self
.
bias
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
,
self
.
deform_groups
)
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
version
=
local_metadata
.
get
(
'version'
,
None
)
if
version
is
None
or
version
<
2
:
# the key is different in early versions
# In version < 2, ModulatedDeformConvPack
# loads previous benchmark models.
if
(
prefix
+
'conv_offset.weight'
not
in
state_dict
and
prefix
[:
-
1
]
+
'_offset.weight'
in
state_dict
):
state_dict
[
prefix
+
'conv_offset.weight'
]
=
state_dict
.
pop
(
prefix
[:
-
1
]
+
'_offset.weight'
)
if
(
prefix
+
'conv_offset.bias'
not
in
state_dict
and
prefix
[:
-
1
]
+
'_offset.bias'
in
state_dict
):
state_dict
[
prefix
+
'conv_offset.bias'
]
=
state_dict
.
pop
(
prefix
[:
-
1
]
+
'_offset.bias'
)
if
version
is
not
None
and
version
>
1
:
print_log
(
f
'ModulatedDeformConvPack
{
prefix
.
rstrip
(
"."
)
}
is upgraded to '
'version 2.'
,
logger
=
'root'
)
super
().
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
)
lavis/common/annotator/uniformer/mmcv/ops/multi_scale_deform_attn.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
math
import
warnings
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.autograd.function
import
Function
,
once_differentiable
from
annotator.uniformer.mmcv
import
deprecated_api_warning
from
annotator.uniformer.mmcv.cnn
import
constant_init
,
xavier_init
from
annotator.uniformer.mmcv.cnn.bricks.registry
import
ATTENTION
from
annotator.uniformer.mmcv.runner
import
BaseModule
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ms_deform_attn_backward'
,
'ms_deform_attn_forward'
])
class
MultiScaleDeformableAttnFunction
(
Function
):
@
staticmethod
def
forward
(
ctx
,
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
):
"""GPU version of multi-scale deformable attention.
Args:
value (Tensor): The value has shape
(bs, num_keys, mum_heads, embed_dims//num_heads)
value_spatial_shapes (Tensor): Spatial shape of
each feature map, has shape (num_levels, 2),
last dimension 2 represent (h, w)
sampling_locations (Tensor): The location of sampling points,
has shape
(bs ,num_queries, num_heads, num_levels, num_points, 2),
the last dimension 2 represent (x, y).
attention_weights (Tensor): The weight of sampling points used
when calculate the attention, has shape
(bs ,num_queries, num_heads, num_levels, num_points),
im2col_step (Tensor): The step used in image to column.
Returns:
Tensor: has shape (bs, num_queries, embed_dims)
"""
ctx
.
im2col_step
=
im2col_step
output
=
ext_module
.
ms_deform_attn_forward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
im2col_step
=
ctx
.
im2col_step
)
ctx
.
save_for_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
"""GPU version of backward function.
Args:
grad_output (Tensor): Gradient
of output tensor of forward.
Returns:
Tuple[Tensor]: Gradient
of input tensors in forward.
"""
value
,
value_spatial_shapes
,
value_level_start_index
,
\
sampling_locations
,
attention_weights
=
ctx
.
saved_tensors
grad_value
=
torch
.
zeros_like
(
value
)
grad_sampling_loc
=
torch
.
zeros_like
(
sampling_locations
)
grad_attn_weight
=
torch
.
zeros_like
(
attention_weights
)
ext_module
.
ms_deform_attn_backward
(
value
,
value_spatial_shapes
,
value_level_start_index
,
sampling_locations
,
attention_weights
,
grad_output
.
contiguous
(),
grad_value
,
grad_sampling_loc
,
grad_attn_weight
,
im2col_step
=
ctx
.
im2col_step
)
return
grad_value
,
None
,
None
,
\
grad_sampling_loc
,
grad_attn_weight
,
None
def
multi_scale_deformable_attn_pytorch
(
value
,
value_spatial_shapes
,
sampling_locations
,
attention_weights
):
"""CPU version of multi-scale deformable attention.
Args:
value (Tensor): The value has shape
(bs, num_keys, mum_heads, embed_dims//num_heads)
value_spatial_shapes (Tensor): Spatial shape of
each feature map, has shape (num_levels, 2),
last dimension 2 represent (h, w)
sampling_locations (Tensor): The location of sampling points,
has shape
(bs ,num_queries, num_heads, num_levels, num_points, 2),
the last dimension 2 represent (x, y).
attention_weights (Tensor): The weight of sampling points used
when calculate the attention, has shape
(bs ,num_queries, num_heads, num_levels, num_points),
Returns:
Tensor: has shape (bs, num_queries, embed_dims)
"""
bs
,
_
,
num_heads
,
embed_dims
=
value
.
shape
_
,
num_queries
,
num_heads
,
num_levels
,
num_points
,
_
=
\
sampling_locations
.
shape
value_list
=
value
.
split
([
H_
*
W_
for
H_
,
W_
in
value_spatial_shapes
],
dim
=
1
)
sampling_grids
=
2
*
sampling_locations
-
1
sampling_value_list
=
[]
for
level
,
(
H_
,
W_
)
in
enumerate
(
value_spatial_shapes
):
# bs, H_*W_, num_heads, embed_dims ->
# bs, H_*W_, num_heads*embed_dims ->
# bs, num_heads*embed_dims, H_*W_ ->
# bs*num_heads, embed_dims, H_, W_
value_l_
=
value_list
[
level
].
flatten
(
2
).
transpose
(
1
,
2
).
reshape
(
bs
*
num_heads
,
embed_dims
,
H_
,
W_
)
# bs, num_queries, num_heads, num_points, 2 ->
# bs, num_heads, num_queries, num_points, 2 ->
# bs*num_heads, num_queries, num_points, 2
sampling_grid_l_
=
sampling_grids
[:,
:,
:,
level
].
transpose
(
1
,
2
).
flatten
(
0
,
1
)
# bs*num_heads, embed_dims, num_queries, num_points
sampling_value_l_
=
F
.
grid_sample
(
value_l_
,
sampling_grid_l_
,
mode
=
'bilinear'
,
padding_mode
=
'zeros'
,
align_corners
=
False
)
sampling_value_list
.
append
(
sampling_value_l_
)
# (bs, num_queries, num_heads, num_levels, num_points) ->
# (bs, num_heads, num_queries, num_levels, num_points) ->
# (bs, num_heads, 1, num_queries, num_levels*num_points)
attention_weights
=
attention_weights
.
transpose
(
1
,
2
).
reshape
(
bs
*
num_heads
,
1
,
num_queries
,
num_levels
*
num_points
)
output
=
(
torch
.
stack
(
sampling_value_list
,
dim
=-
2
).
flatten
(
-
2
)
*
attention_weights
).
sum
(
-
1
).
view
(
bs
,
num_heads
*
embed_dims
,
num_queries
)
return
output
.
transpose
(
1
,
2
).
contiguous
()
@
ATTENTION
.
register_module
()
class
MultiScaleDeformableAttention
(
BaseModule
):
"""An attention module used in Deformable-Detr.
`Deformable DETR: Deformable Transformers for End-to-End Object Detection.
<https://arxiv.org/pdf/2010.04159.pdf>`_.
Args:
embed_dims (int): The embedding dimension of Attention.
Default: 256.
num_heads (int): Parallel attention heads. Default: 64.
num_levels (int): The number of feature map used in
Attention. Default: 4.
num_points (int): The number of sampling points for
each query in each head. Default: 4.
im2col_step (int): The step used in image_to_column.
Default: 64.
dropout (float): A Dropout layer on `inp_identity`.
Default: 0.1.
batch_first (bool): Key, Query and Value are shape of
(batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
norm_cfg (dict): Config dict for normalization layer.
Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def
__init__
(
self
,
embed_dims
=
256
,
num_heads
=
8
,
num_levels
=
4
,
num_points
=
4
,
im2col_step
=
64
,
dropout
=
0.1
,
batch_first
=
False
,
norm_cfg
=
None
,
init_cfg
=
None
):
super
().
__init__
(
init_cfg
)
if
embed_dims
%
num_heads
!=
0
:
raise
ValueError
(
f
'embed_dims must be divisible by num_heads, '
f
'but got
{
embed_dims
}
and
{
num_heads
}
'
)
dim_per_head
=
embed_dims
//
num_heads
self
.
norm_cfg
=
norm_cfg
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
batch_first
=
batch_first
# you'd better set dim_per_head to a power of 2
# which is more efficient in the CUDA implementation
def
_is_power_of_2
(
n
):
if
(
not
isinstance
(
n
,
int
))
or
(
n
<
0
):
raise
ValueError
(
'invalid input for _is_power_of_2: {} (type: {})'
.
format
(
n
,
type
(
n
)))
return
(
n
&
(
n
-
1
)
==
0
)
and
n
!=
0
if
not
_is_power_of_2
(
dim_per_head
):
warnings
.
warn
(
"You'd better set embed_dims in "
'MultiScaleDeformAttention to make '
'the dimension of each attention head a power of 2 '
'which is more efficient in our CUDA implementation.'
)
self
.
im2col_step
=
im2col_step
self
.
embed_dims
=
embed_dims
self
.
num_levels
=
num_levels
self
.
num_heads
=
num_heads
self
.
num_points
=
num_points
self
.
sampling_offsets
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
*
2
)
self
.
attention_weights
=
nn
.
Linear
(
embed_dims
,
num_heads
*
num_levels
*
num_points
)
self
.
value_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
output_proj
=
nn
.
Linear
(
embed_dims
,
embed_dims
)
self
.
init_weights
()
def
init_weights
(
self
):
"""Default initialization for Parameters of Module."""
constant_init
(
self
.
sampling_offsets
,
0.
)
thetas
=
torch
.
arange
(
self
.
num_heads
,
dtype
=
torch
.
float32
)
*
(
2.0
*
math
.
pi
/
self
.
num_heads
)
grid_init
=
torch
.
stack
([
thetas
.
cos
(),
thetas
.
sin
()],
-
1
)
grid_init
=
(
grid_init
/
grid_init
.
abs
().
max
(
-
1
,
keepdim
=
True
)[
0
]).
view
(
self
.
num_heads
,
1
,
1
,
2
).
repeat
(
1
,
self
.
num_levels
,
self
.
num_points
,
1
)
for
i
in
range
(
self
.
num_points
):
grid_init
[:,
:,
i
,
:]
*=
i
+
1
self
.
sampling_offsets
.
bias
.
data
=
grid_init
.
view
(
-
1
)
constant_init
(
self
.
attention_weights
,
val
=
0.
,
bias
=
0.
)
xavier_init
(
self
.
value_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
xavier_init
(
self
.
output_proj
,
distribution
=
'uniform'
,
bias
=
0.
)
self
.
_is_init
=
True
@
deprecated_api_warning
({
'residual'
:
'identity'
},
cls_name
=
'MultiScaleDeformableAttention'
)
def
forward
(
self
,
query
,
key
=
None
,
value
=
None
,
identity
=
None
,
query_pos
=
None
,
key_padding_mask
=
None
,
reference_points
=
None
,
spatial_shapes
=
None
,
level_start_index
=
None
,
**
kwargs
):
"""Forward Function of MultiScaleDeformAttention.
Args:
query (Tensor): Query of Transformer with shape
(num_query, bs, embed_dims).
key (Tensor): The key tensor with shape
`(num_key, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_key, bs, embed_dims)`.
identity (Tensor): The tensor used for addition, with the
same shape as `query`. Default None. If None,
`query` will be used.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`. Default
None.
reference_points (Tensor): The normalized reference
points with shape (bs, num_query, num_levels, 2),
all elements is range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area.
or (N, Length_{query}, num_levels, 4), add
additional two dimensions is (w, h) to
form reference boxes.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_key].
spatial_shapes (Tensor): Spatial shape of features in
different levels. With shape (num_levels, 2),
last dimension represents (h, w).
level_start_index (Tensor): The start index of each level.
A tensor has shape ``(num_levels, )`` and can be represented
as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
Returns:
Tensor: forwarded results with shape [num_query, bs, embed_dims].
"""
if
value
is
None
:
value
=
query
if
identity
is
None
:
identity
=
query
if
query_pos
is
not
None
:
query
=
query
+
query_pos
if
not
self
.
batch_first
:
# change to (bs, num_query ,embed_dims)
query
=
query
.
permute
(
1
,
0
,
2
)
value
=
value
.
permute
(
1
,
0
,
2
)
bs
,
num_query
,
_
=
query
.
shape
bs
,
num_value
,
_
=
value
.
shape
assert
(
spatial_shapes
[:,
0
]
*
spatial_shapes
[:,
1
]).
sum
()
==
num_value
value
=
self
.
value_proj
(
value
)
if
key_padding_mask
is
not
None
:
value
=
value
.
masked_fill
(
key_padding_mask
[...,
None
],
0.0
)
value
=
value
.
view
(
bs
,
num_value
,
self
.
num_heads
,
-
1
)
sampling_offsets
=
self
.
sampling_offsets
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
,
2
)
attention_weights
=
self
.
attention_weights
(
query
).
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
*
self
.
num_points
)
attention_weights
=
attention_weights
.
softmax
(
-
1
)
attention_weights
=
attention_weights
.
view
(
bs
,
num_query
,
self
.
num_heads
,
self
.
num_levels
,
self
.
num_points
)
if
reference_points
.
shape
[
-
1
]
==
2
:
offset_normalizer
=
torch
.
stack
(
[
spatial_shapes
[...,
1
],
spatial_shapes
[...,
0
]],
-
1
)
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:]
\
+
sampling_offsets
\
/
offset_normalizer
[
None
,
None
,
None
,
:,
None
,
:]
elif
reference_points
.
shape
[
-
1
]
==
4
:
sampling_locations
=
reference_points
[:,
:,
None
,
:,
None
,
:
2
]
\
+
sampling_offsets
/
self
.
num_points
\
*
reference_points
[:,
:,
None
,
:,
None
,
2
:]
\
*
0.5
else
:
raise
ValueError
(
f
'Last dim of reference_points must be'
f
' 2 or 4, but get
{
reference_points
.
shape
[
-
1
]
}
instead.'
)
if
torch
.
cuda
.
is_available
()
and
value
.
is_cuda
:
output
=
MultiScaleDeformableAttnFunction
.
apply
(
value
,
spatial_shapes
,
level_start_index
,
sampling_locations
,
attention_weights
,
self
.
im2col_step
)
else
:
output
=
multi_scale_deformable_attn_pytorch
(
value
,
spatial_shapes
,
sampling_locations
,
attention_weights
)
output
=
self
.
output_proj
(
output
)
if
not
self
.
batch_first
:
# (num_query, bs ,embed_dims)
output
=
output
.
permute
(
1
,
0
,
2
)
return
self
.
dropout
(
output
)
+
identity
lavis/common/annotator/uniformer/mmcv/ops/nms.py
0 → 100644
View file @
c04f261a
import
os
import
numpy
as
np
import
torch
from
annotator.uniformer.mmcv.utils
import
deprecated_api_warning
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'nms'
,
'softnms'
,
'nms_match'
,
'nms_rotated'
])
# This function is modified from: https://github.com/pytorch/vision/
class
NMSop
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
bboxes
,
scores
,
iou_threshold
,
offset
,
score_threshold
,
max_num
):
is_filtering_by_score
=
score_threshold
>
0
if
is_filtering_by_score
:
valid_mask
=
scores
>
score_threshold
bboxes
,
scores
=
bboxes
[
valid_mask
],
scores
[
valid_mask
]
valid_inds
=
torch
.
nonzero
(
valid_mask
,
as_tuple
=
False
).
squeeze
(
dim
=
1
)
inds
=
ext_module
.
nms
(
bboxes
,
scores
,
iou_threshold
=
float
(
iou_threshold
),
offset
=
offset
)
if
max_num
>
0
:
inds
=
inds
[:
max_num
]
if
is_filtering_by_score
:
inds
=
valid_inds
[
inds
]
return
inds
@
staticmethod
def
symbolic
(
g
,
bboxes
,
scores
,
iou_threshold
,
offset
,
score_threshold
,
max_num
):
from
..onnx
import
is_custom_op_loaded
has_custom_op
=
is_custom_op_loaded
()
# TensorRT nms plugin is aligned with original nms in ONNXRuntime
is_trt_backend
=
os
.
environ
.
get
(
'ONNX_BACKEND'
)
==
'MMCVTensorRT'
if
has_custom_op
and
(
not
is_trt_backend
):
return
g
.
op
(
'mmcv::NonMaxSuppression'
,
bboxes
,
scores
,
iou_threshold_f
=
float
(
iou_threshold
),
offset_i
=
int
(
offset
))
else
:
from
torch.onnx.symbolic_opset9
import
select
,
squeeze
,
unsqueeze
from
..onnx.onnx_utils.symbolic_helper
import
_size_helper
boxes
=
unsqueeze
(
g
,
bboxes
,
0
)
scores
=
unsqueeze
(
g
,
unsqueeze
(
g
,
scores
,
0
),
0
)
if
max_num
>
0
:
max_num
=
g
.
op
(
'Constant'
,
value_t
=
torch
.
tensor
(
max_num
,
dtype
=
torch
.
long
))
else
:
dim
=
g
.
op
(
'Constant'
,
value_t
=
torch
.
tensor
(
0
))
max_num
=
_size_helper
(
g
,
bboxes
,
dim
)
max_output_per_class
=
max_num
iou_threshold
=
g
.
op
(
'Constant'
,
value_t
=
torch
.
tensor
([
iou_threshold
],
dtype
=
torch
.
float
))
score_threshold
=
g
.
op
(
'Constant'
,
value_t
=
torch
.
tensor
([
score_threshold
],
dtype
=
torch
.
float
))
nms_out
=
g
.
op
(
'NonMaxSuppression'
,
boxes
,
scores
,
max_output_per_class
,
iou_threshold
,
score_threshold
)
return
squeeze
(
g
,
select
(
g
,
nms_out
,
1
,
g
.
op
(
'Constant'
,
value_t
=
torch
.
tensor
([
2
],
dtype
=
torch
.
long
))),
1
)
class
SoftNMSop
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
boxes
,
scores
,
iou_threshold
,
sigma
,
min_score
,
method
,
offset
):
dets
=
boxes
.
new_empty
((
boxes
.
size
(
0
),
5
),
device
=
'cpu'
)
inds
=
ext_module
.
softnms
(
boxes
.
cpu
(),
scores
.
cpu
(),
dets
.
cpu
(),
iou_threshold
=
float
(
iou_threshold
),
sigma
=
float
(
sigma
),
min_score
=
float
(
min_score
),
method
=
int
(
method
),
offset
=
int
(
offset
))
return
dets
,
inds
@
staticmethod
def
symbolic
(
g
,
boxes
,
scores
,
iou_threshold
,
sigma
,
min_score
,
method
,
offset
):
from
packaging
import
version
assert
version
.
parse
(
torch
.
__version__
)
>=
version
.
parse
(
'1.7.0'
)
nms_out
=
g
.
op
(
'mmcv::SoftNonMaxSuppression'
,
boxes
,
scores
,
iou_threshold_f
=
float
(
iou_threshold
),
sigma_f
=
float
(
sigma
),
min_score_f
=
float
(
min_score
),
method_i
=
int
(
method
),
offset_i
=
int
(
offset
),
outputs
=
2
)
return
nms_out
@
deprecated_api_warning
({
'iou_thr'
:
'iou_threshold'
})
def
nms
(
boxes
,
scores
,
iou_threshold
,
offset
=
0
,
score_threshold
=
0
,
max_num
=-
1
):
"""Dispatch to either CPU or GPU NMS implementations.
The input can be either torch tensor or numpy array. GPU NMS will be used
if the input is gpu tensor, otherwise CPU NMS
will be used. The returned type will always be the same as inputs.
Arguments:
boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
scores (torch.Tensor or np.ndarray): scores in shape (N, ).
iou_threshold (float): IoU threshold for NMS.
offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
score_threshold (float): score threshold for NMS.
max_num (int): maximum number of boxes after NMS.
Returns:
tuple: kept dets(boxes and scores) and indice, which is always the
\
same data type as the input.
Example:
>>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
>>> [49.3, 32.9, 51.0, 35.3],
>>> [49.2, 31.8, 51.0, 35.4],
>>> [35.1, 11.5, 39.1, 15.7],
>>> [35.6, 11.8, 39.3, 14.2],
>>> [35.3, 11.5, 39.9, 14.5],
>>> [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
>>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],
\
dtype=np.float32)
>>> iou_threshold = 0.6
>>> dets, inds = nms(boxes, scores, iou_threshold)
>>> assert len(inds) == len(dets) == 3
"""
assert
isinstance
(
boxes
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
scores
,
(
torch
.
Tensor
,
np
.
ndarray
))
is_numpy
=
False
if
isinstance
(
boxes
,
np
.
ndarray
):
is_numpy
=
True
boxes
=
torch
.
from_numpy
(
boxes
)
if
isinstance
(
scores
,
np
.
ndarray
):
scores
=
torch
.
from_numpy
(
scores
)
assert
boxes
.
size
(
1
)
==
4
assert
boxes
.
size
(
0
)
==
scores
.
size
(
0
)
assert
offset
in
(
0
,
1
)
if
torch
.
__version__
==
'parrots'
:
indata_list
=
[
boxes
,
scores
]
indata_dict
=
{
'iou_threshold'
:
float
(
iou_threshold
),
'offset'
:
int
(
offset
)
}
inds
=
ext_module
.
nms
(
*
indata_list
,
**
indata_dict
)
else
:
inds
=
NMSop
.
apply
(
boxes
,
scores
,
iou_threshold
,
offset
,
score_threshold
,
max_num
)
dets
=
torch
.
cat
((
boxes
[
inds
],
scores
[
inds
].
reshape
(
-
1
,
1
)),
dim
=
1
)
if
is_numpy
:
dets
=
dets
.
cpu
().
numpy
()
inds
=
inds
.
cpu
().
numpy
()
return
dets
,
inds
@
deprecated_api_warning
({
'iou_thr'
:
'iou_threshold'
})
def
soft_nms
(
boxes
,
scores
,
iou_threshold
=
0.3
,
sigma
=
0.5
,
min_score
=
1e-3
,
method
=
'linear'
,
offset
=
0
):
"""Dispatch to only CPU Soft NMS implementations.
The input can be either a torch tensor or numpy array.
The returned type will always be the same as inputs.
Arguments:
boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
scores (torch.Tensor or np.ndarray): scores in shape (N, ).
iou_threshold (float): IoU threshold for NMS.
sigma (float): hyperparameter for gaussian method
min_score (float): score filter threshold
method (str): either 'linear' or 'gaussian'
offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
Returns:
tuple: kept dets(boxes and scores) and indice, which is always the
\
same data type as the input.
Example:
>>> boxes = np.array([[4., 3., 5., 3.],
>>> [4., 3., 5., 4.],
>>> [3., 1., 3., 1.],
>>> [3., 1., 3., 1.],
>>> [3., 1., 3., 1.],
>>> [3., 1., 3., 1.]], dtype=np.float32)
>>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
>>> iou_threshold = 0.6
>>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
>>> assert len(inds) == len(dets) == 5
"""
assert
isinstance
(
boxes
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
scores
,
(
torch
.
Tensor
,
np
.
ndarray
))
is_numpy
=
False
if
isinstance
(
boxes
,
np
.
ndarray
):
is_numpy
=
True
boxes
=
torch
.
from_numpy
(
boxes
)
if
isinstance
(
scores
,
np
.
ndarray
):
scores
=
torch
.
from_numpy
(
scores
)
assert
boxes
.
size
(
1
)
==
4
assert
boxes
.
size
(
0
)
==
scores
.
size
(
0
)
assert
offset
in
(
0
,
1
)
method_dict
=
{
'naive'
:
0
,
'linear'
:
1
,
'gaussian'
:
2
}
assert
method
in
method_dict
.
keys
()
if
torch
.
__version__
==
'parrots'
:
dets
=
boxes
.
new_empty
((
boxes
.
size
(
0
),
5
),
device
=
'cpu'
)
indata_list
=
[
boxes
.
cpu
(),
scores
.
cpu
(),
dets
.
cpu
()]
indata_dict
=
{
'iou_threshold'
:
float
(
iou_threshold
),
'sigma'
:
float
(
sigma
),
'min_score'
:
min_score
,
'method'
:
method_dict
[
method
],
'offset'
:
int
(
offset
)
}
inds
=
ext_module
.
softnms
(
*
indata_list
,
**
indata_dict
)
else
:
dets
,
inds
=
SoftNMSop
.
apply
(
boxes
.
cpu
(),
scores
.
cpu
(),
float
(
iou_threshold
),
float
(
sigma
),
float
(
min_score
),
method_dict
[
method
],
int
(
offset
))
dets
=
dets
[:
inds
.
size
(
0
)]
if
is_numpy
:
dets
=
dets
.
cpu
().
numpy
()
inds
=
inds
.
cpu
().
numpy
()
return
dets
,
inds
else
:
return
dets
.
to
(
device
=
boxes
.
device
),
inds
.
to
(
device
=
boxes
.
device
)
def
batched_nms
(
boxes
,
scores
,
idxs
,
nms_cfg
,
class_agnostic
=
False
):
"""Performs non-maximum suppression in a batched fashion.
Modified from https://github.com/pytorch/vision/blob
/505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
In order to perform NMS independently per class, we add an offset to all
the boxes. The offset is dependent only on the class idx, and is large
enough so that boxes from different classes do not overlap.
Arguments:
boxes (torch.Tensor): boxes in shape (N, 4).
scores (torch.Tensor): scores in shape (N, ).
idxs (torch.Tensor): each index value correspond to a bbox cluster,
and NMS will not be applied between elements of different idxs,
shape (N, ).
nms_cfg (dict): specify nms type and other parameters like iou_thr.
Possible keys includes the following.
- iou_thr (float): IoU threshold used for NMS.
- split_thr (float): threshold number of boxes. In some cases the
number of boxes is large (e.g., 200k). To avoid OOM during
training, the users could set `split_thr` to a small value.
If the number of boxes is greater than the threshold, it will
perform NMS on each group of boxes separately and sequentially.
Defaults to 10000.
class_agnostic (bool): if true, nms is class agnostic,
i.e. IoU thresholding happens over all boxes,
regardless of the predicted class.
Returns:
tuple: kept dets and indice.
"""
nms_cfg_
=
nms_cfg
.
copy
()
class_agnostic
=
nms_cfg_
.
pop
(
'class_agnostic'
,
class_agnostic
)
if
class_agnostic
:
boxes_for_nms
=
boxes
else
:
max_coordinate
=
boxes
.
max
()
offsets
=
idxs
.
to
(
boxes
)
*
(
max_coordinate
+
torch
.
tensor
(
1
).
to
(
boxes
))
boxes_for_nms
=
boxes
+
offsets
[:,
None
]
nms_type
=
nms_cfg_
.
pop
(
'type'
,
'nms'
)
nms_op
=
eval
(
nms_type
)
split_thr
=
nms_cfg_
.
pop
(
'split_thr'
,
10000
)
# Won't split to multiple nms nodes when exporting to onnx
if
boxes_for_nms
.
shape
[
0
]
<
split_thr
or
torch
.
onnx
.
is_in_onnx_export
():
dets
,
keep
=
nms_op
(
boxes_for_nms
,
scores
,
**
nms_cfg_
)
boxes
=
boxes
[
keep
]
# -1 indexing works abnormal in TensorRT
# This assumes `dets` has 5 dimensions where
# the last dimension is score.
# TODO: more elegant way to handle the dimension issue.
# Some type of nms would reweight the score, such as SoftNMS
scores
=
dets
[:,
4
]
else
:
max_num
=
nms_cfg_
.
pop
(
'max_num'
,
-
1
)
total_mask
=
scores
.
new_zeros
(
scores
.
size
(),
dtype
=
torch
.
bool
)
# Some type of nms would reweight the score, such as SoftNMS
scores_after_nms
=
scores
.
new_zeros
(
scores
.
size
())
for
id
in
torch
.
unique
(
idxs
):
mask
=
(
idxs
==
id
).
nonzero
(
as_tuple
=
False
).
view
(
-
1
)
dets
,
keep
=
nms_op
(
boxes_for_nms
[
mask
],
scores
[
mask
],
**
nms_cfg_
)
total_mask
[
mask
[
keep
]]
=
True
scores_after_nms
[
mask
[
keep
]]
=
dets
[:,
-
1
]
keep
=
total_mask
.
nonzero
(
as_tuple
=
False
).
view
(
-
1
)
scores
,
inds
=
scores_after_nms
[
keep
].
sort
(
descending
=
True
)
keep
=
keep
[
inds
]
boxes
=
boxes
[
keep
]
if
max_num
>
0
:
keep
=
keep
[:
max_num
]
boxes
=
boxes
[:
max_num
]
scores
=
scores
[:
max_num
]
return
torch
.
cat
([
boxes
,
scores
[:,
None
]],
-
1
),
keep
def
nms_match
(
dets
,
iou_threshold
):
"""Matched dets into different groups by NMS.
NMS match is Similar to NMS but when a bbox is suppressed, nms match will
record the indice of suppressed bbox and form a group with the indice of
kept bbox. In each group, indice is sorted as score order.
Arguments:
dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
iou_thr (float): IoU thresh for NMS.
Returns:
List[torch.Tensor | np.ndarray]: The outer list corresponds different
matched group, the inner Tensor corresponds the indices for a group
in score order.
"""
if
dets
.
shape
[
0
]
==
0
:
matched
=
[]
else
:
assert
dets
.
shape
[
-
1
]
==
5
,
'inputs dets.shape should be (N, 5), '
\
f
'but get
{
dets
.
shape
}
'
if
isinstance
(
dets
,
torch
.
Tensor
):
dets_t
=
dets
.
detach
().
cpu
()
else
:
dets_t
=
torch
.
from_numpy
(
dets
)
indata_list
=
[
dets_t
]
indata_dict
=
{
'iou_threshold'
:
float
(
iou_threshold
)}
matched
=
ext_module
.
nms_match
(
*
indata_list
,
**
indata_dict
)
if
torch
.
__version__
==
'parrots'
:
matched
=
matched
.
tolist
()
if
isinstance
(
dets
,
torch
.
Tensor
):
return
[
dets
.
new_tensor
(
m
,
dtype
=
torch
.
long
)
for
m
in
matched
]
else
:
return
[
np
.
array
(
m
,
dtype
=
np
.
int
)
for
m
in
matched
]
def
nms_rotated
(
dets
,
scores
,
iou_threshold
,
labels
=
None
):
"""Performs non-maximum suppression (NMS) on the rotated boxes according to
their intersection-over-union (IoU).
Rotated NMS iteratively removes lower scoring rotated boxes which have an
IoU greater than iou_threshold with another (higher scoring) rotated box.
Args:
boxes (Tensor): Rotated boxes in shape (N, 5). They are expected to
\
be in (x_ctr, y_ctr, width, height, angle_radian) format.
scores (Tensor): scores in shape (N, ).
iou_threshold (float): IoU thresh for NMS.
labels (Tensor): boxes' label in shape (N,).
Returns:
tuple: kept dets(boxes and scores) and indice, which is always the
\
same data type as the input.
"""
if
dets
.
shape
[
0
]
==
0
:
return
dets
,
None
multi_label
=
labels
is
not
None
if
multi_label
:
dets_wl
=
torch
.
cat
((
dets
,
labels
.
unsqueeze
(
1
)),
1
)
else
:
dets_wl
=
dets
_
,
order
=
scores
.
sort
(
0
,
descending
=
True
)
dets_sorted
=
dets_wl
.
index_select
(
0
,
order
)
if
torch
.
__version__
==
'parrots'
:
keep_inds
=
ext_module
.
nms_rotated
(
dets_wl
,
scores
,
order
,
dets_sorted
,
iou_threshold
=
iou_threshold
,
multi_label
=
multi_label
)
else
:
keep_inds
=
ext_module
.
nms_rotated
(
dets_wl
,
scores
,
order
,
dets_sorted
,
iou_threshold
,
multi_label
)
dets
=
torch
.
cat
((
dets
[
keep_inds
],
scores
[
keep_inds
].
reshape
(
-
1
,
1
)),
dim
=
1
)
return
dets
,
keep_inds
lavis/common/annotator/uniformer/mmcv/ops/pixel_group.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
numpy
as
np
import
torch
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'pixel_group'
])
def
pixel_group
(
score
,
mask
,
embedding
,
kernel_label
,
kernel_contour
,
kernel_region_num
,
distance_threshold
):
"""Group pixels into text instances, which is widely used text detection
methods.
Arguments:
score (np.array or Tensor): The foreground score with size hxw.
mask (np.array or Tensor): The foreground mask with size hxw.
embedding (np.array or Tensor): The embedding with size hxwxc to
distinguish instances.
kernel_label (np.array or Tensor): The instance kernel index with
size hxw.
kernel_contour (np.array or Tensor): The kernel contour with size hxw.
kernel_region_num (int): The instance kernel region number.
distance_threshold (float): The embedding distance threshold between
kernel and pixel in one instance.
Returns:
pixel_assignment (List[List[float]]): The instance coordinate list.
Each element consists of averaged confidence, pixel number, and
coordinates (x_i, y_i for all pixels) in order.
"""
assert
isinstance
(
score
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
mask
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
embedding
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
kernel_label
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
kernel_contour
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
kernel_region_num
,
int
)
assert
isinstance
(
distance_threshold
,
float
)
if
isinstance
(
score
,
np
.
ndarray
):
score
=
torch
.
from_numpy
(
score
)
if
isinstance
(
mask
,
np
.
ndarray
):
mask
=
torch
.
from_numpy
(
mask
)
if
isinstance
(
embedding
,
np
.
ndarray
):
embedding
=
torch
.
from_numpy
(
embedding
)
if
isinstance
(
kernel_label
,
np
.
ndarray
):
kernel_label
=
torch
.
from_numpy
(
kernel_label
)
if
isinstance
(
kernel_contour
,
np
.
ndarray
):
kernel_contour
=
torch
.
from_numpy
(
kernel_contour
)
if
torch
.
__version__
==
'parrots'
:
label
=
ext_module
.
pixel_group
(
score
,
mask
,
embedding
,
kernel_label
,
kernel_contour
,
kernel_region_num
=
kernel_region_num
,
distance_threshold
=
distance_threshold
)
label
=
label
.
tolist
()
label
=
label
[
0
]
list_index
=
kernel_region_num
pixel_assignment
=
[]
for
x
in
range
(
kernel_region_num
):
pixel_assignment
.
append
(
np
.
array
(
label
[
list_index
:
list_index
+
int
(
label
[
x
])],
dtype
=
np
.
float
))
list_index
=
list_index
+
int
(
label
[
x
])
else
:
pixel_assignment
=
ext_module
.
pixel_group
(
score
,
mask
,
embedding
,
kernel_label
,
kernel_contour
,
kernel_region_num
,
distance_threshold
)
return
pixel_assignment
lavis/common/annotator/uniformer/mmcv/ops/point_sample.py
0 → 100644
View file @
c04f261a
# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend # noqa
from
os
import
path
as
osp
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.nn.modules.utils
import
_pair
from
torch.onnx.operators
import
shape_as_tensor
def
bilinear_grid_sample
(
im
,
grid
,
align_corners
=
False
):
"""Given an input and a flow-field grid, computes the output using input
values and pixel locations from grid. Supported only bilinear interpolation
method to sample the input pixels.
Args:
im (torch.Tensor): Input feature map, shape (N, C, H, W)
grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
align_corners {bool}: If set to True, the extrema (-1 and 1) are
considered as referring to the center points of the input’s
corner pixels. If set to False, they are instead considered as
referring to the corner points of the input’s corner pixels,
making the sampling more resolution agnostic.
Returns:
torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
"""
n
,
c
,
h
,
w
=
im
.
shape
gn
,
gh
,
gw
,
_
=
grid
.
shape
assert
n
==
gn
x
=
grid
[:,
:,
:,
0
]
y
=
grid
[:,
:,
:,
1
]
if
align_corners
:
x
=
((
x
+
1
)
/
2
)
*
(
w
-
1
)
y
=
((
y
+
1
)
/
2
)
*
(
h
-
1
)
else
:
x
=
((
x
+
1
)
*
w
-
1
)
/
2
y
=
((
y
+
1
)
*
h
-
1
)
/
2
x
=
x
.
view
(
n
,
-
1
)
y
=
y
.
view
(
n
,
-
1
)
x0
=
torch
.
floor
(
x
).
long
()
y0
=
torch
.
floor
(
y
).
long
()
x1
=
x0
+
1
y1
=
y0
+
1
wa
=
((
x1
-
x
)
*
(
y1
-
y
)).
unsqueeze
(
1
)
wb
=
((
x1
-
x
)
*
(
y
-
y0
)).
unsqueeze
(
1
)
wc
=
((
x
-
x0
)
*
(
y1
-
y
)).
unsqueeze
(
1
)
wd
=
((
x
-
x0
)
*
(
y
-
y0
)).
unsqueeze
(
1
)
# Apply default for grid_sample function zero padding
im_padded
=
F
.
pad
(
im
,
pad
=
[
1
,
1
,
1
,
1
],
mode
=
'constant'
,
value
=
0
)
padded_h
=
h
+
2
padded_w
=
w
+
2
# save points positions after padding
x0
,
x1
,
y0
,
y1
=
x0
+
1
,
x1
+
1
,
y0
+
1
,
y1
+
1
# Clip coordinates to padded image size
x0
=
torch
.
where
(
x0
<
0
,
torch
.
tensor
(
0
),
x0
)
x0
=
torch
.
where
(
x0
>
padded_w
-
1
,
torch
.
tensor
(
padded_w
-
1
),
x0
)
x1
=
torch
.
where
(
x1
<
0
,
torch
.
tensor
(
0
),
x1
)
x1
=
torch
.
where
(
x1
>
padded_w
-
1
,
torch
.
tensor
(
padded_w
-
1
),
x1
)
y0
=
torch
.
where
(
y0
<
0
,
torch
.
tensor
(
0
),
y0
)
y0
=
torch
.
where
(
y0
>
padded_h
-
1
,
torch
.
tensor
(
padded_h
-
1
),
y0
)
y1
=
torch
.
where
(
y1
<
0
,
torch
.
tensor
(
0
),
y1
)
y1
=
torch
.
where
(
y1
>
padded_h
-
1
,
torch
.
tensor
(
padded_h
-
1
),
y1
)
im_padded
=
im_padded
.
view
(
n
,
c
,
-
1
)
x0_y0
=
(
x0
+
y0
*
padded_w
).
unsqueeze
(
1
).
expand
(
-
1
,
c
,
-
1
)
x0_y1
=
(
x0
+
y1
*
padded_w
).
unsqueeze
(
1
).
expand
(
-
1
,
c
,
-
1
)
x1_y0
=
(
x1
+
y0
*
padded_w
).
unsqueeze
(
1
).
expand
(
-
1
,
c
,
-
1
)
x1_y1
=
(
x1
+
y1
*
padded_w
).
unsqueeze
(
1
).
expand
(
-
1
,
c
,
-
1
)
Ia
=
torch
.
gather
(
im_padded
,
2
,
x0_y0
)
Ib
=
torch
.
gather
(
im_padded
,
2
,
x0_y1
)
Ic
=
torch
.
gather
(
im_padded
,
2
,
x1_y0
)
Id
=
torch
.
gather
(
im_padded
,
2
,
x1_y1
)
return
(
Ia
*
wa
+
Ib
*
wb
+
Ic
*
wc
+
Id
*
wd
).
reshape
(
n
,
c
,
gh
,
gw
)
def
is_in_onnx_export_without_custom_ops
():
from
annotator.uniformer.mmcv.ops
import
get_onnxruntime_op_path
ort_custom_op_path
=
get_onnxruntime_op_path
()
return
torch
.
onnx
.
is_in_onnx_export
(
)
and
not
osp
.
exists
(
ort_custom_op_path
)
def
normalize
(
grid
):
"""Normalize input grid from [-1, 1] to [0, 1]
Args:
grid (Tensor): The grid to be normalize, range [-1, 1].
Returns:
Tensor: Normalized grid, range [0, 1].
"""
return
(
grid
+
1.0
)
/
2.0
def
denormalize
(
grid
):
"""Denormalize input grid from range [0, 1] to [-1, 1]
Args:
grid (Tensor): The grid to be denormalize, range [0, 1].
Returns:
Tensor: Denormalized grid, range [-1, 1].
"""
return
grid
*
2.0
-
1.0
def
generate_grid
(
num_grid
,
size
,
device
):
"""Generate regular square grid of points in [0, 1] x [0, 1] coordinate
space.
Args:
num_grid (int): The number of grids to sample, one for each region.
size (tuple(int, int)): The side size of the regular grid.
device (torch.device): Desired device of returned tensor.
Returns:
(torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that
contains coordinates for the regular grids.
"""
affine_trans
=
torch
.
tensor
([[[
1.
,
0.
,
0.
],
[
0.
,
1.
,
0.
]]],
device
=
device
)
grid
=
F
.
affine_grid
(
affine_trans
,
torch
.
Size
((
1
,
1
,
*
size
)),
align_corners
=
False
)
grid
=
normalize
(
grid
)
return
grid
.
view
(
1
,
-
1
,
2
).
expand
(
num_grid
,
-
1
,
-
1
)
def
rel_roi_point_to_abs_img_point
(
rois
,
rel_roi_points
):
"""Convert roi based relative point coordinates to image based absolute
point coordinates.
Args:
rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
rel_roi_points (Tensor): Point coordinates inside RoI, relative to
RoI, location, range (0, 1), shape (N, P, 2)
Returns:
Tensor: Image based absolute point coordinates, shape (N, P, 2)
"""
with
torch
.
no_grad
():
assert
rel_roi_points
.
size
(
0
)
==
rois
.
size
(
0
)
assert
rois
.
dim
()
==
2
assert
rel_roi_points
.
dim
()
==
3
assert
rel_roi_points
.
size
(
2
)
==
2
# remove batch idx
if
rois
.
size
(
1
)
==
5
:
rois
=
rois
[:,
1
:]
abs_img_points
=
rel_roi_points
.
clone
()
# To avoid an error during exporting to onnx use independent
# variables instead inplace computation
xs
=
abs_img_points
[:,
:,
0
]
*
(
rois
[:,
None
,
2
]
-
rois
[:,
None
,
0
])
ys
=
abs_img_points
[:,
:,
1
]
*
(
rois
[:,
None
,
3
]
-
rois
[:,
None
,
1
])
xs
+=
rois
[:,
None
,
0
]
ys
+=
rois
[:,
None
,
1
]
abs_img_points
=
torch
.
stack
([
xs
,
ys
],
dim
=
2
)
return
abs_img_points
def
get_shape_from_feature_map
(
x
):
"""Get spatial resolution of input feature map considering exporting to
onnx mode.
Args:
x (torch.Tensor): Input tensor, shape (N, C, H, W)
Returns:
torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
"""
if
torch
.
onnx
.
is_in_onnx_export
():
img_shape
=
shape_as_tensor
(
x
)[
2
:].
flip
(
0
).
view
(
1
,
1
,
2
).
to
(
x
.
device
).
float
()
else
:
img_shape
=
torch
.
tensor
(
x
.
shape
[
2
:]).
flip
(
0
).
view
(
1
,
1
,
2
).
to
(
x
.
device
).
float
()
return
img_shape
def
abs_img_point_to_rel_img_point
(
abs_img_points
,
img
,
spatial_scale
=
1.
):
"""Convert image based absolute point coordinates to image based relative
coordinates for sampling.
Args:
abs_img_points (Tensor): Image based absolute point coordinates,
shape (N, P, 2)
img (tuple/Tensor): (height, width) of image or feature map.
spatial_scale (float): Scale points by this factor. Default: 1.
Returns:
Tensor: Image based relative point coordinates for sampling,
shape (N, P, 2)
"""
assert
(
isinstance
(
img
,
tuple
)
and
len
(
img
)
==
2
)
or
\
(
isinstance
(
img
,
torch
.
Tensor
)
and
len
(
img
.
shape
)
==
4
)
if
isinstance
(
img
,
tuple
):
h
,
w
=
img
scale
=
torch
.
tensor
([
w
,
h
],
dtype
=
torch
.
float
,
device
=
abs_img_points
.
device
)
scale
=
scale
.
view
(
1
,
1
,
2
)
else
:
scale
=
get_shape_from_feature_map
(
img
)
return
abs_img_points
/
scale
*
spatial_scale
def
rel_roi_point_to_rel_img_point
(
rois
,
rel_roi_points
,
img
,
spatial_scale
=
1.
):
"""Convert roi based relative point coordinates to image based absolute
point coordinates.
Args:
rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
rel_roi_points (Tensor): Point coordinates inside RoI, relative to
RoI, location, range (0, 1), shape (N, P, 2)
img (tuple/Tensor): (height, width) of image or feature map.
spatial_scale (float): Scale points by this factor. Default: 1.
Returns:
Tensor: Image based relative point coordinates for sampling,
shape (N, P, 2)
"""
abs_img_point
=
rel_roi_point_to_abs_img_point
(
rois
,
rel_roi_points
)
rel_img_point
=
abs_img_point_to_rel_img_point
(
abs_img_point
,
img
,
spatial_scale
)
return
rel_img_point
def
point_sample
(
input
,
points
,
align_corners
=
False
,
**
kwargs
):
"""A wrapper around :func:`grid_sample` to support 3D point_coords tensors
Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
lie inside ``[0, 1] x [0, 1]`` square.
Args:
input (Tensor): Feature map, shape (N, C, H, W).
points (Tensor): Image based absolute point coordinates (normalized),
range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
align_corners (bool): Whether align_corners. Default: False
Returns:
Tensor: Features of `point` on `input`, shape (N, C, P) or
(N, C, Hgrid, Wgrid).
"""
add_dim
=
False
if
points
.
dim
()
==
3
:
add_dim
=
True
points
=
points
.
unsqueeze
(
2
)
if
is_in_onnx_export_without_custom_ops
():
# If custom ops for onnx runtime not compiled use python
# implementation of grid_sample function to make onnx graph
# with supported nodes
output
=
bilinear_grid_sample
(
input
,
denormalize
(
points
),
align_corners
=
align_corners
)
else
:
output
=
F
.
grid_sample
(
input
,
denormalize
(
points
),
align_corners
=
align_corners
,
**
kwargs
)
if
add_dim
:
output
=
output
.
squeeze
(
3
)
return
output
class
SimpleRoIAlign
(
nn
.
Module
):
def
__init__
(
self
,
output_size
,
spatial_scale
,
aligned
=
True
):
"""Simple RoI align in PointRend, faster than standard RoIAlign.
Args:
output_size (tuple[int]): h, w
spatial_scale (float): scale the input boxes by this number
aligned (bool): if False, use the legacy implementation in
MMDetection, align_corners=True will be used in F.grid_sample.
If True, align the results more perfectly.
"""
super
(
SimpleRoIAlign
,
self
).
__init__
()
self
.
output_size
=
_pair
(
output_size
)
self
.
spatial_scale
=
float
(
spatial_scale
)
# to be consistent with other RoI ops
self
.
use_torchvision
=
False
self
.
aligned
=
aligned
def
forward
(
self
,
features
,
rois
):
num_imgs
=
features
.
size
(
0
)
num_rois
=
rois
.
size
(
0
)
rel_roi_points
=
generate_grid
(
num_rois
,
self
.
output_size
,
device
=
rois
.
device
)
if
torch
.
onnx
.
is_in_onnx_export
():
rel_img_points
=
rel_roi_point_to_rel_img_point
(
rois
,
rel_roi_points
,
features
,
self
.
spatial_scale
)
rel_img_points
=
rel_img_points
.
reshape
(
num_imgs
,
-
1
,
*
rel_img_points
.
shape
[
1
:])
point_feats
=
point_sample
(
features
,
rel_img_points
,
align_corners
=
not
self
.
aligned
)
point_feats
=
point_feats
.
transpose
(
1
,
2
)
else
:
point_feats
=
[]
for
batch_ind
in
range
(
num_imgs
):
# unravel batch dim
feat
=
features
[
batch_ind
].
unsqueeze
(
0
)
inds
=
(
rois
[:,
0
].
long
()
==
batch_ind
)
if
inds
.
any
():
rel_img_points
=
rel_roi_point_to_rel_img_point
(
rois
[
inds
],
rel_roi_points
[
inds
],
feat
,
self
.
spatial_scale
).
unsqueeze
(
0
)
point_feat
=
point_sample
(
feat
,
rel_img_points
,
align_corners
=
not
self
.
aligned
)
point_feat
=
point_feat
.
squeeze
(
0
).
transpose
(
0
,
1
)
point_feats
.
append
(
point_feat
)
point_feats
=
torch
.
cat
(
point_feats
,
dim
=
0
)
channels
=
features
.
size
(
1
)
roi_feats
=
point_feats
.
reshape
(
num_rois
,
channels
,
*
self
.
output_size
)
return
roi_feats
def
__repr__
(
self
):
format_str
=
self
.
__class__
.
__name__
format_str
+=
'(output_size={}, spatial_scale={}'
.
format
(
self
.
output_size
,
self
.
spatial_scale
)
return
format_str
lavis/common/annotator/uniformer/mmcv/ops/points_in_boxes.py
0 → 100644
View file @
c04f261a
import
torch
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'points_in_boxes_part_forward'
,
'points_in_boxes_cpu_forward'
,
'points_in_boxes_all_forward'
])
def
points_in_boxes_part
(
points
,
boxes
):
"""Find the box in which each point is (CUDA).
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
LiDAR/DEPTH coordinate, (x, y, z) is the bottom center
Returns:
box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
"""
assert
points
.
shape
[
0
]
==
boxes
.
shape
[
0
],
\
'Points and boxes should have the same batch size, '
\
f
'but got
{
points
.
shape
[
0
]
}
and
{
boxes
.
shape
[
0
]
}
'
assert
boxes
.
shape
[
2
]
==
7
,
\
'boxes dimension should be 7, '
\
f
'but got unexpected shape
{
boxes
.
shape
[
2
]
}
'
assert
points
.
shape
[
2
]
==
3
,
\
'points dimension should be 3, '
\
f
'but got unexpected shape
{
points
.
shape
[
2
]
}
'
batch_size
,
num_points
,
_
=
points
.
shape
box_idxs_of_pts
=
points
.
new_zeros
((
batch_size
,
num_points
),
dtype
=
torch
.
int
).
fill_
(
-
1
)
# If manually put the tensor 'points' or 'boxes' on a device
# which is not the current device, some temporary variables
# will be created on the current device in the cuda op,
# and the output will be incorrect.
# Therefore, we force the current device to be the same
# as the device of the tensors if it was not.
# Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
# for the incorrect output before the fix.
points_device
=
points
.
get_device
()
assert
points_device
==
boxes
.
get_device
(),
\
'Points and boxes should be put on the same device'
if
torch
.
cuda
.
current_device
()
!=
points_device
:
torch
.
cuda
.
set_device
(
points_device
)
ext_module
.
points_in_boxes_part_forward
(
boxes
.
contiguous
(),
points
.
contiguous
(),
box_idxs_of_pts
)
return
box_idxs_of_pts
def
points_in_boxes_cpu
(
points
,
boxes
):
"""Find all boxes in which each point is (CPU). The CPU version of
:meth:`points_in_boxes_all`.
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in
LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
(x, y, z) is the bottom center.
Returns:
box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
"""
assert
points
.
shape
[
0
]
==
boxes
.
shape
[
0
],
\
'Points and boxes should have the same batch size, '
\
f
'but got
{
points
.
shape
[
0
]
}
and
{
boxes
.
shape
[
0
]
}
'
assert
boxes
.
shape
[
2
]
==
7
,
\
'boxes dimension should be 7, '
\
f
'but got unexpected shape
{
boxes
.
shape
[
2
]
}
'
assert
points
.
shape
[
2
]
==
3
,
\
'points dimension should be 3, '
\
f
'but got unexpected shape
{
points
.
shape
[
2
]
}
'
batch_size
,
num_points
,
_
=
points
.
shape
num_boxes
=
boxes
.
shape
[
1
]
point_indices
=
points
.
new_zeros
((
batch_size
,
num_boxes
,
num_points
),
dtype
=
torch
.
int
)
for
b
in
range
(
batch_size
):
ext_module
.
points_in_boxes_cpu_forward
(
boxes
[
b
].
float
().
contiguous
(),
points
[
b
].
float
().
contiguous
(),
point_indices
[
b
])
point_indices
=
point_indices
.
transpose
(
1
,
2
)
return
point_indices
def
points_in_boxes_all
(
points
,
boxes
):
"""Find all boxes in which each point is (CUDA).
Args:
points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
boxes (torch.Tensor): [B, T, 7],
num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
(x, y, z) is the bottom center.
Returns:
box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
"""
assert
boxes
.
shape
[
0
]
==
points
.
shape
[
0
],
\
'Points and boxes should have the same batch size, '
\
f
'but got
{
boxes
.
shape
[
0
]
}
and
{
boxes
.
shape
[
0
]
}
'
assert
boxes
.
shape
[
2
]
==
7
,
\
'boxes dimension should be 7, '
\
f
'but got unexpected shape
{
boxes
.
shape
[
2
]
}
'
assert
points
.
shape
[
2
]
==
3
,
\
'points dimension should be 3, '
\
f
'but got unexpected shape
{
points
.
shape
[
2
]
}
'
batch_size
,
num_points
,
_
=
points
.
shape
num_boxes
=
boxes
.
shape
[
1
]
box_idxs_of_pts
=
points
.
new_zeros
((
batch_size
,
num_points
,
num_boxes
),
dtype
=
torch
.
int
).
fill_
(
0
)
# Same reason as line 25-32
points_device
=
points
.
get_device
()
assert
points_device
==
boxes
.
get_device
(),
\
'Points and boxes should be put on the same device'
if
torch
.
cuda
.
current_device
()
!=
points_device
:
torch
.
cuda
.
set_device
(
points_device
)
ext_module
.
points_in_boxes_all_forward
(
boxes
.
contiguous
(),
points
.
contiguous
(),
box_idxs_of_pts
)
return
box_idxs_of_pts
lavis/common/annotator/uniformer/mmcv/ops/points_sampler.py
0 → 100644
View file @
c04f261a
from
typing
import
List
import
torch
from
torch
import
nn
as
nn
from
annotator.uniformer.mmcv.runner
import
force_fp32
from
.furthest_point_sample
import
(
furthest_point_sample
,
furthest_point_sample_with_dist
)
def
calc_square_dist
(
point_feat_a
,
point_feat_b
,
norm
=
True
):
"""Calculating square distance between a and b.
Args:
point_feat_a (Tensor): (B, N, C) Feature vector of each point.
point_feat_b (Tensor): (B, M, C) Feature vector of each point.
norm (Bool, optional): Whether to normalize the distance.
Default: True.
Returns:
Tensor: (B, N, M) Distance between each pair points.
"""
num_channel
=
point_feat_a
.
shape
[
-
1
]
# [bs, n, 1]
a_square
=
torch
.
sum
(
point_feat_a
.
unsqueeze
(
dim
=
2
).
pow
(
2
),
dim
=-
1
)
# [bs, 1, m]
b_square
=
torch
.
sum
(
point_feat_b
.
unsqueeze
(
dim
=
1
).
pow
(
2
),
dim
=-
1
)
corr_matrix
=
torch
.
matmul
(
point_feat_a
,
point_feat_b
.
transpose
(
1
,
2
))
dist
=
a_square
+
b_square
-
2
*
corr_matrix
if
norm
:
dist
=
torch
.
sqrt
(
dist
)
/
num_channel
return
dist
def
get_sampler_cls
(
sampler_type
):
"""Get the type and mode of points sampler.
Args:
sampler_type (str): The type of points sampler.
The valid value are "D-FPS", "F-FPS", or "FS".
Returns:
class: Points sampler type.
"""
sampler_mappings
=
{
'D-FPS'
:
DFPSSampler
,
'F-FPS'
:
FFPSSampler
,
'FS'
:
FSSampler
,
}
try
:
return
sampler_mappings
[
sampler_type
]
except
KeyError
:
raise
KeyError
(
f
'Supported `sampler_type` are
{
sampler_mappings
.
keys
()
}
, but got
\
{
sampler_type
}
'
)
class
PointsSampler
(
nn
.
Module
):
"""Points sampling.
Args:
num_point (list[int]): Number of sample points.
fps_mod_list (list[str], optional): Type of FPS method, valid mod
['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
F-FPS: using feature distances for FPS.
D-FPS: using Euclidean distances of points for FPS.
FS: using F-FPS and D-FPS simultaneously.
fps_sample_range_list (list[int], optional):
Range of points to apply FPS. Default: [-1].
"""
def
__init__
(
self
,
num_point
:
List
[
int
],
fps_mod_list
:
List
[
str
]
=
[
'D-FPS'
],
fps_sample_range_list
:
List
[
int
]
=
[
-
1
]):
super
().
__init__
()
# FPS would be applied to different fps_mod in the list,
# so the length of the num_point should be equal to
# fps_mod_list and fps_sample_range_list.
assert
len
(
num_point
)
==
len
(
fps_mod_list
)
==
len
(
fps_sample_range_list
)
self
.
num_point
=
num_point
self
.
fps_sample_range_list
=
fps_sample_range_list
self
.
samplers
=
nn
.
ModuleList
()
for
fps_mod
in
fps_mod_list
:
self
.
samplers
.
append
(
get_sampler_cls
(
fps_mod
)())
self
.
fp16_enabled
=
False
@
force_fp32
()
def
forward
(
self
,
points_xyz
,
features
):
"""
Args:
points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
features (Tensor): (B, C, N) Descriptors of the features.
Returns:
Tensor: (B, npoint, sample_num) Indices of sampled points.
"""
indices
=
[]
last_fps_end_index
=
0
for
fps_sample_range
,
sampler
,
npoint
in
zip
(
self
.
fps_sample_range_list
,
self
.
samplers
,
self
.
num_point
):
assert
fps_sample_range
<
points_xyz
.
shape
[
1
]
if
fps_sample_range
==
-
1
:
sample_points_xyz
=
points_xyz
[:,
last_fps_end_index
:]
if
features
is
not
None
:
sample_features
=
features
[:,
:,
last_fps_end_index
:]
else
:
sample_features
=
None
else
:
sample_points_xyz
=
\
points_xyz
[:,
last_fps_end_index
:
fps_sample_range
]
if
features
is
not
None
:
sample_features
=
features
[:,
:,
last_fps_end_index
:
fps_sample_range
]
else
:
sample_features
=
None
fps_idx
=
sampler
(
sample_points_xyz
.
contiguous
(),
sample_features
,
npoint
)
indices
.
append
(
fps_idx
+
last_fps_end_index
)
last_fps_end_index
+=
fps_sample_range
indices
=
torch
.
cat
(
indices
,
dim
=
1
)
return
indices
class
DFPSSampler
(
nn
.
Module
):
"""Using Euclidean distances of points for FPS."""
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
points
,
features
,
npoint
):
"""Sampling points with D-FPS."""
fps_idx
=
furthest_point_sample
(
points
.
contiguous
(),
npoint
)
return
fps_idx
class
FFPSSampler
(
nn
.
Module
):
"""Using feature distances for FPS."""
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
points
,
features
,
npoint
):
"""Sampling points with F-FPS."""
assert
features
is
not
None
,
\
'feature input to FFPS_Sampler should not be None'
features_for_fps
=
torch
.
cat
([
points
,
features
.
transpose
(
1
,
2
)],
dim
=
2
)
features_dist
=
calc_square_dist
(
features_for_fps
,
features_for_fps
,
norm
=
False
)
fps_idx
=
furthest_point_sample_with_dist
(
features_dist
,
npoint
)
return
fps_idx
class
FSSampler
(
nn
.
Module
):
"""Using F-FPS and D-FPS simultaneously."""
def
__init__
(
self
):
super
().
__init__
()
def
forward
(
self
,
points
,
features
,
npoint
):
"""Sampling points with FS_Sampling."""
assert
features
is
not
None
,
\
'feature input to FS_Sampler should not be None'
ffps_sampler
=
FFPSSampler
()
dfps_sampler
=
DFPSSampler
()
fps_idx_ffps
=
ffps_sampler
(
points
,
features
,
npoint
)
fps_idx_dfps
=
dfps_sampler
(
points
,
features
,
npoint
)
fps_idx
=
torch
.
cat
([
fps_idx_ffps
,
fps_idx_dfps
],
dim
=
1
)
return
fps_idx
lavis/common/annotator/uniformer/mmcv/ops/psa_mask.py
0 → 100644
View file @
c04f261a
# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
from
torch
import
nn
from
torch.autograd
import
Function
from
torch.nn.modules.utils
import
_pair
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'psamask_forward'
,
'psamask_backward'
])
class
PSAMaskFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
,
psa_type
,
mask_size
):
return
g
.
op
(
'mmcv::MMCVPSAMask'
,
input
,
psa_type_i
=
psa_type
,
mask_size_i
=
mask_size
)
@
staticmethod
def
forward
(
ctx
,
input
,
psa_type
,
mask_size
):
ctx
.
psa_type
=
psa_type
ctx
.
mask_size
=
_pair
(
mask_size
)
ctx
.
save_for_backward
(
input
)
h_mask
,
w_mask
=
ctx
.
mask_size
batch_size
,
channels
,
h_feature
,
w_feature
=
input
.
size
()
assert
channels
==
h_mask
*
w_mask
output
=
input
.
new_zeros
(
(
batch_size
,
h_feature
*
w_feature
,
h_feature
,
w_feature
))
ext_module
.
psamask_forward
(
input
,
output
,
psa_type
=
psa_type
,
num_
=
batch_size
,
h_feature
=
h_feature
,
w_feature
=
w_feature
,
h_mask
=
h_mask
,
w_mask
=
w_mask
,
half_h_mask
=
(
h_mask
-
1
)
//
2
,
half_w_mask
=
(
w_mask
-
1
)
//
2
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
=
ctx
.
saved_tensors
[
0
]
psa_type
=
ctx
.
psa_type
h_mask
,
w_mask
=
ctx
.
mask_size
batch_size
,
channels
,
h_feature
,
w_feature
=
input
.
size
()
grad_input
=
grad_output
.
new_zeros
(
(
batch_size
,
channels
,
h_feature
,
w_feature
))
ext_module
.
psamask_backward
(
grad_output
,
grad_input
,
psa_type
=
psa_type
,
num_
=
batch_size
,
h_feature
=
h_feature
,
w_feature
=
w_feature
,
h_mask
=
h_mask
,
w_mask
=
w_mask
,
half_h_mask
=
(
h_mask
-
1
)
//
2
,
half_w_mask
=
(
w_mask
-
1
)
//
2
)
return
grad_input
,
None
,
None
,
None
psa_mask
=
PSAMaskFunction
.
apply
class
PSAMask
(
nn
.
Module
):
def
__init__
(
self
,
psa_type
,
mask_size
=
None
):
super
(
PSAMask
,
self
).
__init__
()
assert
psa_type
in
[
'collect'
,
'distribute'
]
if
psa_type
==
'collect'
:
psa_type_enum
=
0
else
:
psa_type_enum
=
1
self
.
psa_type_enum
=
psa_type_enum
self
.
mask_size
=
mask_size
self
.
psa_type
=
psa_type
def
forward
(
self
,
input
):
return
psa_mask
(
input
,
self
.
psa_type_enum
,
self
.
mask_size
)
def
__repr__
(
self
):
s
=
self
.
__class__
.
__name__
s
+=
f
'(psa_type=
{
self
.
psa_type
}
, '
s
+=
f
'mask_size=
{
self
.
mask_size
}
)'
return
s
lavis/common/annotator/uniformer/mmcv/ops/roi_align.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.nn.modules.utils
import
_pair
from
..utils
import
deprecated_api_warning
,
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'roi_align_forward'
,
'roi_align_backward'
])
class
RoIAlignFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
,
rois
,
output_size
,
spatial_scale
,
sampling_ratio
,
pool_mode
,
aligned
):
from
..onnx
import
is_custom_op_loaded
has_custom_op
=
is_custom_op_loaded
()
if
has_custom_op
:
return
g
.
op
(
'mmcv::MMCVRoiAlign'
,
input
,
rois
,
output_height_i
=
output_size
[
0
],
output_width_i
=
output_size
[
1
],
spatial_scale_f
=
spatial_scale
,
sampling_ratio_i
=
sampling_ratio
,
mode_s
=
pool_mode
,
aligned_i
=
aligned
)
else
:
from
torch.onnx.symbolic_opset9
import
sub
,
squeeze
from
torch.onnx.symbolic_helper
import
_slice_helper
from
torch.onnx
import
TensorProtoDataType
# batch_indices = rois[:, 0].long()
batch_indices
=
_slice_helper
(
g
,
rois
,
axes
=
[
1
],
starts
=
[
0
],
ends
=
[
1
])
batch_indices
=
squeeze
(
g
,
batch_indices
,
1
)
batch_indices
=
g
.
op
(
'Cast'
,
batch_indices
,
to_i
=
TensorProtoDataType
.
INT64
)
# rois = rois[:, 1:]
rois
=
_slice_helper
(
g
,
rois
,
axes
=
[
1
],
starts
=
[
1
],
ends
=
[
5
])
if
aligned
:
# rois -= 0.5/spatial_scale
aligned_offset
=
g
.
op
(
'Constant'
,
value_t
=
torch
.
tensor
([
0.5
/
spatial_scale
],
dtype
=
torch
.
float32
))
rois
=
sub
(
g
,
rois
,
aligned_offset
)
# roi align
return
g
.
op
(
'RoiAlign'
,
input
,
rois
,
batch_indices
,
output_height_i
=
output_size
[
0
],
output_width_i
=
output_size
[
1
],
spatial_scale_f
=
spatial_scale
,
sampling_ratio_i
=
max
(
0
,
sampling_ratio
),
mode_s
=
pool_mode
)
@
staticmethod
def
forward
(
ctx
,
input
,
rois
,
output_size
,
spatial_scale
=
1.0
,
sampling_ratio
=
0
,
pool_mode
=
'avg'
,
aligned
=
True
):
ctx
.
output_size
=
_pair
(
output_size
)
ctx
.
spatial_scale
=
spatial_scale
ctx
.
sampling_ratio
=
sampling_ratio
assert
pool_mode
in
(
'max'
,
'avg'
)
ctx
.
pool_mode
=
0
if
pool_mode
==
'max'
else
1
ctx
.
aligned
=
aligned
ctx
.
input_shape
=
input
.
size
()
assert
rois
.
size
(
1
)
==
5
,
'RoI must be (idx, x1, y1, x2, y2)!'
output_shape
=
(
rois
.
size
(
0
),
input
.
size
(
1
),
ctx
.
output_size
[
0
],
ctx
.
output_size
[
1
])
output
=
input
.
new_zeros
(
output_shape
)
if
ctx
.
pool_mode
==
0
:
argmax_y
=
input
.
new_zeros
(
output_shape
)
argmax_x
=
input
.
new_zeros
(
output_shape
)
else
:
argmax_y
=
input
.
new_zeros
(
0
)
argmax_x
=
input
.
new_zeros
(
0
)
ext_module
.
roi_align_forward
(
input
,
rois
,
output
,
argmax_y
,
argmax_x
,
aligned_height
=
ctx
.
output_size
[
0
],
aligned_width
=
ctx
.
output_size
[
1
],
spatial_scale
=
ctx
.
spatial_scale
,
sampling_ratio
=
ctx
.
sampling_ratio
,
pool_mode
=
ctx
.
pool_mode
,
aligned
=
ctx
.
aligned
)
ctx
.
save_for_backward
(
rois
,
argmax_y
,
argmax_x
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
rois
,
argmax_y
,
argmax_x
=
ctx
.
saved_tensors
grad_input
=
grad_output
.
new_zeros
(
ctx
.
input_shape
)
# complex head architecture may cause grad_output uncontiguous.
grad_output
=
grad_output
.
contiguous
()
ext_module
.
roi_align_backward
(
grad_output
,
rois
,
argmax_y
,
argmax_x
,
grad_input
,
aligned_height
=
ctx
.
output_size
[
0
],
aligned_width
=
ctx
.
output_size
[
1
],
spatial_scale
=
ctx
.
spatial_scale
,
sampling_ratio
=
ctx
.
sampling_ratio
,
pool_mode
=
ctx
.
pool_mode
,
aligned
=
ctx
.
aligned
)
return
grad_input
,
None
,
None
,
None
,
None
,
None
,
None
roi_align
=
RoIAlignFunction
.
apply
class
RoIAlign
(
nn
.
Module
):
"""RoI align pooling layer.
Args:
output_size (tuple): h, w
spatial_scale (float): scale the input boxes by this number
sampling_ratio (int): number of inputs samples to take for each
output sample. 0 to take samples densely for current models.
pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
aligned (bool): if False, use the legacy implementation in
MMDetection. If True, align the results more perfectly.
use_torchvision (bool): whether to use roi_align from torchvision.
Note:
The implementation of RoIAlign when aligned=True is modified from
https://github.com/facebookresearch/detectron2/
The meaning of aligned=True:
Given a continuous coordinate c, its two neighboring pixel
indices (in our pixel model) are computed by floor(c - 0.5) and
ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
indices [0] and [1] (which are sampled from the underlying signal
at continuous coordinates 0.5 and 1.5). But the original roi_align
(aligned=False) does not subtract the 0.5 when computing
neighboring pixel indices and therefore it uses pixels with a
slightly incorrect alignment (relative to our pixel model) when
performing bilinear interpolation.
With `aligned=True`,
we first appropriately scale the ROI and then shift it by -0.5
prior to calling roi_align. This produces the correct neighbors;
The difference does not make a difference to the model's
performance if ROIAlign is used together with conv layers.
"""
@
deprecated_api_warning
(
{
'out_size'
:
'output_size'
,
'sample_num'
:
'sampling_ratio'
},
cls_name
=
'RoIAlign'
)
def
__init__
(
self
,
output_size
,
spatial_scale
=
1.0
,
sampling_ratio
=
0
,
pool_mode
=
'avg'
,
aligned
=
True
,
use_torchvision
=
False
):
super
(
RoIAlign
,
self
).
__init__
()
self
.
output_size
=
_pair
(
output_size
)
self
.
spatial_scale
=
float
(
spatial_scale
)
self
.
sampling_ratio
=
int
(
sampling_ratio
)
self
.
pool_mode
=
pool_mode
self
.
aligned
=
aligned
self
.
use_torchvision
=
use_torchvision
def
forward
(
self
,
input
,
rois
):
"""
Args:
input: NCHW images
rois: Bx5 boxes. First column is the index into N.
\
The other 4 columns are xyxy.
"""
if
self
.
use_torchvision
:
from
torchvision.ops
import
roi_align
as
tv_roi_align
if
'aligned'
in
tv_roi_align
.
__code__
.
co_varnames
:
return
tv_roi_align
(
input
,
rois
,
self
.
output_size
,
self
.
spatial_scale
,
self
.
sampling_ratio
,
self
.
aligned
)
else
:
if
self
.
aligned
:
rois
-=
rois
.
new_tensor
([
0.
]
+
[
0.5
/
self
.
spatial_scale
]
*
4
)
return
tv_roi_align
(
input
,
rois
,
self
.
output_size
,
self
.
spatial_scale
,
self
.
sampling_ratio
)
else
:
return
roi_align
(
input
,
rois
,
self
.
output_size
,
self
.
spatial_scale
,
self
.
sampling_ratio
,
self
.
pool_mode
,
self
.
aligned
)
def
__repr__
(
self
):
s
=
self
.
__class__
.
__name__
s
+=
f
'(output_size=
{
self
.
output_size
}
, '
s
+=
f
'spatial_scale=
{
self
.
spatial_scale
}
, '
s
+=
f
'sampling_ratio=
{
self
.
sampling_ratio
}
, '
s
+=
f
'pool_mode=
{
self
.
pool_mode
}
, '
s
+=
f
'aligned=
{
self
.
aligned
}
, '
s
+=
f
'use_torchvision=
{
self
.
use_torchvision
}
)'
return
s
Prev
1
…
9
10
11
12
13
14
15
16
17
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment