Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
InstructBLIP_pytorch
Commits
c04f261a
Commit
c04f261a
authored
Aug 22, 2024
by
dongchy920
Browse files
InstruceBLIP
parents
Pipeline
#1594
canceled with stages
Changes
421
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3415 additions
and
0 deletions
+3415
-0
lavis/common/annotator/uniformer/mmcv/image/geometric.py
lavis/common/annotator/uniformer/mmcv/image/geometric.py
+728
-0
lavis/common/annotator/uniformer/mmcv/image/io.py
lavis/common/annotator/uniformer/mmcv/image/io.py
+258
-0
lavis/common/annotator/uniformer/mmcv/image/misc.py
lavis/common/annotator/uniformer/mmcv/image/misc.py
+44
-0
lavis/common/annotator/uniformer/mmcv/image/photometric.py
lavis/common/annotator/uniformer/mmcv/image/photometric.py
+428
-0
lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json
...common/annotator/uniformer/mmcv/model_zoo/deprecated.json
+6
-0
lavis/common/annotator/uniformer/mmcv/model_zoo/mmcls.json
lavis/common/annotator/uniformer/mmcv/model_zoo/mmcls.json
+31
-0
lavis/common/annotator/uniformer/mmcv/model_zoo/open_mmlab.json
...common/annotator/uniformer/mmcv/model_zoo/open_mmlab.json
+50
-0
lavis/common/annotator/uniformer/mmcv/ops/__init__.py
lavis/common/annotator/uniformer/mmcv/ops/__init__.py
+81
-0
lavis/common/annotator/uniformer/mmcv/ops/assign_score_withk.py
...common/annotator/uniformer/mmcv/ops/assign_score_withk.py
+123
-0
lavis/common/annotator/uniformer/mmcv/ops/ball_query.py
lavis/common/annotator/uniformer/mmcv/ops/ball_query.py
+55
-0
lavis/common/annotator/uniformer/mmcv/ops/bbox.py
lavis/common/annotator/uniformer/mmcv/ops/bbox.py
+72
-0
lavis/common/annotator/uniformer/mmcv/ops/border_align.py
lavis/common/annotator/uniformer/mmcv/ops/border_align.py
+109
-0
lavis/common/annotator/uniformer/mmcv/ops/box_iou_rotated.py
lavis/common/annotator/uniformer/mmcv/ops/box_iou_rotated.py
+45
-0
lavis/common/annotator/uniformer/mmcv/ops/carafe.py
lavis/common/annotator/uniformer/mmcv/ops/carafe.py
+287
-0
lavis/common/annotator/uniformer/mmcv/ops/cc_attention.py
lavis/common/annotator/uniformer/mmcv/ops/cc_attention.py
+83
-0
lavis/common/annotator/uniformer/mmcv/ops/contour_expand.py
lavis/common/annotator/uniformer/mmcv/ops/contour_expand.py
+49
-0
lavis/common/annotator/uniformer/mmcv/ops/corner_pool.py
lavis/common/annotator/uniformer/mmcv/ops/corner_pool.py
+161
-0
lavis/common/annotator/uniformer/mmcv/ops/correlation.py
lavis/common/annotator/uniformer/mmcv/ops/correlation.py
+196
-0
lavis/common/annotator/uniformer/mmcv/ops/deform_conv.py
lavis/common/annotator/uniformer/mmcv/ops/deform_conv.py
+405
-0
lavis/common/annotator/uniformer/mmcv/ops/deform_roi_pool.py
lavis/common/annotator/uniformer/mmcv/ops/deform_roi_pool.py
+204
-0
No files found.
Too many changes to show.
To preserve performance only
421 of 421+
files are displayed.
Plain diff
Email patch
lavis/common/annotator/uniformer/mmcv/image/geometric.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
numbers
import
cv2
import
numpy
as
np
from
..utils
import
to_2tuple
from
.io
import
imread_backend
try
:
from
PIL
import
Image
except
ImportError
:
Image
=
None
def
_scale_size
(
size
,
scale
):
"""Rescale a size by a ratio.
Args:
size (tuple[int]): (w, h).
scale (float | tuple(float)): Scaling factor.
Returns:
tuple[int]: scaled size.
"""
if
isinstance
(
scale
,
(
float
,
int
)):
scale
=
(
scale
,
scale
)
w
,
h
=
size
return
int
(
w
*
float
(
scale
[
0
])
+
0.5
),
int
(
h
*
float
(
scale
[
1
])
+
0.5
)
cv2_interp_codes
=
{
'nearest'
:
cv2
.
INTER_NEAREST
,
'bilinear'
:
cv2
.
INTER_LINEAR
,
'bicubic'
:
cv2
.
INTER_CUBIC
,
'area'
:
cv2
.
INTER_AREA
,
'lanczos'
:
cv2
.
INTER_LANCZOS4
}
if
Image
is
not
None
:
pillow_interp_codes
=
{
'nearest'
:
Image
.
NEAREST
,
'bilinear'
:
Image
.
BILINEAR
,
'bicubic'
:
Image
.
BICUBIC
,
'box'
:
Image
.
BOX
,
'lanczos'
:
Image
.
LANCZOS
,
'hamming'
:
Image
.
HAMMING
}
def
imresize
(
img
,
size
,
return_scale
=
False
,
interpolation
=
'bilinear'
,
out
=
None
,
backend
=
None
):
"""Resize image to a given size.
Args:
img (ndarray): The input image.
size (tuple[int]): Target size (w, h).
return_scale (bool): Whether to return `w_scale` and `h_scale`.
interpolation (str): Interpolation method, accepted values are
"nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
backend, "nearest", "bilinear" for 'pillow' backend.
out (ndarray): The output destination.
backend (str | None): The image resize backend type. Options are `cv2`,
`pillow`, `None`. If backend is None, the global imread_backend
specified by ``mmcv.use_backend()`` will be used. Default: None.
Returns:
tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
`resized_img`.
"""
h
,
w
=
img
.
shape
[:
2
]
if
backend
is
None
:
backend
=
imread_backend
if
backend
not
in
[
'cv2'
,
'pillow'
]:
raise
ValueError
(
f
'backend:
{
backend
}
is not supported for resize.'
f
"Supported backends are 'cv2', 'pillow'"
)
if
backend
==
'pillow'
:
assert
img
.
dtype
==
np
.
uint8
,
'Pillow backend only support uint8 type'
pil_image
=
Image
.
fromarray
(
img
)
pil_image
=
pil_image
.
resize
(
size
,
pillow_interp_codes
[
interpolation
])
resized_img
=
np
.
array
(
pil_image
)
else
:
resized_img
=
cv2
.
resize
(
img
,
size
,
dst
=
out
,
interpolation
=
cv2_interp_codes
[
interpolation
])
if
not
return_scale
:
return
resized_img
else
:
w_scale
=
size
[
0
]
/
w
h_scale
=
size
[
1
]
/
h
return
resized_img
,
w_scale
,
h_scale
def
imresize_to_multiple
(
img
,
divisor
,
size
=
None
,
scale_factor
=
None
,
keep_ratio
=
False
,
return_scale
=
False
,
interpolation
=
'bilinear'
,
out
=
None
,
backend
=
None
):
"""Resize image according to a given size or scale factor and then rounds
up the the resized or rescaled image size to the nearest value that can be
divided by the divisor.
Args:
img (ndarray): The input image.
divisor (int | tuple): Resized image size will be a multiple of
divisor. If divisor is a tuple, divisor should be
(w_divisor, h_divisor).
size (None | int | tuple[int]): Target size (w, h). Default: None.
scale_factor (None | float | tuple[float]): Multiplier for spatial
size. Should match input size if it is a tuple and the 2D style is
(w_scale_factor, h_scale_factor). Default: None.
keep_ratio (bool): Whether to keep the aspect ratio when resizing the
image. Default: False.
return_scale (bool): Whether to return `w_scale` and `h_scale`.
interpolation (str): Interpolation method, accepted values are
"nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
backend, "nearest", "bilinear" for 'pillow' backend.
out (ndarray): The output destination.
backend (str | None): The image resize backend type. Options are `cv2`,
`pillow`, `None`. If backend is None, the global imread_backend
specified by ``mmcv.use_backend()`` will be used. Default: None.
Returns:
tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
`resized_img`.
"""
h
,
w
=
img
.
shape
[:
2
]
if
size
is
not
None
and
scale_factor
is
not
None
:
raise
ValueError
(
'only one of size or scale_factor should be defined'
)
elif
size
is
None
and
scale_factor
is
None
:
raise
ValueError
(
'one of size or scale_factor should be defined'
)
elif
size
is
not
None
:
size
=
to_2tuple
(
size
)
if
keep_ratio
:
size
=
rescale_size
((
w
,
h
),
size
,
return_scale
=
False
)
else
:
size
=
_scale_size
((
w
,
h
),
scale_factor
)
divisor
=
to_2tuple
(
divisor
)
size
=
tuple
([
int
(
np
.
ceil
(
s
/
d
))
*
d
for
s
,
d
in
zip
(
size
,
divisor
)])
resized_img
,
w_scale
,
h_scale
=
imresize
(
img
,
size
,
return_scale
=
True
,
interpolation
=
interpolation
,
out
=
out
,
backend
=
backend
)
if
return_scale
:
return
resized_img
,
w_scale
,
h_scale
else
:
return
resized_img
def
imresize_like
(
img
,
dst_img
,
return_scale
=
False
,
interpolation
=
'bilinear'
,
backend
=
None
):
"""Resize image to the same size of a given image.
Args:
img (ndarray): The input image.
dst_img (ndarray): The target image.
return_scale (bool): Whether to return `w_scale` and `h_scale`.
interpolation (str): Same as :func:`resize`.
backend (str | None): Same as :func:`resize`.
Returns:
tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
`resized_img`.
"""
h
,
w
=
dst_img
.
shape
[:
2
]
return
imresize
(
img
,
(
w
,
h
),
return_scale
,
interpolation
,
backend
=
backend
)
def
rescale_size
(
old_size
,
scale
,
return_scale
=
False
):
"""Calculate the new size to be rescaled to.
Args:
old_size (tuple[int]): The old size (w, h) of image.
scale (float | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this
factor, else if it is a tuple of 2 integers, then the image will
be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the
rescaled image size.
Returns:
tuple[int]: The new rescaled image size.
"""
w
,
h
=
old_size
if
isinstance
(
scale
,
(
float
,
int
)):
if
scale
<=
0
:
raise
ValueError
(
f
'Invalid scale
{
scale
}
, must be positive.'
)
scale_factor
=
scale
elif
isinstance
(
scale
,
tuple
):
max_long_edge
=
max
(
scale
)
max_short_edge
=
min
(
scale
)
scale_factor
=
min
(
max_long_edge
/
max
(
h
,
w
),
max_short_edge
/
min
(
h
,
w
))
else
:
raise
TypeError
(
f
'Scale must be a number or tuple of int, but got
{
type
(
scale
)
}
'
)
new_size
=
_scale_size
((
w
,
h
),
scale_factor
)
if
return_scale
:
return
new_size
,
scale_factor
else
:
return
new_size
def
imrescale
(
img
,
scale
,
return_scale
=
False
,
interpolation
=
'bilinear'
,
backend
=
None
):
"""Resize image while keeping the aspect ratio.
Args:
img (ndarray): The input image.
scale (float | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this
factor, else if it is a tuple of 2 integers, then the image will
be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the
rescaled image.
interpolation (str): Same as :func:`resize`.
backend (str | None): Same as :func:`resize`.
Returns:
ndarray: The rescaled image.
"""
h
,
w
=
img
.
shape
[:
2
]
new_size
,
scale_factor
=
rescale_size
((
w
,
h
),
scale
,
return_scale
=
True
)
rescaled_img
=
imresize
(
img
,
new_size
,
interpolation
=
interpolation
,
backend
=
backend
)
if
return_scale
:
return
rescaled_img
,
scale_factor
else
:
return
rescaled_img
def
imflip
(
img
,
direction
=
'horizontal'
):
"""Flip an image horizontally or vertically.
Args:
img (ndarray): Image to be flipped.
direction (str): The flip direction, either "horizontal" or
"vertical" or "diagonal".
Returns:
ndarray: The flipped image.
"""
assert
direction
in
[
'horizontal'
,
'vertical'
,
'diagonal'
]
if
direction
==
'horizontal'
:
return
np
.
flip
(
img
,
axis
=
1
)
elif
direction
==
'vertical'
:
return
np
.
flip
(
img
,
axis
=
0
)
else
:
return
np
.
flip
(
img
,
axis
=
(
0
,
1
))
def
imflip_
(
img
,
direction
=
'horizontal'
):
"""Inplace flip an image horizontally or vertically.
Args:
img (ndarray): Image to be flipped.
direction (str): The flip direction, either "horizontal" or
"vertical" or "diagonal".
Returns:
ndarray: The flipped image (inplace).
"""
assert
direction
in
[
'horizontal'
,
'vertical'
,
'diagonal'
]
if
direction
==
'horizontal'
:
return
cv2
.
flip
(
img
,
1
,
img
)
elif
direction
==
'vertical'
:
return
cv2
.
flip
(
img
,
0
,
img
)
else
:
return
cv2
.
flip
(
img
,
-
1
,
img
)
def
imrotate
(
img
,
angle
,
center
=
None
,
scale
=
1.0
,
border_value
=
0
,
interpolation
=
'bilinear'
,
auto_bound
=
False
):
"""Rotate an image.
Args:
img (ndarray): Image to be rotated.
angle (float): Rotation angle in degrees, positive values mean
clockwise rotation.
center (tuple[float], optional): Center point (w, h) of the rotation in
the source image. If not specified, the center of the image will be
used.
scale (float): Isotropic scale factor.
border_value (int): Border value.
interpolation (str): Same as :func:`resize`.
auto_bound (bool): Whether to adjust the image size to cover the whole
rotated image.
Returns:
ndarray: The rotated image.
"""
if
center
is
not
None
and
auto_bound
:
raise
ValueError
(
'`auto_bound` conflicts with `center`'
)
h
,
w
=
img
.
shape
[:
2
]
if
center
is
None
:
center
=
((
w
-
1
)
*
0.5
,
(
h
-
1
)
*
0.5
)
assert
isinstance
(
center
,
tuple
)
matrix
=
cv2
.
getRotationMatrix2D
(
center
,
-
angle
,
scale
)
if
auto_bound
:
cos
=
np
.
abs
(
matrix
[
0
,
0
])
sin
=
np
.
abs
(
matrix
[
0
,
1
])
new_w
=
h
*
sin
+
w
*
cos
new_h
=
h
*
cos
+
w
*
sin
matrix
[
0
,
2
]
+=
(
new_w
-
w
)
*
0.5
matrix
[
1
,
2
]
+=
(
new_h
-
h
)
*
0.5
w
=
int
(
np
.
round
(
new_w
))
h
=
int
(
np
.
round
(
new_h
))
rotated
=
cv2
.
warpAffine
(
img
,
matrix
,
(
w
,
h
),
flags
=
cv2_interp_codes
[
interpolation
],
borderValue
=
border_value
)
return
rotated
def
bbox_clip
(
bboxes
,
img_shape
):
"""Clip bboxes to fit the image shape.
Args:
bboxes (ndarray): Shape (..., 4*k)
img_shape (tuple[int]): (height, width) of the image.
Returns:
ndarray: Clipped bboxes.
"""
assert
bboxes
.
shape
[
-
1
]
%
4
==
0
cmin
=
np
.
empty
(
bboxes
.
shape
[
-
1
],
dtype
=
bboxes
.
dtype
)
cmin
[
0
::
2
]
=
img_shape
[
1
]
-
1
cmin
[
1
::
2
]
=
img_shape
[
0
]
-
1
clipped_bboxes
=
np
.
maximum
(
np
.
minimum
(
bboxes
,
cmin
),
0
)
return
clipped_bboxes
def
bbox_scaling
(
bboxes
,
scale
,
clip_shape
=
None
):
"""Scaling bboxes w.r.t the box center.
Args:
bboxes (ndarray): Shape(..., 4).
scale (float): Scaling factor.
clip_shape (tuple[int], optional): If specified, bboxes that exceed the
boundary will be clipped according to the given shape (h, w).
Returns:
ndarray: Scaled bboxes.
"""
if
float
(
scale
)
==
1.0
:
scaled_bboxes
=
bboxes
.
copy
()
else
:
w
=
bboxes
[...,
2
]
-
bboxes
[...,
0
]
+
1
h
=
bboxes
[...,
3
]
-
bboxes
[...,
1
]
+
1
dw
=
(
w
*
(
scale
-
1
))
*
0.5
dh
=
(
h
*
(
scale
-
1
))
*
0.5
scaled_bboxes
=
bboxes
+
np
.
stack
((
-
dw
,
-
dh
,
dw
,
dh
),
axis
=-
1
)
if
clip_shape
is
not
None
:
return
bbox_clip
(
scaled_bboxes
,
clip_shape
)
else
:
return
scaled_bboxes
def
imcrop
(
img
,
bboxes
,
scale
=
1.0
,
pad_fill
=
None
):
"""Crop image patches.
3 steps: scale the bboxes -> clip bboxes -> crop and pad.
Args:
img (ndarray): Image to be cropped.
bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
scale (float, optional): Scale ratio of bboxes, the default value
1.0 means no padding.
pad_fill (Number | list[Number]): Value to be filled for padding.
Default: None, which means no padding.
Returns:
list[ndarray] | ndarray: The cropped image patches.
"""
chn
=
1
if
img
.
ndim
==
2
else
img
.
shape
[
2
]
if
pad_fill
is
not
None
:
if
isinstance
(
pad_fill
,
(
int
,
float
)):
pad_fill
=
[
pad_fill
for
_
in
range
(
chn
)]
assert
len
(
pad_fill
)
==
chn
_bboxes
=
bboxes
[
None
,
...]
if
bboxes
.
ndim
==
1
else
bboxes
scaled_bboxes
=
bbox_scaling
(
_bboxes
,
scale
).
astype
(
np
.
int32
)
clipped_bbox
=
bbox_clip
(
scaled_bboxes
,
img
.
shape
)
patches
=
[]
for
i
in
range
(
clipped_bbox
.
shape
[
0
]):
x1
,
y1
,
x2
,
y2
=
tuple
(
clipped_bbox
[
i
,
:])
if
pad_fill
is
None
:
patch
=
img
[
y1
:
y2
+
1
,
x1
:
x2
+
1
,
...]
else
:
_x1
,
_y1
,
_x2
,
_y2
=
tuple
(
scaled_bboxes
[
i
,
:])
if
chn
==
1
:
patch_shape
=
(
_y2
-
_y1
+
1
,
_x2
-
_x1
+
1
)
else
:
patch_shape
=
(
_y2
-
_y1
+
1
,
_x2
-
_x1
+
1
,
chn
)
patch
=
np
.
array
(
pad_fill
,
dtype
=
img
.
dtype
)
*
np
.
ones
(
patch_shape
,
dtype
=
img
.
dtype
)
x_start
=
0
if
_x1
>=
0
else
-
_x1
y_start
=
0
if
_y1
>=
0
else
-
_y1
w
=
x2
-
x1
+
1
h
=
y2
-
y1
+
1
patch
[
y_start
:
y_start
+
h
,
x_start
:
x_start
+
w
,
...]
=
img
[
y1
:
y1
+
h
,
x1
:
x1
+
w
,
...]
patches
.
append
(
patch
)
if
bboxes
.
ndim
==
1
:
return
patches
[
0
]
else
:
return
patches
def
impad
(
img
,
*
,
shape
=
None
,
padding
=
None
,
pad_val
=
0
,
padding_mode
=
'constant'
):
"""Pad the given image to a certain shape or pad on all sides with
specified padding mode and padding value.
Args:
img (ndarray): Image to be padded.
shape (tuple[int]): Expected padding shape (h, w). Default: None.
padding (int or tuple[int]): Padding on each border. If a single int is
provided this is used to pad all borders. If tuple of length 2 is
provided this is the padding on left/right and top/bottom
respectively. If a tuple of length 4 is provided this is the
padding for the left, top, right and bottom borders respectively.
Default: None. Note that `shape` and `padding` can not be both
set.
pad_val (Number | Sequence[Number]): Values to be filled in padding
areas when padding_mode is 'constant'. Default: 0.
padding_mode (str): Type of padding. Should be: constant, edge,
reflect or symmetric. Default: constant.
- constant: pads with a constant value, this value is specified
with pad_val.
- edge: pads with the last value at the edge of the image.
- reflect: pads with reflection of image without repeating the
last value on the edge. For example, padding [1, 2, 3, 4]
with 2 elements on both sides in reflect mode will result
in [3, 2, 1, 2, 3, 4, 3, 2].
- symmetric: pads with reflection of image repeating the last
value on the edge. For example, padding [1, 2, 3, 4] with
2 elements on both sides in symmetric mode will result in
[2, 1, 1, 2, 3, 4, 4, 3]
Returns:
ndarray: The padded image.
"""
assert
(
shape
is
not
None
)
^
(
padding
is
not
None
)
if
shape
is
not
None
:
padding
=
(
0
,
0
,
shape
[
1
]
-
img
.
shape
[
1
],
shape
[
0
]
-
img
.
shape
[
0
])
# check pad_val
if
isinstance
(
pad_val
,
tuple
):
assert
len
(
pad_val
)
==
img
.
shape
[
-
1
]
elif
not
isinstance
(
pad_val
,
numbers
.
Number
):
raise
TypeError
(
'pad_val must be a int or a tuple. '
f
'But received
{
type
(
pad_val
)
}
'
)
# check padding
if
isinstance
(
padding
,
tuple
)
and
len
(
padding
)
in
[
2
,
4
]:
if
len
(
padding
)
==
2
:
padding
=
(
padding
[
0
],
padding
[
1
],
padding
[
0
],
padding
[
1
])
elif
isinstance
(
padding
,
numbers
.
Number
):
padding
=
(
padding
,
padding
,
padding
,
padding
)
else
:
raise
ValueError
(
'Padding must be a int or a 2, or 4 element tuple.'
f
'But received
{
padding
}
'
)
# check padding mode
assert
padding_mode
in
[
'constant'
,
'edge'
,
'reflect'
,
'symmetric'
]
border_type
=
{
'constant'
:
cv2
.
BORDER_CONSTANT
,
'edge'
:
cv2
.
BORDER_REPLICATE
,
'reflect'
:
cv2
.
BORDER_REFLECT_101
,
'symmetric'
:
cv2
.
BORDER_REFLECT
}
img
=
cv2
.
copyMakeBorder
(
img
,
padding
[
1
],
padding
[
3
],
padding
[
0
],
padding
[
2
],
border_type
[
padding_mode
],
value
=
pad_val
)
return
img
def
impad_to_multiple
(
img
,
divisor
,
pad_val
=
0
):
"""Pad an image to ensure each edge to be multiple to some number.
Args:
img (ndarray): Image to be padded.
divisor (int): Padded image edges will be multiple to divisor.
pad_val (Number | Sequence[Number]): Same as :func:`impad`.
Returns:
ndarray: The padded image.
"""
pad_h
=
int
(
np
.
ceil
(
img
.
shape
[
0
]
/
divisor
))
*
divisor
pad_w
=
int
(
np
.
ceil
(
img
.
shape
[
1
]
/
divisor
))
*
divisor
return
impad
(
img
,
shape
=
(
pad_h
,
pad_w
),
pad_val
=
pad_val
)
def
cutout
(
img
,
shape
,
pad_val
=
0
):
"""Randomly cut out a rectangle from the original img.
Args:
img (ndarray): Image to be cutout.
shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
int, the value will be used for both h and w.
pad_val (int | float | tuple[int | float]): Values to be filled in the
cut area. Defaults to 0.
Returns:
ndarray: The cutout image.
"""
channels
=
1
if
img
.
ndim
==
2
else
img
.
shape
[
2
]
if
isinstance
(
shape
,
int
):
cut_h
,
cut_w
=
shape
,
shape
else
:
assert
isinstance
(
shape
,
tuple
)
and
len
(
shape
)
==
2
,
\
f
'shape must be a int or a tuple with length 2, but got type '
\
f
'
{
type
(
shape
)
}
instead.'
cut_h
,
cut_w
=
shape
if
isinstance
(
pad_val
,
(
int
,
float
)):
pad_val
=
tuple
([
pad_val
]
*
channels
)
elif
isinstance
(
pad_val
,
tuple
):
assert
len
(
pad_val
)
==
channels
,
\
'Expected the num of elements in tuple equals the channels'
\
'of input image. Found {} vs {}'
.
format
(
len
(
pad_val
),
channels
)
else
:
raise
TypeError
(
f
'Invalid type
{
type
(
pad_val
)
}
for `pad_val`'
)
img_h
,
img_w
=
img
.
shape
[:
2
]
y0
=
np
.
random
.
uniform
(
img_h
)
x0
=
np
.
random
.
uniform
(
img_w
)
y1
=
int
(
max
(
0
,
y0
-
cut_h
/
2.
))
x1
=
int
(
max
(
0
,
x0
-
cut_w
/
2.
))
y2
=
min
(
img_h
,
y1
+
cut_h
)
x2
=
min
(
img_w
,
x1
+
cut_w
)
if
img
.
ndim
==
2
:
patch_shape
=
(
y2
-
y1
,
x2
-
x1
)
else
:
patch_shape
=
(
y2
-
y1
,
x2
-
x1
,
channels
)
img_cutout
=
img
.
copy
()
patch
=
np
.
array
(
pad_val
,
dtype
=
img
.
dtype
)
*
np
.
ones
(
patch_shape
,
dtype
=
img
.
dtype
)
img_cutout
[
y1
:
y2
,
x1
:
x2
,
...]
=
patch
return
img_cutout
def
_get_shear_matrix
(
magnitude
,
direction
=
'horizontal'
):
"""Generate the shear matrix for transformation.
Args:
magnitude (int | float): The magnitude used for shear.
direction (str): The flip direction, either "horizontal"
or "vertical".
Returns:
ndarray: The shear matrix with dtype float32.
"""
if
direction
==
'horizontal'
:
shear_matrix
=
np
.
float32
([[
1
,
magnitude
,
0
],
[
0
,
1
,
0
]])
elif
direction
==
'vertical'
:
shear_matrix
=
np
.
float32
([[
1
,
0
,
0
],
[
magnitude
,
1
,
0
]])
return
shear_matrix
def
imshear
(
img
,
magnitude
,
direction
=
'horizontal'
,
border_value
=
0
,
interpolation
=
'bilinear'
):
"""Shear an image.
Args:
img (ndarray): Image to be sheared with format (h, w)
or (h, w, c).
magnitude (int | float): The magnitude used for shear.
direction (str): The flip direction, either "horizontal"
or "vertical".
border_value (int | tuple[int]): Value used in case of a
constant border.
interpolation (str): Same as :func:`resize`.
Returns:
ndarray: The sheared image.
"""
assert
direction
in
[
'horizontal'
,
'vertical'
],
f
'Invalid direction:
{
direction
}
'
height
,
width
=
img
.
shape
[:
2
]
if
img
.
ndim
==
2
:
channels
=
1
elif
img
.
ndim
==
3
:
channels
=
img
.
shape
[
-
1
]
if
isinstance
(
border_value
,
int
):
border_value
=
tuple
([
border_value
]
*
channels
)
elif
isinstance
(
border_value
,
tuple
):
assert
len
(
border_value
)
==
channels
,
\
'Expected the num of elements in tuple equals the channels'
\
'of input image. Found {} vs {}'
.
format
(
len
(
border_value
),
channels
)
else
:
raise
ValueError
(
f
'Invalid type
{
type
(
border_value
)
}
for `border_value`'
)
shear_matrix
=
_get_shear_matrix
(
magnitude
,
direction
)
sheared
=
cv2
.
warpAffine
(
img
,
shear_matrix
,
(
width
,
height
),
# Note case when the number elements in `border_value`
# greater than 3 (e.g. shearing masks whose channels large
# than 3) will raise TypeError in `cv2.warpAffine`.
# Here simply slice the first 3 values in `border_value`.
borderValue
=
border_value
[:
3
],
flags
=
cv2_interp_codes
[
interpolation
])
return
sheared
def
_get_translate_matrix
(
offset
,
direction
=
'horizontal'
):
"""Generate the translate matrix.
Args:
offset (int | float): The offset used for translate.
direction (str): The translate direction, either
"horizontal" or "vertical".
Returns:
ndarray: The translate matrix with dtype float32.
"""
if
direction
==
'horizontal'
:
translate_matrix
=
np
.
float32
([[
1
,
0
,
offset
],
[
0
,
1
,
0
]])
elif
direction
==
'vertical'
:
translate_matrix
=
np
.
float32
([[
1
,
0
,
0
],
[
0
,
1
,
offset
]])
return
translate_matrix
def
imtranslate
(
img
,
offset
,
direction
=
'horizontal'
,
border_value
=
0
,
interpolation
=
'bilinear'
):
"""Translate an image.
Args:
img (ndarray): Image to be translated with format
(h, w) or (h, w, c).
offset (int | float): The offset used for translate.
direction (str): The translate direction, either "horizontal"
or "vertical".
border_value (int | tuple[int]): Value used in case of a
constant border.
interpolation (str): Same as :func:`resize`.
Returns:
ndarray: The translated image.
"""
assert
direction
in
[
'horizontal'
,
'vertical'
],
f
'Invalid direction:
{
direction
}
'
height
,
width
=
img
.
shape
[:
2
]
if
img
.
ndim
==
2
:
channels
=
1
elif
img
.
ndim
==
3
:
channels
=
img
.
shape
[
-
1
]
if
isinstance
(
border_value
,
int
):
border_value
=
tuple
([
border_value
]
*
channels
)
elif
isinstance
(
border_value
,
tuple
):
assert
len
(
border_value
)
==
channels
,
\
'Expected the num of elements in tuple equals the channels'
\
'of input image. Found {} vs {}'
.
format
(
len
(
border_value
),
channels
)
else
:
raise
ValueError
(
f
'Invalid type
{
type
(
border_value
)
}
for `border_value`.'
)
translate_matrix
=
_get_translate_matrix
(
offset
,
direction
)
translated
=
cv2
.
warpAffine
(
img
,
translate_matrix
,
(
width
,
height
),
# Note case when the number elements in `border_value`
# greater than 3 (e.g. translating masks whose channels
# large than 3) will raise TypeError in `cv2.warpAffine`.
# Here simply slice the first 3 values in `border_value`.
borderValue
=
border_value
[:
3
],
flags
=
cv2_interp_codes
[
interpolation
])
return
translated
lavis/common/annotator/uniformer/mmcv/image/io.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
io
import
os.path
as
osp
from
pathlib
import
Path
import
cv2
import
numpy
as
np
from
cv2
import
(
IMREAD_COLOR
,
IMREAD_GRAYSCALE
,
IMREAD_IGNORE_ORIENTATION
,
IMREAD_UNCHANGED
)
from
annotator.uniformer.mmcv.utils
import
check_file_exist
,
is_str
,
mkdir_or_exist
try
:
from
turbojpeg
import
TJCS_RGB
,
TJPF_BGR
,
TJPF_GRAY
,
TurboJPEG
except
ImportError
:
TJCS_RGB
=
TJPF_GRAY
=
TJPF_BGR
=
TurboJPEG
=
None
try
:
from
PIL
import
Image
,
ImageOps
except
ImportError
:
Image
=
None
try
:
import
tifffile
except
ImportError
:
tifffile
=
None
jpeg
=
None
supported_backends
=
[
'cv2'
,
'turbojpeg'
,
'pillow'
,
'tifffile'
]
imread_flags
=
{
'color'
:
IMREAD_COLOR
,
'grayscale'
:
IMREAD_GRAYSCALE
,
'unchanged'
:
IMREAD_UNCHANGED
,
'color_ignore_orientation'
:
IMREAD_IGNORE_ORIENTATION
|
IMREAD_COLOR
,
'grayscale_ignore_orientation'
:
IMREAD_IGNORE_ORIENTATION
|
IMREAD_GRAYSCALE
}
imread_backend
=
'cv2'
def
use_backend
(
backend
):
"""Select a backend for image decoding.
Args:
backend (str): The image decoding backend type. Options are `cv2`,
`pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
file format.
"""
assert
backend
in
supported_backends
global
imread_backend
imread_backend
=
backend
if
imread_backend
==
'turbojpeg'
:
if
TurboJPEG
is
None
:
raise
ImportError
(
'`PyTurboJPEG` is not installed'
)
global
jpeg
if
jpeg
is
None
:
jpeg
=
TurboJPEG
()
elif
imread_backend
==
'pillow'
:
if
Image
is
None
:
raise
ImportError
(
'`Pillow` is not installed'
)
elif
imread_backend
==
'tifffile'
:
if
tifffile
is
None
:
raise
ImportError
(
'`tifffile` is not installed'
)
def
_jpegflag
(
flag
=
'color'
,
channel_order
=
'bgr'
):
channel_order
=
channel_order
.
lower
()
if
channel_order
not
in
[
'rgb'
,
'bgr'
]:
raise
ValueError
(
'channel order must be either "rgb" or "bgr"'
)
if
flag
==
'color'
:
if
channel_order
==
'bgr'
:
return
TJPF_BGR
elif
channel_order
==
'rgb'
:
return
TJCS_RGB
elif
flag
==
'grayscale'
:
return
TJPF_GRAY
else
:
raise
ValueError
(
'flag must be "color" or "grayscale"'
)
def
_pillow2array
(
img
,
flag
=
'color'
,
channel_order
=
'bgr'
):
"""Convert a pillow image to numpy array.
Args:
img (:obj:`PIL.Image.Image`): The image loaded using PIL
flag (str): Flags specifying the color type of a loaded image,
candidates are 'color', 'grayscale' and 'unchanged'.
Default to 'color'.
channel_order (str): The channel order of the output image array,
candidates are 'bgr' and 'rgb'. Default to 'bgr'.
Returns:
np.ndarray: The converted numpy array
"""
channel_order
=
channel_order
.
lower
()
if
channel_order
not
in
[
'rgb'
,
'bgr'
]:
raise
ValueError
(
'channel order must be either "rgb" or "bgr"'
)
if
flag
==
'unchanged'
:
array
=
np
.
array
(
img
)
if
array
.
ndim
>=
3
and
array
.
shape
[
2
]
>=
3
:
# color image
array
[:,
:,
:
3
]
=
array
[:,
:,
(
2
,
1
,
0
)]
# RGB to BGR
else
:
# Handle exif orientation tag
if
flag
in
[
'color'
,
'grayscale'
]:
img
=
ImageOps
.
exif_transpose
(
img
)
# If the image mode is not 'RGB', convert it to 'RGB' first.
if
img
.
mode
!=
'RGB'
:
if
img
.
mode
!=
'LA'
:
# Most formats except 'LA' can be directly converted to RGB
img
=
img
.
convert
(
'RGB'
)
else
:
# When the mode is 'LA', the default conversion will fill in
# the canvas with black, which sometimes shadows black objects
# in the foreground.
#
# Therefore, a random color (124, 117, 104) is used for canvas
img_rgba
=
img
.
convert
(
'RGBA'
)
img
=
Image
.
new
(
'RGB'
,
img_rgba
.
size
,
(
124
,
117
,
104
))
img
.
paste
(
img_rgba
,
mask
=
img_rgba
.
split
()[
3
])
# 3 is alpha
if
flag
in
[
'color'
,
'color_ignore_orientation'
]:
array
=
np
.
array
(
img
)
if
channel_order
!=
'rgb'
:
array
=
array
[:,
:,
::
-
1
]
# RGB to BGR
elif
flag
in
[
'grayscale'
,
'grayscale_ignore_orientation'
]:
img
=
img
.
convert
(
'L'
)
array
=
np
.
array
(
img
)
else
:
raise
ValueError
(
'flag must be "color", "grayscale", "unchanged", '
f
'"color_ignore_orientation" or "grayscale_ignore_orientation"'
f
' but got
{
flag
}
'
)
return
array
def
imread
(
img_or_path
,
flag
=
'color'
,
channel_order
=
'bgr'
,
backend
=
None
):
"""Read an image.
Args:
img_or_path (ndarray or str or Path): Either a numpy array or str or
pathlib.Path. If it is a numpy array (loaded image), then
it will be returned as is.
flag (str): Flags specifying the color type of a loaded image,
candidates are `color`, `grayscale`, `unchanged`,
`color_ignore_orientation` and `grayscale_ignore_orientation`.
By default, `cv2` and `pillow` backend would rotate the image
according to its EXIF info unless called with `unchanged` or
`*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
always ignore image's EXIF info regardless of the flag.
The `turbojpeg` backend only supports `color` and `grayscale`.
channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
backend (str | None): The image decoding backend type. Options are
`cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
If backend is None, the global imread_backend specified by
``mmcv.use_backend()`` will be used. Default: None.
Returns:
ndarray: Loaded image array.
"""
if
backend
is
None
:
backend
=
imread_backend
if
backend
not
in
supported_backends
:
raise
ValueError
(
f
'backend:
{
backend
}
is not supported. Supported '
"backends are 'cv2', 'turbojpeg', 'pillow'"
)
if
isinstance
(
img_or_path
,
Path
):
img_or_path
=
str
(
img_or_path
)
if
isinstance
(
img_or_path
,
np
.
ndarray
):
return
img_or_path
elif
is_str
(
img_or_path
):
check_file_exist
(
img_or_path
,
f
'img file does not exist:
{
img_or_path
}
'
)
if
backend
==
'turbojpeg'
:
with
open
(
img_or_path
,
'rb'
)
as
in_file
:
img
=
jpeg
.
decode
(
in_file
.
read
(),
_jpegflag
(
flag
,
channel_order
))
if
img
.
shape
[
-
1
]
==
1
:
img
=
img
[:,
:,
0
]
return
img
elif
backend
==
'pillow'
:
img
=
Image
.
open
(
img_or_path
)
img
=
_pillow2array
(
img
,
flag
,
channel_order
)
return
img
elif
backend
==
'tifffile'
:
img
=
tifffile
.
imread
(
img_or_path
)
return
img
else
:
flag
=
imread_flags
[
flag
]
if
is_str
(
flag
)
else
flag
img
=
cv2
.
imread
(
img_or_path
,
flag
)
if
flag
==
IMREAD_COLOR
and
channel_order
==
'rgb'
:
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
,
img
)
return
img
else
:
raise
TypeError
(
'"img" must be a numpy array or a str or '
'a pathlib.Path object'
)
def
imfrombytes
(
content
,
flag
=
'color'
,
channel_order
=
'bgr'
,
backend
=
None
):
"""Read an image from bytes.
Args:
content (bytes): Image bytes got from files or other streams.
flag (str): Same as :func:`imread`.
backend (str | None): The image decoding backend type. Options are
`cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the
global imread_backend specified by ``mmcv.use_backend()`` will be
used. Default: None.
Returns:
ndarray: Loaded image array.
"""
if
backend
is
None
:
backend
=
imread_backend
if
backend
not
in
supported_backends
:
raise
ValueError
(
f
'backend:
{
backend
}
is not supported. Supported '
"backends are 'cv2', 'turbojpeg', 'pillow'"
)
if
backend
==
'turbojpeg'
:
img
=
jpeg
.
decode
(
content
,
_jpegflag
(
flag
,
channel_order
))
if
img
.
shape
[
-
1
]
==
1
:
img
=
img
[:,
:,
0
]
return
img
elif
backend
==
'pillow'
:
buff
=
io
.
BytesIO
(
content
)
img
=
Image
.
open
(
buff
)
img
=
_pillow2array
(
img
,
flag
,
channel_order
)
return
img
else
:
img_np
=
np
.
frombuffer
(
content
,
np
.
uint8
)
flag
=
imread_flags
[
flag
]
if
is_str
(
flag
)
else
flag
img
=
cv2
.
imdecode
(
img_np
,
flag
)
if
flag
==
IMREAD_COLOR
and
channel_order
==
'rgb'
:
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
,
img
)
return
img
def
imwrite
(
img
,
file_path
,
params
=
None
,
auto_mkdir
=
True
):
"""Write image to file.
Args:
img (ndarray): Image array to be written.
file_path (str): Image file path.
params (None or list): Same as opencv :func:`imwrite` interface.
auto_mkdir (bool): If the parent folder of `file_path` does not exist,
whether to create it automatically.
Returns:
bool: Successful or not.
"""
if
auto_mkdir
:
dir_name
=
osp
.
abspath
(
osp
.
dirname
(
file_path
))
mkdir_or_exist
(
dir_name
)
return
cv2
.
imwrite
(
file_path
,
img
,
params
)
lavis/common/annotator/uniformer/mmcv/image/misc.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
numpy
as
np
import
annotator.uniformer.mmcv
as
mmcv
try
:
import
torch
except
ImportError
:
torch
=
None
def
tensor2imgs
(
tensor
,
mean
=
(
0
,
0
,
0
),
std
=
(
1
,
1
,
1
),
to_rgb
=
True
):
"""Convert tensor to 3-channel images.
Args:
tensor (torch.Tensor): Tensor that contains multiple images, shape (
N, C, H, W).
mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0).
std (tuple[float], optional): Standard deviation of images.
Defaults to (1, 1, 1).
to_rgb (bool, optional): Whether the tensor was converted to RGB
format in the first place. If so, convert it back to BGR.
Defaults to True.
Returns:
list[np.ndarray]: A list that contains multiple images.
"""
if
torch
is
None
:
raise
RuntimeError
(
'pytorch is not installed'
)
assert
torch
.
is_tensor
(
tensor
)
and
tensor
.
ndim
==
4
assert
len
(
mean
)
==
3
assert
len
(
std
)
==
3
num_imgs
=
tensor
.
size
(
0
)
mean
=
np
.
array
(
mean
,
dtype
=
np
.
float32
)
std
=
np
.
array
(
std
,
dtype
=
np
.
float32
)
imgs
=
[]
for
img_id
in
range
(
num_imgs
):
img
=
tensor
[
img_id
,
...].
cpu
().
numpy
().
transpose
(
1
,
2
,
0
)
img
=
mmcv
.
imdenormalize
(
img
,
mean
,
std
,
to_bgr
=
to_rgb
).
astype
(
np
.
uint8
)
imgs
.
append
(
np
.
ascontiguousarray
(
img
))
return
imgs
lavis/common/annotator/uniformer/mmcv/image/photometric.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
cv2
import
numpy
as
np
from
..utils
import
is_tuple_of
from
.colorspace
import
bgr2gray
,
gray2bgr
def
imnormalize
(
img
,
mean
,
std
,
to_rgb
=
True
):
"""Normalize an image with mean and std.
Args:
img (ndarray): Image to be normalized.
mean (ndarray): The mean to be used for normalize.
std (ndarray): The std to be used for normalize.
to_rgb (bool): Whether to convert to rgb.
Returns:
ndarray: The normalized image.
"""
img
=
img
.
copy
().
astype
(
np
.
float32
)
return
imnormalize_
(
img
,
mean
,
std
,
to_rgb
)
def
imnormalize_
(
img
,
mean
,
std
,
to_rgb
=
True
):
"""Inplace normalize an image with mean and std.
Args:
img (ndarray): Image to be normalized.
mean (ndarray): The mean to be used for normalize.
std (ndarray): The std to be used for normalize.
to_rgb (bool): Whether to convert to rgb.
Returns:
ndarray: The normalized image.
"""
# cv2 inplace normalization does not accept uint8
assert
img
.
dtype
!=
np
.
uint8
mean
=
np
.
float64
(
mean
.
reshape
(
1
,
-
1
))
stdinv
=
1
/
np
.
float64
(
std
.
reshape
(
1
,
-
1
))
if
to_rgb
:
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
,
img
)
# inplace
cv2
.
subtract
(
img
,
mean
,
img
)
# inplace
cv2
.
multiply
(
img
,
stdinv
,
img
)
# inplace
return
img
def
imdenormalize
(
img
,
mean
,
std
,
to_bgr
=
True
):
assert
img
.
dtype
!=
np
.
uint8
mean
=
mean
.
reshape
(
1
,
-
1
).
astype
(
np
.
float64
)
std
=
std
.
reshape
(
1
,
-
1
).
astype
(
np
.
float64
)
img
=
cv2
.
multiply
(
img
,
std
)
# make a copy
cv2
.
add
(
img
,
mean
,
img
)
# inplace
if
to_bgr
:
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_RGB2BGR
,
img
)
# inplace
return
img
def
iminvert
(
img
):
"""Invert (negate) an image.
Args:
img (ndarray): Image to be inverted.
Returns:
ndarray: The inverted image.
"""
return
np
.
full_like
(
img
,
255
)
-
img
def
solarize
(
img
,
thr
=
128
):
"""Solarize an image (invert all pixel values above a threshold)
Args:
img (ndarray): Image to be solarized.
thr (int): Threshold for solarizing (0 - 255).
Returns:
ndarray: The solarized image.
"""
img
=
np
.
where
(
img
<
thr
,
img
,
255
-
img
)
return
img
def
posterize
(
img
,
bits
):
"""Posterize an image (reduce the number of bits for each color channel)
Args:
img (ndarray): Image to be posterized.
bits (int): Number of bits (1 to 8) to use for posterizing.
Returns:
ndarray: The posterized image.
"""
shift
=
8
-
bits
img
=
np
.
left_shift
(
np
.
right_shift
(
img
,
shift
),
shift
)
return
img
def
adjust_color
(
img
,
alpha
=
1
,
beta
=
None
,
gamma
=
0
):
r
"""It blends the source image and its gray image:
.. math::
output = img * alpha + gray\_img * beta + gamma
Args:
img (ndarray): The input source image.
alpha (int | float): Weight for the source image. Default 1.
beta (int | float): Weight for the converted gray image.
If None, it's assigned the value (1 - `alpha`).
gamma (int | float): Scalar added to each sum.
Same as :func:`cv2.addWeighted`. Default 0.
Returns:
ndarray: Colored image which has the same size and dtype as input.
"""
gray_img
=
bgr2gray
(
img
)
gray_img
=
np
.
tile
(
gray_img
[...,
None
],
[
1
,
1
,
3
])
if
beta
is
None
:
beta
=
1
-
alpha
colored_img
=
cv2
.
addWeighted
(
img
,
alpha
,
gray_img
,
beta
,
gamma
)
if
not
colored_img
.
dtype
==
np
.
uint8
:
# Note when the dtype of `img` is not the default `np.uint8`
# (e.g. np.float32), the value in `colored_img` got from cv2
# is not guaranteed to be in range [0, 255], so here clip
# is needed.
colored_img
=
np
.
clip
(
colored_img
,
0
,
255
)
return
colored_img
def
imequalize
(
img
):
"""Equalize the image histogram.
This function applies a non-linear mapping to the input image,
in order to create a uniform distribution of grayscale values
in the output image.
Args:
img (ndarray): Image to be equalized.
Returns:
ndarray: The equalized image.
"""
def
_scale_channel
(
im
,
c
):
"""Scale the data in the corresponding channel."""
im
=
im
[:,
:,
c
]
# Compute the histogram of the image channel.
histo
=
np
.
histogram
(
im
,
256
,
(
0
,
255
))[
0
]
# For computing the step, filter out the nonzeros.
nonzero_histo
=
histo
[
histo
>
0
]
step
=
(
np
.
sum
(
nonzero_histo
)
-
nonzero_histo
[
-
1
])
//
255
if
not
step
:
lut
=
np
.
array
(
range
(
256
))
else
:
# Compute the cumulative sum, shifted by step // 2
# and then normalized by step.
lut
=
(
np
.
cumsum
(
histo
)
+
(
step
//
2
))
//
step
# Shift lut, prepending with 0.
lut
=
np
.
concatenate
([[
0
],
lut
[:
-
1
]],
0
)
# handle potential integer overflow
lut
[
lut
>
255
]
=
255
# If step is zero, return the original image.
# Otherwise, index from lut.
return
np
.
where
(
np
.
equal
(
step
,
0
),
im
,
lut
[
im
])
# Scales each channel independently and then stacks
# the result.
s1
=
_scale_channel
(
img
,
0
)
s2
=
_scale_channel
(
img
,
1
)
s3
=
_scale_channel
(
img
,
2
)
equalized_img
=
np
.
stack
([
s1
,
s2
,
s3
],
axis
=-
1
)
return
equalized_img
.
astype
(
img
.
dtype
)
def
adjust_brightness
(
img
,
factor
=
1.
):
"""Adjust image brightness.
This function controls the brightness of an image. An
enhancement factor of 0.0 gives a black image.
A factor of 1.0 gives the original image. This function
blends the source image and the degenerated black image:
.. math::
output = img * factor + degenerated * (1 - factor)
Args:
img (ndarray): Image to be brightened.
factor (float): A value controls the enhancement.
Factor 1.0 returns the original image, lower
factors mean less color (brightness, contrast,
etc), and higher values more. Default 1.
Returns:
ndarray: The brightened image.
"""
degenerated
=
np
.
zeros_like
(
img
)
# Note manually convert the dtype to np.float32, to
# achieve as close results as PIL.ImageEnhance.Brightness.
# Set beta=1-factor, and gamma=0
brightened_img
=
cv2
.
addWeighted
(
img
.
astype
(
np
.
float32
),
factor
,
degenerated
.
astype
(
np
.
float32
),
1
-
factor
,
0
)
brightened_img
=
np
.
clip
(
brightened_img
,
0
,
255
)
return
brightened_img
.
astype
(
img
.
dtype
)
def
adjust_contrast
(
img
,
factor
=
1.
):
"""Adjust image contrast.
This function controls the contrast of an image. An
enhancement factor of 0.0 gives a solid grey
image. A factor of 1.0 gives the original image. It
blends the source image and the degenerated mean image:
.. math::
output = img * factor + degenerated * (1 - factor)
Args:
img (ndarray): Image to be contrasted. BGR order.
factor (float): Same as :func:`mmcv.adjust_brightness`.
Returns:
ndarray: The contrasted image.
"""
gray_img
=
bgr2gray
(
img
)
hist
=
np
.
histogram
(
gray_img
,
256
,
(
0
,
255
))[
0
]
mean
=
round
(
np
.
sum
(
gray_img
)
/
np
.
sum
(
hist
))
degenerated
=
(
np
.
ones_like
(
img
[...,
0
])
*
mean
).
astype
(
img
.
dtype
)
degenerated
=
gray2bgr
(
degenerated
)
contrasted_img
=
cv2
.
addWeighted
(
img
.
astype
(
np
.
float32
),
factor
,
degenerated
.
astype
(
np
.
float32
),
1
-
factor
,
0
)
contrasted_img
=
np
.
clip
(
contrasted_img
,
0
,
255
)
return
contrasted_img
.
astype
(
img
.
dtype
)
def
auto_contrast
(
img
,
cutoff
=
0
):
"""Auto adjust image contrast.
This function maximize (normalize) image contrast by first removing cutoff
percent of the lightest and darkest pixels from the histogram and remapping
the image so that the darkest pixel becomes black (0), and the lightest
becomes white (255).
Args:
img (ndarray): Image to be contrasted. BGR order.
cutoff (int | float | tuple): The cutoff percent of the lightest and
darkest pixels to be removed. If given as tuple, it shall be
(low, high). Otherwise, the single value will be used for both.
Defaults to 0.
Returns:
ndarray: The contrasted image.
"""
def
_auto_contrast_channel
(
im
,
c
,
cutoff
):
im
=
im
[:,
:,
c
]
# Compute the histogram of the image channel.
histo
=
np
.
histogram
(
im
,
256
,
(
0
,
255
))[
0
]
# Remove cut-off percent pixels from histo
histo_sum
=
np
.
cumsum
(
histo
)
cut_low
=
histo_sum
[
-
1
]
*
cutoff
[
0
]
//
100
cut_high
=
histo_sum
[
-
1
]
-
histo_sum
[
-
1
]
*
cutoff
[
1
]
//
100
histo_sum
=
np
.
clip
(
histo_sum
,
cut_low
,
cut_high
)
-
cut_low
histo
=
np
.
concatenate
([[
histo_sum
[
0
]],
np
.
diff
(
histo_sum
)],
0
)
# Compute mapping
low
,
high
=
np
.
nonzero
(
histo
)[
0
][
0
],
np
.
nonzero
(
histo
)[
0
][
-
1
]
# If all the values have been cut off, return the origin img
if
low
>=
high
:
return
im
scale
=
255.0
/
(
high
-
low
)
offset
=
-
low
*
scale
lut
=
np
.
array
(
range
(
256
))
lut
=
lut
*
scale
+
offset
lut
=
np
.
clip
(
lut
,
0
,
255
)
return
lut
[
im
]
if
isinstance
(
cutoff
,
(
int
,
float
)):
cutoff
=
(
cutoff
,
cutoff
)
else
:
assert
isinstance
(
cutoff
,
tuple
),
'cutoff must be of type int, '
\
f
'float or tuple, but got
{
type
(
cutoff
)
}
instead.'
# Auto adjusts contrast for each channel independently and then stacks
# the result.
s1
=
_auto_contrast_channel
(
img
,
0
,
cutoff
)
s2
=
_auto_contrast_channel
(
img
,
1
,
cutoff
)
s3
=
_auto_contrast_channel
(
img
,
2
,
cutoff
)
contrasted_img
=
np
.
stack
([
s1
,
s2
,
s3
],
axis
=-
1
)
return
contrasted_img
.
astype
(
img
.
dtype
)
def
adjust_sharpness
(
img
,
factor
=
1.
,
kernel
=
None
):
"""Adjust image sharpness.
This function controls the sharpness of an image. An
enhancement factor of 0.0 gives a blurred image. A
factor of 1.0 gives the original image. And a factor
of 2.0 gives a sharpened image. It blends the source
image and the degenerated mean image:
.. math::
output = img * factor + degenerated * (1 - factor)
Args:
img (ndarray): Image to be sharpened. BGR order.
factor (float): Same as :func:`mmcv.adjust_brightness`.
kernel (np.ndarray, optional): Filter kernel to be applied on the img
to obtain the degenerated img. Defaults to None.
Note:
No value sanity check is enforced on the kernel set by users. So with
an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
the function its name indicates but end up performing whatever
transform determined by the kernel.
Returns:
ndarray: The sharpened image.
"""
if
kernel
is
None
:
# adopted from PIL.ImageFilter.SMOOTH
kernel
=
np
.
array
([[
1.
,
1.
,
1.
],
[
1.
,
5.
,
1.
],
[
1.
,
1.
,
1.
]])
/
13
assert
isinstance
(
kernel
,
np
.
ndarray
),
\
f
'kernel must be of type np.ndarray, but got
{
type
(
kernel
)
}
instead.'
assert
kernel
.
ndim
==
2
,
\
f
'kernel must have a dimension of 2, but got
{
kernel
.
ndim
}
instead.'
degenerated
=
cv2
.
filter2D
(
img
,
-
1
,
kernel
)
sharpened_img
=
cv2
.
addWeighted
(
img
.
astype
(
np
.
float32
),
factor
,
degenerated
.
astype
(
np
.
float32
),
1
-
factor
,
0
)
sharpened_img
=
np
.
clip
(
sharpened_img
,
0
,
255
)
return
sharpened_img
.
astype
(
img
.
dtype
)
def
adjust_lighting
(
img
,
eigval
,
eigvec
,
alphastd
=
0.1
,
to_rgb
=
True
):
"""AlexNet-style PCA jitter.
This data augmentation is proposed in `ImageNet Classification with Deep
Convolutional Neural Networks
<https://dl.acm.org/doi/pdf/10.1145/3065386>`_.
Args:
img (ndarray): Image to be adjusted lighting. BGR order.
eigval (ndarray): the eigenvalue of the convariance matrix of pixel
values, respectively.
eigvec (ndarray): the eigenvector of the convariance matrix of pixel
values, respectively.
alphastd (float): The standard deviation for distribution of alpha.
Defaults to 0.1
to_rgb (bool): Whether to convert img to rgb.
Returns:
ndarray: The adjusted image.
"""
assert
isinstance
(
eigval
,
np
.
ndarray
)
and
isinstance
(
eigvec
,
np
.
ndarray
),
\
f
'eigval and eigvec should both be of type np.ndarray, got '
\
f
'
{
type
(
eigval
)
}
and
{
type
(
eigvec
)
}
instead.'
assert
eigval
.
ndim
==
1
and
eigvec
.
ndim
==
2
assert
eigvec
.
shape
==
(
3
,
eigval
.
shape
[
0
])
n_eigval
=
eigval
.
shape
[
0
]
assert
isinstance
(
alphastd
,
float
),
'alphastd should be of type float, '
\
f
'got
{
type
(
alphastd
)
}
instead.'
img
=
img
.
copy
().
astype
(
np
.
float32
)
if
to_rgb
:
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2RGB
,
img
)
# inplace
alpha
=
np
.
random
.
normal
(
0
,
alphastd
,
n_eigval
)
alter
=
eigvec
\
*
np
.
broadcast_to
(
alpha
.
reshape
(
1
,
n_eigval
),
(
3
,
n_eigval
))
\
*
np
.
broadcast_to
(
eigval
.
reshape
(
1
,
n_eigval
),
(
3
,
n_eigval
))
alter
=
np
.
broadcast_to
(
alter
.
sum
(
axis
=
1
).
reshape
(
1
,
1
,
3
),
img
.
shape
)
img_adjusted
=
img
+
alter
return
img_adjusted
def
lut_transform
(
img
,
lut_table
):
"""Transform array by look-up table.
The function lut_transform fills the output array with values from the
look-up table. Indices of the entries are taken from the input array.
Args:
img (ndarray): Image to be transformed.
lut_table (ndarray): look-up table of 256 elements; in case of
multi-channel input array, the table should either have a single
channel (in this case the same table is used for all channels) or
the same number of channels as in the input array.
Returns:
ndarray: The transformed image.
"""
assert
isinstance
(
img
,
np
.
ndarray
)
assert
0
<=
np
.
min
(
img
)
and
np
.
max
(
img
)
<=
255
assert
isinstance
(
lut_table
,
np
.
ndarray
)
assert
lut_table
.
shape
==
(
256
,
)
return
cv2
.
LUT
(
np
.
array
(
img
,
dtype
=
np
.
uint8
),
lut_table
)
def
clahe
(
img
,
clip_limit
=
40.0
,
tile_grid_size
=
(
8
,
8
)):
"""Use CLAHE method to process the image.
See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
Graphics Gems, 1994:474-485.` for more information.
Args:
img (ndarray): Image to be processed.
clip_limit (float): Threshold for contrast limiting. Default: 40.0.
tile_grid_size (tuple[int]): Size of grid for histogram equalization.
Input image will be divided into equally sized rectangular tiles.
It defines the number of tiles in row and column. Default: (8, 8).
Returns:
ndarray: The processed image.
"""
assert
isinstance
(
img
,
np
.
ndarray
)
assert
img
.
ndim
==
2
assert
isinstance
(
clip_limit
,
(
float
,
int
))
assert
is_tuple_of
(
tile_grid_size
,
int
)
assert
len
(
tile_grid_size
)
==
2
clahe
=
cv2
.
createCLAHE
(
clip_limit
,
tile_grid_size
)
return
clahe
.
apply
(
np
.
array
(
img
,
dtype
=
np
.
uint8
))
lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json
0 → 100644
View file @
c04f261a
{
"resnet50_caffe"
:
"detectron/resnet50_caffe"
,
"resnet50_caffe_bgr"
:
"detectron2/resnet50_caffe_bgr"
,
"resnet101_caffe"
:
"detectron/resnet101_caffe"
,
"resnet101_caffe_bgr"
:
"detectron2/resnet101_caffe_bgr"
}
lavis/common/annotator/uniformer/mmcv/model_zoo/mmcls.json
0 → 100644
View file @
c04f261a
{
"vgg11"
:
"https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth"
,
"vgg13"
:
"https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth"
,
"vgg16"
:
"https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth"
,
"vgg19"
:
"https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth"
,
"vgg11_bn"
:
"https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth"
,
"vgg13_bn"
:
"https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth"
,
"vgg16_bn"
:
"https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth"
,
"vgg19_bn"
:
"https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth"
,
"resnet18"
:
"https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_batch256_imagenet_20200708-34ab8f90.pth"
,
"resnet34"
:
"https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_batch256_imagenet_20200708-32ffb4f7.pth"
,
"resnet50"
:
"https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth"
,
"resnet101"
:
"https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.pth"
,
"resnet152"
:
"https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.pth"
,
"resnet50_v1d"
:
"https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_batch256_imagenet_20200708-1ad0ce94.pth"
,
"resnet101_v1d"
:
"https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_batch256_imagenet_20200708-9cb302ef.pth"
,
"resnet152_v1d"
:
"https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_batch256_imagenet_20200708-e79cb6a2.pth"
,
"resnext50_32x4d"
:
"https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth"
,
"resnext101_32x4d"
:
"https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth"
,
"resnext101_32x8d"
:
"https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth"
,
"resnext152_32x4d"
:
"https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth"
,
"se-resnet50"
:
"https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth"
,
"se-resnet101"
:
"https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth"
,
"resnest50"
:
"https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth"
,
"resnest101"
:
"https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth"
,
"resnest200"
:
"https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth"
,
"resnest269"
:
"https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth"
,
"shufflenet_v1"
:
"https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth"
,
"shufflenet_v2"
:
"https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth"
,
"mobilenet_v2"
:
"https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth"
}
lavis/common/annotator/uniformer/mmcv/model_zoo/open_mmlab.json
0 → 100644
View file @
c04f261a
{
"vgg16_caffe"
:
"https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth"
,
"detectron/resnet50_caffe"
:
"https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth"
,
"detectron2/resnet50_caffe"
:
"https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth"
,
"detectron/resnet101_caffe"
:
"https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth"
,
"detectron2/resnet101_caffe"
:
"https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth"
,
"detectron2/resnext101_32x8d"
:
"https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth"
,
"resnext50_32x4d"
:
"https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth"
,
"resnext101_32x4d"
:
"https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth"
,
"resnext101_64x4d"
:
"https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth"
,
"contrib/resnet50_gn"
:
"https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth"
,
"detectron/resnet50_gn"
:
"https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth"
,
"detectron/resnet101_gn"
:
"https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth"
,
"jhu/resnet50_gn_ws"
:
"https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth"
,
"jhu/resnet101_gn_ws"
:
"https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth"
,
"jhu/resnext50_32x4d_gn_ws"
:
"https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth"
,
"jhu/resnext101_32x4d_gn_ws"
:
"https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth"
,
"jhu/resnext50_32x4d_gn"
:
"https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth"
,
"jhu/resnext101_32x4d_gn"
:
"https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth"
,
"msra/hrnetv2_w18_small"
:
"https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth"
,
"msra/hrnetv2_w18"
:
"https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth"
,
"msra/hrnetv2_w32"
:
"https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth"
,
"msra/hrnetv2_w40"
:
"https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth"
,
"msra/hrnetv2_w48"
:
"https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth"
,
"bninception_caffe"
:
"https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth"
,
"kin400/i3d_r50_f32s2_k400"
:
"https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth"
,
"kin400/nl3d_r50_f32s2_k400"
:
"https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth"
,
"res2net101_v1d_26w_4s"
:
"https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth"
,
"regnetx_400mf"
:
"https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth"
,
"regnetx_800mf"
:
"https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth"
,
"regnetx_1.6gf"
:
"https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth"
,
"regnetx_3.2gf"
:
"https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth"
,
"regnetx_4.0gf"
:
"https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth"
,
"regnetx_6.4gf"
:
"https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth"
,
"regnetx_8.0gf"
:
"https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth"
,
"regnetx_12gf"
:
"https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth"
,
"resnet18_v1c"
:
"https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth"
,
"resnet50_v1c"
:
"https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth"
,
"resnet101_v1c"
:
"https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth"
,
"mmedit/vgg16"
:
"https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth"
,
"mmedit/res34_en_nomixup"
:
"https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth"
,
"mmedit/mobilenet_v2"
:
"https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth"
,
"contrib/mobilenet_v3_large"
:
"https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth"
,
"contrib/mobilenet_v3_small"
:
"https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth"
,
"resnest50"
:
"https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth"
,
"resnest101"
:
"https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth"
,
"resnest200"
:
"https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth"
,
"darknet53"
:
"https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth"
,
"mmdet/mobilenet_v2"
:
"https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth"
}
lavis/common/annotator/uniformer/mmcv/ops/__init__.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
.assign_score_withk
import
assign_score_withk
from
.ball_query
import
ball_query
from
.bbox
import
bbox_overlaps
from
.border_align
import
BorderAlign
,
border_align
from
.box_iou_rotated
import
box_iou_rotated
from
.carafe
import
CARAFE
,
CARAFENaive
,
CARAFEPack
,
carafe
,
carafe_naive
from
.cc_attention
import
CrissCrossAttention
from
.contour_expand
import
contour_expand
from
.corner_pool
import
CornerPool
from
.correlation
import
Correlation
from
.deform_conv
import
DeformConv2d
,
DeformConv2dPack
,
deform_conv2d
from
.deform_roi_pool
import
(
DeformRoIPool
,
DeformRoIPoolPack
,
ModulatedDeformRoIPoolPack
,
deform_roi_pool
)
from
.deprecated_wrappers
import
Conv2d_deprecated
as
Conv2d
from
.deprecated_wrappers
import
ConvTranspose2d_deprecated
as
ConvTranspose2d
from
.deprecated_wrappers
import
Linear_deprecated
as
Linear
from
.deprecated_wrappers
import
MaxPool2d_deprecated
as
MaxPool2d
from
.focal_loss
import
(
SigmoidFocalLoss
,
SoftmaxFocalLoss
,
sigmoid_focal_loss
,
softmax_focal_loss
)
from
.furthest_point_sample
import
(
furthest_point_sample
,
furthest_point_sample_with_dist
)
from
.fused_bias_leakyrelu
import
FusedBiasLeakyReLU
,
fused_bias_leakyrelu
from
.gather_points
import
gather_points
from
.group_points
import
GroupAll
,
QueryAndGroup
,
grouping_operation
from
.info
import
(
get_compiler_version
,
get_compiling_cuda_version
,
get_onnxruntime_op_path
)
from
.iou3d
import
boxes_iou_bev
,
nms_bev
,
nms_normal_bev
from
.knn
import
knn
from
.masked_conv
import
MaskedConv2d
,
masked_conv2d
from
.modulated_deform_conv
import
(
ModulatedDeformConv2d
,
ModulatedDeformConv2dPack
,
modulated_deform_conv2d
)
from
.multi_scale_deform_attn
import
MultiScaleDeformableAttention
from
.nms
import
batched_nms
,
nms
,
nms_match
,
nms_rotated
,
soft_nms
from
.pixel_group
import
pixel_group
from
.point_sample
import
(
SimpleRoIAlign
,
point_sample
,
rel_roi_point_to_rel_img_point
)
from
.points_in_boxes
import
(
points_in_boxes_all
,
points_in_boxes_cpu
,
points_in_boxes_part
)
from
.points_sampler
import
PointsSampler
from
.psa_mask
import
PSAMask
from
.roi_align
import
RoIAlign
,
roi_align
from
.roi_align_rotated
import
RoIAlignRotated
,
roi_align_rotated
from
.roi_pool
import
RoIPool
,
roi_pool
from
.roiaware_pool3d
import
RoIAwarePool3d
from
.roipoint_pool3d
import
RoIPointPool3d
from
.saconv
import
SAConv2d
from
.scatter_points
import
DynamicScatter
,
dynamic_scatter
from
.sync_bn
import
SyncBatchNorm
from
.three_interpolate
import
three_interpolate
from
.three_nn
import
three_nn
from
.tin_shift
import
TINShift
,
tin_shift
from
.upfirdn2d
import
upfirdn2d
from
.voxelize
import
Voxelization
,
voxelization
__all__
=
[
'bbox_overlaps'
,
'CARAFE'
,
'CARAFENaive'
,
'CARAFEPack'
,
'carafe'
,
'carafe_naive'
,
'CornerPool'
,
'DeformConv2d'
,
'DeformConv2dPack'
,
'deform_conv2d'
,
'DeformRoIPool'
,
'DeformRoIPoolPack'
,
'ModulatedDeformRoIPoolPack'
,
'deform_roi_pool'
,
'SigmoidFocalLoss'
,
'SoftmaxFocalLoss'
,
'sigmoid_focal_loss'
,
'softmax_focal_loss'
,
'get_compiler_version'
,
'get_compiling_cuda_version'
,
'get_onnxruntime_op_path'
,
'MaskedConv2d'
,
'masked_conv2d'
,
'ModulatedDeformConv2d'
,
'ModulatedDeformConv2dPack'
,
'modulated_deform_conv2d'
,
'batched_nms'
,
'nms'
,
'soft_nms'
,
'nms_match'
,
'RoIAlign'
,
'roi_align'
,
'RoIPool'
,
'roi_pool'
,
'SyncBatchNorm'
,
'Conv2d'
,
'ConvTranspose2d'
,
'Linear'
,
'MaxPool2d'
,
'CrissCrossAttention'
,
'PSAMask'
,
'point_sample'
,
'rel_roi_point_to_rel_img_point'
,
'SimpleRoIAlign'
,
'SAConv2d'
,
'TINShift'
,
'tin_shift'
,
'assign_score_withk'
,
'box_iou_rotated'
,
'RoIPointPool3d'
,
'nms_rotated'
,
'knn'
,
'ball_query'
,
'upfirdn2d'
,
'FusedBiasLeakyReLU'
,
'fused_bias_leakyrelu'
,
'RoIAlignRotated'
,
'roi_align_rotated'
,
'pixel_group'
,
'QueryAndGroup'
,
'GroupAll'
,
'grouping_operation'
,
'contour_expand'
,
'three_nn'
,
'three_interpolate'
,
'MultiScaleDeformableAttention'
,
'BorderAlign'
,
'border_align'
,
'gather_points'
,
'furthest_point_sample'
,
'furthest_point_sample_with_dist'
,
'PointsSampler'
,
'Correlation'
,
'boxes_iou_bev'
,
'nms_bev'
,
'nms_normal_bev'
,
'Voxelization'
,
'voxelization'
,
'dynamic_scatter'
,
'DynamicScatter'
,
'RoIAwarePool3d'
,
'points_in_boxes_part'
,
'points_in_boxes_cpu'
,
'points_in_boxes_all'
]
lavis/common/annotator/uniformer/mmcv/ops/assign_score_withk.py
0 → 100644
View file @
c04f261a
from
torch.autograd
import
Function
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'assign_score_withk_forward'
,
'assign_score_withk_backward'
])
class
AssignScoreWithK
(
Function
):
r
"""Perform weighted sum to generate output features according to scores.
Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
scene_seg/lib/paconv_lib/src/gpu>`_.
This is a memory-efficient CUDA implementation of assign_scores operation,
which first transform all point features with weight bank, then assemble
neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.
See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
more detailed descriptions.
Note:
This implementation assumes using ``neighbor`` kernel input, which is
(point_features - center_features, point_features).
See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
pointnet2/paconv.py#L128 for more details.
"""
@
staticmethod
def
forward
(
ctx
,
scores
,
point_features
,
center_features
,
knn_idx
,
aggregate
=
'sum'
):
"""
Args:
scores (torch.Tensor): (B, npoint, K, M), predicted scores to
aggregate weight matrices in the weight bank.
``npoint`` is the number of sampled centers.
``K`` is the number of queried neighbors.
``M`` is the number of weight matrices in the weight bank.
point_features (torch.Tensor): (B, N, M, out_dim)
Pre-computed point features to be aggregated.
center_features (torch.Tensor): (B, N, M, out_dim)
Pre-computed center features to be aggregated.
knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
We assume the first idx in each row is the idx of the center.
aggregate (str, optional): Aggregation method.
Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.
Returns:
torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
"""
agg
=
{
'sum'
:
0
,
'avg'
:
1
,
'max'
:
2
}
B
,
N
,
M
,
out_dim
=
point_features
.
size
()
_
,
npoint
,
K
,
_
=
scores
.
size
()
output
=
point_features
.
new_zeros
((
B
,
out_dim
,
npoint
,
K
))
ext_module
.
assign_score_withk_forward
(
point_features
.
contiguous
(),
center_features
.
contiguous
(),
scores
.
contiguous
(),
knn_idx
.
contiguous
(),
output
,
B
=
B
,
N0
=
N
,
N1
=
npoint
,
M
=
M
,
K
=
K
,
O
=
out_dim
,
aggregate
=
agg
[
aggregate
])
ctx
.
save_for_backward
(
output
,
point_features
,
center_features
,
scores
,
knn_idx
)
ctx
.
agg
=
agg
[
aggregate
]
return
output
@
staticmethod
def
backward
(
ctx
,
grad_out
):
"""
Args:
grad_out (torch.Tensor): (B, out_dim, npoint, K)
Returns:
grad_scores (torch.Tensor): (B, npoint, K, M)
grad_point_features (torch.Tensor): (B, N, M, out_dim)
grad_center_features (torch.Tensor): (B, N, M, out_dim)
"""
_
,
point_features
,
center_features
,
scores
,
knn_idx
=
ctx
.
saved_tensors
agg
=
ctx
.
agg
B
,
N
,
M
,
out_dim
=
point_features
.
size
()
_
,
npoint
,
K
,
_
=
scores
.
size
()
grad_point_features
=
point_features
.
new_zeros
(
point_features
.
shape
)
grad_center_features
=
center_features
.
new_zeros
(
center_features
.
shape
)
grad_scores
=
scores
.
new_zeros
(
scores
.
shape
)
ext_module
.
assign_score_withk_backward
(
grad_out
.
contiguous
(),
point_features
.
contiguous
(),
center_features
.
contiguous
(),
scores
.
contiguous
(),
knn_idx
.
contiguous
(),
grad_point_features
,
grad_center_features
,
grad_scores
,
B
=
B
,
N0
=
N
,
N1
=
npoint
,
M
=
M
,
K
=
K
,
O
=
out_dim
,
aggregate
=
agg
)
return
grad_scores
,
grad_point_features
,
\
grad_center_features
,
None
,
None
assign_score_withk
=
AssignScoreWithK
.
apply
lavis/common/annotator/uniformer/mmcv/ops/ball_query.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
torch.autograd
import
Function
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'ball_query_forward'
])
class
BallQuery
(
Function
):
"""Find nearby points in spherical space."""
@
staticmethod
def
forward
(
ctx
,
min_radius
:
float
,
max_radius
:
float
,
sample_num
:
int
,
xyz
:
torch
.
Tensor
,
center_xyz
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
Args:
min_radius (float): minimum radius of the balls.
max_radius (float): maximum radius of the balls.
sample_num (int): maximum number of features in the balls.
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
Returns:
Tensor: (B, npoint, nsample) tensor with the indices of
the features that form the query balls.
"""
assert
center_xyz
.
is_contiguous
()
assert
xyz
.
is_contiguous
()
assert
min_radius
<
max_radius
B
,
N
,
_
=
xyz
.
size
()
npoint
=
center_xyz
.
size
(
1
)
idx
=
xyz
.
new_zeros
(
B
,
npoint
,
sample_num
,
dtype
=
torch
.
int
)
ext_module
.
ball_query_forward
(
center_xyz
,
xyz
,
idx
,
b
=
B
,
n
=
N
,
m
=
npoint
,
min_radius
=
min_radius
,
max_radius
=
max_radius
,
nsample
=
sample_num
)
if
torch
.
__version__
!=
'parrots'
:
ctx
.
mark_non_differentiable
(
idx
)
return
idx
@
staticmethod
def
backward
(
ctx
,
a
=
None
):
return
None
,
None
,
None
,
None
ball_query
=
BallQuery
.
apply
lavis/common/annotator/uniformer/mmcv/ops/bbox.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'bbox_overlaps'
])
def
bbox_overlaps
(
bboxes1
,
bboxes2
,
mode
=
'iou'
,
aligned
=
False
,
offset
=
0
):
"""Calculate overlap between two set of bboxes.
If ``aligned`` is ``False``, then calculate the ious between each bbox
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
If aligned is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
Example:
>>> bboxes1 = torch.FloatTensor([
>>> [0, 0, 10, 10],
>>> [10, 10, 20, 20],
>>> [32, 32, 38, 42],
>>> ])
>>> bboxes2 = torch.FloatTensor([
>>> [0, 0, 10, 20],
>>> [0, 10, 10, 19],
>>> [10, 10, 20, 20],
>>> ])
>>> bbox_overlaps(bboxes1, bboxes2)
tensor([[0.5000, 0.0000, 0.0000],
[0.0000, 0.0000, 1.0000],
[0.0000, 0.0000, 0.0000]])
Example:
>>> empty = torch.FloatTensor([])
>>> nonempty = torch.FloatTensor([
>>> [0, 0, 10, 9],
>>> ])
>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
"""
mode_dict
=
{
'iou'
:
0
,
'iof'
:
1
}
assert
mode
in
mode_dict
.
keys
()
mode_flag
=
mode_dict
[
mode
]
# Either the boxes are empty or the length of boxes' last dimension is 4
assert
(
bboxes1
.
size
(
-
1
)
==
4
or
bboxes1
.
size
(
0
)
==
0
)
assert
(
bboxes2
.
size
(
-
1
)
==
4
or
bboxes2
.
size
(
0
)
==
0
)
assert
offset
==
1
or
offset
==
0
rows
=
bboxes1
.
size
(
0
)
cols
=
bboxes2
.
size
(
0
)
if
aligned
:
assert
rows
==
cols
if
rows
*
cols
==
0
:
return
bboxes1
.
new
(
rows
,
1
)
if
aligned
else
bboxes1
.
new
(
rows
,
cols
)
if
aligned
:
ious
=
bboxes1
.
new_zeros
(
rows
)
else
:
ious
=
bboxes1
.
new_zeros
((
rows
,
cols
))
ext_module
.
bbox_overlaps
(
bboxes1
,
bboxes2
,
ious
,
mode
=
mode_flag
,
aligned
=
aligned
,
offset
=
offset
)
return
ious
lavis/common/annotator/uniformer/mmcv/ops/border_align.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
# modified from
# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
import
torch
import
torch.nn
as
nn
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'border_align_forward'
,
'border_align_backward'
])
class
BorderAlignFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
,
boxes
,
pool_size
):
return
g
.
op
(
'mmcv::MMCVBorderAlign'
,
input
,
boxes
,
pool_size_i
=
pool_size
)
@
staticmethod
def
forward
(
ctx
,
input
,
boxes
,
pool_size
):
ctx
.
pool_size
=
pool_size
ctx
.
input_shape
=
input
.
size
()
assert
boxes
.
ndim
==
3
,
'boxes must be with shape [B, H*W, 4]'
assert
boxes
.
size
(
2
)
==
4
,
\
'the last dimension of boxes must be (x1, y1, x2, y2)'
assert
input
.
size
(
1
)
%
4
==
0
,
\
'the channel for input feature must be divisible by factor 4'
# [B, C//4, H*W, 4]
output_shape
=
(
input
.
size
(
0
),
input
.
size
(
1
)
//
4
,
boxes
.
size
(
1
),
4
)
output
=
input
.
new_zeros
(
output_shape
)
# `argmax_idx` only used for backward
argmax_idx
=
input
.
new_zeros
(
output_shape
).
to
(
torch
.
int
)
ext_module
.
border_align_forward
(
input
,
boxes
,
output
,
argmax_idx
,
pool_size
=
ctx
.
pool_size
)
ctx
.
save_for_backward
(
boxes
,
argmax_idx
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
boxes
,
argmax_idx
=
ctx
.
saved_tensors
grad_input
=
grad_output
.
new_zeros
(
ctx
.
input_shape
)
# complex head architecture may cause grad_output uncontiguous
grad_output
=
grad_output
.
contiguous
()
ext_module
.
border_align_backward
(
grad_output
,
boxes
,
argmax_idx
,
grad_input
,
pool_size
=
ctx
.
pool_size
)
return
grad_input
,
None
,
None
border_align
=
BorderAlignFunction
.
apply
class
BorderAlign
(
nn
.
Module
):
r
"""Border align pooling layer.
Applies border_align over the input feature based on predicted bboxes.
The details were described in the paper
`BorderDet: Border Feature for Dense Object Detection
<https://arxiv.org/abs/2007.11056>`_.
For each border line (e.g. top, left, bottom or right) of each box,
border_align does the following:
1. uniformly samples `pool_size`+1 positions on this line, involving \
the start and end points.
2. the corresponding features on these points are computed by \
bilinear interpolation.
3. max pooling over all the `pool_size`+1 positions are used for \
computing pooled feature.
Args:
pool_size (int): number of positions sampled over the boxes' borders
(e.g. top, bottom, left, right).
"""
def
__init__
(
self
,
pool_size
):
super
(
BorderAlign
,
self
).
__init__
()
self
.
pool_size
=
pool_size
def
forward
(
self
,
input
,
boxes
):
"""
Args:
input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
[C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
right features respectively.
boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
Returns:
Tensor: Pooled features with shape [N,C,H*W,4]. The order is
(top,left,bottom,right) for the last dimension.
"""
return
border_align
(
input
,
boxes
,
self
.
pool_size
)
def
__repr__
(
self
):
s
=
self
.
__class__
.
__name__
s
+=
f
'(pool_size=
{
self
.
pool_size
}
)'
return
s
lavis/common/annotator/uniformer/mmcv/ops/box_iou_rotated.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'box_iou_rotated'
])
def
box_iou_rotated
(
bboxes1
,
bboxes2
,
mode
=
'iou'
,
aligned
=
False
):
"""Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in
(x_center, y_center, width, height, angle) format.
If ``aligned`` is ``False``, then calculate the ious between each bbox
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Arguments:
boxes1 (Tensor): rotated bboxes 1.
\
It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
boxes2 (Tensor): rotated bboxes 2.
\
It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
ious(Tensor): shape (N, M) if aligned == False else shape (N,)
"""
assert
mode
in
[
'iou'
,
'iof'
]
mode_dict
=
{
'iou'
:
0
,
'iof'
:
1
}
mode_flag
=
mode_dict
[
mode
]
rows
=
bboxes1
.
size
(
0
)
cols
=
bboxes2
.
size
(
0
)
if
aligned
:
ious
=
bboxes1
.
new_zeros
(
rows
)
else
:
ious
=
bboxes1
.
new_zeros
((
rows
*
cols
))
bboxes1
=
bboxes1
.
contiguous
()
bboxes2
=
bboxes2
.
contiguous
()
ext_module
.
box_iou_rotated
(
bboxes1
,
bboxes2
,
ious
,
mode_flag
=
mode_flag
,
aligned
=
aligned
)
if
not
aligned
:
ious
=
ious
.
view
(
rows
,
cols
)
return
ious
lavis/common/annotator/uniformer/mmcv/ops/carafe.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch.autograd
import
Function
from
torch.nn.modules.module
import
Module
from
..cnn
import
UPSAMPLE_LAYERS
,
normal_init
,
xavier_init
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'carafe_naive_forward'
,
'carafe_naive_backward'
,
'carafe_forward'
,
'carafe_backward'
])
class
CARAFENaiveFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
features
,
masks
,
kernel_size
,
group_size
,
scale_factor
):
return
g
.
op
(
'mmcv::MMCVCARAFENaive'
,
features
,
masks
,
kernel_size_i
=
kernel_size
,
group_size_i
=
group_size
,
scale_factor_f
=
scale_factor
)
@
staticmethod
def
forward
(
ctx
,
features
,
masks
,
kernel_size
,
group_size
,
scale_factor
):
assert
scale_factor
>=
1
assert
masks
.
size
(
1
)
==
kernel_size
*
kernel_size
*
group_size
assert
masks
.
size
(
-
1
)
==
features
.
size
(
-
1
)
*
scale_factor
assert
masks
.
size
(
-
2
)
==
features
.
size
(
-
2
)
*
scale_factor
assert
features
.
size
(
1
)
%
group_size
==
0
assert
(
kernel_size
-
1
)
%
2
==
0
and
kernel_size
>=
1
ctx
.
kernel_size
=
kernel_size
ctx
.
group_size
=
group_size
ctx
.
scale_factor
=
scale_factor
ctx
.
feature_size
=
features
.
size
()
ctx
.
mask_size
=
masks
.
size
()
n
,
c
,
h
,
w
=
features
.
size
()
output
=
features
.
new_zeros
((
n
,
c
,
h
*
scale_factor
,
w
*
scale_factor
))
ext_module
.
carafe_naive_forward
(
features
,
masks
,
output
,
kernel_size
=
kernel_size
,
group_size
=
group_size
,
scale_factor
=
scale_factor
)
if
features
.
requires_grad
or
masks
.
requires_grad
:
ctx
.
save_for_backward
(
features
,
masks
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
assert
grad_output
.
is_cuda
features
,
masks
=
ctx
.
saved_tensors
kernel_size
=
ctx
.
kernel_size
group_size
=
ctx
.
group_size
scale_factor
=
ctx
.
scale_factor
grad_input
=
torch
.
zeros_like
(
features
)
grad_masks
=
torch
.
zeros_like
(
masks
)
ext_module
.
carafe_naive_backward
(
grad_output
.
contiguous
(),
features
,
masks
,
grad_input
,
grad_masks
,
kernel_size
=
kernel_size
,
group_size
=
group_size
,
scale_factor
=
scale_factor
)
return
grad_input
,
grad_masks
,
None
,
None
,
None
carafe_naive
=
CARAFENaiveFunction
.
apply
class
CARAFENaive
(
Module
):
def
__init__
(
self
,
kernel_size
,
group_size
,
scale_factor
):
super
(
CARAFENaive
,
self
).
__init__
()
assert
isinstance
(
kernel_size
,
int
)
and
isinstance
(
group_size
,
int
)
and
isinstance
(
scale_factor
,
int
)
self
.
kernel_size
=
kernel_size
self
.
group_size
=
group_size
self
.
scale_factor
=
scale_factor
def
forward
(
self
,
features
,
masks
):
return
carafe_naive
(
features
,
masks
,
self
.
kernel_size
,
self
.
group_size
,
self
.
scale_factor
)
class
CARAFEFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
features
,
masks
,
kernel_size
,
group_size
,
scale_factor
):
return
g
.
op
(
'mmcv::MMCVCARAFE'
,
features
,
masks
,
kernel_size_i
=
kernel_size
,
group_size_i
=
group_size
,
scale_factor_f
=
scale_factor
)
@
staticmethod
def
forward
(
ctx
,
features
,
masks
,
kernel_size
,
group_size
,
scale_factor
):
assert
scale_factor
>=
1
assert
masks
.
size
(
1
)
==
kernel_size
*
kernel_size
*
group_size
assert
masks
.
size
(
-
1
)
==
features
.
size
(
-
1
)
*
scale_factor
assert
masks
.
size
(
-
2
)
==
features
.
size
(
-
2
)
*
scale_factor
assert
features
.
size
(
1
)
%
group_size
==
0
assert
(
kernel_size
-
1
)
%
2
==
0
and
kernel_size
>=
1
ctx
.
kernel_size
=
kernel_size
ctx
.
group_size
=
group_size
ctx
.
scale_factor
=
scale_factor
ctx
.
feature_size
=
features
.
size
()
ctx
.
mask_size
=
masks
.
size
()
n
,
c
,
h
,
w
=
features
.
size
()
output
=
features
.
new_zeros
((
n
,
c
,
h
*
scale_factor
,
w
*
scale_factor
))
routput
=
features
.
new_zeros
(
output
.
size
(),
requires_grad
=
False
)
rfeatures
=
features
.
new_zeros
(
features
.
size
(),
requires_grad
=
False
)
rmasks
=
masks
.
new_zeros
(
masks
.
size
(),
requires_grad
=
False
)
ext_module
.
carafe_forward
(
features
,
masks
,
rfeatures
,
routput
,
rmasks
,
output
,
kernel_size
=
kernel_size
,
group_size
=
group_size
,
scale_factor
=
scale_factor
)
if
features
.
requires_grad
or
masks
.
requires_grad
:
ctx
.
save_for_backward
(
features
,
masks
,
rfeatures
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
assert
grad_output
.
is_cuda
features
,
masks
,
rfeatures
=
ctx
.
saved_tensors
kernel_size
=
ctx
.
kernel_size
group_size
=
ctx
.
group_size
scale_factor
=
ctx
.
scale_factor
rgrad_output
=
torch
.
zeros_like
(
grad_output
,
requires_grad
=
False
)
rgrad_input_hs
=
torch
.
zeros_like
(
grad_output
,
requires_grad
=
False
)
rgrad_input
=
torch
.
zeros_like
(
features
,
requires_grad
=
False
)
rgrad_masks
=
torch
.
zeros_like
(
masks
,
requires_grad
=
False
)
grad_input
=
torch
.
zeros_like
(
features
,
requires_grad
=
False
)
grad_masks
=
torch
.
zeros_like
(
masks
,
requires_grad
=
False
)
ext_module
.
carafe_backward
(
grad_output
.
contiguous
(),
rfeatures
,
masks
,
rgrad_output
,
rgrad_input_hs
,
rgrad_input
,
rgrad_masks
,
grad_input
,
grad_masks
,
kernel_size
=
kernel_size
,
group_size
=
group_size
,
scale_factor
=
scale_factor
)
return
grad_input
,
grad_masks
,
None
,
None
,
None
carafe
=
CARAFEFunction
.
apply
class
CARAFE
(
Module
):
""" CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args:
kernel_size (int): reassemble kernel size
group_size (int): reassemble group size
scale_factor (int): upsample ratio
Returns:
upsampled feature map
"""
def
__init__
(
self
,
kernel_size
,
group_size
,
scale_factor
):
super
(
CARAFE
,
self
).
__init__
()
assert
isinstance
(
kernel_size
,
int
)
and
isinstance
(
group_size
,
int
)
and
isinstance
(
scale_factor
,
int
)
self
.
kernel_size
=
kernel_size
self
.
group_size
=
group_size
self
.
scale_factor
=
scale_factor
def
forward
(
self
,
features
,
masks
):
return
carafe
(
features
,
masks
,
self
.
kernel_size
,
self
.
group_size
,
self
.
scale_factor
)
@
UPSAMPLE_LAYERS
.
register_module
(
name
=
'carafe'
)
class
CARAFEPack
(
nn
.
Module
):
"""A unified package of CARAFE upsampler that contains: 1) channel
compressor 2) content encoder 3) CARAFE op.
Official implementation of ICCV 2019 paper
CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args:
channels (int): input feature channels
scale_factor (int): upsample ratio
up_kernel (int): kernel size of CARAFE op
up_group (int): group size of CARAFE op
encoder_kernel (int): kernel size of content encoder
encoder_dilation (int): dilation of content encoder
compressed_channels (int): output channels of channels compressor
Returns:
upsampled feature map
"""
def
__init__
(
self
,
channels
,
scale_factor
,
up_kernel
=
5
,
up_group
=
1
,
encoder_kernel
=
3
,
encoder_dilation
=
1
,
compressed_channels
=
64
):
super
(
CARAFEPack
,
self
).
__init__
()
self
.
channels
=
channels
self
.
scale_factor
=
scale_factor
self
.
up_kernel
=
up_kernel
self
.
up_group
=
up_group
self
.
encoder_kernel
=
encoder_kernel
self
.
encoder_dilation
=
encoder_dilation
self
.
compressed_channels
=
compressed_channels
self
.
channel_compressor
=
nn
.
Conv2d
(
channels
,
self
.
compressed_channels
,
1
)
self
.
content_encoder
=
nn
.
Conv2d
(
self
.
compressed_channels
,
self
.
up_kernel
*
self
.
up_kernel
*
self
.
up_group
*
self
.
scale_factor
*
self
.
scale_factor
,
self
.
encoder_kernel
,
padding
=
int
((
self
.
encoder_kernel
-
1
)
*
self
.
encoder_dilation
/
2
),
dilation
=
self
.
encoder_dilation
,
groups
=
1
)
self
.
init_weights
()
def
init_weights
(
self
):
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
xavier_init
(
m
,
distribution
=
'uniform'
)
normal_init
(
self
.
content_encoder
,
std
=
0.001
)
def
kernel_normalizer
(
self
,
mask
):
mask
=
F
.
pixel_shuffle
(
mask
,
self
.
scale_factor
)
n
,
mask_c
,
h
,
w
=
mask
.
size
()
# use float division explicitly,
# to void inconsistency while exporting to onnx
mask_channel
=
int
(
mask_c
/
float
(
self
.
up_kernel
**
2
))
mask
=
mask
.
view
(
n
,
mask_channel
,
-
1
,
h
,
w
)
mask
=
F
.
softmax
(
mask
,
dim
=
2
,
dtype
=
mask
.
dtype
)
mask
=
mask
.
view
(
n
,
mask_c
,
h
,
w
).
contiguous
()
return
mask
def
feature_reassemble
(
self
,
x
,
mask
):
x
=
carafe
(
x
,
mask
,
self
.
up_kernel
,
self
.
up_group
,
self
.
scale_factor
)
return
x
def
forward
(
self
,
x
):
compressed_x
=
self
.
channel_compressor
(
x
)
mask
=
self
.
content_encoder
(
compressed_x
)
mask
=
self
.
kernel_normalizer
(
mask
)
x
=
self
.
feature_reassemble
(
x
,
mask
)
return
x
lavis/common/annotator/uniformer/mmcv/ops/cc_attention.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
annotator.uniformer.mmcv.cnn
import
PLUGIN_LAYERS
,
Scale
def
NEG_INF_DIAG
(
n
,
device
):
"""Returns a diagonal matrix of size [n, n].
The diagonal are all "-inf". This is for avoiding calculating the
overlapped element in the Criss-Cross twice.
"""
return
torch
.
diag
(
torch
.
tensor
(
float
(
'-inf'
)).
to
(
device
).
repeat
(
n
),
0
)
@
PLUGIN_LAYERS
.
register_module
()
class
CrissCrossAttention
(
nn
.
Module
):
"""Criss-Cross Attention Module.
.. note::
Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
to a pure PyTorch and equivalent implementation. For more
details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.
Speed comparison for one forward pass
- Input size: [2,512,97,97]
- Device: 1 NVIDIA GeForce RTX 2080 Ti
+-----------------------+---------------+------------+---------------+
| |PyTorch version|CUDA version|Relative speed |
+=======================+===============+============+===============+
|with torch.no_grad() |0.00554402 s |0.0299619 s |5.4x |
+-----------------------+---------------+------------+---------------+
|no with torch.no_grad()|0.00562803 s |0.0301349 s |5.4x |
+-----------------------+---------------+------------+---------------+
Args:
in_channels (int): Channels of the input feature map.
"""
def
__init__
(
self
,
in_channels
):
super
().
__init__
()
self
.
query_conv
=
nn
.
Conv2d
(
in_channels
,
in_channels
//
8
,
1
)
self
.
key_conv
=
nn
.
Conv2d
(
in_channels
,
in_channels
//
8
,
1
)
self
.
value_conv
=
nn
.
Conv2d
(
in_channels
,
in_channels
,
1
)
self
.
gamma
=
Scale
(
0.
)
self
.
in_channels
=
in_channels
def
forward
(
self
,
x
):
"""forward function of Criss-Cross Attention.
Args:
x (Tensor): Input feature.
\
shape (batch_size, in_channels, height, width)
Returns:
Tensor: Output of the layer, with shape of
\
(batch_size, in_channels, height, width)
"""
B
,
C
,
H
,
W
=
x
.
size
()
query
=
self
.
query_conv
(
x
)
key
=
self
.
key_conv
(
x
)
value
=
self
.
value_conv
(
x
)
energy_H
=
torch
.
einsum
(
'bchw,bciw->bwhi'
,
query
,
key
)
+
NEG_INF_DIAG
(
H
,
query
.
device
)
energy_H
=
energy_H
.
transpose
(
1
,
2
)
energy_W
=
torch
.
einsum
(
'bchw,bchj->bhwj'
,
query
,
key
)
attn
=
F
.
softmax
(
torch
.
cat
([
energy_H
,
energy_W
],
dim
=-
1
),
dim
=-
1
)
# [B,H,W,(H+W)]
out
=
torch
.
einsum
(
'bciw,bhwi->bchw'
,
value
,
attn
[...,
:
H
])
out
+=
torch
.
einsum
(
'bchj,bhwj->bchw'
,
value
,
attn
[...,
H
:])
out
=
self
.
gamma
(
out
)
+
x
out
=
out
.
contiguous
()
return
out
def
__repr__
(
self
):
s
=
self
.
__class__
.
__name__
s
+=
f
'(in_channels=
{
self
.
in_channels
}
)'
return
s
lavis/common/annotator/uniformer/mmcv/ops/contour_expand.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
numpy
as
np
import
torch
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'contour_expand'
])
def
contour_expand
(
kernel_mask
,
internal_kernel_label
,
min_kernel_area
,
kernel_num
):
"""Expand kernel contours so that foreground pixels are assigned into
instances.
Arguments:
kernel_mask (np.array or Tensor): The instance kernel mask with
size hxw.
internal_kernel_label (np.array or Tensor): The instance internal
kernel label with size hxw.
min_kernel_area (int): The minimum kernel area.
kernel_num (int): The instance kernel number.
Returns:
label (list): The instance index map with size hxw.
"""
assert
isinstance
(
kernel_mask
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
internal_kernel_label
,
(
torch
.
Tensor
,
np
.
ndarray
))
assert
isinstance
(
min_kernel_area
,
int
)
assert
isinstance
(
kernel_num
,
int
)
if
isinstance
(
kernel_mask
,
np
.
ndarray
):
kernel_mask
=
torch
.
from_numpy
(
kernel_mask
)
if
isinstance
(
internal_kernel_label
,
np
.
ndarray
):
internal_kernel_label
=
torch
.
from_numpy
(
internal_kernel_label
)
if
torch
.
__version__
==
'parrots'
:
if
kernel_mask
.
shape
[
0
]
==
0
or
internal_kernel_label
.
shape
[
0
]
==
0
:
label
=
[]
else
:
label
=
ext_module
.
contour_expand
(
kernel_mask
,
internal_kernel_label
,
min_kernel_area
=
min_kernel_area
,
kernel_num
=
kernel_num
)
label
=
label
.
tolist
()
else
:
label
=
ext_module
.
contour_expand
(
kernel_mask
,
internal_kernel_label
,
min_kernel_area
,
kernel_num
)
return
label
lavis/common/annotator/uniformer/mmcv/ops/corner_pool.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
torch
import
nn
from
torch.autograd
import
Function
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'top_pool_forward'
,
'top_pool_backward'
,
'bottom_pool_forward'
,
'bottom_pool_backward'
,
'left_pool_forward'
,
'left_pool_backward'
,
'right_pool_forward'
,
'right_pool_backward'
])
_mode_dict
=
{
'top'
:
0
,
'bottom'
:
1
,
'left'
:
2
,
'right'
:
3
}
class
TopPoolFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
):
output
=
g
.
op
(
'mmcv::MMCVCornerPool'
,
input
,
mode_i
=
int
(
_mode_dict
[
'top'
]))
return
output
@
staticmethod
def
forward
(
ctx
,
input
):
output
=
ext_module
.
top_pool_forward
(
input
)
ctx
.
save_for_backward
(
input
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
=
ctx
.
saved_tensors
output
=
ext_module
.
top_pool_backward
(
input
,
grad_output
)
return
output
class
BottomPoolFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
):
output
=
g
.
op
(
'mmcv::MMCVCornerPool'
,
input
,
mode_i
=
int
(
_mode_dict
[
'bottom'
]))
return
output
@
staticmethod
def
forward
(
ctx
,
input
):
output
=
ext_module
.
bottom_pool_forward
(
input
)
ctx
.
save_for_backward
(
input
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
=
ctx
.
saved_tensors
output
=
ext_module
.
bottom_pool_backward
(
input
,
grad_output
)
return
output
class
LeftPoolFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
):
output
=
g
.
op
(
'mmcv::MMCVCornerPool'
,
input
,
mode_i
=
int
(
_mode_dict
[
'left'
]))
return
output
@
staticmethod
def
forward
(
ctx
,
input
):
output
=
ext_module
.
left_pool_forward
(
input
)
ctx
.
save_for_backward
(
input
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
=
ctx
.
saved_tensors
output
=
ext_module
.
left_pool_backward
(
input
,
grad_output
)
return
output
class
RightPoolFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
):
output
=
g
.
op
(
'mmcv::MMCVCornerPool'
,
input
,
mode_i
=
int
(
_mode_dict
[
'right'
]))
return
output
@
staticmethod
def
forward
(
ctx
,
input
):
output
=
ext_module
.
right_pool_forward
(
input
)
ctx
.
save_for_backward
(
input
)
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
input
,
=
ctx
.
saved_tensors
output
=
ext_module
.
right_pool_backward
(
input
,
grad_output
)
return
output
class
CornerPool
(
nn
.
Module
):
"""Corner Pooling.
Corner Pooling is a new type of pooling layer that helps a
convolutional network better localize corners of bounding boxes.
Please refer to https://arxiv.org/abs/1808.01244 for more details.
Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
Args:
mode(str): Pooling orientation for the pooling layer
- 'bottom': Bottom Pooling
- 'left': Left Pooling
- 'right': Right Pooling
- 'top': Top Pooling
Returns:
Feature map after pooling.
"""
pool_functions
=
{
'bottom'
:
BottomPoolFunction
,
'left'
:
LeftPoolFunction
,
'right'
:
RightPoolFunction
,
'top'
:
TopPoolFunction
,
}
cummax_dim_flip
=
{
'bottom'
:
(
2
,
False
),
'left'
:
(
3
,
True
),
'right'
:
(
3
,
False
),
'top'
:
(
2
,
True
),
}
def
__init__
(
self
,
mode
):
super
(
CornerPool
,
self
).
__init__
()
assert
mode
in
self
.
pool_functions
self
.
mode
=
mode
self
.
corner_pool
=
self
.
pool_functions
[
mode
]
def
forward
(
self
,
x
):
if
torch
.
__version__
!=
'parrots'
and
torch
.
__version__
>=
'1.5.0'
:
if
torch
.
onnx
.
is_in_onnx_export
():
assert
torch
.
__version__
>=
'1.7.0'
,
\
'When `cummax` serves as an intermediate component whose '
\
'outputs is used as inputs for another modules, it
\'
s '
\
'expected that pytorch version must be >= 1.7.0, '
\
'otherwise Error appears like: `RuntimeError: tuple '
\
'appears in op that does not forward tuples, unsupported '
\
'kind: prim::PythonOp`.'
dim
,
flip
=
self
.
cummax_dim_flip
[
self
.
mode
]
if
flip
:
x
=
x
.
flip
(
dim
)
pool_tensor
,
_
=
torch
.
cummax
(
x
,
dim
=
dim
)
if
flip
:
pool_tensor
=
pool_tensor
.
flip
(
dim
)
return
pool_tensor
else
:
return
self
.
corner_pool
.
apply
(
x
)
lavis/common/annotator/uniformer/mmcv/ops/correlation.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
import
torch
from
torch
import
Tensor
,
nn
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.nn.modules.utils
import
_pair
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'correlation_forward'
,
'correlation_backward'
])
class
CorrelationFunction
(
Function
):
@
staticmethod
def
forward
(
ctx
,
input1
,
input2
,
kernel_size
=
1
,
max_displacement
=
1
,
stride
=
1
,
padding
=
1
,
dilation
=
1
,
dilation_patch
=
1
):
ctx
.
save_for_backward
(
input1
,
input2
)
kH
,
kW
=
ctx
.
kernel_size
=
_pair
(
kernel_size
)
patch_size
=
max_displacement
*
2
+
1
ctx
.
patch_size
=
patch_size
dH
,
dW
=
ctx
.
stride
=
_pair
(
stride
)
padH
,
padW
=
ctx
.
padding
=
_pair
(
padding
)
dilationH
,
dilationW
=
ctx
.
dilation
=
_pair
(
dilation
)
dilation_patchH
,
dilation_patchW
=
ctx
.
dilation_patch
=
_pair
(
dilation_patch
)
output_size
=
CorrelationFunction
.
_output_size
(
ctx
,
input1
)
output
=
input1
.
new_zeros
(
output_size
)
ext_module
.
correlation_forward
(
input1
,
input2
,
output
,
kH
=
kH
,
kW
=
kW
,
patchH
=
patch_size
,
patchW
=
patch_size
,
padH
=
padH
,
padW
=
padW
,
dilationH
=
dilationH
,
dilationW
=
dilationW
,
dilation_patchH
=
dilation_patchH
,
dilation_patchW
=
dilation_patchW
,
dH
=
dH
,
dW
=
dW
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
input1
,
input2
=
ctx
.
saved_tensors
kH
,
kW
=
ctx
.
kernel_size
patch_size
=
ctx
.
patch_size
padH
,
padW
=
ctx
.
padding
dilationH
,
dilationW
=
ctx
.
dilation
dilation_patchH
,
dilation_patchW
=
ctx
.
dilation_patch
dH
,
dW
=
ctx
.
stride
grad_input1
=
torch
.
zeros_like
(
input1
)
grad_input2
=
torch
.
zeros_like
(
input2
)
ext_module
.
correlation_backward
(
grad_output
,
input1
,
input2
,
grad_input1
,
grad_input2
,
kH
=
kH
,
kW
=
kW
,
patchH
=
patch_size
,
patchW
=
patch_size
,
padH
=
padH
,
padW
=
padW
,
dilationH
=
dilationH
,
dilationW
=
dilationW
,
dilation_patchH
=
dilation_patchH
,
dilation_patchW
=
dilation_patchW
,
dH
=
dH
,
dW
=
dW
)
return
grad_input1
,
grad_input2
,
None
,
None
,
None
,
None
,
None
,
None
@
staticmethod
def
_output_size
(
ctx
,
input1
):
iH
,
iW
=
input1
.
size
(
2
),
input1
.
size
(
3
)
batch_size
=
input1
.
size
(
0
)
kH
,
kW
=
ctx
.
kernel_size
patch_size
=
ctx
.
patch_size
dH
,
dW
=
ctx
.
stride
padH
,
padW
=
ctx
.
padding
dilationH
,
dilationW
=
ctx
.
dilation
dilatedKH
=
(
kH
-
1
)
*
dilationH
+
1
dilatedKW
=
(
kW
-
1
)
*
dilationW
+
1
oH
=
int
((
iH
+
2
*
padH
-
dilatedKH
)
/
dH
+
1
)
oW
=
int
((
iW
+
2
*
padW
-
dilatedKW
)
/
dW
+
1
)
output_size
=
(
batch_size
,
patch_size
,
patch_size
,
oH
,
oW
)
return
output_size
class
Correlation
(
nn
.
Module
):
r
"""Correlation operator
This correlation operator works for optical flow correlation computation.
There are two batched tensors with shape :math:`(N, C, H, W)`,
and the correlation output's shape is :math:`(N, max\_displacement \times
2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`
where
.. math::
H_{out} = \left\lfloor\frac{H_{in} + 2 \times padding -
dilation \times (kernel\_size - 1) - 1}
{stride} + 1\right\rfloor
.. math::
W_{out} = \left\lfloor\frac{W_{in} + 2 \times padding - dilation
\times (kernel\_size - 1) - 1}
{stride} + 1\right\rfloor
the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
window convolution between input1 and shifted input2,
.. math::
Corr(N_i, dx, dy) =
\sum_{c=0}^{C-1}
input1(N_i, c) \star
\mathcal{S}(input2(N_i, c), dy, dx)
where :math:`\star` is the valid 2d sliding window convolution operator,
and :math:`\mathcal{S}` means shifting the input features (auto-complete
zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
[-max\_displacement \times dilation\_patch, max\_displacement \times
dilation\_patch]`.
Args:
kernel_size (int): The size of sliding window i.e. local neighborhood
representing the center points and involved in correlation
computation. Defaults to 1.
max_displacement (int): The radius for computing correlation volume,
but the actual working space can be dilated by dilation_patch.
Defaults to 1.
stride (int): The stride of the sliding blocks in the input spatial
dimensions. Defaults to 1.
padding (int): Zero padding added to all four sides of the input1.
Defaults to 0.
dilation (int): The spacing of local neighborhood that will involved
in correlation. Defaults to 1.
dilation_patch (int): The spacing between position need to compute
correlation. Defaults to 1.
"""
def
__init__
(
self
,
kernel_size
:
int
=
1
,
max_displacement
:
int
=
1
,
stride
:
int
=
1
,
padding
:
int
=
0
,
dilation
:
int
=
1
,
dilation_patch
:
int
=
1
)
->
None
:
super
().
__init__
()
self
.
kernel_size
=
kernel_size
self
.
max_displacement
=
max_displacement
self
.
stride
=
stride
self
.
padding
=
padding
self
.
dilation
=
dilation
self
.
dilation_patch
=
dilation_patch
def
forward
(
self
,
input1
:
Tensor
,
input2
:
Tensor
)
->
Tensor
:
return
CorrelationFunction
.
apply
(
input1
,
input2
,
self
.
kernel_size
,
self
.
max_displacement
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
dilation_patch
)
def
__repr__
(
self
)
->
str
:
s
=
self
.
__class__
.
__name__
s
+=
f
'(kernel_size=
{
self
.
kernel_size
}
, '
s
+=
f
'max_displacement=
{
self
.
max_displacement
}
, '
s
+=
f
'stride=
{
self
.
stride
}
, '
s
+=
f
'padding=
{
self
.
padding
}
, '
s
+=
f
'dilation=
{
self
.
dilation
}
, '
s
+=
f
'dilation_patch=
{
self
.
dilation_patch
}
)'
return
s
lavis/common/annotator/uniformer/mmcv/ops/deform_conv.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Tuple
,
Union
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torch
import
Tensor
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.nn.modules.utils
import
_pair
,
_single
from
annotator.uniformer.mmcv.utils
import
deprecated_api_warning
from
..cnn
import
CONV_LAYERS
from
..utils
import
ext_loader
,
print_log
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'deform_conv_forward'
,
'deform_conv_backward_input'
,
'deform_conv_backward_parameters'
])
class
DeformConv2dFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
,
offset
,
weight
,
stride
,
padding
,
dilation
,
groups
,
deform_groups
,
bias
=
False
,
im2col_step
=
32
):
return
g
.
op
(
'mmcv::MMCVDeformConv2d'
,
input
,
offset
,
weight
,
stride_i
=
stride
,
padding_i
=
padding
,
dilation_i
=
dilation
,
groups_i
=
groups
,
deform_groups_i
=
deform_groups
,
bias_i
=
bias
,
im2col_step_i
=
im2col_step
)
@
staticmethod
def
forward
(
ctx
,
input
,
offset
,
weight
,
stride
=
1
,
padding
=
0
,
dilation
=
1
,
groups
=
1
,
deform_groups
=
1
,
bias
=
False
,
im2col_step
=
32
):
if
input
is
not
None
and
input
.
dim
()
!=
4
:
raise
ValueError
(
f
'Expected 4D tensor as input, got
{
input
.
dim
()
}
D tensor
\
instead.'
)
assert
bias
is
False
,
'Only support bias is False.'
ctx
.
stride
=
_pair
(
stride
)
ctx
.
padding
=
_pair
(
padding
)
ctx
.
dilation
=
_pair
(
dilation
)
ctx
.
groups
=
groups
ctx
.
deform_groups
=
deform_groups
ctx
.
im2col_step
=
im2col_step
# When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
# amp won't cast the type of model (float32), but "offset" is cast
# to float16 by nn.Conv2d automatically, leading to the type
# mismatch with input (when it is float32) or weight.
# The flag for whether to use fp16 or amp is the type of "offset",
# we cast weight and input to temporarily support fp16 and amp
# whatever the pytorch version is.
input
=
input
.
type_as
(
offset
)
weight
=
weight
.
type_as
(
input
)
ctx
.
save_for_backward
(
input
,
offset
,
weight
)
output
=
input
.
new_empty
(
DeformConv2dFunction
.
_output_size
(
ctx
,
input
,
weight
))
ctx
.
bufs_
=
[
input
.
new_empty
(
0
),
input
.
new_empty
(
0
)]
# columns, ones
cur_im2col_step
=
min
(
ctx
.
im2col_step
,
input
.
size
(
0
))
assert
(
input
.
size
(
0
)
%
cur_im2col_step
)
==
0
,
'im2col step must divide batchsize'
ext_module
.
deform_conv_forward
(
input
,
weight
,
offset
,
output
,
ctx
.
bufs_
[
0
],
ctx
.
bufs_
[
1
],
kW
=
weight
.
size
(
3
),
kH
=
weight
.
size
(
2
),
dW
=
ctx
.
stride
[
1
],
dH
=
ctx
.
stride
[
0
],
padW
=
ctx
.
padding
[
1
],
padH
=
ctx
.
padding
[
0
],
dilationW
=
ctx
.
dilation
[
1
],
dilationH
=
ctx
.
dilation
[
0
],
group
=
ctx
.
groups
,
deformable_group
=
ctx
.
deform_groups
,
im2col_step
=
cur_im2col_step
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
input
,
offset
,
weight
=
ctx
.
saved_tensors
grad_input
=
grad_offset
=
grad_weight
=
None
cur_im2col_step
=
min
(
ctx
.
im2col_step
,
input
.
size
(
0
))
assert
(
input
.
size
(
0
)
%
cur_im2col_step
)
==
0
,
'batch size must be divisible by im2col_step'
grad_output
=
grad_output
.
contiguous
()
if
ctx
.
needs_input_grad
[
0
]
or
ctx
.
needs_input_grad
[
1
]:
grad_input
=
torch
.
zeros_like
(
input
)
grad_offset
=
torch
.
zeros_like
(
offset
)
ext_module
.
deform_conv_backward_input
(
input
,
offset
,
grad_output
,
grad_input
,
grad_offset
,
weight
,
ctx
.
bufs_
[
0
],
kW
=
weight
.
size
(
3
),
kH
=
weight
.
size
(
2
),
dW
=
ctx
.
stride
[
1
],
dH
=
ctx
.
stride
[
0
],
padW
=
ctx
.
padding
[
1
],
padH
=
ctx
.
padding
[
0
],
dilationW
=
ctx
.
dilation
[
1
],
dilationH
=
ctx
.
dilation
[
0
],
group
=
ctx
.
groups
,
deformable_group
=
ctx
.
deform_groups
,
im2col_step
=
cur_im2col_step
)
if
ctx
.
needs_input_grad
[
2
]:
grad_weight
=
torch
.
zeros_like
(
weight
)
ext_module
.
deform_conv_backward_parameters
(
input
,
offset
,
grad_output
,
grad_weight
,
ctx
.
bufs_
[
0
],
ctx
.
bufs_
[
1
],
kW
=
weight
.
size
(
3
),
kH
=
weight
.
size
(
2
),
dW
=
ctx
.
stride
[
1
],
dH
=
ctx
.
stride
[
0
],
padW
=
ctx
.
padding
[
1
],
padH
=
ctx
.
padding
[
0
],
dilationW
=
ctx
.
dilation
[
1
],
dilationH
=
ctx
.
dilation
[
0
],
group
=
ctx
.
groups
,
deformable_group
=
ctx
.
deform_groups
,
scale
=
1
,
im2col_step
=
cur_im2col_step
)
return
grad_input
,
grad_offset
,
grad_weight
,
\
None
,
None
,
None
,
None
,
None
,
None
,
None
@
staticmethod
def
_output_size
(
ctx
,
input
,
weight
):
channels
=
weight
.
size
(
0
)
output_size
=
(
input
.
size
(
0
),
channels
)
for
d
in
range
(
input
.
dim
()
-
2
):
in_size
=
input
.
size
(
d
+
2
)
pad
=
ctx
.
padding
[
d
]
kernel
=
ctx
.
dilation
[
d
]
*
(
weight
.
size
(
d
+
2
)
-
1
)
+
1
stride_
=
ctx
.
stride
[
d
]
output_size
+=
((
in_size
+
(
2
*
pad
)
-
kernel
)
//
stride_
+
1
,
)
if
not
all
(
map
(
lambda
s
:
s
>
0
,
output_size
)):
raise
ValueError
(
'convolution input is too small (output would be '
+
'x'
.
join
(
map
(
str
,
output_size
))
+
')'
)
return
output_size
deform_conv2d
=
DeformConv2dFunction
.
apply
class
DeformConv2d
(
nn
.
Module
):
r
"""Deformable 2D convolution.
Applies a deformable 2D convolution over an input signal composed of
several input planes. DeformConv2d was described in the paper
`Deformable Convolutional Networks
<https://arxiv.org/pdf/1703.06211.pdf>`_
Note:
The argument ``im2col_step`` was added in version 1.3.17, which means
number of samples processed by the ``im2col_cuda_kernel`` per call.
It enables users to define ``batch_size`` and ``im2col_step`` more
flexibly and solved `issue mmcv#1440
<https://github.com/open-mmlab/mmcv/issues/1440>`_.
Args:
in_channels (int): Number of channels in the input image.
out_channels (int): Number of channels produced by the convolution.
kernel_size(int, tuple): Size of the convolving kernel.
stride(int, tuple): Stride of the convolution. Default: 1.
padding (int or tuple): Zero-padding added to both sides of the input.
Default: 0.
dilation (int or tuple): Spacing between kernel elements. Default: 1.
groups (int): Number of blocked connections from input.
channels to output channels. Default: 1.
deform_groups (int): Number of deformable group partitions.
bias (bool): If True, adds a learnable bias to the output.
Default: False.
im2col_step (int): Number of samples processed by im2col_cuda_kernel
per call. It will work when ``batch_size`` > ``im2col_step``, but
``batch_size`` must be divisible by ``im2col_step``. Default: 32.
`New in version 1.3.17.`
"""
@
deprecated_api_warning
({
'deformable_groups'
:
'deform_groups'
},
cls_name
=
'DeformConv2d'
)
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
kernel_size
:
Union
[
int
,
Tuple
[
int
,
...]],
stride
:
Union
[
int
,
Tuple
[
int
,
...]]
=
1
,
padding
:
Union
[
int
,
Tuple
[
int
,
...]]
=
0
,
dilation
:
Union
[
int
,
Tuple
[
int
,
...]]
=
1
,
groups
:
int
=
1
,
deform_groups
:
int
=
1
,
bias
:
bool
=
False
,
im2col_step
:
int
=
32
)
->
None
:
super
(
DeformConv2d
,
self
).
__init__
()
assert
not
bias
,
\
f
'bias=
{
bias
}
is not supported in DeformConv2d.'
assert
in_channels
%
groups
==
0
,
\
f
'in_channels
{
in_channels
}
cannot be divisible by groups
{
groups
}
'
assert
out_channels
%
groups
==
0
,
\
f
'out_channels
{
out_channels
}
cannot be divisible by groups
\
{
groups
}
'
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
kernel_size
=
_pair
(
kernel_size
)
self
.
stride
=
_pair
(
stride
)
self
.
padding
=
_pair
(
padding
)
self
.
dilation
=
_pair
(
dilation
)
self
.
groups
=
groups
self
.
deform_groups
=
deform_groups
self
.
im2col_step
=
im2col_step
# enable compatibility with nn.Conv2d
self
.
transposed
=
False
self
.
output_padding
=
_single
(
0
)
# only weight, no bias
self
.
weight
=
nn
.
Parameter
(
torch
.
Tensor
(
out_channels
,
in_channels
//
self
.
groups
,
*
self
.
kernel_size
))
self
.
reset_parameters
()
def
reset_parameters
(
self
):
# switch the initialization of `self.weight` to the standard kaiming
# method described in `Delving deep into rectifiers: Surpassing
# human-level performance on ImageNet classification` - He, K. et al.
# (2015), using a uniform distribution
nn
.
init
.
kaiming_uniform_
(
self
.
weight
,
nonlinearity
=
'relu'
)
def
forward
(
self
,
x
:
Tensor
,
offset
:
Tensor
)
->
Tensor
:
"""Deformable Convolutional forward function.
Args:
x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
offset (Tensor): Offset for deformable convolution, shape
(B, deform_groups*kernel_size[0]*kernel_size[1]*2,
H_out, W_out), H_out, W_out are equal to the output's.
An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
The spatial arrangement is like:
.. code:: text
(x0, y0) (x1, y1) (x2, y2)
(x3, y3) (x4, y4) (x5, y5)
(x6, y6) (x7, y7) (x8, y8)
Returns:
Tensor: Output of the layer.
"""
# To fix an assert error in deform_conv_cuda.cpp:128
# input image is smaller than kernel
input_pad
=
(
x
.
size
(
2
)
<
self
.
kernel_size
[
0
])
or
(
x
.
size
(
3
)
<
self
.
kernel_size
[
1
])
if
input_pad
:
pad_h
=
max
(
self
.
kernel_size
[
0
]
-
x
.
size
(
2
),
0
)
pad_w
=
max
(
self
.
kernel_size
[
1
]
-
x
.
size
(
3
),
0
)
x
=
F
.
pad
(
x
,
(
0
,
pad_w
,
0
,
pad_h
),
'constant'
,
0
).
contiguous
()
offset
=
F
.
pad
(
offset
,
(
0
,
pad_w
,
0
,
pad_h
),
'constant'
,
0
)
offset
=
offset
.
contiguous
()
out
=
deform_conv2d
(
x
,
offset
,
self
.
weight
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
,
self
.
deform_groups
,
False
,
self
.
im2col_step
)
if
input_pad
:
out
=
out
[:,
:,
:
out
.
size
(
2
)
-
pad_h
,
:
out
.
size
(
3
)
-
pad_w
].
contiguous
()
return
out
def
__repr__
(
self
):
s
=
self
.
__class__
.
__name__
s
+=
f
'(in_channels=
{
self
.
in_channels
}
,
\n
'
s
+=
f
'out_channels=
{
self
.
out_channels
}
,
\n
'
s
+=
f
'kernel_size=
{
self
.
kernel_size
}
,
\n
'
s
+=
f
'stride=
{
self
.
stride
}
,
\n
'
s
+=
f
'padding=
{
self
.
padding
}
,
\n
'
s
+=
f
'dilation=
{
self
.
dilation
}
,
\n
'
s
+=
f
'groups=
{
self
.
groups
}
,
\n
'
s
+=
f
'deform_groups=
{
self
.
deform_groups
}
,
\n
'
# bias is not supported in DeformConv2d.
s
+=
'bias=False)'
return
s
@
CONV_LAYERS
.
register_module
(
'DCN'
)
class
DeformConv2dPack
(
DeformConv2d
):
"""A Deformable Conv Encapsulation that acts as normal Conv layers.
The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
The spatial arrangement is like:
.. code:: text
(x0, y0) (x1, y1) (x2, y2)
(x3, y3) (x4, y4) (x5, y5)
(x6, y6) (x7, y7) (x8, y8)
Args:
in_channels (int): Same as nn.Conv2d.
out_channels (int): Same as nn.Conv2d.
kernel_size (int or tuple[int]): Same as nn.Conv2d.
stride (int or tuple[int]): Same as nn.Conv2d.
padding (int or tuple[int]): Same as nn.Conv2d.
dilation (int or tuple[int]): Same as nn.Conv2d.
groups (int): Same as nn.Conv2d.
bias (bool or str): If specified as `auto`, it will be decided by the
norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
False.
"""
_version
=
2
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
(
DeformConv2dPack
,
self
).
__init__
(
*
args
,
**
kwargs
)
self
.
conv_offset
=
nn
.
Conv2d
(
self
.
in_channels
,
self
.
deform_groups
*
2
*
self
.
kernel_size
[
0
]
*
self
.
kernel_size
[
1
],
kernel_size
=
self
.
kernel_size
,
stride
=
_pair
(
self
.
stride
),
padding
=
_pair
(
self
.
padding
),
dilation
=
_pair
(
self
.
dilation
),
bias
=
True
)
self
.
init_offset
()
def
init_offset
(
self
):
self
.
conv_offset
.
weight
.
data
.
zero_
()
self
.
conv_offset
.
bias
.
data
.
zero_
()
def
forward
(
self
,
x
):
offset
=
self
.
conv_offset
(
x
)
return
deform_conv2d
(
x
,
offset
,
self
.
weight
,
self
.
stride
,
self
.
padding
,
self
.
dilation
,
self
.
groups
,
self
.
deform_groups
,
False
,
self
.
im2col_step
)
def
_load_from_state_dict
(
self
,
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
):
version
=
local_metadata
.
get
(
'version'
,
None
)
if
version
is
None
or
version
<
2
:
# the key is different in early versions
# In version < 2, DeformConvPack loads previous benchmark models.
if
(
prefix
+
'conv_offset.weight'
not
in
state_dict
and
prefix
[:
-
1
]
+
'_offset.weight'
in
state_dict
):
state_dict
[
prefix
+
'conv_offset.weight'
]
=
state_dict
.
pop
(
prefix
[:
-
1
]
+
'_offset.weight'
)
if
(
prefix
+
'conv_offset.bias'
not
in
state_dict
and
prefix
[:
-
1
]
+
'_offset.bias'
in
state_dict
):
state_dict
[
prefix
+
'conv_offset.bias'
]
=
state_dict
.
pop
(
prefix
[:
-
1
]
+
'_offset.bias'
)
if
version
is
not
None
and
version
>
1
:
print_log
(
f
'DeformConv2dPack
{
prefix
.
rstrip
(
"."
)
}
is upgraded to '
'version 2.'
,
logger
=
'root'
)
super
().
_load_from_state_dict
(
state_dict
,
prefix
,
local_metadata
,
strict
,
missing_keys
,
unexpected_keys
,
error_msgs
)
lavis/common/annotator/uniformer/mmcv/ops/deform_roi_pool.py
0 → 100644
View file @
c04f261a
# Copyright (c) OpenMMLab. All rights reserved.
from
torch
import
nn
from
torch.autograd
import
Function
from
torch.autograd.function
import
once_differentiable
from
torch.nn.modules.utils
import
_pair
from
..utils
import
ext_loader
ext_module
=
ext_loader
.
load_ext
(
'_ext'
,
[
'deform_roi_pool_forward'
,
'deform_roi_pool_backward'
])
class
DeformRoIPoolFunction
(
Function
):
@
staticmethod
def
symbolic
(
g
,
input
,
rois
,
offset
,
output_size
,
spatial_scale
,
sampling_ratio
,
gamma
):
return
g
.
op
(
'mmcv::MMCVDeformRoIPool'
,
input
,
rois
,
offset
,
pooled_height_i
=
output_size
[
0
],
pooled_width_i
=
output_size
[
1
],
spatial_scale_f
=
spatial_scale
,
sampling_ratio_f
=
sampling_ratio
,
gamma_f
=
gamma
)
@
staticmethod
def
forward
(
ctx
,
input
,
rois
,
offset
,
output_size
,
spatial_scale
=
1.0
,
sampling_ratio
=
0
,
gamma
=
0.1
):
if
offset
is
None
:
offset
=
input
.
new_zeros
(
0
)
ctx
.
output_size
=
_pair
(
output_size
)
ctx
.
spatial_scale
=
float
(
spatial_scale
)
ctx
.
sampling_ratio
=
int
(
sampling_ratio
)
ctx
.
gamma
=
float
(
gamma
)
assert
rois
.
size
(
1
)
==
5
,
'RoI must be (idx, x1, y1, x2, y2)!'
output_shape
=
(
rois
.
size
(
0
),
input
.
size
(
1
),
ctx
.
output_size
[
0
],
ctx
.
output_size
[
1
])
output
=
input
.
new_zeros
(
output_shape
)
ext_module
.
deform_roi_pool_forward
(
input
,
rois
,
offset
,
output
,
pooled_height
=
ctx
.
output_size
[
0
],
pooled_width
=
ctx
.
output_size
[
1
],
spatial_scale
=
ctx
.
spatial_scale
,
sampling_ratio
=
ctx
.
sampling_ratio
,
gamma
=
ctx
.
gamma
)
ctx
.
save_for_backward
(
input
,
rois
,
offset
)
return
output
@
staticmethod
@
once_differentiable
def
backward
(
ctx
,
grad_output
):
input
,
rois
,
offset
=
ctx
.
saved_tensors
grad_input
=
grad_output
.
new_zeros
(
input
.
shape
)
grad_offset
=
grad_output
.
new_zeros
(
offset
.
shape
)
ext_module
.
deform_roi_pool_backward
(
grad_output
,
input
,
rois
,
offset
,
grad_input
,
grad_offset
,
pooled_height
=
ctx
.
output_size
[
0
],
pooled_width
=
ctx
.
output_size
[
1
],
spatial_scale
=
ctx
.
spatial_scale
,
sampling_ratio
=
ctx
.
sampling_ratio
,
gamma
=
ctx
.
gamma
)
if
grad_offset
.
numel
()
==
0
:
grad_offset
=
None
return
grad_input
,
None
,
grad_offset
,
None
,
None
,
None
,
None
deform_roi_pool
=
DeformRoIPoolFunction
.
apply
class
DeformRoIPool
(
nn
.
Module
):
def
__init__
(
self
,
output_size
,
spatial_scale
=
1.0
,
sampling_ratio
=
0
,
gamma
=
0.1
):
super
(
DeformRoIPool
,
self
).
__init__
()
self
.
output_size
=
_pair
(
output_size
)
self
.
spatial_scale
=
float
(
spatial_scale
)
self
.
sampling_ratio
=
int
(
sampling_ratio
)
self
.
gamma
=
float
(
gamma
)
def
forward
(
self
,
input
,
rois
,
offset
=
None
):
return
deform_roi_pool
(
input
,
rois
,
offset
,
self
.
output_size
,
self
.
spatial_scale
,
self
.
sampling_ratio
,
self
.
gamma
)
class
DeformRoIPoolPack
(
DeformRoIPool
):
def
__init__
(
self
,
output_size
,
output_channels
,
deform_fc_channels
=
1024
,
spatial_scale
=
1.0
,
sampling_ratio
=
0
,
gamma
=
0.1
):
super
(
DeformRoIPoolPack
,
self
).
__init__
(
output_size
,
spatial_scale
,
sampling_ratio
,
gamma
)
self
.
output_channels
=
output_channels
self
.
deform_fc_channels
=
deform_fc_channels
self
.
offset_fc
=
nn
.
Sequential
(
nn
.
Linear
(
self
.
output_size
[
0
]
*
self
.
output_size
[
1
]
*
self
.
output_channels
,
self
.
deform_fc_channels
),
nn
.
ReLU
(
inplace
=
True
),
nn
.
Linear
(
self
.
deform_fc_channels
,
self
.
deform_fc_channels
),
nn
.
ReLU
(
inplace
=
True
),
nn
.
Linear
(
self
.
deform_fc_channels
,
self
.
output_size
[
0
]
*
self
.
output_size
[
1
]
*
2
))
self
.
offset_fc
[
-
1
].
weight
.
data
.
zero_
()
self
.
offset_fc
[
-
1
].
bias
.
data
.
zero_
()
def
forward
(
self
,
input
,
rois
):
assert
input
.
size
(
1
)
==
self
.
output_channels
x
=
deform_roi_pool
(
input
,
rois
,
None
,
self
.
output_size
,
self
.
spatial_scale
,
self
.
sampling_ratio
,
self
.
gamma
)
rois_num
=
rois
.
size
(
0
)
offset
=
self
.
offset_fc
(
x
.
view
(
rois_num
,
-
1
))
offset
=
offset
.
view
(
rois_num
,
2
,
self
.
output_size
[
0
],
self
.
output_size
[
1
])
return
deform_roi_pool
(
input
,
rois
,
offset
,
self
.
output_size
,
self
.
spatial_scale
,
self
.
sampling_ratio
,
self
.
gamma
)
class
ModulatedDeformRoIPoolPack
(
DeformRoIPool
):
def
__init__
(
self
,
output_size
,
output_channels
,
deform_fc_channels
=
1024
,
spatial_scale
=
1.0
,
sampling_ratio
=
0
,
gamma
=
0.1
):
super
(
ModulatedDeformRoIPoolPack
,
self
).
__init__
(
output_size
,
spatial_scale
,
sampling_ratio
,
gamma
)
self
.
output_channels
=
output_channels
self
.
deform_fc_channels
=
deform_fc_channels
self
.
offset_fc
=
nn
.
Sequential
(
nn
.
Linear
(
self
.
output_size
[
0
]
*
self
.
output_size
[
1
]
*
self
.
output_channels
,
self
.
deform_fc_channels
),
nn
.
ReLU
(
inplace
=
True
),
nn
.
Linear
(
self
.
deform_fc_channels
,
self
.
deform_fc_channels
),
nn
.
ReLU
(
inplace
=
True
),
nn
.
Linear
(
self
.
deform_fc_channels
,
self
.
output_size
[
0
]
*
self
.
output_size
[
1
]
*
2
))
self
.
offset_fc
[
-
1
].
weight
.
data
.
zero_
()
self
.
offset_fc
[
-
1
].
bias
.
data
.
zero_
()
self
.
mask_fc
=
nn
.
Sequential
(
nn
.
Linear
(
self
.
output_size
[
0
]
*
self
.
output_size
[
1
]
*
self
.
output_channels
,
self
.
deform_fc_channels
),
nn
.
ReLU
(
inplace
=
True
),
nn
.
Linear
(
self
.
deform_fc_channels
,
self
.
output_size
[
0
]
*
self
.
output_size
[
1
]
*
1
),
nn
.
Sigmoid
())
self
.
mask_fc
[
2
].
weight
.
data
.
zero_
()
self
.
mask_fc
[
2
].
bias
.
data
.
zero_
()
def
forward
(
self
,
input
,
rois
):
assert
input
.
size
(
1
)
==
self
.
output_channels
x
=
deform_roi_pool
(
input
,
rois
,
None
,
self
.
output_size
,
self
.
spatial_scale
,
self
.
sampling_ratio
,
self
.
gamma
)
rois_num
=
rois
.
size
(
0
)
offset
=
self
.
offset_fc
(
x
.
view
(
rois_num
,
-
1
))
offset
=
offset
.
view
(
rois_num
,
2
,
self
.
output_size
[
0
],
self
.
output_size
[
1
])
mask
=
self
.
mask_fc
(
x
.
view
(
rois_num
,
-
1
))
mask
=
mask
.
view
(
rois_num
,
1
,
self
.
output_size
[
0
],
self
.
output_size
[
1
])
d
=
deform_roi_pool
(
input
,
rois
,
offset
,
self
.
output_size
,
self
.
spatial_scale
,
self
.
sampling_ratio
,
self
.
gamma
)
return
d
*
mask
Prev
1
…
8
9
10
11
12
13
14
15
16
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment