Commit c04f261a authored by dongchy920's avatar dongchy920
Browse files

InstruceBLIP

parents
Pipeline #1594 canceled with stages
# Copyright (c) OpenMMLab. All rights reserved.
import numbers
import cv2
import numpy as np
from ..utils import to_2tuple
from .io import imread_backend
try:
from PIL import Image
except ImportError:
Image = None
def _scale_size(size, scale):
"""Rescale a size by a ratio.
Args:
size (tuple[int]): (w, h).
scale (float | tuple(float)): Scaling factor.
Returns:
tuple[int]: scaled size.
"""
if isinstance(scale, (float, int)):
scale = (scale, scale)
w, h = size
return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
cv2_interp_codes = {
'nearest': cv2.INTER_NEAREST,
'bilinear': cv2.INTER_LINEAR,
'bicubic': cv2.INTER_CUBIC,
'area': cv2.INTER_AREA,
'lanczos': cv2.INTER_LANCZOS4
}
if Image is not None:
pillow_interp_codes = {
'nearest': Image.NEAREST,
'bilinear': Image.BILINEAR,
'bicubic': Image.BICUBIC,
'box': Image.BOX,
'lanczos': Image.LANCZOS,
'hamming': Image.HAMMING
}
def imresize(img,
size,
return_scale=False,
interpolation='bilinear',
out=None,
backend=None):
"""Resize image to a given size.
Args:
img (ndarray): The input image.
size (tuple[int]): Target size (w, h).
return_scale (bool): Whether to return `w_scale` and `h_scale`.
interpolation (str): Interpolation method, accepted values are
"nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
backend, "nearest", "bilinear" for 'pillow' backend.
out (ndarray): The output destination.
backend (str | None): The image resize backend type. Options are `cv2`,
`pillow`, `None`. If backend is None, the global imread_backend
specified by ``mmcv.use_backend()`` will be used. Default: None.
Returns:
tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
`resized_img`.
"""
h, w = img.shape[:2]
if backend is None:
backend = imread_backend
if backend not in ['cv2', 'pillow']:
raise ValueError(f'backend: {backend} is not supported for resize.'
f"Supported backends are 'cv2', 'pillow'")
if backend == 'pillow':
assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
pil_image = Image.fromarray(img)
pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
resized_img = np.array(pil_image)
else:
resized_img = cv2.resize(
img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
if not return_scale:
return resized_img
else:
w_scale = size[0] / w
h_scale = size[1] / h
return resized_img, w_scale, h_scale
def imresize_to_multiple(img,
divisor,
size=None,
scale_factor=None,
keep_ratio=False,
return_scale=False,
interpolation='bilinear',
out=None,
backend=None):
"""Resize image according to a given size or scale factor and then rounds
up the the resized or rescaled image size to the nearest value that can be
divided by the divisor.
Args:
img (ndarray): The input image.
divisor (int | tuple): Resized image size will be a multiple of
divisor. If divisor is a tuple, divisor should be
(w_divisor, h_divisor).
size (None | int | tuple[int]): Target size (w, h). Default: None.
scale_factor (None | float | tuple[float]): Multiplier for spatial
size. Should match input size if it is a tuple and the 2D style is
(w_scale_factor, h_scale_factor). Default: None.
keep_ratio (bool): Whether to keep the aspect ratio when resizing the
image. Default: False.
return_scale (bool): Whether to return `w_scale` and `h_scale`.
interpolation (str): Interpolation method, accepted values are
"nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
backend, "nearest", "bilinear" for 'pillow' backend.
out (ndarray): The output destination.
backend (str | None): The image resize backend type. Options are `cv2`,
`pillow`, `None`. If backend is None, the global imread_backend
specified by ``mmcv.use_backend()`` will be used. Default: None.
Returns:
tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
`resized_img`.
"""
h, w = img.shape[:2]
if size is not None and scale_factor is not None:
raise ValueError('only one of size or scale_factor should be defined')
elif size is None and scale_factor is None:
raise ValueError('one of size or scale_factor should be defined')
elif size is not None:
size = to_2tuple(size)
if keep_ratio:
size = rescale_size((w, h), size, return_scale=False)
else:
size = _scale_size((w, h), scale_factor)
divisor = to_2tuple(divisor)
size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)])
resized_img, w_scale, h_scale = imresize(
img,
size,
return_scale=True,
interpolation=interpolation,
out=out,
backend=backend)
if return_scale:
return resized_img, w_scale, h_scale
else:
return resized_img
def imresize_like(img,
dst_img,
return_scale=False,
interpolation='bilinear',
backend=None):
"""Resize image to the same size of a given image.
Args:
img (ndarray): The input image.
dst_img (ndarray): The target image.
return_scale (bool): Whether to return `w_scale` and `h_scale`.
interpolation (str): Same as :func:`resize`.
backend (str | None): Same as :func:`resize`.
Returns:
tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
`resized_img`.
"""
h, w = dst_img.shape[:2]
return imresize(img, (w, h), return_scale, interpolation, backend=backend)
def rescale_size(old_size, scale, return_scale=False):
"""Calculate the new size to be rescaled to.
Args:
old_size (tuple[int]): The old size (w, h) of image.
scale (float | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this
factor, else if it is a tuple of 2 integers, then the image will
be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the
rescaled image size.
Returns:
tuple[int]: The new rescaled image size.
"""
w, h = old_size
if isinstance(scale, (float, int)):
if scale <= 0:
raise ValueError(f'Invalid scale {scale}, must be positive.')
scale_factor = scale
elif isinstance(scale, tuple):
max_long_edge = max(scale)
max_short_edge = min(scale)
scale_factor = min(max_long_edge / max(h, w),
max_short_edge / min(h, w))
else:
raise TypeError(
f'Scale must be a number or tuple of int, but got {type(scale)}')
new_size = _scale_size((w, h), scale_factor)
if return_scale:
return new_size, scale_factor
else:
return new_size
def imrescale(img,
scale,
return_scale=False,
interpolation='bilinear',
backend=None):
"""Resize image while keeping the aspect ratio.
Args:
img (ndarray): The input image.
scale (float | tuple[int]): The scaling factor or maximum size.
If it is a float number, then the image will be rescaled by this
factor, else if it is a tuple of 2 integers, then the image will
be rescaled as large as possible within the scale.
return_scale (bool): Whether to return the scaling factor besides the
rescaled image.
interpolation (str): Same as :func:`resize`.
backend (str | None): Same as :func:`resize`.
Returns:
ndarray: The rescaled image.
"""
h, w = img.shape[:2]
new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
rescaled_img = imresize(
img, new_size, interpolation=interpolation, backend=backend)
if return_scale:
return rescaled_img, scale_factor
else:
return rescaled_img
def imflip(img, direction='horizontal'):
"""Flip an image horizontally or vertically.
Args:
img (ndarray): Image to be flipped.
direction (str): The flip direction, either "horizontal" or
"vertical" or "diagonal".
Returns:
ndarray: The flipped image.
"""
assert direction in ['horizontal', 'vertical', 'diagonal']
if direction == 'horizontal':
return np.flip(img, axis=1)
elif direction == 'vertical':
return np.flip(img, axis=0)
else:
return np.flip(img, axis=(0, 1))
def imflip_(img, direction='horizontal'):
"""Inplace flip an image horizontally or vertically.
Args:
img (ndarray): Image to be flipped.
direction (str): The flip direction, either "horizontal" or
"vertical" or "diagonal".
Returns:
ndarray: The flipped image (inplace).
"""
assert direction in ['horizontal', 'vertical', 'diagonal']
if direction == 'horizontal':
return cv2.flip(img, 1, img)
elif direction == 'vertical':
return cv2.flip(img, 0, img)
else:
return cv2.flip(img, -1, img)
def imrotate(img,
angle,
center=None,
scale=1.0,
border_value=0,
interpolation='bilinear',
auto_bound=False):
"""Rotate an image.
Args:
img (ndarray): Image to be rotated.
angle (float): Rotation angle in degrees, positive values mean
clockwise rotation.
center (tuple[float], optional): Center point (w, h) of the rotation in
the source image. If not specified, the center of the image will be
used.
scale (float): Isotropic scale factor.
border_value (int): Border value.
interpolation (str): Same as :func:`resize`.
auto_bound (bool): Whether to adjust the image size to cover the whole
rotated image.
Returns:
ndarray: The rotated image.
"""
if center is not None and auto_bound:
raise ValueError('`auto_bound` conflicts with `center`')
h, w = img.shape[:2]
if center is None:
center = ((w - 1) * 0.5, (h - 1) * 0.5)
assert isinstance(center, tuple)
matrix = cv2.getRotationMatrix2D(center, -angle, scale)
if auto_bound:
cos = np.abs(matrix[0, 0])
sin = np.abs(matrix[0, 1])
new_w = h * sin + w * cos
new_h = h * cos + w * sin
matrix[0, 2] += (new_w - w) * 0.5
matrix[1, 2] += (new_h - h) * 0.5
w = int(np.round(new_w))
h = int(np.round(new_h))
rotated = cv2.warpAffine(
img,
matrix, (w, h),
flags=cv2_interp_codes[interpolation],
borderValue=border_value)
return rotated
def bbox_clip(bboxes, img_shape):
"""Clip bboxes to fit the image shape.
Args:
bboxes (ndarray): Shape (..., 4*k)
img_shape (tuple[int]): (height, width) of the image.
Returns:
ndarray: Clipped bboxes.
"""
assert bboxes.shape[-1] % 4 == 0
cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
cmin[0::2] = img_shape[1] - 1
cmin[1::2] = img_shape[0] - 1
clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
return clipped_bboxes
def bbox_scaling(bboxes, scale, clip_shape=None):
"""Scaling bboxes w.r.t the box center.
Args:
bboxes (ndarray): Shape(..., 4).
scale (float): Scaling factor.
clip_shape (tuple[int], optional): If specified, bboxes that exceed the
boundary will be clipped according to the given shape (h, w).
Returns:
ndarray: Scaled bboxes.
"""
if float(scale) == 1.0:
scaled_bboxes = bboxes.copy()
else:
w = bboxes[..., 2] - bboxes[..., 0] + 1
h = bboxes[..., 3] - bboxes[..., 1] + 1
dw = (w * (scale - 1)) * 0.5
dh = (h * (scale - 1)) * 0.5
scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
if clip_shape is not None:
return bbox_clip(scaled_bboxes, clip_shape)
else:
return scaled_bboxes
def imcrop(img, bboxes, scale=1.0, pad_fill=None):
"""Crop image patches.
3 steps: scale the bboxes -> clip bboxes -> crop and pad.
Args:
img (ndarray): Image to be cropped.
bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
scale (float, optional): Scale ratio of bboxes, the default value
1.0 means no padding.
pad_fill (Number | list[Number]): Value to be filled for padding.
Default: None, which means no padding.
Returns:
list[ndarray] | ndarray: The cropped image patches.
"""
chn = 1 if img.ndim == 2 else img.shape[2]
if pad_fill is not None:
if isinstance(pad_fill, (int, float)):
pad_fill = [pad_fill for _ in range(chn)]
assert len(pad_fill) == chn
_bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
patches = []
for i in range(clipped_bbox.shape[0]):
x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
if pad_fill is None:
patch = img[y1:y2 + 1, x1:x2 + 1, ...]
else:
_x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
if chn == 1:
patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1)
else:
patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn)
patch = np.array(
pad_fill, dtype=img.dtype) * np.ones(
patch_shape, dtype=img.dtype)
x_start = 0 if _x1 >= 0 else -_x1
y_start = 0 if _y1 >= 0 else -_y1
w = x2 - x1 + 1
h = y2 - y1 + 1
patch[y_start:y_start + h, x_start:x_start + w,
...] = img[y1:y1 + h, x1:x1 + w, ...]
patches.append(patch)
if bboxes.ndim == 1:
return patches[0]
else:
return patches
def impad(img,
*,
shape=None,
padding=None,
pad_val=0,
padding_mode='constant'):
"""Pad the given image to a certain shape or pad on all sides with
specified padding mode and padding value.
Args:
img (ndarray): Image to be padded.
shape (tuple[int]): Expected padding shape (h, w). Default: None.
padding (int or tuple[int]): Padding on each border. If a single int is
provided this is used to pad all borders. If tuple of length 2 is
provided this is the padding on left/right and top/bottom
respectively. If a tuple of length 4 is provided this is the
padding for the left, top, right and bottom borders respectively.
Default: None. Note that `shape` and `padding` can not be both
set.
pad_val (Number | Sequence[Number]): Values to be filled in padding
areas when padding_mode is 'constant'. Default: 0.
padding_mode (str): Type of padding. Should be: constant, edge,
reflect or symmetric. Default: constant.
- constant: pads with a constant value, this value is specified
with pad_val.
- edge: pads with the last value at the edge of the image.
- reflect: pads with reflection of image without repeating the
last value on the edge. For example, padding [1, 2, 3, 4]
with 2 elements on both sides in reflect mode will result
in [3, 2, 1, 2, 3, 4, 3, 2].
- symmetric: pads with reflection of image repeating the last
value on the edge. For example, padding [1, 2, 3, 4] with
2 elements on both sides in symmetric mode will result in
[2, 1, 1, 2, 3, 4, 4, 3]
Returns:
ndarray: The padded image.
"""
assert (shape is not None) ^ (padding is not None)
if shape is not None:
padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0])
# check pad_val
if isinstance(pad_val, tuple):
assert len(pad_val) == img.shape[-1]
elif not isinstance(pad_val, numbers.Number):
raise TypeError('pad_val must be a int or a tuple. '
f'But received {type(pad_val)}')
# check padding
if isinstance(padding, tuple) and len(padding) in [2, 4]:
if len(padding) == 2:
padding = (padding[0], padding[1], padding[0], padding[1])
elif isinstance(padding, numbers.Number):
padding = (padding, padding, padding, padding)
else:
raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
f'But received {padding}')
# check padding mode
assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
border_type = {
'constant': cv2.BORDER_CONSTANT,
'edge': cv2.BORDER_REPLICATE,
'reflect': cv2.BORDER_REFLECT_101,
'symmetric': cv2.BORDER_REFLECT
}
img = cv2.copyMakeBorder(
img,
padding[1],
padding[3],
padding[0],
padding[2],
border_type[padding_mode],
value=pad_val)
return img
def impad_to_multiple(img, divisor, pad_val=0):
"""Pad an image to ensure each edge to be multiple to some number.
Args:
img (ndarray): Image to be padded.
divisor (int): Padded image edges will be multiple to divisor.
pad_val (Number | Sequence[Number]): Same as :func:`impad`.
Returns:
ndarray: The padded image.
"""
pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
def cutout(img, shape, pad_val=0):
"""Randomly cut out a rectangle from the original img.
Args:
img (ndarray): Image to be cutout.
shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
int, the value will be used for both h and w.
pad_val (int | float | tuple[int | float]): Values to be filled in the
cut area. Defaults to 0.
Returns:
ndarray: The cutout image.
"""
channels = 1 if img.ndim == 2 else img.shape[2]
if isinstance(shape, int):
cut_h, cut_w = shape, shape
else:
assert isinstance(shape, tuple) and len(shape) == 2, \
f'shape must be a int or a tuple with length 2, but got type ' \
f'{type(shape)} instead.'
cut_h, cut_w = shape
if isinstance(pad_val, (int, float)):
pad_val = tuple([pad_val] * channels)
elif isinstance(pad_val, tuple):
assert len(pad_val) == channels, \
'Expected the num of elements in tuple equals the channels' \
'of input image. Found {} vs {}'.format(
len(pad_val), channels)
else:
raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
img_h, img_w = img.shape[:2]
y0 = np.random.uniform(img_h)
x0 = np.random.uniform(img_w)
y1 = int(max(0, y0 - cut_h / 2.))
x1 = int(max(0, x0 - cut_w / 2.))
y2 = min(img_h, y1 + cut_h)
x2 = min(img_w, x1 + cut_w)
if img.ndim == 2:
patch_shape = (y2 - y1, x2 - x1)
else:
patch_shape = (y2 - y1, x2 - x1, channels)
img_cutout = img.copy()
patch = np.array(
pad_val, dtype=img.dtype) * np.ones(
patch_shape, dtype=img.dtype)
img_cutout[y1:y2, x1:x2, ...] = patch
return img_cutout
def _get_shear_matrix(magnitude, direction='horizontal'):
"""Generate the shear matrix for transformation.
Args:
magnitude (int | float): The magnitude used for shear.
direction (str): The flip direction, either "horizontal"
or "vertical".
Returns:
ndarray: The shear matrix with dtype float32.
"""
if direction == 'horizontal':
shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
elif direction == 'vertical':
shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
return shear_matrix
def imshear(img,
magnitude,
direction='horizontal',
border_value=0,
interpolation='bilinear'):
"""Shear an image.
Args:
img (ndarray): Image to be sheared with format (h, w)
or (h, w, c).
magnitude (int | float): The magnitude used for shear.
direction (str): The flip direction, either "horizontal"
or "vertical".
border_value (int | tuple[int]): Value used in case of a
constant border.
interpolation (str): Same as :func:`resize`.
Returns:
ndarray: The sheared image.
"""
assert direction in ['horizontal',
'vertical'], f'Invalid direction: {direction}'
height, width = img.shape[:2]
if img.ndim == 2:
channels = 1
elif img.ndim == 3:
channels = img.shape[-1]
if isinstance(border_value, int):
border_value = tuple([border_value] * channels)
elif isinstance(border_value, tuple):
assert len(border_value) == channels, \
'Expected the num of elements in tuple equals the channels' \
'of input image. Found {} vs {}'.format(
len(border_value), channels)
else:
raise ValueError(
f'Invalid type {type(border_value)} for `border_value`')
shear_matrix = _get_shear_matrix(magnitude, direction)
sheared = cv2.warpAffine(
img,
shear_matrix,
(width, height),
# Note case when the number elements in `border_value`
# greater than 3 (e.g. shearing masks whose channels large
# than 3) will raise TypeError in `cv2.warpAffine`.
# Here simply slice the first 3 values in `border_value`.
borderValue=border_value[:3],
flags=cv2_interp_codes[interpolation])
return sheared
def _get_translate_matrix(offset, direction='horizontal'):
"""Generate the translate matrix.
Args:
offset (int | float): The offset used for translate.
direction (str): The translate direction, either
"horizontal" or "vertical".
Returns:
ndarray: The translate matrix with dtype float32.
"""
if direction == 'horizontal':
translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
elif direction == 'vertical':
translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
return translate_matrix
def imtranslate(img,
offset,
direction='horizontal',
border_value=0,
interpolation='bilinear'):
"""Translate an image.
Args:
img (ndarray): Image to be translated with format
(h, w) or (h, w, c).
offset (int | float): The offset used for translate.
direction (str): The translate direction, either "horizontal"
or "vertical".
border_value (int | tuple[int]): Value used in case of a
constant border.
interpolation (str): Same as :func:`resize`.
Returns:
ndarray: The translated image.
"""
assert direction in ['horizontal',
'vertical'], f'Invalid direction: {direction}'
height, width = img.shape[:2]
if img.ndim == 2:
channels = 1
elif img.ndim == 3:
channels = img.shape[-1]
if isinstance(border_value, int):
border_value = tuple([border_value] * channels)
elif isinstance(border_value, tuple):
assert len(border_value) == channels, \
'Expected the num of elements in tuple equals the channels' \
'of input image. Found {} vs {}'.format(
len(border_value), channels)
else:
raise ValueError(
f'Invalid type {type(border_value)} for `border_value`.')
translate_matrix = _get_translate_matrix(offset, direction)
translated = cv2.warpAffine(
img,
translate_matrix,
(width, height),
# Note case when the number elements in `border_value`
# greater than 3 (e.g. translating masks whose channels
# large than 3) will raise TypeError in `cv2.warpAffine`.
# Here simply slice the first 3 values in `border_value`.
borderValue=border_value[:3],
flags=cv2_interp_codes[interpolation])
return translated
# Copyright (c) OpenMMLab. All rights reserved.
import io
import os.path as osp
from pathlib import Path
import cv2
import numpy as np
from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
IMREAD_UNCHANGED)
from annotator.uniformer.mmcv.utils import check_file_exist, is_str, mkdir_or_exist
try:
from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
except ImportError:
TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
try:
from PIL import Image, ImageOps
except ImportError:
Image = None
try:
import tifffile
except ImportError:
tifffile = None
jpeg = None
supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
imread_flags = {
'color': IMREAD_COLOR,
'grayscale': IMREAD_GRAYSCALE,
'unchanged': IMREAD_UNCHANGED,
'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
'grayscale_ignore_orientation':
IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
}
imread_backend = 'cv2'
def use_backend(backend):
"""Select a backend for image decoding.
Args:
backend (str): The image decoding backend type. Options are `cv2`,
`pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
file format.
"""
assert backend in supported_backends
global imread_backend
imread_backend = backend
if imread_backend == 'turbojpeg':
if TurboJPEG is None:
raise ImportError('`PyTurboJPEG` is not installed')
global jpeg
if jpeg is None:
jpeg = TurboJPEG()
elif imread_backend == 'pillow':
if Image is None:
raise ImportError('`Pillow` is not installed')
elif imread_backend == 'tifffile':
if tifffile is None:
raise ImportError('`tifffile` is not installed')
def _jpegflag(flag='color', channel_order='bgr'):
channel_order = channel_order.lower()
if channel_order not in ['rgb', 'bgr']:
raise ValueError('channel order must be either "rgb" or "bgr"')
if flag == 'color':
if channel_order == 'bgr':
return TJPF_BGR
elif channel_order == 'rgb':
return TJCS_RGB
elif flag == 'grayscale':
return TJPF_GRAY
else:
raise ValueError('flag must be "color" or "grayscale"')
def _pillow2array(img, flag='color', channel_order='bgr'):
"""Convert a pillow image to numpy array.
Args:
img (:obj:`PIL.Image.Image`): The image loaded using PIL
flag (str): Flags specifying the color type of a loaded image,
candidates are 'color', 'grayscale' and 'unchanged'.
Default to 'color'.
channel_order (str): The channel order of the output image array,
candidates are 'bgr' and 'rgb'. Default to 'bgr'.
Returns:
np.ndarray: The converted numpy array
"""
channel_order = channel_order.lower()
if channel_order not in ['rgb', 'bgr']:
raise ValueError('channel order must be either "rgb" or "bgr"')
if flag == 'unchanged':
array = np.array(img)
if array.ndim >= 3 and array.shape[2] >= 3: # color image
array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR
else:
# Handle exif orientation tag
if flag in ['color', 'grayscale']:
img = ImageOps.exif_transpose(img)
# If the image mode is not 'RGB', convert it to 'RGB' first.
if img.mode != 'RGB':
if img.mode != 'LA':
# Most formats except 'LA' can be directly converted to RGB
img = img.convert('RGB')
else:
# When the mode is 'LA', the default conversion will fill in
# the canvas with black, which sometimes shadows black objects
# in the foreground.
#
# Therefore, a random color (124, 117, 104) is used for canvas
img_rgba = img.convert('RGBA')
img = Image.new('RGB', img_rgba.size, (124, 117, 104))
img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha
if flag in ['color', 'color_ignore_orientation']:
array = np.array(img)
if channel_order != 'rgb':
array = array[:, :, ::-1] # RGB to BGR
elif flag in ['grayscale', 'grayscale_ignore_orientation']:
img = img.convert('L')
array = np.array(img)
else:
raise ValueError(
'flag must be "color", "grayscale", "unchanged", '
f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
f' but got {flag}')
return array
def imread(img_or_path, flag='color', channel_order='bgr', backend=None):
"""Read an image.
Args:
img_or_path (ndarray or str or Path): Either a numpy array or str or
pathlib.Path. If it is a numpy array (loaded image), then
it will be returned as is.
flag (str): Flags specifying the color type of a loaded image,
candidates are `color`, `grayscale`, `unchanged`,
`color_ignore_orientation` and `grayscale_ignore_orientation`.
By default, `cv2` and `pillow` backend would rotate the image
according to its EXIF info unless called with `unchanged` or
`*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
always ignore image's EXIF info regardless of the flag.
The `turbojpeg` backend only supports `color` and `grayscale`.
channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
backend (str | None): The image decoding backend type. Options are
`cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
If backend is None, the global imread_backend specified by
``mmcv.use_backend()`` will be used. Default: None.
Returns:
ndarray: Loaded image array.
"""
if backend is None:
backend = imread_backend
if backend not in supported_backends:
raise ValueError(f'backend: {backend} is not supported. Supported '
"backends are 'cv2', 'turbojpeg', 'pillow'")
if isinstance(img_or_path, Path):
img_or_path = str(img_or_path)
if isinstance(img_or_path, np.ndarray):
return img_or_path
elif is_str(img_or_path):
check_file_exist(img_or_path,
f'img file does not exist: {img_or_path}')
if backend == 'turbojpeg':
with open(img_or_path, 'rb') as in_file:
img = jpeg.decode(in_file.read(),
_jpegflag(flag, channel_order))
if img.shape[-1] == 1:
img = img[:, :, 0]
return img
elif backend == 'pillow':
img = Image.open(img_or_path)
img = _pillow2array(img, flag, channel_order)
return img
elif backend == 'tifffile':
img = tifffile.imread(img_or_path)
return img
else:
flag = imread_flags[flag] if is_str(flag) else flag
img = cv2.imread(img_or_path, flag)
if flag == IMREAD_COLOR and channel_order == 'rgb':
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
return img
else:
raise TypeError('"img" must be a numpy array or a str or '
'a pathlib.Path object')
def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
"""Read an image from bytes.
Args:
content (bytes): Image bytes got from files or other streams.
flag (str): Same as :func:`imread`.
backend (str | None): The image decoding backend type. Options are
`cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the
global imread_backend specified by ``mmcv.use_backend()`` will be
used. Default: None.
Returns:
ndarray: Loaded image array.
"""
if backend is None:
backend = imread_backend
if backend not in supported_backends:
raise ValueError(f'backend: {backend} is not supported. Supported '
"backends are 'cv2', 'turbojpeg', 'pillow'")
if backend == 'turbojpeg':
img = jpeg.decode(content, _jpegflag(flag, channel_order))
if img.shape[-1] == 1:
img = img[:, :, 0]
return img
elif backend == 'pillow':
buff = io.BytesIO(content)
img = Image.open(buff)
img = _pillow2array(img, flag, channel_order)
return img
else:
img_np = np.frombuffer(content, np.uint8)
flag = imread_flags[flag] if is_str(flag) else flag
img = cv2.imdecode(img_np, flag)
if flag == IMREAD_COLOR and channel_order == 'rgb':
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
return img
def imwrite(img, file_path, params=None, auto_mkdir=True):
"""Write image to file.
Args:
img (ndarray): Image array to be written.
file_path (str): Image file path.
params (None or list): Same as opencv :func:`imwrite` interface.
auto_mkdir (bool): If the parent folder of `file_path` does not exist,
whether to create it automatically.
Returns:
bool: Successful or not.
"""
if auto_mkdir:
dir_name = osp.abspath(osp.dirname(file_path))
mkdir_or_exist(dir_name)
return cv2.imwrite(file_path, img, params)
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import annotator.uniformer.mmcv as mmcv
try:
import torch
except ImportError:
torch = None
def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
"""Convert tensor to 3-channel images.
Args:
tensor (torch.Tensor): Tensor that contains multiple images, shape (
N, C, H, W).
mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0).
std (tuple[float], optional): Standard deviation of images.
Defaults to (1, 1, 1).
to_rgb (bool, optional): Whether the tensor was converted to RGB
format in the first place. If so, convert it back to BGR.
Defaults to True.
Returns:
list[np.ndarray]: A list that contains multiple images.
"""
if torch is None:
raise RuntimeError('pytorch is not installed')
assert torch.is_tensor(tensor) and tensor.ndim == 4
assert len(mean) == 3
assert len(std) == 3
num_imgs = tensor.size(0)
mean = np.array(mean, dtype=np.float32)
std = np.array(std, dtype=np.float32)
imgs = []
for img_id in range(num_imgs):
img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
img = mmcv.imdenormalize(
img, mean, std, to_bgr=to_rgb).astype(np.uint8)
imgs.append(np.ascontiguousarray(img))
return imgs
# Copyright (c) OpenMMLab. All rights reserved.
import cv2
import numpy as np
from ..utils import is_tuple_of
from .colorspace import bgr2gray, gray2bgr
def imnormalize(img, mean, std, to_rgb=True):
"""Normalize an image with mean and std.
Args:
img (ndarray): Image to be normalized.
mean (ndarray): The mean to be used for normalize.
std (ndarray): The std to be used for normalize.
to_rgb (bool): Whether to convert to rgb.
Returns:
ndarray: The normalized image.
"""
img = img.copy().astype(np.float32)
return imnormalize_(img, mean, std, to_rgb)
def imnormalize_(img, mean, std, to_rgb=True):
"""Inplace normalize an image with mean and std.
Args:
img (ndarray): Image to be normalized.
mean (ndarray): The mean to be used for normalize.
std (ndarray): The std to be used for normalize.
to_rgb (bool): Whether to convert to rgb.
Returns:
ndarray: The normalized image.
"""
# cv2 inplace normalization does not accept uint8
assert img.dtype != np.uint8
mean = np.float64(mean.reshape(1, -1))
stdinv = 1 / np.float64(std.reshape(1, -1))
if to_rgb:
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
cv2.subtract(img, mean, img) # inplace
cv2.multiply(img, stdinv, img) # inplace
return img
def imdenormalize(img, mean, std, to_bgr=True):
assert img.dtype != np.uint8
mean = mean.reshape(1, -1).astype(np.float64)
std = std.reshape(1, -1).astype(np.float64)
img = cv2.multiply(img, std) # make a copy
cv2.add(img, mean, img) # inplace
if to_bgr:
cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img) # inplace
return img
def iminvert(img):
"""Invert (negate) an image.
Args:
img (ndarray): Image to be inverted.
Returns:
ndarray: The inverted image.
"""
return np.full_like(img, 255) - img
def solarize(img, thr=128):
"""Solarize an image (invert all pixel values above a threshold)
Args:
img (ndarray): Image to be solarized.
thr (int): Threshold for solarizing (0 - 255).
Returns:
ndarray: The solarized image.
"""
img = np.where(img < thr, img, 255 - img)
return img
def posterize(img, bits):
"""Posterize an image (reduce the number of bits for each color channel)
Args:
img (ndarray): Image to be posterized.
bits (int): Number of bits (1 to 8) to use for posterizing.
Returns:
ndarray: The posterized image.
"""
shift = 8 - bits
img = np.left_shift(np.right_shift(img, shift), shift)
return img
def adjust_color(img, alpha=1, beta=None, gamma=0):
r"""It blends the source image and its gray image:
.. math::
output = img * alpha + gray\_img * beta + gamma
Args:
img (ndarray): The input source image.
alpha (int | float): Weight for the source image. Default 1.
beta (int | float): Weight for the converted gray image.
If None, it's assigned the value (1 - `alpha`).
gamma (int | float): Scalar added to each sum.
Same as :func:`cv2.addWeighted`. Default 0.
Returns:
ndarray: Colored image which has the same size and dtype as input.
"""
gray_img = bgr2gray(img)
gray_img = np.tile(gray_img[..., None], [1, 1, 3])
if beta is None:
beta = 1 - alpha
colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
if not colored_img.dtype == np.uint8:
# Note when the dtype of `img` is not the default `np.uint8`
# (e.g. np.float32), the value in `colored_img` got from cv2
# is not guaranteed to be in range [0, 255], so here clip
# is needed.
colored_img = np.clip(colored_img, 0, 255)
return colored_img
def imequalize(img):
"""Equalize the image histogram.
This function applies a non-linear mapping to the input image,
in order to create a uniform distribution of grayscale values
in the output image.
Args:
img (ndarray): Image to be equalized.
Returns:
ndarray: The equalized image.
"""
def _scale_channel(im, c):
"""Scale the data in the corresponding channel."""
im = im[:, :, c]
# Compute the histogram of the image channel.
histo = np.histogram(im, 256, (0, 255))[0]
# For computing the step, filter out the nonzeros.
nonzero_histo = histo[histo > 0]
step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
if not step:
lut = np.array(range(256))
else:
# Compute the cumulative sum, shifted by step // 2
# and then normalized by step.
lut = (np.cumsum(histo) + (step // 2)) // step
# Shift lut, prepending with 0.
lut = np.concatenate([[0], lut[:-1]], 0)
# handle potential integer overflow
lut[lut > 255] = 255
# If step is zero, return the original image.
# Otherwise, index from lut.
return np.where(np.equal(step, 0), im, lut[im])
# Scales each channel independently and then stacks
# the result.
s1 = _scale_channel(img, 0)
s2 = _scale_channel(img, 1)
s3 = _scale_channel(img, 2)
equalized_img = np.stack([s1, s2, s3], axis=-1)
return equalized_img.astype(img.dtype)
def adjust_brightness(img, factor=1.):
"""Adjust image brightness.
This function controls the brightness of an image. An
enhancement factor of 0.0 gives a black image.
A factor of 1.0 gives the original image. This function
blends the source image and the degenerated black image:
.. math::
output = img * factor + degenerated * (1 - factor)
Args:
img (ndarray): Image to be brightened.
factor (float): A value controls the enhancement.
Factor 1.0 returns the original image, lower
factors mean less color (brightness, contrast,
etc), and higher values more. Default 1.
Returns:
ndarray: The brightened image.
"""
degenerated = np.zeros_like(img)
# Note manually convert the dtype to np.float32, to
# achieve as close results as PIL.ImageEnhance.Brightness.
# Set beta=1-factor, and gamma=0
brightened_img = cv2.addWeighted(
img.astype(np.float32), factor, degenerated.astype(np.float32),
1 - factor, 0)
brightened_img = np.clip(brightened_img, 0, 255)
return brightened_img.astype(img.dtype)
def adjust_contrast(img, factor=1.):
"""Adjust image contrast.
This function controls the contrast of an image. An
enhancement factor of 0.0 gives a solid grey
image. A factor of 1.0 gives the original image. It
blends the source image and the degenerated mean image:
.. math::
output = img * factor + degenerated * (1 - factor)
Args:
img (ndarray): Image to be contrasted. BGR order.
factor (float): Same as :func:`mmcv.adjust_brightness`.
Returns:
ndarray: The contrasted image.
"""
gray_img = bgr2gray(img)
hist = np.histogram(gray_img, 256, (0, 255))[0]
mean = round(np.sum(gray_img) / np.sum(hist))
degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
degenerated = gray2bgr(degenerated)
contrasted_img = cv2.addWeighted(
img.astype(np.float32), factor, degenerated.astype(np.float32),
1 - factor, 0)
contrasted_img = np.clip(contrasted_img, 0, 255)
return contrasted_img.astype(img.dtype)
def auto_contrast(img, cutoff=0):
"""Auto adjust image contrast.
This function maximize (normalize) image contrast by first removing cutoff
percent of the lightest and darkest pixels from the histogram and remapping
the image so that the darkest pixel becomes black (0), and the lightest
becomes white (255).
Args:
img (ndarray): Image to be contrasted. BGR order.
cutoff (int | float | tuple): The cutoff percent of the lightest and
darkest pixels to be removed. If given as tuple, it shall be
(low, high). Otherwise, the single value will be used for both.
Defaults to 0.
Returns:
ndarray: The contrasted image.
"""
def _auto_contrast_channel(im, c, cutoff):
im = im[:, :, c]
# Compute the histogram of the image channel.
histo = np.histogram(im, 256, (0, 255))[0]
# Remove cut-off percent pixels from histo
histo_sum = np.cumsum(histo)
cut_low = histo_sum[-1] * cutoff[0] // 100
cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
# Compute mapping
low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
# If all the values have been cut off, return the origin img
if low >= high:
return im
scale = 255.0 / (high - low)
offset = -low * scale
lut = np.array(range(256))
lut = lut * scale + offset
lut = np.clip(lut, 0, 255)
return lut[im]
if isinstance(cutoff, (int, float)):
cutoff = (cutoff, cutoff)
else:
assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
f'float or tuple, but got {type(cutoff)} instead.'
# Auto adjusts contrast for each channel independently and then stacks
# the result.
s1 = _auto_contrast_channel(img, 0, cutoff)
s2 = _auto_contrast_channel(img, 1, cutoff)
s3 = _auto_contrast_channel(img, 2, cutoff)
contrasted_img = np.stack([s1, s2, s3], axis=-1)
return contrasted_img.astype(img.dtype)
def adjust_sharpness(img, factor=1., kernel=None):
"""Adjust image sharpness.
This function controls the sharpness of an image. An
enhancement factor of 0.0 gives a blurred image. A
factor of 1.0 gives the original image. And a factor
of 2.0 gives a sharpened image. It blends the source
image and the degenerated mean image:
.. math::
output = img * factor + degenerated * (1 - factor)
Args:
img (ndarray): Image to be sharpened. BGR order.
factor (float): Same as :func:`mmcv.adjust_brightness`.
kernel (np.ndarray, optional): Filter kernel to be applied on the img
to obtain the degenerated img. Defaults to None.
Note:
No value sanity check is enforced on the kernel set by users. So with
an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
the function its name indicates but end up performing whatever
transform determined by the kernel.
Returns:
ndarray: The sharpened image.
"""
if kernel is None:
# adopted from PIL.ImageFilter.SMOOTH
kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
assert isinstance(kernel, np.ndarray), \
f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
assert kernel.ndim == 2, \
f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
degenerated = cv2.filter2D(img, -1, kernel)
sharpened_img = cv2.addWeighted(
img.astype(np.float32), factor, degenerated.astype(np.float32),
1 - factor, 0)
sharpened_img = np.clip(sharpened_img, 0, 255)
return sharpened_img.astype(img.dtype)
def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
"""AlexNet-style PCA jitter.
This data augmentation is proposed in `ImageNet Classification with Deep
Convolutional Neural Networks
<https://dl.acm.org/doi/pdf/10.1145/3065386>`_.
Args:
img (ndarray): Image to be adjusted lighting. BGR order.
eigval (ndarray): the eigenvalue of the convariance matrix of pixel
values, respectively.
eigvec (ndarray): the eigenvector of the convariance matrix of pixel
values, respectively.
alphastd (float): The standard deviation for distribution of alpha.
Defaults to 0.1
to_rgb (bool): Whether to convert img to rgb.
Returns:
ndarray: The adjusted image.
"""
assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
f'eigval and eigvec should both be of type np.ndarray, got ' \
f'{type(eigval)} and {type(eigvec)} instead.'
assert eigval.ndim == 1 and eigvec.ndim == 2
assert eigvec.shape == (3, eigval.shape[0])
n_eigval = eigval.shape[0]
assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
f'got {type(alphastd)} instead.'
img = img.copy().astype(np.float32)
if to_rgb:
cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace
alpha = np.random.normal(0, alphastd, n_eigval)
alter = eigvec \
* np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
* np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
img_adjusted = img + alter
return img_adjusted
def lut_transform(img, lut_table):
"""Transform array by look-up table.
The function lut_transform fills the output array with values from the
look-up table. Indices of the entries are taken from the input array.
Args:
img (ndarray): Image to be transformed.
lut_table (ndarray): look-up table of 256 elements; in case of
multi-channel input array, the table should either have a single
channel (in this case the same table is used for all channels) or
the same number of channels as in the input array.
Returns:
ndarray: The transformed image.
"""
assert isinstance(img, np.ndarray)
assert 0 <= np.min(img) and np.max(img) <= 255
assert isinstance(lut_table, np.ndarray)
assert lut_table.shape == (256, )
return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
"""Use CLAHE method to process the image.
See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
Graphics Gems, 1994:474-485.` for more information.
Args:
img (ndarray): Image to be processed.
clip_limit (float): Threshold for contrast limiting. Default: 40.0.
tile_grid_size (tuple[int]): Size of grid for histogram equalization.
Input image will be divided into equally sized rectangular tiles.
It defines the number of tiles in row and column. Default: (8, 8).
Returns:
ndarray: The processed image.
"""
assert isinstance(img, np.ndarray)
assert img.ndim == 2
assert isinstance(clip_limit, (float, int))
assert is_tuple_of(tile_grid_size, int)
assert len(tile_grid_size) == 2
clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
return clahe.apply(np.array(img, dtype=np.uint8))
{
"resnet50_caffe": "detectron/resnet50_caffe",
"resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
"resnet101_caffe": "detectron/resnet101_caffe",
"resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
}
{
"vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth",
"vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth",
"vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth",
"vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth",
"vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth",
"vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth",
"vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth",
"vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth",
"resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_batch256_imagenet_20200708-34ab8f90.pth",
"resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_batch256_imagenet_20200708-32ffb4f7.pth",
"resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth",
"resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.pth",
"resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.pth",
"resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_batch256_imagenet_20200708-1ad0ce94.pth",
"resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_batch256_imagenet_20200708-9cb302ef.pth",
"resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_batch256_imagenet_20200708-e79cb6a2.pth",
"resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth",
"resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth",
"resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth",
"resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth",
"se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth",
"se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth",
"resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth",
"resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth",
"resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth",
"resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth",
"shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth",
"shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth",
"mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth"
}
{
"vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth",
"detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth",
"detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth",
"detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth",
"detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth",
"detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth",
"resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth",
"resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth",
"resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth",
"contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth",
"detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth",
"detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth",
"jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth",
"jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth",
"jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth",
"jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth",
"jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth",
"jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth",
"msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth",
"msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth",
"msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth",
"msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth",
"msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth",
"bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth",
"kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth",
"kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth",
"res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth",
"regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth",
"regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth",
"regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth",
"regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth",
"regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth",
"regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth",
"regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth",
"regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth",
"resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth",
"resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth",
"resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth",
"mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth",
"mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth",
"mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth",
"contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth",
"contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth",
"resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth",
"resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth",
"resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth",
"darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth",
"mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth"
}
# Copyright (c) OpenMMLab. All rights reserved.
from .assign_score_withk import assign_score_withk
from .ball_query import ball_query
from .bbox import bbox_overlaps
from .border_align import BorderAlign, border_align
from .box_iou_rotated import box_iou_rotated
from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
from .cc_attention import CrissCrossAttention
from .contour_expand import contour_expand
from .corner_pool import CornerPool
from .correlation import Correlation
from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
ModulatedDeformRoIPoolPack, deform_roi_pool)
from .deprecated_wrappers import Conv2d_deprecated as Conv2d
from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
from .deprecated_wrappers import Linear_deprecated as Linear
from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
sigmoid_focal_loss, softmax_focal_loss)
from .furthest_point_sample import (furthest_point_sample,
furthest_point_sample_with_dist)
from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
from .gather_points import gather_points
from .group_points import GroupAll, QueryAndGroup, grouping_operation
from .info import (get_compiler_version, get_compiling_cuda_version,
get_onnxruntime_op_path)
from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev
from .knn import knn
from .masked_conv import MaskedConv2d, masked_conv2d
from .modulated_deform_conv import (ModulatedDeformConv2d,
ModulatedDeformConv2dPack,
modulated_deform_conv2d)
from .multi_scale_deform_attn import MultiScaleDeformableAttention
from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms
from .pixel_group import pixel_group
from .point_sample import (SimpleRoIAlign, point_sample,
rel_roi_point_to_rel_img_point)
from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
points_in_boxes_part)
from .points_sampler import PointsSampler
from .psa_mask import PSAMask
from .roi_align import RoIAlign, roi_align
from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
from .roi_pool import RoIPool, roi_pool
from .roiaware_pool3d import RoIAwarePool3d
from .roipoint_pool3d import RoIPointPool3d
from .saconv import SAConv2d
from .scatter_points import DynamicScatter, dynamic_scatter
from .sync_bn import SyncBatchNorm
from .three_interpolate import three_interpolate
from .three_nn import three_nn
from .tin_shift import TINShift, tin_shift
from .upfirdn2d import upfirdn2d
from .voxelize import Voxelization, voxelization
__all__ = [
'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
'get_compiler_version', 'get_compiling_cuda_version',
'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d',
'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
'border_align', 'gather_points', 'furthest_point_sample',
'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization',
'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d',
'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all'
]
from torch.autograd import Function
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
class AssignScoreWithK(Function):
r"""Perform weighted sum to generate output features according to scores.
Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
scene_seg/lib/paconv_lib/src/gpu>`_.
This is a memory-efficient CUDA implementation of assign_scores operation,
which first transform all point features with weight bank, then assemble
neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.
See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
more detailed descriptions.
Note:
This implementation assumes using ``neighbor`` kernel input, which is
(point_features - center_features, point_features).
See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
pointnet2/paconv.py#L128 for more details.
"""
@staticmethod
def forward(ctx,
scores,
point_features,
center_features,
knn_idx,
aggregate='sum'):
"""
Args:
scores (torch.Tensor): (B, npoint, K, M), predicted scores to
aggregate weight matrices in the weight bank.
``npoint`` is the number of sampled centers.
``K`` is the number of queried neighbors.
``M`` is the number of weight matrices in the weight bank.
point_features (torch.Tensor): (B, N, M, out_dim)
Pre-computed point features to be aggregated.
center_features (torch.Tensor): (B, N, M, out_dim)
Pre-computed center features to be aggregated.
knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
We assume the first idx in each row is the idx of the center.
aggregate (str, optional): Aggregation method.
Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.
Returns:
torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
"""
agg = {'sum': 0, 'avg': 1, 'max': 2}
B, N, M, out_dim = point_features.size()
_, npoint, K, _ = scores.size()
output = point_features.new_zeros((B, out_dim, npoint, K))
ext_module.assign_score_withk_forward(
point_features.contiguous(),
center_features.contiguous(),
scores.contiguous(),
knn_idx.contiguous(),
output,
B=B,
N0=N,
N1=npoint,
M=M,
K=K,
O=out_dim,
aggregate=agg[aggregate])
ctx.save_for_backward(output, point_features, center_features, scores,
knn_idx)
ctx.agg = agg[aggregate]
return output
@staticmethod
def backward(ctx, grad_out):
"""
Args:
grad_out (torch.Tensor): (B, out_dim, npoint, K)
Returns:
grad_scores (torch.Tensor): (B, npoint, K, M)
grad_point_features (torch.Tensor): (B, N, M, out_dim)
grad_center_features (torch.Tensor): (B, N, M, out_dim)
"""
_, point_features, center_features, scores, knn_idx = ctx.saved_tensors
agg = ctx.agg
B, N, M, out_dim = point_features.size()
_, npoint, K, _ = scores.size()
grad_point_features = point_features.new_zeros(point_features.shape)
grad_center_features = center_features.new_zeros(center_features.shape)
grad_scores = scores.new_zeros(scores.shape)
ext_module.assign_score_withk_backward(
grad_out.contiguous(),
point_features.contiguous(),
center_features.contiguous(),
scores.contiguous(),
knn_idx.contiguous(),
grad_point_features,
grad_center_features,
grad_scores,
B=B,
N0=N,
N1=npoint,
M=M,
K=K,
O=out_dim,
aggregate=agg)
return grad_scores, grad_point_features, \
grad_center_features, None, None
assign_score_withk = AssignScoreWithK.apply
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch.autograd import Function
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['ball_query_forward'])
class BallQuery(Function):
"""Find nearby points in spherical space."""
@staticmethod
def forward(ctx, min_radius: float, max_radius: float, sample_num: int,
xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor:
"""
Args:
min_radius (float): minimum radius of the balls.
max_radius (float): maximum radius of the balls.
sample_num (int): maximum number of features in the balls.
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
Returns:
Tensor: (B, npoint, nsample) tensor with the indices of
the features that form the query balls.
"""
assert center_xyz.is_contiguous()
assert xyz.is_contiguous()
assert min_radius < max_radius
B, N, _ = xyz.size()
npoint = center_xyz.size(1)
idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int)
ext_module.ball_query_forward(
center_xyz,
xyz,
idx,
b=B,
n=N,
m=npoint,
min_radius=min_radius,
max_radius=max_radius,
nsample=sample_num)
if torch.__version__ != 'parrots':
ctx.mark_non_differentiable(idx)
return idx
@staticmethod
def backward(ctx, a=None):
return None, None, None, None
ball_query = BallQuery.apply
# Copyright (c) OpenMMLab. All rights reserved.
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
"""Calculate overlap between two set of bboxes.
If ``aligned`` is ``False``, then calculate the ious between each bbox
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
If aligned is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
Example:
>>> bboxes1 = torch.FloatTensor([
>>> [0, 0, 10, 10],
>>> [10, 10, 20, 20],
>>> [32, 32, 38, 42],
>>> ])
>>> bboxes2 = torch.FloatTensor([
>>> [0, 0, 10, 20],
>>> [0, 10, 10, 19],
>>> [10, 10, 20, 20],
>>> ])
>>> bbox_overlaps(bboxes1, bboxes2)
tensor([[0.5000, 0.0000, 0.0000],
[0.0000, 0.0000, 1.0000],
[0.0000, 0.0000, 0.0000]])
Example:
>>> empty = torch.FloatTensor([])
>>> nonempty = torch.FloatTensor([
>>> [0, 0, 10, 9],
>>> ])
>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
"""
mode_dict = {'iou': 0, 'iof': 1}
assert mode in mode_dict.keys()
mode_flag = mode_dict[mode]
# Either the boxes are empty or the length of boxes' last dimension is 4
assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
assert offset == 1 or offset == 0
rows = bboxes1.size(0)
cols = bboxes2.size(0)
if aligned:
assert rows == cols
if rows * cols == 0:
return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
if aligned:
ious = bboxes1.new_zeros(rows)
else:
ious = bboxes1.new_zeros((rows, cols))
ext_module.bbox_overlaps(
bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
return ious
# Copyright (c) OpenMMLab. All rights reserved.
# modified from
# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
import torch
import torch.nn as nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['border_align_forward', 'border_align_backward'])
class BorderAlignFunction(Function):
@staticmethod
def symbolic(g, input, boxes, pool_size):
return g.op(
'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
@staticmethod
def forward(ctx, input, boxes, pool_size):
ctx.pool_size = pool_size
ctx.input_shape = input.size()
assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'
assert boxes.size(2) == 4, \
'the last dimension of boxes must be (x1, y1, x2, y2)'
assert input.size(1) % 4 == 0, \
'the channel for input feature must be divisible by factor 4'
# [B, C//4, H*W, 4]
output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)
output = input.new_zeros(output_shape)
# `argmax_idx` only used for backward
argmax_idx = input.new_zeros(output_shape).to(torch.int)
ext_module.border_align_forward(
input, boxes, output, argmax_idx, pool_size=ctx.pool_size)
ctx.save_for_backward(boxes, argmax_idx)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
boxes, argmax_idx = ctx.saved_tensors
grad_input = grad_output.new_zeros(ctx.input_shape)
# complex head architecture may cause grad_output uncontiguous
grad_output = grad_output.contiguous()
ext_module.border_align_backward(
grad_output,
boxes,
argmax_idx,
grad_input,
pool_size=ctx.pool_size)
return grad_input, None, None
border_align = BorderAlignFunction.apply
class BorderAlign(nn.Module):
r"""Border align pooling layer.
Applies border_align over the input feature based on predicted bboxes.
The details were described in the paper
`BorderDet: Border Feature for Dense Object Detection
<https://arxiv.org/abs/2007.11056>`_.
For each border line (e.g. top, left, bottom or right) of each box,
border_align does the following:
1. uniformly samples `pool_size`+1 positions on this line, involving \
the start and end points.
2. the corresponding features on these points are computed by \
bilinear interpolation.
3. max pooling over all the `pool_size`+1 positions are used for \
computing pooled feature.
Args:
pool_size (int): number of positions sampled over the boxes' borders
(e.g. top, bottom, left, right).
"""
def __init__(self, pool_size):
super(BorderAlign, self).__init__()
self.pool_size = pool_size
def forward(self, input, boxes):
"""
Args:
input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
[C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
right features respectively.
boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
Returns:
Tensor: Pooled features with shape [N,C,H*W,4]. The order is
(top,left,bottom,right) for the last dimension.
"""
return border_align(input, boxes, self.pool_size)
def __repr__(self):
s = self.__class__.__name__
s += f'(pool_size={self.pool_size})'
return s
# Copyright (c) OpenMMLab. All rights reserved.
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
"""Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in
(x_center, y_center, width, height, angle) format.
If ``aligned`` is ``False``, then calculate the ious between each bbox
of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
bboxes1 and bboxes2.
Arguments:
boxes1 (Tensor): rotated bboxes 1. \
It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
boxes2 (Tensor): rotated bboxes 2. \
It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
Note that theta is in radian.
mode (str): "iou" (intersection over union) or iof (intersection over
foreground).
Returns:
ious(Tensor): shape (N, M) if aligned == False else shape (N,)
"""
assert mode in ['iou', 'iof']
mode_dict = {'iou': 0, 'iof': 1}
mode_flag = mode_dict[mode]
rows = bboxes1.size(0)
cols = bboxes2.size(0)
if aligned:
ious = bboxes1.new_zeros(rows)
else:
ious = bboxes1.new_zeros((rows * cols))
bboxes1 = bboxes1.contiguous()
bboxes2 = bboxes2.contiguous()
ext_module.box_iou_rotated(
bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
if not aligned:
ious = ious.view(rows, cols)
return ious
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function
from torch.nn.modules.module import Module
from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [
'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
'carafe_backward'
])
class CARAFENaiveFunction(Function):
@staticmethod
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
return g.op(
'mmcv::MMCVCARAFENaive',
features,
masks,
kernel_size_i=kernel_size,
group_size_i=group_size,
scale_factor_f=scale_factor)
@staticmethod
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor
assert masks.size(-2) == features.size(-2) * scale_factor
assert features.size(1) % group_size == 0
assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
ctx.kernel_size = kernel_size
ctx.group_size = group_size
ctx.scale_factor = scale_factor
ctx.feature_size = features.size()
ctx.mask_size = masks.size()
n, c, h, w = features.size()
output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
ext_module.carafe_naive_forward(
features,
masks,
output,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad:
ctx.save_for_backward(features, masks)
return output
@staticmethod
def backward(ctx, grad_output):
assert grad_output.is_cuda
features, masks = ctx.saved_tensors
kernel_size = ctx.kernel_size
group_size = ctx.group_size
scale_factor = ctx.scale_factor
grad_input = torch.zeros_like(features)
grad_masks = torch.zeros_like(masks)
ext_module.carafe_naive_backward(
grad_output.contiguous(),
features,
masks,
grad_input,
grad_masks,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
return grad_input, grad_masks, None, None, None
carafe_naive = CARAFENaiveFunction.apply
class CARAFENaive(Module):
def __init__(self, kernel_size, group_size, scale_factor):
super(CARAFENaive, self).__init__()
assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int)
self.kernel_size = kernel_size
self.group_size = group_size
self.scale_factor = scale_factor
def forward(self, features, masks):
return carafe_naive(features, masks, self.kernel_size, self.group_size,
self.scale_factor)
class CARAFEFunction(Function):
@staticmethod
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
return g.op(
'mmcv::MMCVCARAFE',
features,
masks,
kernel_size_i=kernel_size,
group_size_i=group_size,
scale_factor_f=scale_factor)
@staticmethod
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
assert scale_factor >= 1
assert masks.size(1) == kernel_size * kernel_size * group_size
assert masks.size(-1) == features.size(-1) * scale_factor
assert masks.size(-2) == features.size(-2) * scale_factor
assert features.size(1) % group_size == 0
assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
ctx.kernel_size = kernel_size
ctx.group_size = group_size
ctx.scale_factor = scale_factor
ctx.feature_size = features.size()
ctx.mask_size = masks.size()
n, c, h, w = features.size()
output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
routput = features.new_zeros(output.size(), requires_grad=False)
rfeatures = features.new_zeros(features.size(), requires_grad=False)
rmasks = masks.new_zeros(masks.size(), requires_grad=False)
ext_module.carafe_forward(
features,
masks,
rfeatures,
routput,
rmasks,
output,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
if features.requires_grad or masks.requires_grad:
ctx.save_for_backward(features, masks, rfeatures)
return output
@staticmethod
def backward(ctx, grad_output):
assert grad_output.is_cuda
features, masks, rfeatures = ctx.saved_tensors
kernel_size = ctx.kernel_size
group_size = ctx.group_size
scale_factor = ctx.scale_factor
rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
rgrad_input = torch.zeros_like(features, requires_grad=False)
rgrad_masks = torch.zeros_like(masks, requires_grad=False)
grad_input = torch.zeros_like(features, requires_grad=False)
grad_masks = torch.zeros_like(masks, requires_grad=False)
ext_module.carafe_backward(
grad_output.contiguous(),
rfeatures,
masks,
rgrad_output,
rgrad_input_hs,
rgrad_input,
rgrad_masks,
grad_input,
grad_masks,
kernel_size=kernel_size,
group_size=group_size,
scale_factor=scale_factor)
return grad_input, grad_masks, None, None, None
carafe = CARAFEFunction.apply
class CARAFE(Module):
""" CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args:
kernel_size (int): reassemble kernel size
group_size (int): reassemble group size
scale_factor (int): upsample ratio
Returns:
upsampled feature map
"""
def __init__(self, kernel_size, group_size, scale_factor):
super(CARAFE, self).__init__()
assert isinstance(kernel_size, int) and isinstance(
group_size, int) and isinstance(scale_factor, int)
self.kernel_size = kernel_size
self.group_size = group_size
self.scale_factor = scale_factor
def forward(self, features, masks):
return carafe(features, masks, self.kernel_size, self.group_size,
self.scale_factor)
@UPSAMPLE_LAYERS.register_module(name='carafe')
class CARAFEPack(nn.Module):
"""A unified package of CARAFE upsampler that contains: 1) channel
compressor 2) content encoder 3) CARAFE op.
Official implementation of ICCV 2019 paper
CARAFE: Content-Aware ReAssembly of FEatures
Please refer to https://arxiv.org/abs/1905.02188 for more details.
Args:
channels (int): input feature channels
scale_factor (int): upsample ratio
up_kernel (int): kernel size of CARAFE op
up_group (int): group size of CARAFE op
encoder_kernel (int): kernel size of content encoder
encoder_dilation (int): dilation of content encoder
compressed_channels (int): output channels of channels compressor
Returns:
upsampled feature map
"""
def __init__(self,
channels,
scale_factor,
up_kernel=5,
up_group=1,
encoder_kernel=3,
encoder_dilation=1,
compressed_channels=64):
super(CARAFEPack, self).__init__()
self.channels = channels
self.scale_factor = scale_factor
self.up_kernel = up_kernel
self.up_group = up_group
self.encoder_kernel = encoder_kernel
self.encoder_dilation = encoder_dilation
self.compressed_channels = compressed_channels
self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
1)
self.content_encoder = nn.Conv2d(
self.compressed_channels,
self.up_kernel * self.up_kernel * self.up_group *
self.scale_factor * self.scale_factor,
self.encoder_kernel,
padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
dilation=self.encoder_dilation,
groups=1)
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
xavier_init(m, distribution='uniform')
normal_init(self.content_encoder, std=0.001)
def kernel_normalizer(self, mask):
mask = F.pixel_shuffle(mask, self.scale_factor)
n, mask_c, h, w = mask.size()
# use float division explicitly,
# to void inconsistency while exporting to onnx
mask_channel = int(mask_c / float(self.up_kernel**2))
mask = mask.view(n, mask_channel, -1, h, w)
mask = F.softmax(mask, dim=2, dtype=mask.dtype)
mask = mask.view(n, mask_c, h, w).contiguous()
return mask
def feature_reassemble(self, x, mask):
x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
return x
def forward(self, x):
compressed_x = self.channel_compressor(x)
mask = self.content_encoder(compressed_x)
mask = self.kernel_normalizer(mask)
x = self.feature_reassemble(x, mask)
return x
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.nn.functional as F
from annotator.uniformer.mmcv.cnn import PLUGIN_LAYERS, Scale
def NEG_INF_DIAG(n, device):
"""Returns a diagonal matrix of size [n, n].
The diagonal are all "-inf". This is for avoiding calculating the
overlapped element in the Criss-Cross twice.
"""
return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
@PLUGIN_LAYERS.register_module()
class CrissCrossAttention(nn.Module):
"""Criss-Cross Attention Module.
.. note::
Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
to a pure PyTorch and equivalent implementation. For more
details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.
Speed comparison for one forward pass
- Input size: [2,512,97,97]
- Device: 1 NVIDIA GeForce RTX 2080 Ti
+-----------------------+---------------+------------+---------------+
| |PyTorch version|CUDA version|Relative speed |
+=======================+===============+============+===============+
|with torch.no_grad() |0.00554402 s |0.0299619 s |5.4x |
+-----------------------+---------------+------------+---------------+
|no with torch.no_grad()|0.00562803 s |0.0301349 s |5.4x |
+-----------------------+---------------+------------+---------------+
Args:
in_channels (int): Channels of the input feature map.
"""
def __init__(self, in_channels):
super().__init__()
self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
self.gamma = Scale(0.)
self.in_channels = in_channels
def forward(self, x):
"""forward function of Criss-Cross Attention.
Args:
x (Tensor): Input feature. \
shape (batch_size, in_channels, height, width)
Returns:
Tensor: Output of the layer, with shape of \
(batch_size, in_channels, height, width)
"""
B, C, H, W = x.size()
query = self.query_conv(x)
key = self.key_conv(x)
value = self.value_conv(x)
energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
H, query.device)
energy_H = energy_H.transpose(1, 2)
energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
attn = F.softmax(
torch.cat([energy_H, energy_W], dim=-1), dim=-1) # [B,H,W,(H+W)]
out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])
out = self.gamma(out) + x
out = out.contiguous()
return out
def __repr__(self):
s = self.__class__.__name__
s += f'(in_channels={self.in_channels})'
return s
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
import torch
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
kernel_num):
"""Expand kernel contours so that foreground pixels are assigned into
instances.
Arguments:
kernel_mask (np.array or Tensor): The instance kernel mask with
size hxw.
internal_kernel_label (np.array or Tensor): The instance internal
kernel label with size hxw.
min_kernel_area (int): The minimum kernel area.
kernel_num (int): The instance kernel number.
Returns:
label (list): The instance index map with size hxw.
"""
assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
assert isinstance(min_kernel_area, int)
assert isinstance(kernel_num, int)
if isinstance(kernel_mask, np.ndarray):
kernel_mask = torch.from_numpy(kernel_mask)
if isinstance(internal_kernel_label, np.ndarray):
internal_kernel_label = torch.from_numpy(internal_kernel_label)
if torch.__version__ == 'parrots':
if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:
label = []
else:
label = ext_module.contour_expand(
kernel_mask,
internal_kernel_label,
min_kernel_area=min_kernel_area,
kernel_num=kernel_num)
label = label.tolist()
else:
label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
min_kernel_area, kernel_num)
return label
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch import nn
from torch.autograd import Function
from ..utils import ext_loader
ext_module = ext_loader.load_ext('_ext', [
'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
'right_pool_forward', 'right_pool_backward'
])
_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
class TopPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.top_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.top_pool_backward(input, grad_output)
return output
class BottomPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.bottom_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.bottom_pool_backward(input, grad_output)
return output
class LeftPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.left_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.left_pool_backward(input, grad_output)
return output
class RightPoolFunction(Function):
@staticmethod
def symbolic(g, input):
output = g.op(
'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
return output
@staticmethod
def forward(ctx, input):
output = ext_module.right_pool_forward(input)
ctx.save_for_backward(input)
return output
@staticmethod
def backward(ctx, grad_output):
input, = ctx.saved_tensors
output = ext_module.right_pool_backward(input, grad_output)
return output
class CornerPool(nn.Module):
"""Corner Pooling.
Corner Pooling is a new type of pooling layer that helps a
convolutional network better localize corners of bounding boxes.
Please refer to https://arxiv.org/abs/1808.01244 for more details.
Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
Args:
mode(str): Pooling orientation for the pooling layer
- 'bottom': Bottom Pooling
- 'left': Left Pooling
- 'right': Right Pooling
- 'top': Top Pooling
Returns:
Feature map after pooling.
"""
pool_functions = {
'bottom': BottomPoolFunction,
'left': LeftPoolFunction,
'right': RightPoolFunction,
'top': TopPoolFunction,
}
cummax_dim_flip = {
'bottom': (2, False),
'left': (3, True),
'right': (3, False),
'top': (2, True),
}
def __init__(self, mode):
super(CornerPool, self).__init__()
assert mode in self.pool_functions
self.mode = mode
self.corner_pool = self.pool_functions[mode]
def forward(self, x):
if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
if torch.onnx.is_in_onnx_export():
assert torch.__version__ >= '1.7.0', \
'When `cummax` serves as an intermediate component whose '\
'outputs is used as inputs for another modules, it\'s '\
'expected that pytorch version must be >= 1.7.0, '\
'otherwise Error appears like: `RuntimeError: tuple '\
'appears in op that does not forward tuples, unsupported '\
'kind: prim::PythonOp`.'
dim, flip = self.cummax_dim_flip[self.mode]
if flip:
x = x.flip(dim)
pool_tensor, _ = torch.cummax(x, dim=dim)
if flip:
pool_tensor = pool_tensor.flip(dim)
return pool_tensor
else:
return self.corner_pool.apply(x)
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from torch import Tensor, nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['correlation_forward', 'correlation_backward'])
class CorrelationFunction(Function):
@staticmethod
def forward(ctx,
input1,
input2,
kernel_size=1,
max_displacement=1,
stride=1,
padding=1,
dilation=1,
dilation_patch=1):
ctx.save_for_backward(input1, input2)
kH, kW = ctx.kernel_size = _pair(kernel_size)
patch_size = max_displacement * 2 + 1
ctx.patch_size = patch_size
dH, dW = ctx.stride = _pair(stride)
padH, padW = ctx.padding = _pair(padding)
dilationH, dilationW = ctx.dilation = _pair(dilation)
dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(
dilation_patch)
output_size = CorrelationFunction._output_size(ctx, input1)
output = input1.new_zeros(output_size)
ext_module.correlation_forward(
input1,
input2,
output,
kH=kH,
kW=kW,
patchH=patch_size,
patchW=patch_size,
padH=padH,
padW=padW,
dilationH=dilationH,
dilationW=dilationW,
dilation_patchH=dilation_patchH,
dilation_patchW=dilation_patchW,
dH=dH,
dW=dW)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
input1, input2 = ctx.saved_tensors
kH, kW = ctx.kernel_size
patch_size = ctx.patch_size
padH, padW = ctx.padding
dilationH, dilationW = ctx.dilation
dilation_patchH, dilation_patchW = ctx.dilation_patch
dH, dW = ctx.stride
grad_input1 = torch.zeros_like(input1)
grad_input2 = torch.zeros_like(input2)
ext_module.correlation_backward(
grad_output,
input1,
input2,
grad_input1,
grad_input2,
kH=kH,
kW=kW,
patchH=patch_size,
patchW=patch_size,
padH=padH,
padW=padW,
dilationH=dilationH,
dilationW=dilationW,
dilation_patchH=dilation_patchH,
dilation_patchW=dilation_patchW,
dH=dH,
dW=dW)
return grad_input1, grad_input2, None, None, None, None, None, None
@staticmethod
def _output_size(ctx, input1):
iH, iW = input1.size(2), input1.size(3)
batch_size = input1.size(0)
kH, kW = ctx.kernel_size
patch_size = ctx.patch_size
dH, dW = ctx.stride
padH, padW = ctx.padding
dilationH, dilationW = ctx.dilation
dilatedKH = (kH - 1) * dilationH + 1
dilatedKW = (kW - 1) * dilationW + 1
oH = int((iH + 2 * padH - dilatedKH) / dH + 1)
oW = int((iW + 2 * padW - dilatedKW) / dW + 1)
output_size = (batch_size, patch_size, patch_size, oH, oW)
return output_size
class Correlation(nn.Module):
r"""Correlation operator
This correlation operator works for optical flow correlation computation.
There are two batched tensors with shape :math:`(N, C, H, W)`,
and the correlation output's shape is :math:`(N, max\_displacement \times
2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`
where
.. math::
H_{out} = \left\lfloor\frac{H_{in} + 2 \times padding -
dilation \times (kernel\_size - 1) - 1}
{stride} + 1\right\rfloor
.. math::
W_{out} = \left\lfloor\frac{W_{in} + 2 \times padding - dilation
\times (kernel\_size - 1) - 1}
{stride} + 1\right\rfloor
the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
window convolution between input1 and shifted input2,
.. math::
Corr(N_i, dx, dy) =
\sum_{c=0}^{C-1}
input1(N_i, c) \star
\mathcal{S}(input2(N_i, c), dy, dx)
where :math:`\star` is the valid 2d sliding window convolution operator,
and :math:`\mathcal{S}` means shifting the input features (auto-complete
zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
[-max\_displacement \times dilation\_patch, max\_displacement \times
dilation\_patch]`.
Args:
kernel_size (int): The size of sliding window i.e. local neighborhood
representing the center points and involved in correlation
computation. Defaults to 1.
max_displacement (int): The radius for computing correlation volume,
but the actual working space can be dilated by dilation_patch.
Defaults to 1.
stride (int): The stride of the sliding blocks in the input spatial
dimensions. Defaults to 1.
padding (int): Zero padding added to all four sides of the input1.
Defaults to 0.
dilation (int): The spacing of local neighborhood that will involved
in correlation. Defaults to 1.
dilation_patch (int): The spacing between position need to compute
correlation. Defaults to 1.
"""
def __init__(self,
kernel_size: int = 1,
max_displacement: int = 1,
stride: int = 1,
padding: int = 0,
dilation: int = 1,
dilation_patch: int = 1) -> None:
super().__init__()
self.kernel_size = kernel_size
self.max_displacement = max_displacement
self.stride = stride
self.padding = padding
self.dilation = dilation
self.dilation_patch = dilation_patch
def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
return CorrelationFunction.apply(input1, input2, self.kernel_size,
self.max_displacement, self.stride,
self.padding, self.dilation,
self.dilation_patch)
def __repr__(self) -> str:
s = self.__class__.__name__
s += f'(kernel_size={self.kernel_size}, '
s += f'max_displacement={self.max_displacement}, '
s += f'stride={self.stride}, '
s += f'padding={self.padding}, '
s += f'dilation={self.dilation}, '
s += f'dilation_patch={self.dilation_patch})'
return s
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair, _single
from annotator.uniformer.mmcv.utils import deprecated_api_warning
from ..cnn import CONV_LAYERS
from ..utils import ext_loader, print_log
ext_module = ext_loader.load_ext('_ext', [
'deform_conv_forward', 'deform_conv_backward_input',
'deform_conv_backward_parameters'
])
class DeformConv2dFunction(Function):
@staticmethod
def symbolic(g,
input,
offset,
weight,
stride,
padding,
dilation,
groups,
deform_groups,
bias=False,
im2col_step=32):
return g.op(
'mmcv::MMCVDeformConv2d',
input,
offset,
weight,
stride_i=stride,
padding_i=padding,
dilation_i=dilation,
groups_i=groups,
deform_groups_i=deform_groups,
bias_i=bias,
im2col_step_i=im2col_step)
@staticmethod
def forward(ctx,
input,
offset,
weight,
stride=1,
padding=0,
dilation=1,
groups=1,
deform_groups=1,
bias=False,
im2col_step=32):
if input is not None and input.dim() != 4:
raise ValueError(
f'Expected 4D tensor as input, got {input.dim()}D tensor \
instead.')
assert bias is False, 'Only support bias is False.'
ctx.stride = _pair(stride)
ctx.padding = _pair(padding)
ctx.dilation = _pair(dilation)
ctx.groups = groups
ctx.deform_groups = deform_groups
ctx.im2col_step = im2col_step
# When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
# amp won't cast the type of model (float32), but "offset" is cast
# to float16 by nn.Conv2d automatically, leading to the type
# mismatch with input (when it is float32) or weight.
# The flag for whether to use fp16 or amp is the type of "offset",
# we cast weight and input to temporarily support fp16 and amp
# whatever the pytorch version is.
input = input.type_as(offset)
weight = weight.type_as(input)
ctx.save_for_backward(input, offset, weight)
output = input.new_empty(
DeformConv2dFunction._output_size(ctx, input, weight))
ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones
cur_im2col_step = min(ctx.im2col_step, input.size(0))
assert (input.size(0) %
cur_im2col_step) == 0, 'im2col step must divide batchsize'
ext_module.deform_conv_forward(
input,
weight,
offset,
output,
ctx.bufs_[0],
ctx.bufs_[1],
kW=weight.size(3),
kH=weight.size(2),
dW=ctx.stride[1],
dH=ctx.stride[0],
padW=ctx.padding[1],
padH=ctx.padding[0],
dilationW=ctx.dilation[1],
dilationH=ctx.dilation[0],
group=ctx.groups,
deformable_group=ctx.deform_groups,
im2col_step=cur_im2col_step)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
input, offset, weight = ctx.saved_tensors
grad_input = grad_offset = grad_weight = None
cur_im2col_step = min(ctx.im2col_step, input.size(0))
assert (input.size(0) % cur_im2col_step
) == 0, 'batch size must be divisible by im2col_step'
grad_output = grad_output.contiguous()
if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
grad_input = torch.zeros_like(input)
grad_offset = torch.zeros_like(offset)
ext_module.deform_conv_backward_input(
input,
offset,
grad_output,
grad_input,
grad_offset,
weight,
ctx.bufs_[0],
kW=weight.size(3),
kH=weight.size(2),
dW=ctx.stride[1],
dH=ctx.stride[0],
padW=ctx.padding[1],
padH=ctx.padding[0],
dilationW=ctx.dilation[1],
dilationH=ctx.dilation[0],
group=ctx.groups,
deformable_group=ctx.deform_groups,
im2col_step=cur_im2col_step)
if ctx.needs_input_grad[2]:
grad_weight = torch.zeros_like(weight)
ext_module.deform_conv_backward_parameters(
input,
offset,
grad_output,
grad_weight,
ctx.bufs_[0],
ctx.bufs_[1],
kW=weight.size(3),
kH=weight.size(2),
dW=ctx.stride[1],
dH=ctx.stride[0],
padW=ctx.padding[1],
padH=ctx.padding[0],
dilationW=ctx.dilation[1],
dilationH=ctx.dilation[0],
group=ctx.groups,
deformable_group=ctx.deform_groups,
scale=1,
im2col_step=cur_im2col_step)
return grad_input, grad_offset, grad_weight, \
None, None, None, None, None, None, None
@staticmethod
def _output_size(ctx, input, weight):
channels = weight.size(0)
output_size = (input.size(0), channels)
for d in range(input.dim() - 2):
in_size = input.size(d + 2)
pad = ctx.padding[d]
kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
stride_ = ctx.stride[d]
output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
if not all(map(lambda s: s > 0, output_size)):
raise ValueError(
'convolution input is too small (output would be ' +
'x'.join(map(str, output_size)) + ')')
return output_size
deform_conv2d = DeformConv2dFunction.apply
class DeformConv2d(nn.Module):
r"""Deformable 2D convolution.
Applies a deformable 2D convolution over an input signal composed of
several input planes. DeformConv2d was described in the paper
`Deformable Convolutional Networks
<https://arxiv.org/pdf/1703.06211.pdf>`_
Note:
The argument ``im2col_step`` was added in version 1.3.17, which means
number of samples processed by the ``im2col_cuda_kernel`` per call.
It enables users to define ``batch_size`` and ``im2col_step`` more
flexibly and solved `issue mmcv#1440
<https://github.com/open-mmlab/mmcv/issues/1440>`_.
Args:
in_channels (int): Number of channels in the input image.
out_channels (int): Number of channels produced by the convolution.
kernel_size(int, tuple): Size of the convolving kernel.
stride(int, tuple): Stride of the convolution. Default: 1.
padding (int or tuple): Zero-padding added to both sides of the input.
Default: 0.
dilation (int or tuple): Spacing between kernel elements. Default: 1.
groups (int): Number of blocked connections from input.
channels to output channels. Default: 1.
deform_groups (int): Number of deformable group partitions.
bias (bool): If True, adds a learnable bias to the output.
Default: False.
im2col_step (int): Number of samples processed by im2col_cuda_kernel
per call. It will work when ``batch_size`` > ``im2col_step``, but
``batch_size`` must be divisible by ``im2col_step``. Default: 32.
`New in version 1.3.17.`
"""
@deprecated_api_warning({'deformable_groups': 'deform_groups'},
cls_name='DeformConv2d')
def __init__(self,
in_channels: int,
out_channels: int,
kernel_size: Union[int, Tuple[int, ...]],
stride: Union[int, Tuple[int, ...]] = 1,
padding: Union[int, Tuple[int, ...]] = 0,
dilation: Union[int, Tuple[int, ...]] = 1,
groups: int = 1,
deform_groups: int = 1,
bias: bool = False,
im2col_step: int = 32) -> None:
super(DeformConv2d, self).__init__()
assert not bias, \
f'bias={bias} is not supported in DeformConv2d.'
assert in_channels % groups == 0, \
f'in_channels {in_channels} cannot be divisible by groups {groups}'
assert out_channels % groups == 0, \
f'out_channels {out_channels} cannot be divisible by groups \
{groups}'
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = _pair(kernel_size)
self.stride = _pair(stride)
self.padding = _pair(padding)
self.dilation = _pair(dilation)
self.groups = groups
self.deform_groups = deform_groups
self.im2col_step = im2col_step
# enable compatibility with nn.Conv2d
self.transposed = False
self.output_padding = _single(0)
# only weight, no bias
self.weight = nn.Parameter(
torch.Tensor(out_channels, in_channels // self.groups,
*self.kernel_size))
self.reset_parameters()
def reset_parameters(self):
# switch the initialization of `self.weight` to the standard kaiming
# method described in `Delving deep into rectifiers: Surpassing
# human-level performance on ImageNet classification` - He, K. et al.
# (2015), using a uniform distribution
nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')
def forward(self, x: Tensor, offset: Tensor) -> Tensor:
"""Deformable Convolutional forward function.
Args:
x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
offset (Tensor): Offset for deformable convolution, shape
(B, deform_groups*kernel_size[0]*kernel_size[1]*2,
H_out, W_out), H_out, W_out are equal to the output's.
An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
The spatial arrangement is like:
.. code:: text
(x0, y0) (x1, y1) (x2, y2)
(x3, y3) (x4, y4) (x5, y5)
(x6, y6) (x7, y7) (x8, y8)
Returns:
Tensor: Output of the layer.
"""
# To fix an assert error in deform_conv_cuda.cpp:128
# input image is smaller than kernel
input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
self.kernel_size[1])
if input_pad:
pad_h = max(self.kernel_size[0] - x.size(2), 0)
pad_w = max(self.kernel_size[1] - x.size(3), 0)
x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
offset = offset.contiguous()
out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
self.dilation, self.groups, self.deform_groups,
False, self.im2col_step)
if input_pad:
out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
pad_w].contiguous()
return out
def __repr__(self):
s = self.__class__.__name__
s += f'(in_channels={self.in_channels},\n'
s += f'out_channels={self.out_channels},\n'
s += f'kernel_size={self.kernel_size},\n'
s += f'stride={self.stride},\n'
s += f'padding={self.padding},\n'
s += f'dilation={self.dilation},\n'
s += f'groups={self.groups},\n'
s += f'deform_groups={self.deform_groups},\n'
# bias is not supported in DeformConv2d.
s += 'bias=False)'
return s
@CONV_LAYERS.register_module('DCN')
class DeformConv2dPack(DeformConv2d):
"""A Deformable Conv Encapsulation that acts as normal Conv layers.
The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
The spatial arrangement is like:
.. code:: text
(x0, y0) (x1, y1) (x2, y2)
(x3, y3) (x4, y4) (x5, y5)
(x6, y6) (x7, y7) (x8, y8)
Args:
in_channels (int): Same as nn.Conv2d.
out_channels (int): Same as nn.Conv2d.
kernel_size (int or tuple[int]): Same as nn.Conv2d.
stride (int or tuple[int]): Same as nn.Conv2d.
padding (int or tuple[int]): Same as nn.Conv2d.
dilation (int or tuple[int]): Same as nn.Conv2d.
groups (int): Same as nn.Conv2d.
bias (bool or str): If specified as `auto`, it will be decided by the
norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
False.
"""
_version = 2
def __init__(self, *args, **kwargs):
super(DeformConv2dPack, self).__init__(*args, **kwargs)
self.conv_offset = nn.Conv2d(
self.in_channels,
self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
kernel_size=self.kernel_size,
stride=_pair(self.stride),
padding=_pair(self.padding),
dilation=_pair(self.dilation),
bias=True)
self.init_offset()
def init_offset(self):
self.conv_offset.weight.data.zero_()
self.conv_offset.bias.data.zero_()
def forward(self, x):
offset = self.conv_offset(x)
return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
self.dilation, self.groups, self.deform_groups,
False, self.im2col_step)
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
version = local_metadata.get('version', None)
if version is None or version < 2:
# the key is different in early versions
# In version < 2, DeformConvPack loads previous benchmark models.
if (prefix + 'conv_offset.weight' not in state_dict
and prefix[:-1] + '_offset.weight' in state_dict):
state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
prefix[:-1] + '_offset.weight')
if (prefix + 'conv_offset.bias' not in state_dict
and prefix[:-1] + '_offset.bias' in state_dict):
state_dict[prefix +
'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
'_offset.bias')
if version is not None and version > 1:
print_log(
f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
'version 2.',
logger='root')
super()._load_from_state_dict(state_dict, prefix, local_metadata,
strict, missing_keys, unexpected_keys,
error_msgs)
# Copyright (c) OpenMMLab. All rights reserved.
from torch import nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair
from ..utils import ext_loader
ext_module = ext_loader.load_ext(
'_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])
class DeformRoIPoolFunction(Function):
@staticmethod
def symbolic(g, input, rois, offset, output_size, spatial_scale,
sampling_ratio, gamma):
return g.op(
'mmcv::MMCVDeformRoIPool',
input,
rois,
offset,
pooled_height_i=output_size[0],
pooled_width_i=output_size[1],
spatial_scale_f=spatial_scale,
sampling_ratio_f=sampling_ratio,
gamma_f=gamma)
@staticmethod
def forward(ctx,
input,
rois,
offset,
output_size,
spatial_scale=1.0,
sampling_ratio=0,
gamma=0.1):
if offset is None:
offset = input.new_zeros(0)
ctx.output_size = _pair(output_size)
ctx.spatial_scale = float(spatial_scale)
ctx.sampling_ratio = int(sampling_ratio)
ctx.gamma = float(gamma)
assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
ctx.output_size[1])
output = input.new_zeros(output_shape)
ext_module.deform_roi_pool_forward(
input,
rois,
offset,
output,
pooled_height=ctx.output_size[0],
pooled_width=ctx.output_size[1],
spatial_scale=ctx.spatial_scale,
sampling_ratio=ctx.sampling_ratio,
gamma=ctx.gamma)
ctx.save_for_backward(input, rois, offset)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
input, rois, offset = ctx.saved_tensors
grad_input = grad_output.new_zeros(input.shape)
grad_offset = grad_output.new_zeros(offset.shape)
ext_module.deform_roi_pool_backward(
grad_output,
input,
rois,
offset,
grad_input,
grad_offset,
pooled_height=ctx.output_size[0],
pooled_width=ctx.output_size[1],
spatial_scale=ctx.spatial_scale,
sampling_ratio=ctx.sampling_ratio,
gamma=ctx.gamma)
if grad_offset.numel() == 0:
grad_offset = None
return grad_input, None, grad_offset, None, None, None, None
deform_roi_pool = DeformRoIPoolFunction.apply
class DeformRoIPool(nn.Module):
def __init__(self,
output_size,
spatial_scale=1.0,
sampling_ratio=0,
gamma=0.1):
super(DeformRoIPool, self).__init__()
self.output_size = _pair(output_size)
self.spatial_scale = float(spatial_scale)
self.sampling_ratio = int(sampling_ratio)
self.gamma = float(gamma)
def forward(self, input, rois, offset=None):
return deform_roi_pool(input, rois, offset, self.output_size,
self.spatial_scale, self.sampling_ratio,
self.gamma)
class DeformRoIPoolPack(DeformRoIPool):
def __init__(self,
output_size,
output_channels,
deform_fc_channels=1024,
spatial_scale=1.0,
sampling_ratio=0,
gamma=0.1):
super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale,
sampling_ratio, gamma)
self.output_channels = output_channels
self.deform_fc_channels = deform_fc_channels
self.offset_fc = nn.Sequential(
nn.Linear(
self.output_size[0] * self.output_size[1] *
self.output_channels, self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels,
self.output_size[0] * self.output_size[1] * 2))
self.offset_fc[-1].weight.data.zero_()
self.offset_fc[-1].bias.data.zero_()
def forward(self, input, rois):
assert input.size(1) == self.output_channels
x = deform_roi_pool(input, rois, None, self.output_size,
self.spatial_scale, self.sampling_ratio,
self.gamma)
rois_num = rois.size(0)
offset = self.offset_fc(x.view(rois_num, -1))
offset = offset.view(rois_num, 2, self.output_size[0],
self.output_size[1])
return deform_roi_pool(input, rois, offset, self.output_size,
self.spatial_scale, self.sampling_ratio,
self.gamma)
class ModulatedDeformRoIPoolPack(DeformRoIPool):
def __init__(self,
output_size,
output_channels,
deform_fc_channels=1024,
spatial_scale=1.0,
sampling_ratio=0,
gamma=0.1):
super(ModulatedDeformRoIPoolPack,
self).__init__(output_size, spatial_scale, sampling_ratio, gamma)
self.output_channels = output_channels
self.deform_fc_channels = deform_fc_channels
self.offset_fc = nn.Sequential(
nn.Linear(
self.output_size[0] * self.output_size[1] *
self.output_channels, self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels,
self.output_size[0] * self.output_size[1] * 2))
self.offset_fc[-1].weight.data.zero_()
self.offset_fc[-1].bias.data.zero_()
self.mask_fc = nn.Sequential(
nn.Linear(
self.output_size[0] * self.output_size[1] *
self.output_channels, self.deform_fc_channels),
nn.ReLU(inplace=True),
nn.Linear(self.deform_fc_channels,
self.output_size[0] * self.output_size[1] * 1),
nn.Sigmoid())
self.mask_fc[2].weight.data.zero_()
self.mask_fc[2].bias.data.zero_()
def forward(self, input, rois):
assert input.size(1) == self.output_channels
x = deform_roi_pool(input, rois, None, self.output_size,
self.spatial_scale, self.sampling_ratio,
self.gamma)
rois_num = rois.size(0)
offset = self.offset_fc(x.view(rois_num, -1))
offset = offset.view(rois_num, 2, self.output_size[0],
self.output_size[1])
mask = self.mask_fc(x.view(rois_num, -1))
mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])
d = deform_roi_pool(input, rois, offset, self.output_size,
self.spatial_scale, self.sampling_ratio,
self.gamma)
return d * mask
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment