Refactor the baseclass related to transformer (#978)

* minor changes * change to modulist * change to Sequential * replace dropout with attn_drop and proj_drop in MultiheadAttention * add operation_name for attn * add drop path and move all ffn args to ffncfgs * fix typo * fix a bug when use default value of ffn_cfgs * fix ffns * add deprecate warning * fix deprecate warning * change to pop kwargs * support register FFN of transformer * support batch first * fix batch first wapper * fix forward wapper * fix typo * fix lint * add unitest for transformer * fix unitest * fix equal * use allclose * fix comments * fix comments * change configdict to dict * move drop to a file * add comments for drop path * add noqa 501 * move bnc wapper to MultiheadAttention * move bnc wapper to MultiheadAttention * use dep warning * resolve comments * add unitest: * rename residual to identity * revert runner * msda residual to identity * rename inp_identity to identity * fix name * fix transformer * remove key in msda * remove assert for key Co-authored-by: HIT-cwh <2892770585@qq.com> Co-authored-by: bkhuang <congee524@gmail.com> Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>

Refactor the baseclass related to transformer (#978)
* minor changes * change to modulist * change to Sequential * replace dropout with attn_drop and proj_drop in MultiheadAttention * add operation_name for attn * add drop path and move all ffn args to ffncfgs * fix typo * fix a bug when use default value of ffn_cfgs * fix ffns * add deprecate warning * fix deprecate warning * change to pop kwargs * support register FFN of transformer * support batch first * fix batch first wapper * fix forward wapper * fix typo * fix lint * add unitest for transformer * fix unitest * fix equal * use allclose * fix comments * fix comments * change configdict to dict * move drop to a file * add comments for drop path * add noqa 501 * move bnc wapper to MultiheadAttention * move bnc wapper to MultiheadAttention * use dep warning * resolve comments * add unitest: * rename residual to identity * revert runner * msda residual to identity * rename inp_identity to identity * fix name * fix transformer * remove key in msda * remove assert for key Co-authored-by: HIT-cwh <2892770585@qq.com> Co-authored-by: bkhuang <congee524@gmail.com> Co-authored-by: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com>
e05fb560 · Shilong Zhang · GitHub · 11629d52 · e05fb560 · e05fb560
Unverified Commit e05fb560 authored Jun 11, 2021 by Shilong Zhang Committed by GitHub Jun 11, 2021
8 changed files
--- a/mmcv/cnn/bricks/__init__.py
+++ b/mmcv/cnn/bricks/__init__.py
@@ -5,6 +5,7 @@ from .conv2d_adaptive_padding import Conv2dAdaptivePadding
 from .conv_module import ConvModule
 from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
 from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from .drop import Dropout, DropPath
 from .generalized_attention import GeneralizedAttention
 from .hsigmoid import HSigmoid
 from .hswish import HSwish
@@ -29,5 +30,5 @@ __all__ = [
    'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d',
    'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear',
    'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d',
-    'ConvTranspose3d', 'MaxPool3d', 'Conv3d'
+    'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath'
 ]
--- a/mmcv/cnn/bricks/drop.py
+++ b/mmcv/cnn/bricks/drop.py
+import torch
+import torch.nn as nn
+from mmcv import build_from_cfg
+from .registry import DROPOUT_LAYERS
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of
+    residual blocks).
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    # handle tensors with different dimensions, not just 4D tensors.
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    output = x.div(keep_prob) * random_tensor.floor()
+    return output
+@DROPOUT_LAYERS.register_module()
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks).
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    Args:
+        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
+    """
+    def __init__(self, drop_prob=0.1):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+@DROPOUT_LAYERS.register_module()
+class Dropout(nn.Dropout):
+    """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
+    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
+    ``DropPath``
+    Args:
+        drop_prob (float): Probability of the elements to be
+            zeroed. Default: 0.5.
+        inplace (bool):  Do the operation inplace or not. Default: False.
+    """
+    def __init__(self, drop_prob=0.5, inplace=False):
+        super().__init__(p=drop_prob, inplace=inplace)
+def build_dropout(cfg, default_args=None):
+    """Builder for drop out layers."""
+    return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
--- a/mmcv/cnn/bricks/registry.py
+++ b/mmcv/cnn/bricks/registry.py
@@ -7,7 +7,9 @@ PADDING_LAYERS = Registry('padding layer')
 UPSAMPLE_LAYERS = Registry('upsample layer')
 PLUGIN_LAYERS = Registry('plugin layer')
-POSITIONAL_ENCODING = Registry('Position encoding')
+DROPOUT_LAYERS = Registry('drop out layers')
-ATTENTION = Registry('Attention')
+POSITIONAL_ENCODING = Registry('position encoding')
-TRANSFORMER_LAYER = Registry('TransformerLayer')
+ATTENTION = Registry('attention')
-TRANSFORMER_LAYER_SEQUENCE = Registry('TransformerLayerSequence')
+FEEDFORWARD_NETWORK = Registry('feed-forward Network')
+TRANSFORMER_LAYER = Registry('transformerLayer')
+TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
--- a/mmcv/cnn/bricks/transformer.py
+++ b/mmcv/cnn/bricks/transformer.py
--- a/mmcv/ops/__init__.py
+++ b/mmcv/ops/__init__.py
@@ -21,6 +21,7 @@ from .masked_conv import MaskedConv2d, masked_conv2d
 from .modulated_deform_conv import (ModulatedDeformConv2d,
                                    ModulatedDeformConv2dPack,
                                    modulated_deform_conv2d)
+from .multi_scale_deform_attn import MultiScaleDeformableAttention
 from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms
 from .pixel_group import pixel_group
 from .point_sample import (SimpleRoIAlign, point_sample,
@@ -50,5 +51,5 @@ __all__ = [
    'SAConv2d', 'TINShift', 'tin_shift', 'box_iou_rotated', 'nms_rotated',
    'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
    'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'contour_expand',
-    'BorderAlign', 'border_align'
+    'MultiScaleDeformableAttention', 'BorderAlign', 'border_align'
 ]
--- a/mmcv/ops/multi_scale_deform_attn.py
+++ b/mmcv/ops/multi_scale_deform_attn.py
+import math
+import warnings
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd.function import Function, once_differentiable
+from mmcv import deprecated_api_warning
+from mmcv.cnn import constant_init, xavier_init
+from mmcv.cnn.bricks.registry import ATTENTION
+from mmcv.runner import BaseModule
 from ..utils import ext_loader
 ext_module = ext_loader.load_ext(
@@ -140,3 +148,211 @@ def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
                                              num_queries)
    return output.transpose(1, 2).contiguous()
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr. `Deformable DETR:
+    Deformable Transformers for End-to-End Object Detection.
+      <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available():
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+        return self.dropout(output) + identity
--- a/tests/test_cnn/test_transformer.py
+++ b/tests/test_cnn/test_transformer.py
+import pytest
+import torch
+from mmcv.cnn.bricks.drop import DropPath
+from mmcv.cnn.bricks.transformer import (FFN, BaseTransformerLayer,
+                                         MultiheadAttention,
+                                         TransformerLayerSequence)
+def test_multiheadattention():
+    MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='Dropout', drop_prob=0.),
+        batch_first=True)
+    batch_dim = 2
+    embed_dim = 5
+    num_query = 100
+    attn_batch_first = MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='DropPath', drop_prob=0.),
+        batch_first=True)
+    attn_query_first = MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='DropPath', drop_prob=0.),
+        batch_first=False)
+    param_dict = dict(attn_query_first.named_parameters())
+    for n, v in attn_batch_first.named_parameters():
+        param_dict[n].data = v.data
+    input_batch_first = torch.rand(batch_dim, num_query, embed_dim)
+    input_query_first = input_batch_first.transpose(0, 1)
+    assert torch.allclose(
+        attn_query_first(input_query_first).sum(),
+        attn_batch_first(input_batch_first).sum())
+    key_batch_first = torch.rand(batch_dim, num_query, embed_dim)
+    key_query_first = key_batch_first.transpose(0, 1)
+    assert torch.allclose(
+        attn_query_first(input_query_first, key_query_first).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum())
+    identity = torch.ones_like(input_query_first)
+    # check deprecated arguments can be used normally
+    assert torch.allclose(
+        attn_query_first(
+            input_query_first, key_query_first, residual=identity).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum() +
+        identity.sum() - input_batch_first.sum())
+    assert torch.allclose(
+        attn_query_first(
+            input_query_first, key_query_first, identity=identity).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum() +
+        identity.sum() - input_batch_first.sum())
+    attn_query_first(
+        input_query_first, key_query_first, identity=identity).sum(),
+def test_ffn():
+    with pytest.raises(AssertionError):
+        # num_fcs should be no less than 2
+        FFN(num_fcs=1)
+    FFN(dropout=0, add_residual=True)
+    ffn = FFN(dropout=0, add_identity=True)
+    input_tensor = torch.rand(2, 20, 256)
+    input_tensor_nbc = input_tensor.transpose(0, 1)
+    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())
+    residual = torch.rand_like(input_tensor)
+    torch.allclose(
+        ffn(input_tensor, residual=residual).sum(),
+        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
+    torch.allclose(
+        ffn(input_tensor, identity=residual).sum(),
+        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
+def test_basetransformerlayer():
+    attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),
+    feedforward_channels = 2048
+    ffn_dropout = 0.1
+    operation_order = ('self_attn', 'norm', 'ffn', 'norm')
+    # test deprecated_args
+    baselayer = BaseTransformerLayer(
+        attn_cfgs=attn_cfgs,
+        feedforward_channels=feedforward_channels,
+        ffn_dropout=ffn_dropout,
+        operation_order=operation_order)
+    assert baselayer.batch_first is False
+    assert baselayer.ffns[0].feedforward_channels == feedforward_channels
+    attn_cfgs = dict(type='MultiheadAttention', num_heads=8, embed_dims=256),
+    feedforward_channels = 2048
+    ffn_dropout = 0.1
+    operation_order = ('self_attn', 'norm', 'ffn', 'norm')
+    baselayer = BaseTransformerLayer(
+        attn_cfgs=attn_cfgs,
+        feedforward_channels=feedforward_channels,
+        ffn_dropout=ffn_dropout,
+        operation_order=operation_order,
+        batch_first=True)
+    assert baselayer.attentions[0].batch_first
+    in_tensor = torch.rand(2, 10, 256)
+    baselayer(in_tensor)
+def test_transformerlayersequence():
+    squeue = TransformerLayerSequence(
+        num_layers=6,
+        transformerlayers=dict(
+            type='BaseTransformerLayer',
+            attn_cfgs=[
+                dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1),
+                dict(type='MultiheadAttention', embed_dims=256, num_heads=4)
+            ],
+            feedforward_channels=1024,
+            ffn_dropout=0.1,
+            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+                             'norm')))
+    assert len(squeue.layers) == 6
+    assert squeue.pre_norm is False
+    with pytest.raises(AssertionError):
+        # if transformerlayers is a list, len(transformerlayers)
+        # should be equal to num_layers
+        TransformerLayerSequence(
+            num_layers=6,
+            transformerlayers=[
+                dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(type='MultiheadAttention', embed_dims=256)
+                    ],
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))
+            ])
+def test_drop_path():
+    drop_path = DropPath(drop_prob=0)
+    test_in = torch.rand(2, 3, 4, 5)
+    assert test_in is drop_path(test_in)
+    drop_path = DropPath(drop_prob=0.1)
+    drop_path.training = False
+    test_in = torch.rand(2, 3, 4, 5)
+    assert test_in is drop_path(test_in)
+    drop_path.training = True
+    assert test_in is not drop_path(test_in)
--- a/tests/test_ops/test_ms_deformable_attn.py
+++ b/tests/test_ops/test_ms_deformable_attn.py
@@ -2,7 +2,8 @@ import pytest
 import torch
 from mmcv.ops.multi_scale_deform_attn import (
-    MultiScaleDeformableAttnFunction, multi_scale_deformable_attn_pytorch)
+    MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,
+    multi_scale_deformable_attn_pytorch)
 _USING_PARROTS = True
 try:
@@ -98,7 +99,14 @@ def test_forward_equal_with_pytorch_float():
 @pytest.mark.skipif(
    not torch.cuda.is_available(), reason='requires CUDA support')
-@pytest.mark.parametrize('channels', [4, 30, 32, 64, 71, 1025, 2048, 3096])
+@pytest.mark.parametrize('channels', [
+    4,
+    30,
+    32,
+    64,
+    71,
+    1025,
+])
 def test_gradient_numerical(channels,
                            grad_value=True,
                            grad_sampling_loc=True,
@@ -134,3 +142,20 @@ def test_gradient_numerical(channels,
        assert gradcheck(func, (value.double(), shapes, level_start_index,
                                sampling_locations.double(),
                                attention_weights.double(), im2col_step))
+def test_multiscale_deformable_attention():
+    with pytest.raises(ValueError):
+        # embed_dims must be divisible by num_heads,
+        MultiScaleDeformableAttention(
+            embed_dims=256,
+            num_heads=7,
+        )
+    with pytest.raises(ValueError):
+        # embed_dims must be divisible by num_heads,
+        MultiScaleDeformableAttention(
+            embed_dims=256,
+            num_heads=7,
+        )
+    MultiScaleDeformableAttention(embed_dims=256, num_heads=8)