Use pre-commit to reformat code

Use pre-commit to reformat code

Use pre-commit to reformat code
41b18fd8 · zhe chen · ff20ea39 · 41b18fd8 · 41b18fd8 · 41b18fd8
Commit 41b18fd8 authored Jan 06, 2025 by zhe chen
20 changed files
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/spatial_cross_attention.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/spatial_cross_attention.py
-
-# ---------------------------------------------
-# Copyright (c) OpenMMLab. All rights reserved.
-# ---------------------------------------------
-#  Modified by Zhiqi Li
-# ---------------------------------------------
-
-from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
-import warnings
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from mmcv.cnn import xavier_init, constant_init
-from mmcv.cnn.bricks.registry import (ATTENTION,
-                                      TRANSFORMER_LAYER,
-                                      TRANSFORMER_LAYER_SEQUENCE)
-from mmcv.cnn.bricks.transformer import build_attention
-import math
-from mmcv.runner import force_fp32, auto_fp16
-
-from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
-
-from mmcv.utils import ext_loader
-from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
-    MultiScaleDeformableAttnFunction_fp16
-ext_module = ext_loader.load_ext(
-    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
-
-
-@ATTENTION.register_module()
-class SpatialCrossAttention(BaseModule):
-    """An attention module used in BEVFormer.
-    Args:
-        embed_dims (int): The embedding dimension of Attention.
-            Default: 256.
-        num_cams (int): The number of cameras
-        dropout (float): A Dropout layer on `inp_residual`.
-            Default: 0..
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-        deformable_attention: (dict): The config for the deformable attention used in SCA.
-    """
-
-    def __init__(self,
-                 embed_dims=256,
-                 num_cams=6,
-                 pc_range=None,
-                 dropout=0.1,
-                 init_cfg=None,
-                 batch_first=False,
-                 deformable_attention=dict(
-                     type='MSDeformableAttention3D',
-                     embed_dims=256,
-                     num_levels=4),
-                 **kwargs
-                 ):
-        super(SpatialCrossAttention, self).__init__(init_cfg)
-
-        self.init_cfg = init_cfg
-        self.dropout = nn.Dropout(dropout)
-        self.pc_range = pc_range
-        self.fp16_enabled = False
-        self.deformable_attention = build_attention(deformable_attention)
-        self.embed_dims = embed_dims
-        self.num_cams = num_cams
-        self.output_proj = nn.Linear(embed_dims, embed_dims)
-        self.batch_first = batch_first
-        self.init_weight()
-
-    def init_weight(self):
-        """Default initialization for Parameters of Module."""
-        xavier_init(self.output_proj, distribution='uniform', bias=0.)
-    
-    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
-    def forward(self,
-                query,
-                key,
-                value,
-                residual=None,
-                query_pos=None,
-                key_padding_mask=None,
-                reference_points=None,
-                spatial_shapes=None,
-                reference_points_cam=None,
-                bev_mask=None,
-                level_start_index=None,
-                flag='encoder',
-                **kwargs):
-        """Forward Function of Detr3DCrossAtten.
-        Args:
-            query (Tensor): Query of Transformer with shape
-                (num_query, bs, embed_dims).
-            key (Tensor): The key tensor with shape
-                `(num_key, bs, embed_dims)`.
-            value (Tensor): The value tensor with shape
-                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
-            residual (Tensor): The tensor used for addition, with the
-                same shape as `x`. Default None. If None, `x` will be used.
-            query_pos (Tensor): The positional encoding for `query`.
-                Default: None.
-            key_pos (Tensor): The positional encoding for  `key`. Default
-                None.
-            reference_points (Tensor):  The normalized reference
-                points with shape (bs, num_query, 4),
-                all elements is range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area.
-                or (N, Length_{query}, num_levels, 4), add
-                additional two dimensions is (w, h) to
-                form reference boxes.
-            key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_key].
-            spatial_shapes (Tensor): Spatial shape of features in
-                different level. With shape  (num_levels, 2),
-                last dimension represent (h, w).
-            level_start_index (Tensor): The start index of each level.
-                A tensor has shape (num_levels) and can be represented
-                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
-        Returns:
-             Tensor: forwarded results with shape [num_query, bs, embed_dims].
-        """
-
-        if key is None:
-            key = query
-        if value is None:
-            value = key
-
-        if residual is None:
-            inp_residual = query
-            slots = torch.zeros_like(query)
-        if query_pos is not None:
-            query = query + query_pos
-
-        bs, num_query, _ = query.size()
-
-        D = reference_points_cam.size(3)
-        indexes = []
-        for i, mask_per_img in enumerate(bev_mask):
-            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
-            indexes.append(index_query_per_img)
-        max_len = max([len(each) for each in indexes])
-
-        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
-        queries_rebatch = query.new_zeros(
-            [bs, self.num_cams, max_len, self.embed_dims])
-        reference_points_rebatch = reference_points_cam.new_zeros(
-            [bs, self.num_cams, max_len, D, 2])
-        
-        for j in range(bs):
-            for i, reference_points_per_img in enumerate(reference_points_cam):   
-                index_query_per_img = indexes[i]
-                queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img]
-                reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img]
-
-        num_cams, l, bs, embed_dims = key.shape
-
-        key = key.permute(2, 0, 1, 3).reshape(
-            bs * self.num_cams, l, self.embed_dims)
-        value = value.permute(2, 0, 1, 3).reshape(
-            bs * self.num_cams, l, self.embed_dims)
-
-        queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,
-                                            reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,
-                                            level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims)
-        for j in range(bs):
-            for i, index_query_per_img in enumerate(indexes):
-                slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]
-
-        count = bev_mask.sum(-1) > 0
-        count = count.permute(1, 2, 0).sum(-1)
-        count = torch.clamp(count, min=1.0)
-        slots = slots / count[..., None]
-        slots = self.output_proj(slots)
-
-        return self.dropout(slots) + inp_residual
-
-
-@ATTENTION.register_module()
-class MSDeformableAttention3D(BaseModule):
-    """An attention module used in BEVFormer based on Deformable-Detr.
-    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
-    <https://arxiv.org/pdf/2010.04159.pdf>`_.
-    Args:
-        embed_dims (int): The embedding dimension of Attention.
-            Default: 256.
-        num_heads (int): Parallel attention heads. Default: 64.
-        num_levels (int): The number of feature map used in
-            Attention. Default: 4.
-        num_points (int): The number of sampling points for
-            each query in each head. Default: 4.
-        im2col_step (int): The step used in image_to_column.
-            Default: 64.
-        dropout (float): A Dropout layer on `inp_identity`.
-            Default: 0.1.
-        batch_first (bool): Key, Query and Value are shape of
-            (batch, n, embed_dim)
-            or (n, batch, embed_dim). Default to False.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: None.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=8,
-                 im2col_step=64,
-                 dropout=0.1,
-                 batch_first=True,
-                 norm_cfg=None,
-                 init_cfg=None):
-        super().__init__(init_cfg)
-        if embed_dims % num_heads != 0:
-            raise ValueError(f'embed_dims must be divisible by num_heads, '
-                             f'but got {embed_dims} and {num_heads}')
-        dim_per_head = embed_dims // num_heads
-        self.norm_cfg = norm_cfg
-        self.batch_first = batch_first
-        self.output_proj = None
-        self.fp16_enabled = False
-
-        # you'd better set dim_per_head to a power of 2
-        # which is more efficient in the CUDA implementation
-        def _is_power_of_2(n):
-            if (not isinstance(n, int)) or (n < 0):
-                raise ValueError(
-                    'invalid input for _is_power_of_2: {} (type: {})'.format(
-                        n, type(n)))
-            return (n & (n - 1) == 0) and n != 0
-
-        if not _is_power_of_2(dim_per_head):
-            warnings.warn(
-                "You'd better set embed_dims in "
-                'MultiScaleDeformAttention to make '
-                'the dimension of each attention head a power of 2 '
-                'which is more efficient in our CUDA implementation.')
-
-        self.im2col_step = im2col_step
-        self.embed_dims = embed_dims
-        self.num_levels = num_levels
-        self.num_heads = num_heads
-        self.num_points = num_points
-        self.sampling_offsets = nn.Linear(
-            embed_dims, num_heads * num_levels * num_points * 2)
-        self.attention_weights = nn.Linear(embed_dims,
-                                           num_heads * num_levels * num_points)
-        self.value_proj = nn.Linear(embed_dims, embed_dims)
-
-        self.init_weights()
-
-    def init_weights(self):
-        """Default initialization for Parameters of Module."""
-        constant_init(self.sampling_offsets, 0.)
-        thetas = torch.arange(
-            self.num_heads,
-            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init /
-                     grid_init.abs().max(-1, keepdim=True)[0]).view(
-            self.num_heads, 1, 1,
-            2).repeat(1, self.num_levels, self.num_points, 1)
-        for i in range(self.num_points):
-            grid_init[:, :, i, :] *= i + 1
-
-        self.sampling_offsets.bias.data = grid_init.view(-1)
-        constant_init(self.attention_weights, val=0., bias=0.)
-        xavier_init(self.value_proj, distribution='uniform', bias=0.)
-        xavier_init(self.output_proj, distribution='uniform', bias=0.)
-        self._is_init = True
-
-    def forward(self,
-                query,
-                key=None,
-                value=None,
-                identity=None,
-                query_pos=None,
-                key_padding_mask=None,
-                reference_points=None,
-                spatial_shapes=None,
-                level_start_index=None,
-                **kwargs):
-        """Forward Function of MultiScaleDeformAttention.
-        Args:
-            query (Tensor): Query of Transformer with shape
-                ( bs, num_query, embed_dims).
-            key (Tensor): The key tensor with shape
-                `(bs, num_key,  embed_dims)`.
-            value (Tensor): The value tensor with shape
-                `(bs, num_key,  embed_dims)`.
-            identity (Tensor): The tensor used for addition, with the
-                same shape as `query`. Default None. If None,
-                `query` will be used.
-            query_pos (Tensor): The positional encoding for `query`.
-                Default: None.
-            key_pos (Tensor): The positional encoding for `key`. Default
-                None.
-            reference_points (Tensor):  The normalized reference
-                points with shape (bs, num_query, num_levels, 2),
-                all elements is range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area.
-                or (N, Length_{query}, num_levels, 4), add
-                additional two dimensions is (w, h) to
-                form reference boxes.
-            key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_key].
-            spatial_shapes (Tensor): Spatial shape of features in
-                different levels. With shape (num_levels, 2),
-                last dimension represents (h, w).
-            level_start_index (Tensor): The start index of each level.
-                A tensor has shape ``(num_levels, )`` and can be represented
-                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
-        Returns:
-             Tensor: forwarded results with shape [num_query, bs, embed_dims].
-        """
-
-        if value is None:
-            value = query
-        if identity is None:
-            identity = query
-        if query_pos is not None:
-            query = query + query_pos
-
-        if not self.batch_first:
-            # change to (bs, num_query ,embed_dims)
-            query = query.permute(1, 0, 2)
-            value = value.permute(1, 0, 2)
-
-        bs, num_query, _ = query.shape
-        bs, num_value, _ = value.shape
-        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
-
-        value = self.value_proj(value)
-        if key_padding_mask is not None:
-            value = value.masked_fill(key_padding_mask[..., None], 0.0)
-        value = value.view(bs, num_value, self.num_heads, -1)
-        sampling_offsets = self.sampling_offsets(query).view(
-            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
-        attention_weights = self.attention_weights(query).view(
-            bs, num_query, self.num_heads, self.num_levels * self.num_points)
-
-        attention_weights = attention_weights.softmax(-1)
-
-        attention_weights = attention_weights.view(bs, num_query,
-                                                   self.num_heads,
-                                                   self.num_levels,
-                                                   self.num_points)
-
-        if reference_points.shape[-1] == 2:
-            """
-            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
-            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
-            For each referent point, we sample `num_points` sampling points.
-            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
-            """
-            offset_normalizer = torch.stack(
-                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
-
-            bs, num_query, num_Z_anchors, xy = reference_points.shape
-            reference_points = reference_points[:, :, None, None, None, :, :]
-            sampling_offsets = sampling_offsets / \
-                offset_normalizer[None, None, None, :, None, :]
-            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
-            sampling_offsets = sampling_offsets.view(
-                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
-            sampling_locations = reference_points + sampling_offsets
-            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
-            assert num_all_points == num_points * num_Z_anchors
-
-            sampling_locations = sampling_locations.view(
-                bs, num_query, num_heads, num_levels, num_all_points, xy)
-
-        elif reference_points.shape[-1] == 4:
-            assert False
-        else:
-            raise ValueError(
-                f'Last dim of reference_points must be'
-                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
-
-        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
-        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
-        #
-
-        if torch.cuda.is_available() and value.is_cuda:
-            if value.dtype == torch.float16:
-                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
-            else:
-                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
-            output = MultiScaleDeformableAttnFunction.apply(
-                value, spatial_shapes, level_start_index, sampling_locations,
-                attention_weights, self.im2col_step)
-        else:
-            output = multi_scale_deformable_attn_pytorch(
-                value, spatial_shapes, sampling_locations, attention_weights)
-        if not self.batch_first:
-            output = output.permute(1, 0, 2)
-
-        return output
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import constant_init, xavier_init
+from mmcv.cnn.bricks.registry import ATTENTION
+from mmcv.cnn.bricks.transformer import build_attention
+from mmcv.ops.multi_scale_deform_attn import \
+    multi_scale_deformable_attn_pytorch
+from mmcv.runner import force_fp32
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import ext_loader
+
+from .multi_scale_deformable_attn_function import \
+    MultiScaleDeformableAttnFunction_fp32
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class SpatialCrossAttention(BaseModule):
+    """An attention module used in BEVFormer.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_cams (int): The number of cameras
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0..
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        deformable_attention: (dict): The config for the deformable attention used in SCA.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_cams=6,
+                 pc_range=None,
+                 dropout=0.1,
+                 init_cfg=None,
+                 batch_first=False,
+                 deformable_attention=dict(
+                     type='MSDeformableAttention3D',
+                     embed_dims=256,
+                     num_levels=4),
+                 **kwargs
+                 ):
+        super(SpatialCrossAttention, self).__init__(init_cfg)
+
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.deformable_attention = build_attention(deformable_attention)
+        self.embed_dims = embed_dims
+        self.num_cams = num_cams
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.batch_first = batch_first
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+
+    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                reference_points_cam=None,
+                bev_mask=None,
+                level_start_index=None,
+                flag='encoder',
+                **kwargs):
+        """Forward Function of Detr3DCrossAtten.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for  `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, 4),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+            slots = torch.zeros_like(query)
+        if query_pos is not None:
+            query = query + query_pos
+
+        bs, num_query, _ = query.size()
+
+        D = reference_points_cam.size(3)
+        indexes = []
+        for i, mask_per_img in enumerate(bev_mask):
+            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
+            indexes.append(index_query_per_img)
+        max_len = max([len(each) for each in indexes])
+
+        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
+        queries_rebatch = query.new_zeros(
+            [bs, self.num_cams, max_len, self.embed_dims])
+        reference_points_rebatch = reference_points_cam.new_zeros(
+            [bs, self.num_cams, max_len, D, 2])
+
+        for j in range(bs):
+            for i, reference_points_per_img in enumerate(reference_points_cam):
+                index_query_per_img = indexes[i]
+                queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img]
+                reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[
+                    j, index_query_per_img]
+
+        num_cams, l, bs, embed_dims = key.shape
+
+        key = key.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+        value = value.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+
+        queries = self.deformable_attention(query=queries_rebatch.view(bs * self.num_cams, max_len, self.embed_dims),
+                                            key=key, value=value,
+                                            reference_points=reference_points_rebatch.view(bs * self.num_cams, max_len,
+                                                                                           D, 2),
+                                            spatial_shapes=spatial_shapes,
+                                            level_start_index=level_start_index).view(bs, self.num_cams, max_len,
+                                                                                      self.embed_dims)
+        for j in range(bs):
+            for i, index_query_per_img in enumerate(indexes):
+                slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]
+
+        count = bev_mask.sum(-1) > 0
+        count = count.permute(1, 2, 0).sum(-1)
+        count = torch.clamp(count, min=1.0)
+        slots = slots / count[..., None]
+        slots = self.output_proj(slots)
+
+        return self.dropout(slots) + inp_residual
+
+
+@ATTENTION.register_module()
+class MSDeformableAttention3D(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=8,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            """
+            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
+            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
+            For each referent point, we sample `num_points` sampling points.
+            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, None, None, :, :]
+            sampling_offsets = sampling_offsets / \
+                               offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
+            assert num_all_points == num_points * num_Z_anchors
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
+        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
+        #
+
+        if torch.cuda.is_available() and value.is_cuda:
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return output
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/temporal_self_attention.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/temporal_self_attention.py
-# ---------------------------------------------
-# Copyright (c) OpenMMLab. All rights reserved.
-# ---------------------------------------------
-#  Modified by Zhiqi Li
-# ---------------------------------------------
-
-from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32
-from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
-import warnings
-import torch
-import torch.nn as nn
-from mmcv.cnn import xavier_init, constant_init
-from mmcv.cnn.bricks.registry import ATTENTION
-import math
-from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
-from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
-                        to_2tuple)
-
-from mmcv.utils import ext_loader
-ext_module = ext_loader.load_ext(
-    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
-
-
-@ATTENTION.register_module()
-class TemporalSelfAttention(BaseModule):
-    """An attention module used in BEVFormer based on Deformable-Detr.
-
-    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
-    <https://arxiv.org/pdf/2010.04159.pdf>`_.
-
-    Args:
-        embed_dims (int): The embedding dimension of Attention.
-            Default: 256.
-        num_heads (int): Parallel attention heads. Default: 64.
-        num_levels (int): The number of feature map used in
-            Attention. Default: 4.
-        num_points (int): The number of sampling points for
-            each query in each head. Default: 4.
-        im2col_step (int): The step used in image_to_column.
-            Default: 64.
-        dropout (float): A Dropout layer on `inp_identity`.
-            Default: 0.1.
-        batch_first (bool): Key, Query and Value are shape of
-            (batch, n, embed_dim)
-            or (n, batch, embed_dim). Default to True.
-        norm_cfg (dict): Config dict for normalization layer.
-            Default: None.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-        num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
-         the length of BEV queue is 2.
-    """
-
-    def __init__(self,
-                 embed_dims=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=4,
-                 num_bev_queue=2,
-                 im2col_step=64,
-                 dropout=0.1,
-                 batch_first=True,
-                 norm_cfg=None,
-                 init_cfg=None):
-
-        super().__init__(init_cfg)
-        if embed_dims % num_heads != 0:
-            raise ValueError(f'embed_dims must be divisible by num_heads, '
-                             f'but got {embed_dims} and {num_heads}')
-        dim_per_head = embed_dims // num_heads
-        self.norm_cfg = norm_cfg
-        self.dropout = nn.Dropout(dropout)
-        self.batch_first = batch_first
-        self.fp16_enabled = False
-
-        # you'd better set dim_per_head to a power of 2
-        # which is more efficient in the CUDA implementation
-        def _is_power_of_2(n):
-            if (not isinstance(n, int)) or (n < 0):
-                raise ValueError(
-                    'invalid input for _is_power_of_2: {} (type: {})'.format(
-                        n, type(n)))
-            return (n & (n - 1) == 0) and n != 0
-
-        if not _is_power_of_2(dim_per_head):
-            warnings.warn(
-                "You'd better set embed_dims in "
-                'MultiScaleDeformAttention to make '
-                'the dimension of each attention head a power of 2 '
-                'which is more efficient in our CUDA implementation.')
-
-        self.im2col_step = im2col_step
-        self.embed_dims = embed_dims
-        self.num_levels = num_levels
-        self.num_heads = num_heads
-        self.num_points = num_points
-        self.num_bev_queue = num_bev_queue
-        self.sampling_offsets = nn.Linear(
-            embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2)
-        self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue,
-                                           num_bev_queue*num_heads * num_levels * num_points)
-        self.value_proj = nn.Linear(embed_dims, embed_dims)
-        self.output_proj = nn.Linear(embed_dims, embed_dims)
-        self.init_weights()
-
-    def init_weights(self):
-        """Default initialization for Parameters of Module."""
-        constant_init(self.sampling_offsets, 0.)
-        thetas = torch.arange(
-            self.num_heads,
-            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init /
-                     grid_init.abs().max(-1, keepdim=True)[0]).view(
-            self.num_heads, 1, 1,
-            2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1)
-
-        for i in range(self.num_points):
-            grid_init[:, :, i, :] *= i + 1
-
-        self.sampling_offsets.bias.data = grid_init.view(-1)
-        constant_init(self.attention_weights, val=0., bias=0.)
-        xavier_init(self.value_proj, distribution='uniform', bias=0.)
-        xavier_init(self.output_proj, distribution='uniform', bias=0.)
-        self._is_init = True
-
-    def forward(self,
-                query,
-                key=None,
-                value=None,
-                identity=None,
-                query_pos=None,
-                key_padding_mask=None,
-                reference_points=None,
-                spatial_shapes=None,
-                level_start_index=None,
-                flag='decoder',
-
-                **kwargs):
-        """Forward Function of MultiScaleDeformAttention.
-
-        Args:
-            query (Tensor): Query of Transformer with shape
-                (num_query, bs, embed_dims).
-            key (Tensor): The key tensor with shape
-                `(num_key, bs, embed_dims)`.
-            value (Tensor): The value tensor with shape
-                `(num_key, bs, embed_dims)`.
-            identity (Tensor): The tensor used for addition, with the
-                same shape as `query`. Default None. If None,
-                `query` will be used.
-            query_pos (Tensor): The positional encoding for `query`.
-                Default: None.
-            key_pos (Tensor): The positional encoding for `key`. Default
-                None.
-            reference_points (Tensor):  The normalized reference
-                points with shape (bs, num_query, num_levels, 2),
-                all elements is range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area.
-                or (N, Length_{query}, num_levels, 4), add
-                additional two dimensions is (w, h) to
-                form reference boxes.
-            key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_key].
-            spatial_shapes (Tensor): Spatial shape of features in
-                different levels. With shape (num_levels, 2),
-                last dimension represents (h, w).
-            level_start_index (Tensor): The start index of each level.
-                A tensor has shape ``(num_levels, )`` and can be represented
-                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
-
-        Returns:
-             Tensor: forwarded results with shape [num_query, bs, embed_dims].
-        """
-
-        if value is None:
-            assert self.batch_first
-            bs, len_bev, c = query.shape
-            value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c)
-
-            # value = torch.cat([query, query], 0)
-
-        if identity is None:
-            identity = query
-        if query_pos is not None:
-            query = query + query_pos
-        if not self.batch_first:
-            # change to (bs, num_query ,embed_dims)
-            query = query.permute(1, 0, 2)
-            value = value.permute(1, 0, 2)
-        bs,  num_query, embed_dims = query.shape
-        _, num_value, _ = value.shape
-        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
-        assert self.num_bev_queue == 2
-
-        query = torch.cat([value[::2], query], -1)
-        value_ = value.clone()
-        value_[:bs] = value[::2]
-        value_[bs:] = value[1::2]
-        value = self.value_proj(value)
-        value = self.value_proj(value)
-
-        if key_padding_mask is not None:
-            value = value.masked_fill(key_padding_mask[..., None], 0.0)
-
-        value = value.reshape(bs*self.num_bev_queue,
-                              num_value, self.num_heads, -1)
-
-        sampling_offsets = self.sampling_offsets(query)
-        sampling_offsets = sampling_offsets.view(
-            bs, num_query, self.num_heads,  self.num_bev_queue, self.num_levels, self.num_points, 2)
-        attention_weights = self.attention_weights(query).view(
-            bs, num_query,  self.num_heads, self.num_bev_queue, self.num_levels * self.num_points)
-        attention_weights = attention_weights.softmax(-1)
-
-        attention_weights = attention_weights.view(bs, num_query,
-                                                   self.num_heads,
-                                                   self.num_bev_queue,
-                                                   self.num_levels,
-                                                   self.num_points)
-
-        attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\
-            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous()
-        sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\
-            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2)
-
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.stack(
-                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
-            sampling_locations = reference_points[:, :, None, :, None, :] \
-                + sampling_offsets \
-                / offset_normalizer[None, None, None, :, None, :]
-
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = reference_points[:, :, None, :, None, :2] \
-                + sampling_offsets / self.num_points \
-                * reference_points[:, :, None, :, None, 2:] \
-                * 0.5
-        else:
-            raise ValueError(
-                f'Last dim of reference_points must be'
-                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
-        if torch.cuda.is_available() and value.is_cuda:
-
-            # using fp16 deformable attention is unstable because it performs many sum operations
-            if value.dtype == torch.float16:
-                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
-            else:
-                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
-            output = MultiScaleDeformableAttnFunction.apply(
-                value, spatial_shapes, level_start_index, sampling_locations,
-                attention_weights, self.im2col_step)
-        else:
-
-            output = multi_scale_deformable_attn_pytorch(
-                value, spatial_shapes, sampling_locations, attention_weights)
-
-        # output shape (bs*num_bev_queue, num_query, embed_dims)
-        # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
-        output = output.permute(1, 2, 0)
-
-        # fuse history value and current value
-        # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
-        output = output.view(num_query, embed_dims, bs, self.num_bev_queue)
-        output = output.mean(-1)
-
-        # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
-        output = output.permute(2, 0, 1)
-
-        output = self.output_proj(output)
-
-        if not self.batch_first:
-            output = output.permute(1, 0, 2)
-
-        return self.dropout(output) + identity
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import constant_init, xavier_init
+from mmcv.cnn.bricks.registry import ATTENTION
+from mmcv.ops.multi_scale_deform_attn import \
+    multi_scale_deformable_attn_pytorch
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import ext_loader
+
+from .multi_scale_deformable_attn_function import \
+    MultiScaleDeformableAttnFunction_fp32
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class TemporalSelfAttention(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
+         the length of BEV queue is 2.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 num_bev_queue=2,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_bev_queue = num_bev_queue
+        self.sampling_offsets = nn.Linear(
+            embed_dims * self.num_bev_queue, num_bev_queue * num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims * self.num_bev_queue,
+                                           num_bev_queue * num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels * self.num_bev_queue, self.num_points, 1)
+
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            assert self.batch_first
+            bs, len_bev, c = query.shape
+            value = torch.stack([query, query], 1).reshape(bs * 2, len_bev, c)
+
+            # value = torch.cat([query, query], 0)
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+        bs, num_query, embed_dims = query.shape
+        _, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+        assert self.num_bev_queue == 2
+
+        query = torch.cat([value[::2], query], -1)
+        value_ = value.clone()
+        value_[:bs] = value[::2]
+        value_[bs:] = value[1::2]
+        value = self.value_proj(value)
+        value = self.value_proj(value)
+
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+
+        value = value.reshape(bs * self.num_bev_queue,
+                              num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.view(
+            bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_bev_queue,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5) \
+            .reshape(bs * self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous()
+        sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6) \
+            .reshape(bs * self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets \
+                                 / offset_normalizer[None, None, None, :, None, :]
+
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.num_points \
+                                 * reference_points[:, :, None, :, None, 2:] \
+                                 * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        # output shape (bs*num_bev_queue, num_query, embed_dims)
+        # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
+        output = output.permute(1, 2, 0)
+
+        # fuse history value and current value
+        # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
+        output = output.view(num_query, embed_dims, bs, self.num_bev_queue)
+        output = output.mean(-1)
+
+        # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
+        output = output.permute(2, 0, 1)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
-# ---------------------------------------------
-# Copyright (c) OpenMMLab. All rights reserved.
-# ---------------------------------------------
-#  Modified by Zhiqi Li
-# ---------------------------------------------
-
-import numpy as np
-import torch
-import torch.nn as nn
-from mmcv.cnn import xavier_init
-from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
-from mmcv.runner.base_module import BaseModule
-
-from mmdet.models.utils.builder import TRANSFORMER
-from torch.nn.init import normal_
-from mmcv.runner.base_module import BaseModule
-from torchvision.transforms.functional import rotate
-from .temporal_self_attention import TemporalSelfAttention
-from .spatial_cross_attention import MSDeformableAttention3D
-from .decoder import CustomMSDeformableAttention
-from mmcv.runner import force_fp32, auto_fp16
-import pdb
-
-
-@TRANSFORMER.register_module()
-class PerceptionTransformer(BaseModule):
-    """Implements the Detr3D transformer.
-    Args:
-        as_two_stage (bool): Generate query from encoder features.
-            Default: False.
-        num_feature_levels (int): Number of feature maps from FPN:
-            Default: 4.
-        two_stage_num_proposals (int): Number of proposals when set
-            `as_two_stage` as True. Default: 300.
-    """
-
-    def __init__(self,
-                 decoder=None,
-                 embed_dims=256,
-                 **kwargs):
-        super(PerceptionTransformer, self).__init__(**kwargs)
-        self.decoder = build_transformer_layer_sequence(decoder)
-        self.embed_dims = embed_dims
-        self.fp16_enabled = False
-        self.init_layers()
-
-    def init_layers(self):
-        """Initialize layers of the Detr3DTransformer."""
-        self.reference_points = nn.Linear(self.embed_dims, 3)
-
-    def init_weights(self):
-        """Initialize the transformer weights."""
-        for p in self.parameters():
-            if p.dim() > 1:
-                nn.init.xavier_uniform_(p)
-        for m in self.modules():
-            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
-                    or isinstance(m, CustomMSDeformableAttention):
-                try:
-                    m.init_weight()
-                except AttributeError:
-                    m.init_weights()
-        xavier_init(self.reference_points, distribution='uniform', bias=0.)
-
-
-    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
-    def forward(self,
-                mlvl_feats,
-                bev_embed,
-                object_query_embed,
-                bev_h,
-                bev_w,
-                reg_branches=None,
-                cls_branches=None,
-                **kwargs):
-        """Forward function for `Detr3DTransformer`.
-        Args:
-            mlvl_feats (list(Tensor)): Input queries from
-                different level. Each element has shape
-                [bs, num_cams, embed_dims, h, w].
-            bev_queries (Tensor): (bev_h*bev_w, c)
-            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
-            object_query_embed (Tensor): The query embedding for decoder,
-                with shape [num_query, c].
-            reg_branches (obj:`nn.ModuleList`): Regression heads for
-                feature maps from each decoder layer. Only would
-                be passed when `with_box_refine` is True. Default to None.
-        Returns:
-            tuple[Tensor]: results of decoder containing the following tensor.
-                - bev_embed: BEV features
-                - inter_states: Outputs from decoder. If
-                    return_intermediate_dec is True output has shape \
-                      (num_dec_layers, bs, num_query, embed_dims), else has \
-                      shape (1, bs, num_query, embed_dims).
-                - init_reference_out: The initial value of reference \
-                    points, has shape (bs, num_queries, 4).
-                - inter_references_out: The internal value of reference \
-                    points in decoder, has shape \
-                    (num_dec_layers, bs,num_query, embed_dims)
-                - enc_outputs_class: The classification score of \
-                    proposals generated from \
-                    encoder's feature maps, has shape \
-                    (batch, h*w, num_classes). \
-                    Only would be returned when `as_two_stage` is True, \
-                    otherwise None.
-                - enc_outputs_coord_unact: The regression results \
-                    generated from encoder's feature maps., has shape \
-                    (batch, h*w, 4). Only would \
-                    be returned when `as_two_stage` is True, \
-                    otherwise None.
-        """
-
-        bs = mlvl_feats[0].size(0)
-        query_pos, query = torch.split(
-            object_query_embed, self.embed_dims, dim=1)
-        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
-        query = query.unsqueeze(0).expand(bs, -1, -1)
-        reference_points = self.reference_points(query_pos)
-        reference_points = reference_points.sigmoid()
-        init_reference_out = reference_points
-
-        query = query.permute(1, 0, 2)
-        query_pos = query_pos.permute(1, 0, 2)
-        bev_embed = bev_embed.permute(1, 0, 2)
-        inter_states, inter_references = self.decoder(
-            query=query,
-            key=None,
-            value=bev_embed,
-            query_pos=query_pos,
-            reference_points=reference_points,
-            reg_branches=reg_branches,
-            cls_branches=cls_branches,
-            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
-            level_start_index=torch.tensor([0], device=query.device),
-            **kwargs)
-
-        inter_references_out = inter_references
-
-        return inter_states, init_reference_out, inter_references_out
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+from mmcv.runner import auto_fp16
+from mmcv.runner.base_module import BaseModule
+from mmdet.models.utils.builder import TRANSFORMER
+
+from .decoder import CustomMSDeformableAttention
+from .spatial_cross_attention import MSDeformableAttention3D
+from .temporal_self_attention import TemporalSelfAttention
+
+
+@TRANSFORMER.register_module()
+class PerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 decoder=None,
+                 embed_dims=256,
+                 **kwargs):
+        super(PerceptionTransformer, self).__init__(**kwargs)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = embed_dims
+        self.fp16_enabled = False
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                bev_embed,
+                object_query_embed,
+                bev_h,
+                bev_w,
+                reg_branches=None,
+                cls_branches=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return inter_states, init_reference_out, inter_references_out
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
-from .custom_fpn import *
-from .custom_ipm_view_transformer import *
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
 # ==============================================================================
-# Binaries and/or source for the following packages or projects 
+# Binaries and/or source for the following packages or projects
 # are presented under one or more of the following open source licenses:
 # custom_fpn.py    The OpenLane-V2 Dataset Authors    Apache License, Version 2.0
 #
@@ -22,7 +22,6 @@

 import torch.nn as nn
 import torch.nn.functional as F
-
 from mmcv.cnn import ConvModule
 from mmcv.runner import BaseModule
 from mmdet3d.models import NECKS
@@ -34,7 +33,7 @@ class CustomFPN(BaseModule):
    Notes
    -----
    Adapted from https://github.com/HuangJunJie2017/BEVDet/blob/dev2.0/mmdet3d/models/necks/fpn.py#L11.
-    
+
    Feature Pyramid Network.
    This is an implementation of paper `Feature Pyramid Networks for Object
    Detection <https://arxiv.org/abs/1612.03144>`_.

--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_ipm_view_transformer.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_ipm_view_transformer.py
 # ==============================================================================
-# Binaries and/or source for the following packages or projects 
+# Binaries and/or source for the following packages or projects
 # are presented under one or more of the following open source licenses:
 # custom_ipm_view_transformer.py    The OpenLane-V2 Dataset Authors    Apache License, Version 2.0
 #
@@ -20,12 +20,9 @@
 # limitations under the License.
 # ==============================================================================

-import copy
-import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
 from mmcv.runner import BaseModule
 from mmdet3d.models import NECKS

@@ -33,7 +30,7 @@ from mmdet3d.models import NECKS
 def get_campos(reference_points, ego2cam, img_shape):
    '''
        Find the each refence point's corresponding pixel in each camera
-        Args: 
+        Args:
            reference_points: [B, num_query, 3]
            ego2cam: (B, num_cam, 4, 4)
        Outs:
@@ -63,7 +60,7 @@ def get_campos(reference_points, ego2cam, img_shape):
    eps = 1e-9
    mask = (reference_points_cam[..., 2:3] > eps)

-    reference_points_cam =\
+    reference_points_cam = \
        reference_points_cam[..., 0:2] / \
        reference_points_cam[..., 2:3] + eps

@@ -74,16 +71,17 @@ def get_campos(reference_points, ego2cam, img_shape):
    reference_points_cam = (reference_points_cam - 0.5) * 2

    mask = (mask & (reference_points_cam[..., 0:1] > -1.0)
-                 & (reference_points_cam[..., 0:1] < 1.0)
-                 & (reference_points_cam[..., 1:2] > -1.0)
-                 & (reference_points_cam[..., 1:2] < 1.0))
+            & (reference_points_cam[..., 0:1] < 1.0)
+            & (reference_points_cam[..., 1:2] > -1.0)
+            & (reference_points_cam[..., 1:2] < 1.0))

    # (B, num_cam, num_query)
    mask = mask.view(B, num_cam, num_query)
-    reference_points_cam = reference_points_cam.view(B*num_cam, num_query, 2)
+    reference_points_cam = reference_points_cam.view(B * num_cam, num_query, 2)

    return reference_points_cam, mask

+
 def construct_plane_grid(xbound, ybound, height: float, dtype=torch.float32):
    '''
        Returns:
@@ -108,6 +106,7 @@ def construct_plane_grid(xbound, ybound, height: float, dtype=torch.float32):

    return plane

+
 @NECKS.register_module()
 class CustomIPMViewTransformer(BaseModule):
    r"""
@@ -116,8 +115,9 @@ class CustomIPMViewTransformer(BaseModule):
    Adapted from https://github.com/Mrmoore98/VectorMapNet_code/blob/mian/plugin/models/backbones/ipm_backbone.py#L238.

    """
-    def __init__(self,         
-                 num_cam,        
+
+    def __init__(self,
+                 num_cam,
                 xbound,
                 ybound,
                 zbound,
@@ -126,24 +126,24 @@ class CustomIPMViewTransformer(BaseModule):
        super().__init__()
        self.x_bound = xbound
        self.y_bound = ybound
-        heights = [zbound[0]+i*zbound[2] for i in range(int((zbound[1]-zbound[0])//zbound[2])+1)]
+        heights = [zbound[0] + i * zbound[2] for i in range(int((zbound[1] - zbound[0]) // zbound[2]) + 1)]
        self.heights = heights

        self.num_cam = num_cam

-        self.outconvs =\
-            nn.Conv2d((out_channels+3)*len(heights), out_channels, 
-                        kernel_size=3, stride=1, padding=1)  # same
+        self.outconvs = \
+            nn.Conv2d((out_channels + 3) * len(heights), out_channels,
+                      kernel_size=3, stride=1, padding=1)  # same

        # bev_plane
        bev_planes = [construct_plane_grid(
            xbound, ybound, h) for h in self.heights]
        self.register_buffer('bev_planes', torch.stack(
-            bev_planes),)  # nlvl,bH,bW,2
+            bev_planes), )  # nlvl,bH,bW,2

    def forward(self, cam_feat, ego2cam, img_shape):
        '''
-            inverse project 
+            inverse project
            Args:
                cam_feat: B*ncam, C, cH, cW
                img_shape: tuple(H, W)
@@ -161,7 +161,7 @@ class CustomIPMViewTransformer(BaseModule):
        # bev_grid_pos: B*ncam, nlvl*bH*bW, 2
        bev_grid_pos, bev_cam_mask = get_campos(bev_grid, ego2cam, img_shape)
        # B*cam, nlvl*bH, bW, 2
-        bev_grid_pos = bev_grid_pos.unflatten(-2, (nlvl*bH, bW))
+        bev_grid_pos = bev_grid_pos.unflatten(-2, (nlvl * bH, bW))

        # project feat from 2D to bev plane
        projected_feature = F.grid_sample(
@@ -173,11 +173,11 @@ class CustomIPMViewTransformer(BaseModule):
        # eliminate the ncam
        # The bev feature is the sum of the 6 cameras
        bev_feat_mask = bev_feat_mask.unsqueeze(2)
-        projected_feature = (projected_feature*bev_feat_mask).sum(1)
+        projected_feature = (projected_feature * bev_feat_mask).sum(1)
        num_feat = bev_feat_mask.sum(1)

        projected_feature = projected_feature / \
-            num_feat.masked_fill(num_feat == 0, 1)
+                            num_feat.masked_fill(num_feat == 0, 1)

        # concatenate a position information
        # projected_feature: B, bH, bW, nlvl, C+3

--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
 custom_imports = dict(imports=['projects.openlanev2.baseline'])

-method_para = dict(n_control=5) # #point for each curve
+method_para = dict(n_control=5)  # #point for each curve

 _dim_ = 128

@@ -19,26 +19,26 @@ model = dict(
        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
    img_neck=dict(
        type='CustomFPN',
-        in_channels=[_dim_*2, _dim_*4],
+        in_channels=[_dim_ * 2, _dim_ * 4],
        out_channels=_dim_,
        num_outs=1,
        start_level=0,
        out_ids=[0]),
    img_view_transformer=dict(
        type='CustomIPMViewTransformer',
-        num_cam=7,        
+        num_cam=7,
        xbound=[-50.0, 50.0, 1.0],
        ybound=[-25.0, 25.0, 1.0],
        zbound=[-3.0, 2.0, 0.5],
        out_channels=_dim_),
    lc_head=dict(
        type='CustomDETRHead',
-        num_classes=1, 
+        num_classes=1,
        in_channels=_dim_,
        num_query=50,
        object_type='lane',
        num_layers=1,
-        num_reg_dim=method_para['n_control']*3,
+        num_reg_dim=method_para['n_control'] * 3,
        loss_cls=dict(
            type='FocalLoss',
            use_sigmoid=True,
@@ -46,17 +46,17 @@ model = dict(
            alpha=0.25,
            loss_weight=1.0),
        loss_bbox=dict(type='L1Loss', loss_weight=2.5),
-        loss_iou=dict(type='GIoULoss', loss_weight=0.0), # dummy
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),  # dummy
        train_cfg=dict(
            assigner=dict(
                type='LaneHungarianAssigner',
                cls_cost=dict(type='FocalLossCost', weight=1.0),
                reg_cost=dict(type='LaneL1Cost', weight=2.5),
-                iou_cost=dict(type='IoUCost', weight=0.0))), # dummy
+                iou_cost=dict(type='IoUCost', weight=0.0))),  # dummy
        bev_range=[-50.0, -25.0, -3.0, 50.0, 25.0, 2.0]),
    te_head=dict(
        type='CustomDETRHead',
-        num_classes=13, 
+        num_classes=13,
        in_channels=_dim_,
        num_query=30,
        object_type='bbox',
@@ -120,7 +120,7 @@ train_pipeline = [
            'gt_topology_lclc', 'gt_topology_lcte',
        ],
        meta_keys=[
-            'scene_token', 'sample_idx', 'img_paths', 
+            'scene_token', 'sample_idx', 'img_paths',
            'img_shape', 'scale_factor', 'pad_shape',
            'lidar2img', 'can_bus',
        ],
@@ -138,7 +138,7 @@ test_pipeline = [
            'img',
        ],
        meta_keys=[
-            'scene_token', 'sample_idx', 'img_paths', 
+            'scene_token', 'sample_idx', 'img_paths',
            'img_shape', 'scale_factor', 'pad_shape',
            'lidar2img', 'can_bus',
        ],

--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
@@ -18,14 +18,13 @@ input_modality = dict(
    use_external=False)
 num_cams = 7

-
 Map_size = [(-50, 50), (-25, 25)]
-method_para = dict(n_control=5) # #point for each curve
+method_para = dict(n_control=5)  # #point for each curve
 code_size = 3 * method_para['n_control']

 _dim_ = 256
-_pos_dim_ = _dim_//2
-_ffn_dim_ = _dim_*2
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
 _ffn_cfg_ = dict(
    type='FFN',
    embed_dims=_dim_,
@@ -71,7 +70,7 @@ model = dict(
        pc_range=point_cloud_range,
        bev_h=bev_h_,
        bev_w=bev_w_,
-        rotate_center=[bev_h_//2, bev_w_//2],
+        rotate_center=[bev_h_ // 2, bev_w_ // 2],
        encoder=dict(
            type='BEVFormerEncoder',
            num_layers=3,
@@ -99,7 +98,7 @@ model = dict(
                ],
                ffn_cfgs=_ffn_cfg_,
                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                    'ffn', 'norm'))),
+                                 'ffn', 'norm'))),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=_pos_dim_,
@@ -169,7 +168,7 @@ model = dict(
        with_box_refine=False,
        with_shared_param=False,
        code_size=code_size,
-        code_weights= [1.0 for i in range(code_size)],
+        code_weights=[1.0 for i in range(code_size)],
        pc_range=point_cloud_range,
        transformer=dict(
            type='PerceptionTransformer',
@@ -186,7 +185,7 @@ model = dict(
                            embed_dims=_dim_,
                            num_heads=8,
                            dropout=0.1),
-                         dict(
+                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=_dim_,
                            num_levels=1),
@@ -240,9 +239,8 @@ model = dict(
            type='LaneHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=1.5),
            reg_cost=dict(type='LaneL1Cost', weight=0.0075),
-            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
-            ))))
-
+            iou_cost=dict(type='IoUCost', weight=0.0),  # Fake cost. This is just to make it compatible with DETR head.
+        ))))

 train_pipeline = [
    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True),
@@ -261,7 +259,7 @@ train_pipeline = [
            'gt_topology_lclc', 'gt_topology_lcte',
        ],
        meta_keys=[
-            'scene_token', 'sample_idx', 'img_paths', 
+            'scene_token', 'sample_idx', 'img_paths',
            'img_shape', 'scale_factor', 'pad_shape',
            'lidar2img', 'can_bus',
        ],
@@ -279,7 +277,7 @@ test_pipeline = [
            'img',
        ],
        meta_keys=[
-            'scene_token', 'sample_idx', 'img_paths', 
+            'scene_token', 'sample_idx', 'img_paths',
            'img_shape', 'scale_factor', 'pad_shape',
            'lidar2img', 'can_bus',
        ],
@@ -350,4 +348,4 @@ dist_params = dict(backend='nccl')
 log_level = 'INFO'
 load_from = None
 resume_from = None
-workflow = [('train', 1)]
\ No newline at end of file
+workflow = [('train', 1)]
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
@@ -18,14 +18,13 @@ input_modality = dict(
    use_external=False)
 num_cams = 7

-
 Map_size = [(-50, 50), (-25, 25)]
-method_para = dict(n_control=5) # #point for each curve
+method_para = dict(n_control=5)  # #point for each curve
 code_size = 3 * method_para['n_control']

 _dim_ = 256
-_pos_dim_ = _dim_//2
-_ffn_dim_ = _dim_*2
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
 _ffn_cfg_ = dict(
    type='FFN',
    embed_dims=_dim_,
@@ -78,7 +77,7 @@ model = dict(
        pc_range=point_cloud_range,
        bev_h=bev_h_,
        bev_w=bev_w_,
-        rotate_center=[bev_h_//2, bev_w_//2],
+        rotate_center=[bev_h_ // 2, bev_w_ // 2],
        encoder=dict(
            type='BEVFormerEncoder',
            num_layers=3,
@@ -106,7 +105,7 @@ model = dict(
                ],
                ffn_cfgs=_ffn_cfg_,
                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                    'ffn', 'norm'))),
+                                 'ffn', 'norm'))),
        positional_encoding=dict(
            type='LearnedPositionalEncoding',
            num_feats=_pos_dim_,
@@ -176,7 +175,7 @@ model = dict(
        with_box_refine=False,
        with_shared_param=False,
        code_size=code_size,
-        code_weights= [1.0 for i in range(code_size)],
+        code_weights=[1.0 for i in range(code_size)],
        pc_range=point_cloud_range,
        transformer=dict(
            type='PerceptionTransformer',
@@ -193,7 +192,7 @@ model = dict(
                            embed_dims=_dim_,
                            num_heads=8,
                            dropout=0.1),
-                         dict(
+                        dict(
                            type='CustomMSDeformableAttention',
                            embed_dims=_dim_,
                            num_levels=1),
@@ -247,9 +246,8 @@ model = dict(
            type='LaneHungarianAssigner',
            cls_cost=dict(type='FocalLossCost', weight=1.5),
            reg_cost=dict(type='LaneL1Cost', weight=0.0075),
-            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
-            ))))
-
+            iou_cost=dict(type='IoUCost', weight=0.0),  # Fake cost. This is just to make it compatible with DETR head.
+        ))))

 train_pipeline = [
    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True),
@@ -268,7 +266,7 @@ train_pipeline = [
            'gt_topology_lclc', 'gt_topology_lcte',
        ],
        meta_keys=[
-            'scene_token', 'sample_idx', 'img_paths', 
+            'scene_token', 'sample_idx', 'img_paths',
            'img_shape', 'scale_factor', 'pad_shape',
            'lidar2img', 'can_bus',
        ],
@@ -286,7 +284,7 @@ test_pipeline = [
            'img',
        ],
        meta_keys=[
-            'scene_token', 'sample_idx', 'img_paths', 
+            'scene_token', 'sample_idx', 'img_paths',
            'img_shape', 'scale_factor', 'pad_shape',
            'lidar2img', 'can_bus',
        ],
@@ -357,4 +355,4 @@ dist_params = dict(backend='nccl')
 log_level = 'INFO'
 load_from = None
 resume_from = None
-workflow = [('train', 1)]
\ No newline at end of file
+workflow = [('train', 1)]
--- a/autonomous_driving/openlane-v2/requirements.txt
+++ b/autonomous_driving/openlane-v2/requirements.txt
-tqdm
-ninja
+chardet
+iso3166
 jupyter
-openmim
 matplotlib
+ninja
 numpy >=1.22.0, <1.24.0
-scikit-learn
-similaritymeasures
 opencv-python
-scipy ==1.8.0
+openmim
 ortools ==9.2.9972
-iso3166
-chardet
+scikit-learn
+scipy ==1.8.0
+similaritymeasures
+tqdm
--- a/autonomous_driving/openlane-v2/setup.py
+++ b/autonomous_driving/openlane-v2/setup.py
 # ==============================================================================
-# Binaries and/or source for the following packages or projects 
+# Binaries and/or source for the following packages or projects
 # are presented under one or more of the following open source licenses:
 # setup.py    The OpenLane-V2 Dataset Authors    Apache License, Version 2.0
 #
@@ -20,8 +20,7 @@
 # limitations under the License.
 # ==============================================================================

-from setuptools import setup, find_packages
-
+from setuptools import find_packages, setup

 setup(
    name='openlanev2',

--- a/autonomous_driving/openlane-v2/tools/analysis_tools/analyze_logs.py
+++ b/autonomous_driving/openlane-v2/tools/analysis_tools/analyze_logs.py
@@ -66,7 +66,7 @@ def plot_curve(log_dicts, args):
                    else:
                        # find the first epoch that do eval
                        x0 = min(epochs) + args.interval - \
-                            min(epochs) % args.interval
+                             min(epochs) % args.interval
                xs = np.arange(x0, max(epochs) + 1, args.interval)
                ys = []
                for epoch in epochs[args.interval - 1::args.interval]:
@@ -86,7 +86,7 @@ def plot_curve(log_dicts, args):
                xs = []
                ys = []
                num_iters_per_epoch = \
-                    log_dict[epochs[args.interval-1]]['iter'][-1]
+                    log_dict[epochs[args.interval - 1]]['iter'][-1]
                for epoch in epochs[args.interval - 1::args.interval]:
                    iters = log_dict[epoch]['iter']
                    if log_dict[epoch]['mode'][-1] == 'val':
@@ -153,7 +153,7 @@ def add_time_parser(subparsers):
        '--include-outliers',
        action='store_true',
        help='include the first value of every epoch when computing '
-        'the average time')
+             'the average time')


 def parse_args():

--- a/autonomous_driving/openlane-v2/tools/analysis_tools/benchmark.py
+++ b/autonomous_driving/openlane-v2/tools/analysis_tools/benchmark.py
@@ -6,7 +6,6 @@ import torch
 from mmcv import Config
 from mmcv.parallel import MMDataParallel
 from mmcv.runner import load_checkpoint, wrap_fp16_model
-
 from mmdet3d.datasets import build_dataloader, build_dataset
 from mmdet3d.models import build_detector
 from tools.misc.fuse_conv_bn import fuse_module
@@ -23,7 +22,7 @@ def parse_args():
        '--fuse-conv-bn',
        action='store_true',
        help='Whether to fuse conv and bn, this will slightly increase'
-        'the inference speed')
+             'the inference speed')
    args = parser.parse_args()
    return args


--- a/autonomous_driving/openlane-v2/tools/analysis_tools/get_flops.py
+++ b/autonomous_driving/openlane-v2/tools/analysis_tools/get_flops.py
@@ -3,7 +3,6 @@ import argparse

 import torch
 from mmcv import Config, DictAction
-
 from mmdet3d.models import build_model

 try:
@@ -32,17 +31,16 @@ def parse_args():
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file. If the value to '
-        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
-        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
-        'Note that the quotation marks are necessary and that no white space '
-        'is allowed.')
+             'in xxx=yyy format will be merged into config file. If the value to '
+             'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+             'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+             'Note that the quotation marks are necessary and that no white space '
+             'is allowed.')
    args = parser.parse_args()
    return args


 def main():
-
    args = parse_args()

    if args.modality == 'point':
@@ -52,7 +50,7 @@ def main():
        if len(args.shape) == 1:
            input_shape = (3, args.shape[0], args.shape[0])
        elif len(args.shape) == 2:
-            input_shape = (3, ) + tuple(args.shape)
+            input_shape = (3,) + tuple(args.shape)
        else:
            raise ValueError('invalid input shape')
    elif args.modality == 'multi':

--- a/autonomous_driving/openlane-v2/tools/data_converter/create_gt_database.py
+++ b/autonomous_driving/openlane-v2/tools/data_converter/create_gt_database.py
@@ -6,12 +6,11 @@ import mmcv
 import numpy as np
 from mmcv import track_iter_progress
 from mmcv.ops import roi_align
-from pycocotools import mask as maskUtils
-from pycocotools.coco import COCO
-
 from mmdet3d.core.bbox import box_np_ops as box_np_ops
 from mmdet3d.datasets import build_dataset
 from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from pycocotools import mask as maskUtils
+from pycocotools.coco import COCO


 def _poly2mask(mask_ann, img_h, img_w):

--- a/autonomous_driving/openlane-v2/tools/data_converter/indoor_converter.py
+++ b/autonomous_driving/openlane-v2/tools/data_converter/indoor_converter.py
@@ -3,7 +3,6 @@ import os

 import mmcv
 import numpy as np
-
 from tools.data_converter.s3dis_data_utils import S3DISData, S3DISSegData
 from tools.data_converter.scannet_data_utils import ScanNetData, ScanNetSegData
 from tools.data_converter.sunrgbd_data_utils import SUNRGBDData

--- a/autonomous_driving/openlane-v2/tools/data_converter/kitti_converter.py
+++ b/autonomous_driving/openlane-v2/tools/data_converter/kitti_converter.py
@@ -4,9 +4,9 @@ from pathlib import Path

 import mmcv
 import numpy as np
+from mmdet3d.core.bbox import box_np_ops, points_cam2img
 from nuscenes.utils.geometry_utils import view_points

-from mmdet3d.core.bbox import box_np_ops, points_cam2img
 from .kitti_data_utils import WaymoInfoGatherer, get_kitti_image_info
 from .nuscenes_converter import post_process_coords

@@ -507,7 +507,7 @@ def get_2d_boxes(info, occluded, mono3d=True):
        src = np.array([0.5, 1.0, 0.5])
        loc = loc + dim * (dst - src)
        offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \
-            / info['calib']['P2'][0, 0]
+                 / info['calib']['P2'][0, 0]
        loc_3d = np.copy(loc)
        loc_3d[0, 0] += offset
        gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32)

--- a/autonomous_driving/openlane-v2/tools/data_converter/kitti_data_utils.py
+++ b/autonomous_driving/openlane-v2/tools/data_converter/kitti_data_utils.py
@@ -151,7 +151,7 @@ def get_label_anno(label_path):
    if len(content) != 0 and len(content[0]) == 16:  # have score
        annotations['score'] = np.array([float(x[15]) for x in content])
    else:
-        annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
+        annotations['score'] = np.zeros((annotations['bbox'].shape[0],))
    index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
    annotations['index'] = np.array(index, dtype=np.int32)
    annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
@@ -547,9 +547,9 @@ def add_difficulty_to_annos(info):
    occlusion = annos['occluded']
    truncation = annos['truncated']
    diff = []
-    easy_mask = np.ones((len(dims), ), dtype=np.bool)
-    moderate_mask = np.ones((len(dims), ), dtype=np.bool)
-    hard_mask = np.ones((len(dims), ), dtype=np.bool)
+    easy_mask = np.ones((len(dims),), dtype=np.bool)
+    moderate_mask = np.ones((len(dims),), dtype=np.bool)
+    hard_mask = np.ones((len(dims),), dtype=np.bool)
    i = 0
    for h, o, t in zip(height, occlusion, truncation):
        if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]:

--- a/autonomous_driving/openlane-v2/tools/data_converter/lyft_converter.py
+++ b/autonomous_driving/openlane-v2/tools/data_converter/lyft_converter.py
@@ -6,9 +6,9 @@ from os import path as osp
 import mmcv
 import numpy as np
 from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from mmdet3d.datasets import LyftDataset
 from pyquaternion import Quaternion

-from mmdet3d.datasets import LyftDataset
 from .nuscenes_converter import (get_2d_boxes, get_available_scenes,
                                 obtain_sensor2top)


--- a/autonomous_driving/openlane-v2/tools/data_converter/lyft_data_fixer.py
+++ b/autonomous_driving/openlane-v2/tools/data_converter/lyft_data_fixer.py
@@ -11,7 +11,7 @@ def fix_lyft(root_folder='./data/lyft', version='v1.01'):
    root_folder = os.path.join(root_folder, f'{version}-train')
    lidar_path = os.path.join(root_folder, lidar_path)
    assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \
-        f'dataset and make sure {lidar_path} is present.'
+                                       f'dataset and make sure {lidar_path} is present.'
    points = np.fromfile(lidar_path, dtype=np.float32, count=-1)
    try:
        points.reshape([-1, 5])