Add openlane v2 (#121)

cce49ba9 · Chengyu Wang · GitHub · dbf29e61 · cce49ba9 · cce49ba9
Unverified Commit cce49ba9 authored Apr 21, 2023 by Chengyu Wang Committed by GitHub Apr 21, 2023
13 changed files
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/multi_scale_deformable_attn_function.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/multi_scale_deformable_attn_function.py
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import torch
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.autograd.function import Function, once_differentiable
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction_fp16(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+class MultiScaleDeformableAttnFunction_fp32(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/spatial_cross_attention.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/spatial_cross_attention.py
+
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import build_attention
+import math
+from mmcv.runner import force_fp32, auto_fp16
+
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+
+from mmcv.utils import ext_loader
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+    MultiScaleDeformableAttnFunction_fp16
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class SpatialCrossAttention(BaseModule):
+    """An attention module used in BEVFormer.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_cams (int): The number of cameras
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0..
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        deformable_attention: (dict): The config for the deformable attention used in SCA.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_cams=6,
+                 pc_range=None,
+                 dropout=0.1,
+                 init_cfg=None,
+                 batch_first=False,
+                 deformable_attention=dict(
+                     type='MSDeformableAttention3D',
+                     embed_dims=256,
+                     num_levels=4),
+                 **kwargs
+                 ):
+        super(SpatialCrossAttention, self).__init__(init_cfg)
+
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.deformable_attention = build_attention(deformable_attention)
+        self.embed_dims = embed_dims
+        self.num_cams = num_cams
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.batch_first = batch_first
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+    
+    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                reference_points_cam=None,
+                bev_mask=None,
+                level_start_index=None,
+                flag='encoder',
+                **kwargs):
+        """Forward Function of Detr3DCrossAtten.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for  `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, 4),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+            slots = torch.zeros_like(query)
+        if query_pos is not None:
+            query = query + query_pos
+
+        bs, num_query, _ = query.size()
+
+        D = reference_points_cam.size(3)
+        indexes = []
+        for i, mask_per_img in enumerate(bev_mask):
+            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
+            indexes.append(index_query_per_img)
+        max_len = max([len(each) for each in indexes])
+
+        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
+        queries_rebatch = query.new_zeros(
+            [bs, self.num_cams, max_len, self.embed_dims])
+        reference_points_rebatch = reference_points_cam.new_zeros(
+            [bs, self.num_cams, max_len, D, 2])
+        
+        for j in range(bs):
+            for i, reference_points_per_img in enumerate(reference_points_cam):   
+                index_query_per_img = indexes[i]
+                queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img]
+                reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img]
+
+        num_cams, l, bs, embed_dims = key.shape
+
+        key = key.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+        value = value.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+
+        queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,
+                                            reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,
+                                            level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims)
+        for j in range(bs):
+            for i, index_query_per_img in enumerate(indexes):
+                slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]
+
+        count = bev_mask.sum(-1) > 0
+        count = count.permute(1, 2, 0).sum(-1)
+        count = torch.clamp(count, min=1.0)
+        slots = slots / count[..., None]
+        slots = self.output_proj(slots)
+
+        return self.dropout(slots) + inp_residual
+
+
+@ATTENTION.register_module()
+class MSDeformableAttention3D(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=8,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            """
+            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
+            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
+            For each referent point, we sample `num_points` sampling points.
+            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, None, None, :, :]
+            sampling_offsets = sampling_offsets / \
+                offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
+            assert num_all_points == num_points * num_Z_anchors
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
+        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
+        #
+
+        if torch.cuda.is_available() and value.is_cuda:
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return output
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/temporal_self_attention.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/temporal_self_attention.py
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import ATTENTION
+import math
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class TemporalSelfAttention(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
+         the length of BEV queue is 2.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 num_bev_queue=2,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_bev_queue = num_bev_queue
+        self.sampling_offsets = nn.Linear(
+            embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue,
+                                           num_bev_queue*num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1)
+
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            assert self.batch_first
+            bs, len_bev, c = query.shape
+            value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c)
+
+            # value = torch.cat([query, query], 0)
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+        bs,  num_query, embed_dims = query.shape
+        _, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+        assert self.num_bev_queue == 2
+
+        query = torch.cat([value[::2], query], -1)
+        value_ = value.clone()
+        value_[:bs] = value[::2]
+        value_[bs:] = value[1::2]
+        value = self.value_proj(value)
+        value = self.value_proj(value)
+
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+
+        value = value.reshape(bs*self.num_bev_queue,
+                              num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.view(
+            bs, num_query, self.num_heads,  self.num_bev_queue, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query,  self.num_heads, self.num_bev_queue, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_bev_queue,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous()
+        sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        # output shape (bs*num_bev_queue, num_query, embed_dims)
+        # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
+        output = output.permute(1, 2, 0)
+
+        # fuse history value and current value
+        # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
+        output = output.view(num_query, embed_dims, bs, self.num_bev_queue)
+        output = output.mean(-1)
+
+        # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
+        output = output.permute(2, 0, 1)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/modules/transformer.py
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+from mmcv.runner.base_module import BaseModule
+
+from mmdet.models.utils.builder import TRANSFORMER
+from torch.nn.init import normal_
+from mmcv.runner.base_module import BaseModule
+from torchvision.transforms.functional import rotate
+from .temporal_self_attention import TemporalSelfAttention
+from .spatial_cross_attention import MSDeformableAttention3D
+from .decoder import CustomMSDeformableAttention
+from mmcv.runner import force_fp32, auto_fp16
+import pdb
+
+
+@TRANSFORMER.register_module()
+class PerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 decoder=None,
+                 embed_dims=256,
+                 **kwargs):
+        super(PerceptionTransformer, self).__init__(**kwargs)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = embed_dims
+        self.fp16_enabled = False
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                bev_embed,
+                object_query_embed,
+                bev_h,
+                bev_w,
+                reg_branches=None,
+                cls_branches=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return inter_states, init_reference_out, inter_references_out
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/__init__.py
+from .custom_fpn import *
+from .custom_ipm_view_transformer import *
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_fpn.py
+# ==============================================================================
+# Binaries and/or source for the following packages or projects 
+# are presented under one or more of the following open source licenses:
+# custom_fpn.py    The OpenLane-V2 Dataset Authors    Apache License, Version 2.0
+#
+# Contact wanghuijie@pjlab.org.cn if you have any issue.
+#
+# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from mmdet3d.models import NECKS
+
+
+@NECKS.register_module()
+class CustomFPN(BaseModule):
+    r"""
+    Notes
+    -----
+    Adapted from https://github.com/HuangJunJie2017/BEVDet/blob/dev2.0/mmdet3d/models/necks/fpn.py#L11.
+    
+    Feature Pyramid Network.
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 out_ids=[],
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(CustomFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+        self.out_ids = out_ids
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            # assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            if i in self.out_ids:
+                fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] += F.interpolate(laterals[i],
+                                                 **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] += F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [self.fpn_convs[i](laterals[i]) for i in self.out_ids]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return outs[0]
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_ipm_view_transformer.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/baseline/models/necks/custom_ipm_view_transformer.py
+# ==============================================================================
+# Binaries and/or source for the following packages or projects 
+# are presented under one or more of the following open source licenses:
+# custom_ipm_view_transformer.py    The OpenLane-V2 Dataset Authors    Apache License, Version 2.0
+#
+# Contact wanghuijie@pjlab.org.cn if you have any issue.
+#
+# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import copy
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.runner import BaseModule
+from mmdet3d.models import NECKS
+
+
+def get_campos(reference_points, ego2cam, img_shape):
+    '''
+        Find the each refence point's corresponding pixel in each camera
+        Args: 
+            reference_points: [B, num_query, 3]
+            ego2cam: (B, num_cam, 4, 4)
+        Outs:
+            reference_points_cam: (B*num_cam, num_query, 2)
+            mask:  (B, num_cam, num_query)
+            num_query == W*H
+    '''
+
+    ego2cam = reference_points.new_tensor(ego2cam)  # (B, N, 4, 4)
+    reference_points = reference_points.clone()
+
+    B, num_query = reference_points.shape[:2]
+    num_cam = ego2cam.shape[1]
+
+    # reference_points (B, num_queries, 4)
+    reference_points = torch.cat(
+        (reference_points, torch.ones_like(reference_points[..., :1])), -1)
+    reference_points = reference_points.view(
+        B, 1, num_query, 4).repeat(1, num_cam, 1, 1).unsqueeze(-1)
+
+    ego2cam = ego2cam.view(
+        B, num_cam, 1, 4, 4).repeat(1, 1, num_query, 1, 1)
+
+    # reference_points_cam (B, num_cam, num_queries, 4)
+    reference_points_cam = (ego2cam @ reference_points).squeeze(-1)
+
+    eps = 1e-9
+    mask = (reference_points_cam[..., 2:3] > eps)
+
+    reference_points_cam =\
+        reference_points_cam[..., 0:2] / \
+        reference_points_cam[..., 2:3] + eps
+
+    reference_points_cam[..., 0] /= img_shape[1]
+    reference_points_cam[..., 1] /= img_shape[0]
+
+    # from 0~1 to -1~1
+    reference_points_cam = (reference_points_cam - 0.5) * 2
+
+    mask = (mask & (reference_points_cam[..., 0:1] > -1.0)
+                 & (reference_points_cam[..., 0:1] < 1.0)
+                 & (reference_points_cam[..., 1:2] > -1.0)
+                 & (reference_points_cam[..., 1:2] < 1.0))
+
+    # (B, num_cam, num_query)
+    mask = mask.view(B, num_cam, num_query)
+    reference_points_cam = reference_points_cam.view(B*num_cam, num_query, 2)
+
+    return reference_points_cam, mask
+
+def construct_plane_grid(xbound, ybound, height: float, dtype=torch.float32):
+    '''
+        Returns:
+            plane: H, W, 3
+    '''
+
+    xmin, xmax = xbound[0], xbound[1]
+    num_x = int((xbound[1] - xbound[0]) / xbound[2])
+    ymin, ymax = ybound[0], ybound[1]
+    num_y = int((ybound[1] - ybound[0]) / ybound[2])
+
+    x = torch.linspace(xmin, xmax, num_x, dtype=dtype)
+    y = torch.linspace(ymin, ymax, num_y, dtype=dtype)
+
+    # [num_y, num_x]
+    y, x = torch.meshgrid(y, x)
+
+    z = torch.ones_like(x) * height
+
+    # [num_y, num_x, 3]
+    plane = torch.stack([x, y, z], dim=-1)
+
+    return plane
+
+@NECKS.register_module()
+class CustomIPMViewTransformer(BaseModule):
+    r"""
+    Notes
+    -----
+    Adapted from https://github.com/Mrmoore98/VectorMapNet_code/blob/mian/plugin/models/backbones/ipm_backbone.py#L238.
+
+    """
+    def __init__(self,         
+                 num_cam,        
+                 xbound,
+                 ybound,
+                 zbound,
+                 out_channels,
+                 ):
+        super().__init__()
+        self.x_bound = xbound
+        self.y_bound = ybound
+        heights = [zbound[0]+i*zbound[2] for i in range(int((zbound[1]-zbound[0])//zbound[2])+1)]
+        self.heights = heights
+
+        self.num_cam = num_cam
+
+        self.outconvs =\
+            nn.Conv2d((out_channels+3)*len(heights), out_channels, 
+                        kernel_size=3, stride=1, padding=1)  # same
+
+        # bev_plane
+        bev_planes = [construct_plane_grid(
+            xbound, ybound, h) for h in self.heights]
+        self.register_buffer('bev_planes', torch.stack(
+            bev_planes),)  # nlvl,bH,bW,2
+
+    def forward(self, cam_feat, ego2cam, img_shape):
+        '''
+            inverse project 
+            Args:
+                cam_feat: B*ncam, C, cH, cW
+                img_shape: tuple(H, W)
+            Returns:
+                project_feat: B, C, nlvl, bH, bW
+                bev_feat_mask: B, 1, nlvl, bH, bW
+        '''
+        B = ego2cam.shape[0]
+        C = cam_feat.shape[1]
+        bev_grid = self.bev_planes.unsqueeze(0).repeat(B, 1, 1, 1, 1)
+        nlvl, bH, bW = bev_grid.shape[1:4]
+        bev_grid = bev_grid.flatten(1, 3)  # B, nlvl*W*H, 3
+
+        # Find points in cam coords
+        # bev_grid_pos: B*ncam, nlvl*bH*bW, 2
+        bev_grid_pos, bev_cam_mask = get_campos(bev_grid, ego2cam, img_shape)
+        # B*cam, nlvl*bH, bW, 2
+        bev_grid_pos = bev_grid_pos.unflatten(-2, (nlvl*bH, bW))
+
+        # project feat from 2D to bev plane
+        projected_feature = F.grid_sample(
+            cam_feat, bev_grid_pos).view(B, -1, C, nlvl, bH, bW)  # B,cam,C,nlvl,bH,bW
+
+        # B,cam,nlvl,bH,bW
+        bev_feat_mask = bev_cam_mask.unflatten(-1, (nlvl, bH, bW))
+
+        # eliminate the ncam
+        # The bev feature is the sum of the 6 cameras
+        bev_feat_mask = bev_feat_mask.unsqueeze(2)
+        projected_feature = (projected_feature*bev_feat_mask).sum(1)
+        num_feat = bev_feat_mask.sum(1)
+
+        projected_feature = projected_feature / \
+            num_feat.masked_fill(num_feat == 0, 1)
+
+        # concatenate a position information
+        # projected_feature: B, bH, bW, nlvl, C+3
+        bev_grid = bev_grid.view(B, nlvl, bH, bW,
+                                 3).permute(0, 4, 1, 2, 3)
+        projected_feature = torch.cat(
+            (projected_feature, bev_grid), dim=1)
+
+        bev_feat, bev_feat_mask = projected_feature, bev_feat_mask.sum(1) > 0
+
+        # multi level into a same
+        bev_feat = bev_feat.flatten(1, 2)
+        bev_feat = self.outconvs(bev_feat)
+
+        return bev_feat
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline.py
+custom_imports = dict(imports=['projects.openlanev2.baseline'])
+
+method_para = dict(n_control=5) # #point for each curve
+
+_dim_ = 128
+
+model = dict(
+    type='Baseline',
+    img_backbone=dict(
+        type='ResNet',
+        depth=18,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18')),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[_dim_*2, _dim_*4],
+        out_channels=_dim_,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='CustomIPMViewTransformer',
+        num_cam=7,        
+        xbound=[-50.0, 50.0, 1.0],
+        ybound=[-25.0, 25.0, 1.0],
+        zbound=[-3.0, 2.0, 0.5],
+        out_channels=_dim_),
+    lc_head=dict(
+        type='CustomDETRHead',
+        num_classes=1, 
+        in_channels=_dim_,
+        num_query=50,
+        object_type='lane',
+        num_layers=1,
+        num_reg_dim=method_para['n_control']*3,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=2.5),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0), # dummy
+        train_cfg=dict(
+            assigner=dict(
+                type='LaneHungarianAssigner',
+                cls_cost=dict(type='FocalLossCost', weight=1.0),
+                reg_cost=dict(type='LaneL1Cost', weight=2.5),
+                iou_cost=dict(type='IoUCost', weight=0.0))), # dummy
+        bev_range=[-50.0, -25.0, -3.0, 50.0, 25.0, 2.0]),
+    te_head=dict(
+        type='CustomDETRHead',
+        num_classes=13, 
+        in_channels=_dim_,
+        num_query=30,
+        object_type='bbox',
+        num_layers=1,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=2.5),
+        loss_iou=dict(type='GIoULoss', loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='HungarianAssigner',
+                cls_cost=dict(type='FocalLossCost', weight=1.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=2.5, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)))),
+    lclc_head=dict(
+        type='TopologyHead',
+        in_channels=128,
+        hidden_channels=_dim_,
+        out_channels=1,
+        num_layers=3,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0)),
+    lcte_head=dict(
+        type='TopologyHead',
+        in_channels=128,
+        hidden_channels=_dim_,
+        out_channels=1,
+        num_layers=3,
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0)))
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+train_pipeline = [
+    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='ResizeFrontView'),
+    dict(type='CustomPadMultiViewImage', size_divisor=32),
+    dict(type='CustomParameterizeLane', method='bezier_Endpointfixed', method_para=method_para),
+    dict(type='CustomDefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+            'gt_lc', 'gt_lc_labels',
+            'gt_te', 'gt_te_labels',
+            'gt_topology_lclc', 'gt_topology_lcte',
+        ],
+        meta_keys=[
+            'scene_token', 'sample_idx', 'img_paths', 
+            'img_shape', 'scale_factor', 'pad_shape',
+            'lidar2img', 'can_bus',
+        ],
+    )
+]
+test_pipeline = [
+    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='ResizeFrontView'),
+    dict(type='CustomPadMultiViewImage', size_divisor=32),
+    dict(type='CustomDefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'scene_token', 'sample_idx', 'img_paths', 
+            'img_shape', 'scale_factor', 'pad_shape',
+            'lidar2img', 'can_bus',
+        ],
+    )
+]
+
+dataset_type = 'OpenLaneV2SubsetADataset'
+data_root = 'OpenLane-V2/data/OpenLane-V2'
+meta_root = 'OpenLane-V2/data/OpenLane-V2'
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_train',
+        pipeline=train_pipeline,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_val',
+        pipeline=test_pipeline,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_val',
+        pipeline=test_pipeline,
+        test_mode=True),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+
+optimizer = dict(
+    type='AdamW',
+    lr=1e-4,
+    weight_decay=1e-4)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+
+runner = dict(type='EpochBasedRunner', max_epochs=20)
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+
+# yapf:disable
+log_config = dict(
+    interval=10,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/baseline_large.py
+custom_imports = dict(imports=['plugin.mmdet3d.baseline'])
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -25.6, -2.3, 51.2, 25.6, 1.7]
+voxel_size = [0.2, 0.2, 8]
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+class_names = ['centerline']
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+num_cams = 7
+
+
+Map_size = [(-50, 50), (-25, 25)]
+method_para = dict(n_control=5) # #point for each curve
+code_size = 3 * method_para['n_control']
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_ffn_cfg_ = dict(
+    type='FFN',
+    embed_dims=_dim_,
+    feedforward_channels=_ffn_dim_,
+    num_fcs=2,
+    ffn_drop=0.1,
+    act_cfg=dict(type='ReLU', inplace=True),
+),
+
+_num_levels_ = 4
+bev_h_ = 100
+bev_w_ = 200
+
+model = dict(
+    type='ROAD_BEVFormer',
+    video_test_mode=False,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    bev_constructor=dict(
+        type='BEVFormerConstructer',
+        num_feature_levels=_num_levels_,
+        num_cams=num_cams,
+        embed_dims=_dim_,
+        rotate_prev_bev=True,
+        use_shift=True,
+        use_can_bus=True,
+        pc_range=point_cloud_range,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        rotate_center=[bev_h_//2, bev_w_//2],
+        encoder=dict(
+            type='BEVFormerEncoder',
+            num_layers=3,
+            pc_range=point_cloud_range,
+            num_points_in_pillar=4,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BEVFormerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='TemporalSelfAttention',
+                        embed_dims=_dim_,
+                        num_levels=1),
+                    dict(
+                        type='SpatialCrossAttention',
+                        embed_dims=_dim_,
+                        num_cams=num_cams,
+                        pc_range=point_cloud_range,
+                        deformable_attention=dict(
+                            type='MSDeformableAttention3D',
+                            embed_dims=_dim_,
+                            num_points=8,
+                            num_levels=_num_levels_)
+                    )
+                ],
+                ffn_cfgs=_ffn_cfg_,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                    'ffn', 'norm'))),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+    ),
+    bbox_head=dict(
+        type='TEDeformableDETRHead',
+        num_query=100,
+        num_classes=13,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='DeformableDetrTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention', embed_dims=_dim_),
+                    ffn_cfgs=_ffn_cfg_,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='CustomDetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=_dim_)
+                    ],
+                    ffn_cfgs=_ffn_cfg_,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=_pos_dim_,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=2.5),
+        loss_iou=dict(type='GIoULoss', loss_weight=1.0),
+        test_cfg=dict(max_per_img=50)),
+    pts_bbox_head=dict(
+        type='LCDeformableDETRHead',
+        num_classes=1,
+        in_channels=_dim_,
+        num_query=100,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        sync_cls_avg_factor=False,
+        with_box_refine=False,
+        with_shared_param=False,
+        code_size=code_size,
+        code_weights= [1.0 for i in range(code_size)],
+        pc_range=point_cloud_range,
+        transformer=dict(
+            type='PerceptionTransformer',
+            embed_dims=_dim_,
+            decoder=dict(
+                type='LaneDetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='CustomDetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    ffn_cfgs=_ffn_cfg_,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.5),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0075),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    lclc_head=dict(
+        type='RelationshipHead',
+        in_channels_o1=_dim_,
+        in_channels_o2=_dim_,
+        shared_param=False,
+        loss_rel=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=5)),
+    lcte_head=dict(
+        type='RelationshipHead',
+        in_channels_o1=_dim_,
+        in_channels_o2=_dim_,
+        shared_param=False,
+        loss_rel=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=5)),
+    # model training and testing settings
+    bbox_train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=1.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=2.5, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0))),
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='LaneHungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=1.5),
+            reg_cost=dict(type='LaneL1Cost', weight=0.0075),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            ))))
+
+
+train_pipeline = [
+    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='ResizeFrontView'),
+    dict(type='CustomPadMultiViewImage', size_divisor=32),
+    dict(type='CustomParameterizeLane', method='bezier_Endpointfixed', method_para=method_para),
+    dict(type='CustomDefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+            'gt_lc', 'gt_lc_labels',
+            'gt_te', 'gt_te_labels',
+            'gt_topology_lclc', 'gt_topology_lcte',
+        ],
+        meta_keys=[
+            'scene_token', 'sample_idx', 'img_paths', 
+            'img_shape', 'scale_factor', 'pad_shape',
+            'lidar2img', 'can_bus',
+        ],
+    )
+]
+test_pipeline = [
+    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='ResizeFrontView'),
+    dict(type='CustomPadMultiViewImage', size_divisor=32),
+    dict(type='CustomDefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'scene_token', 'sample_idx', 'img_paths', 
+            'img_shape', 'scale_factor', 'pad_shape',
+            'lidar2img', 'can_bus',
+        ],
+    )
+]
+
+dataset_type = 'OpenLaneV2SubsetADataset'
+data_root = 'data/OpenLane-V2'
+meta_root = 'data/OpenLane-V2'
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_train',
+        pipeline=train_pipeline,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_val',
+        pipeline=test_pipeline,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_val',
+        pipeline=test_pipeline,
+        test_mode=True),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
\ No newline at end of file
--- a/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
+++ b/autonomous_driving/openlane-v2/plugin/mmdet3d/configs/internimage-s.py
+custom_imports = dict(imports=['plugin.mmdet3d.baseline'])
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -25.6, -2.3, 51.2, 25.6, 1.7]
+voxel_size = [0.2, 0.2, 8]
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+class_names = ['centerline']
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+num_cams = 7
+
+
+Map_size = [(-50, 50), (-25, 25)]
+method_para = dict(n_control=5) # #point for each curve
+code_size = 3 * method_para['n_control']
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_ffn_cfg_ = dict(
+    type='FFN',
+    embed_dims=_dim_,
+    feedforward_channels=_ffn_dim_,
+    num_fcs=2,
+    ffn_drop=0.1,
+    act_cfg=dict(type='ReLU', inplace=True),
+),
+
+_num_levels_ = 4
+bev_h_ = 100
+bev_w_ = 200
+
+pretrained = 'https://huggingface.co/OpenGVLab/InternImage/resolve/main/internimage_s_1k_224.pth'
+
+model = dict(
+    type='ROAD_BEVFormer',
+    video_test_mode=False,
+    img_backbone=dict(
+        type='InternImage',
+        core_op='DCNv3',
+        channels=80,
+        depths=[4, 4, 21, 4],
+        groups=[5, 10, 20, 40],
+        mlp_ratio=4.,
+        drop_path_rate=0.3,
+        norm_layer='LN',
+        layer_scale=1.0,
+        offset_scale=1.0,
+        post_norm=True,
+        with_cp=False,
+        out_indices=(0, 1, 2, 3),
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[80, 160, 320, 640],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    bev_constructor=dict(
+        type='BEVFormerConstructer',
+        num_feature_levels=_num_levels_,
+        num_cams=num_cams,
+        embed_dims=_dim_,
+        rotate_prev_bev=True,
+        use_shift=True,
+        use_can_bus=True,
+        pc_range=point_cloud_range,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        rotate_center=[bev_h_//2, bev_w_//2],
+        encoder=dict(
+            type='BEVFormerEncoder',
+            num_layers=3,
+            pc_range=point_cloud_range,
+            num_points_in_pillar=4,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BEVFormerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='TemporalSelfAttention',
+                        embed_dims=_dim_,
+                        num_levels=1),
+                    dict(
+                        type='SpatialCrossAttention',
+                        embed_dims=_dim_,
+                        num_cams=num_cams,
+                        pc_range=point_cloud_range,
+                        deformable_attention=dict(
+                            type='MSDeformableAttention3D',
+                            embed_dims=_dim_,
+                            num_points=8,
+                            num_levels=_num_levels_)
+                    )
+                ],
+                ffn_cfgs=_ffn_cfg_,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                    'ffn', 'norm'))),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+    ),
+    bbox_head=dict(
+        type='TEDeformableDETRHead',
+        num_query=100,
+        num_classes=13,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='DeformableDetrTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention', embed_dims=_dim_),
+                    ffn_cfgs=_ffn_cfg_,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='CustomDetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=_dim_)
+                    ],
+                    ffn_cfgs=_ffn_cfg_,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=_pos_dim_,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=2.5),
+        loss_iou=dict(type='GIoULoss', loss_weight=1.0),
+        test_cfg=dict(max_per_img=50)),
+    pts_bbox_head=dict(
+        type='LCDeformableDETRHead',
+        num_classes=1,
+        in_channels=_dim_,
+        num_query=100,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        sync_cls_avg_factor=False,
+        with_box_refine=False,
+        with_shared_param=False,
+        code_size=code_size,
+        code_weights= [1.0 for i in range(code_size)],
+        pc_range=point_cloud_range,
+        transformer=dict(
+            type='PerceptionTransformer',
+            embed_dims=_dim_,
+            decoder=dict(
+                type='LaneDetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='CustomDetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    ffn_cfgs=_ffn_cfg_,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.5),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.0075),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    lclc_head=dict(
+        type='RelationshipHead',
+        in_channels_o1=_dim_,
+        in_channels_o2=_dim_,
+        shared_param=False,
+        loss_rel=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=5)),
+    lcte_head=dict(
+        type='RelationshipHead',
+        in_channels_o1=_dim_,
+        in_channels_o2=_dim_,
+        shared_param=False,
+        loss_rel=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=5)),
+    # model training and testing settings
+    bbox_train_cfg=dict(
+        assigner=dict(
+            type='HungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=1.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=2.5, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0))),
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='LaneHungarianAssigner',
+            cls_cost=dict(type='FocalLossCost', weight=1.5),
+            reg_cost=dict(type='LaneL1Cost', weight=0.0075),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            ))))
+
+
+train_pipeline = [
+    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='ResizeFrontView'),
+    dict(type='CustomPadMultiViewImage', size_divisor=32),
+    dict(type='CustomParameterizeLane', method='bezier_Endpointfixed', method_para=method_para),
+    dict(type='CustomDefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+            'gt_lc', 'gt_lc_labels',
+            'gt_te', 'gt_te_labels',
+            'gt_topology_lclc', 'gt_topology_lcte',
+        ],
+        meta_keys=[
+            'scene_token', 'sample_idx', 'img_paths', 
+            'img_shape', 'scale_factor', 'pad_shape',
+            'lidar2img', 'can_bus',
+        ],
+    )
+]
+test_pipeline = [
+    dict(type='CustomLoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='ResizeFrontView'),
+    dict(type='CustomPadMultiViewImage', size_divisor=32),
+    dict(type='CustomDefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+        ],
+        meta_keys=[
+            'scene_token', 'sample_idx', 'img_paths', 
+            'img_shape', 'scale_factor', 'pad_shape',
+            'lidar2img', 'can_bus',
+        ],
+    )
+]
+
+dataset_type = 'OpenLaneV2SubsetADataset'
+data_root = 'data/OpenLane-V2'
+meta_root = 'data/OpenLane-V2'
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_train',
+        pipeline=train_pipeline,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_val',
+        pipeline=test_pipeline,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        meta_root=meta_root,
+        collection='data_dict_subset_A_val',
+        pipeline=test_pipeline,
+        test_mode=True),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
\ No newline at end of file
--- a/autonomous_driving/openlane-v2/requirements.txt
+++ b/autonomous_driving/openlane-v2/requirements.txt
+tqdm
+ninja
+jupyter
+openmim
+matplotlib
+numpy >=1.22.0, <1.24.0
+scikit-learn
+similaritymeasures
+opencv-python
+scipy ==1.8.0
+ortools ==9.2.9972
+iso3166
+chardet
--- a/autonomous_driving/openlane-v2/setup.py
+++ b/autonomous_driving/openlane-v2/setup.py
+# ==============================================================================
+# Binaries and/or source for the following packages or projects 
+# are presented under one or more of the following open source licenses:
+# setup.py    The OpenLane-V2 Dataset Authors    Apache License, Version 2.0
+#
+# Contact wanghuijie@pjlab.org.cn if you have any issue.
+#
+# Copyright (c) 2023 The OpenLane-v2 Dataset Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from setuptools import setup, find_packages
+
+
+setup(
+    name='openlanev2',
+    version='0.1.0',
+    author='The OpenLane-V2 Dataset Authors',
+    author_email='wanghuijie@pjlab.org.cn',
+    description='The official devkit of the OpenLane-V2 dataset.',
+    url='https://github.com/OpenDriveLab/OpenLane-V2',
+    packages=find_packages(),
+    license='Apache License 2.0',
+)
--- a/autonomous_driving/openlane-v2/tutorial.ipynb
+++ b/autonomous_driving/openlane-v2/tutorial.ipynb