init

4cd43886 · lishj6 · a9a1fe81 · 4cd43886 · 4cd43886 · 4cd43886
Commit 4cd43886 authored Sep 01, 2025 by lishj6 🏸
20 changed files
--- a/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py
+++ b/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import os.path as osp
+import torch
+import mmcv
+from mmcv.runner.base_runner import BaseRunner
+from mmcv.runner.epoch_based_runner import EpochBasedRunner
+from mmcv.runner.builder import RUNNERS
+from mmcv.runner.checkpoint import save_checkpoint
+from mmcv.runner.utils import get_host_info
+from pprint import pprint
+from mmcv.parallel.data_container import DataContainer
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner_video(EpochBasedRunner):
+    
+    ''' 
+    # basic logic
+    
+    input_sequence = [a, b, c] # given a sequence of samples
+    
+    prev_bev = None
+    for each in input_sequcene[:-1]
+        prev_bev = eval_model(each, prev_bev)) # inference only.
+    
+    model(input_sequcene[-1], prev_bev) # train the last sample.
+    '''
+    
+    def __init__(self,
+                 model,
+                 eval_model=None,
+                 batch_processor=None,
+                 optimizer=None,
+                 work_dir=None,
+                 logger=None,
+                 meta=None,
+                 keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'],
+                 max_iters=None,
+                 max_epochs=None):
+        super().__init__(model,
+                 batch_processor,
+                 optimizer,
+                 work_dir,
+                 logger,
+                 meta,
+                 max_iters,
+                 max_epochs)
+        keys.append('img_metas')
+        self.keys = keys
+        self.eval_model = eval_model
+        self.eval_model.eval()
+    
+    def run_iter(self, data_batch, train_mode, **kwargs):
+        if self.batch_processor is not None:
+            assert False
+            # outputs = self.batch_processor(
+            #     self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+
+            num_samples = data_batch['img'].data[0].size(1)
+            data_list = []
+            prev_bev = None
+            for i in range(num_samples):
+                data = {}
+                for key in self.keys:
+                    if key not in ['img_metas', 'img', 'points']:
+                        data[key] = data_batch[key]
+                    else:
+                        if key == 'img':
+                            data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True)
+                        elif key == 'img_metas':
+                            data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only)
+                        else:
+                            assert False
+                data_list.append(data)
+            with torch.no_grad():
+                for i in range(num_samples-1):
+                    if data_list[i]['img_metas'].data[0][0]['prev_bev_exists']:
+                        data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+                    prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs)
+            if data_list[-1]['img_metas'].data[0][0]['prev_bev_exists']:
+                data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+            outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs)
+        else:
+            assert False
+            # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
+++ b/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
+from .hungarian_assigner_3d import HungarianAssigner3D
+
+__all__ = ['HungarianAssigner3D']
--- a/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
+++ b/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
+import torch
+
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.assigners import AssignResult
+from mmdet.core.bbox.assigners import BaseAssigner
+from mmdet.core.bbox.match_costs import build_match_cost
+from mmdet.models.utils.transformer import inverse_sigmoid
+from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes, 
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+       
+        normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
+    
+        reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+      
+        # weighted sum of above two costs
+        cost = cls_cost + reg_cost
+        
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
+++ b/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
+from .nms_free_coder import NMSFreeCoder
+
+__all__ = ['NMSFreeCoder']
--- a/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
+++ b/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox
+import numpy as np
+
+
+@BBOX_CODERS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+       
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
+        final_scores = scores 
+        final_preds = labels 
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+        
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
+
--- a/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
+++ b/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
+from mmdet.core.bbox.match_costs import build_match_cost
+from .match_cost import BBox3DL1Cost, SmoothL1Cost
+
+__all__ = ['build_match_cost', 'BBox3DL1Cost', 'SmoothL1Cost']
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
+++ b/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
+import torch
+import mmcv
+from mmdet.core.bbox.match_costs.builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@mmcv.jit(derivate=True, coderize=True)
+#@weighted_loss
+def smooth_l1_loss(pred, target, beta=1.0):
+    """Smooth L1 loss.
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    # assert pred.size() == target.size()
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss.sum(-1)
+
+
+@MATCH_COST.register_module()
+class SmoothL1Cost(object):
+    """SmoothL1Cost.
+     Args:
+         weight (int | float, optional): loss weight
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
+         >>> import torch
+         >>> self = IoUCost()
+         >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+         >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> self(bboxes, gt_bboxes)
+         tensor([[-0.1250,  0.1667],
+                [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, input, target):
+        """
+        Args:
+            bboxes (Tensor): Predicted boxes with unnormalized coordinates
+                (x1, y1, x2, y2). Shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+        Returns:
+            torch.Tensor: iou_cost value with weight
+        """
+        N1, C = input.shape
+        N2, C = target.shape
+        input = input.contiguous().view(N1, C)[:, None, :]
+        target = target.contiguous().view(N2, C)[None, :, :]
+        cost = smooth_l1_loss(input, target)
+
+        return cost * self.weight
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/bbox/util.py
+++ b/projects/mmdet3d_plugin/core/bbox/util.py
+import torch 
+
+
+def normalize_bbox(bboxes, pc_range):
+
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    w = bboxes[..., 3:4].log()
+    l = bboxes[..., 4:5].log()
+    h = bboxes[..., 5:6].log()
+
+    rot = bboxes[..., 6:7]
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8] 
+        vy = bboxes[..., 8:9]
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
+        )
+    else:
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
+        )
+    return normalized_bboxes
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    # rotation 
+    rot_sine = normalized_bboxes[..., 6:7]
+
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+   
+    # size
+    w = normalized_bboxes[..., 2:3]
+    l = normalized_bboxes[..., 3:4]
+    h = normalized_bboxes[..., 5:6]
+
+    w = w.exp() 
+    l = l.exp() 
+    h = h.exp() 
+    if normalized_bboxes.size(-1) > 8:
+         # velocity 
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
+    else:
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
+    return denormalized_bboxes
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/evaluation/__init__.py
+++ b/projects/mmdet3d_plugin/core/evaluation/__init__.py
+from .eval_hooks import CustomDistEvalHook
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py
+++ b/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py
+
+# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16,
+# in order to avoid strong version dependency, we did not directly
+# inherit EvalHook but BaseDistEvalHook.
+
+import bisect
+import os.path as osp
+
+import mmcv
+import torch.distributed as dist
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+from torch.nn.modules.batchnorm import _BatchNorm
+from mmdet.core.evaluation.eval_hooks import DistEvalHook
+
+
+def _calc_dynamic_intervals(start_interval, dynamic_interval_list):
+    assert mmcv.is_list_of(dynamic_interval_list, tuple)
+
+    dynamic_milestones = [0]
+    dynamic_milestones.extend(
+        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
+    dynamic_intervals = [start_interval]
+    dynamic_intervals.extend(
+        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
+    return dynamic_milestones, dynamic_intervals
+
+
+class CustomDistEvalHook(BaseDistEvalHook):
+
+    def __init__(self, *args, dynamic_intervals=None,  **kwargs):
+        super(CustomDistEvalHook, self).__init__(*args, **kwargs)
+        self.use_dynamic_intervals = dynamic_intervals is not None
+        if self.use_dynamic_intervals:
+            self.dynamic_milestones, self.dynamic_intervals = \
+                _calc_dynamic_intervals(self.interval, dynamic_intervals)
+
+    def _decide_interval(self, runner):
+        if self.use_dynamic_intervals:
+            progress = runner.epoch if self.by_epoch else runner.iter
+            step = bisect.bisect(self.dynamic_milestones, (progress + 1))
+            # Dynamically modify the evaluation interval
+            self.interval = self.dynamic_intervals[step - 1]
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        self._decide_interval(runner)
+        super().before_train_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self._decide_interval(runner)
+        super().before_train_iter(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        if not self._should_evaluate(runner):
+            return
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test # to solve circlur  import
+
+        results = custom_multi_gpu_test(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+
+            key_score = self.evaluate(runner, results)
+
+            if self.save_best:
+                self._save_ckpt(runner, key_score)
+  
--- a/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py
+++ b/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2 as open_dataset
+    import mmcv
+    import numpy as np
+    import tensorflow as tf
+    from glob import glob
+    from os.path import join
+    from waymo_open_dataset import label_pb2
+    from waymo_open_dataset.protos import metrics_pb2
+except ImportError:
+    #pass
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
+        'to install the official devkit first.')
+
+
+
+
+class KITTI2Waymo(object):
+    """KITTI predictions to Waymo converter.
+    This class serves as the converter to change predictions from KITTI to
+    Waymo format.
+    Args:
+        kitti_result_files (list[dict]): Predictions in KITTI format.
+        waymo_tfrecords_dir (str): Directory to load waymo raw data.
+        waymo_results_save_dir (str): Directory to save converted predictions
+            in waymo format (.bin files).
+        waymo_results_final_path (str): Path to save combined
+            predictions in waymo format (.bin file), like 'a/b/c.bin'.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (str): Number of parallel processes.
+    """
+
+    def __init__(self,
+                 kitti_result_files,
+                 waymo_tfrecords_dir,
+                 waymo_results_save_dir,
+                 waymo_results_final_path,
+                 prefix,
+                 workers=64):
+
+        self.kitti_result_files = kitti_result_files
+        self.waymo_tfrecords_dir = waymo_tfrecords_dir
+        self.waymo_results_save_dir = waymo_results_save_dir
+        self.waymo_results_final_path = waymo_results_final_path
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.name2idx = {}
+        for idx, result in enumerate(kitti_result_files):
+            if len(result['sample_idx']) > 0:
+                self.name2idx[str(result['sample_idx'][0])] = idx
+
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        self.k2w_cls_map = {
+            'Car': label_pb2.Label.TYPE_VEHICLE,
+            'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
+            'Sign': label_pb2.Label.TYPE_SIGN,
+            'Cyclist': label_pb2.Label.TYPE_CYCLIST,
+        }
+
+        self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
+                                            [-1.0, 0.0, 0.0, 0.0],
+                                            [0.0, -1.0, 0.0, 0.0],
+                                            [0.0, 0.0, 0.0, 1.0]])
+
+        self.get_file_names()
+        self.create_folder()
+
+    def get_file_names(self):
+        """Get file names of waymo raw data."""
+        self.waymo_tfrecord_pathnames = sorted(
+            glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
+        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
+
+    def create_folder(self):
+        """Create folder for data conversion."""
+        mmcv.mkdir_or_exist(self.waymo_results_save_dir)
+
+    def parse_objects(self, kitti_result, T_k2w, context_name,
+                      frame_timestamp_micros):
+        """Parse one prediction with several instances in kitti format and
+        convert them to `Object` proto.
+        Args:
+            kitti_result (dict): Predictions in kitti format.
+                - name (np.ndarray): Class labels of predictions.
+                - dimensions (np.ndarray): Height, width, length of boxes.
+                - location (np.ndarray): Bottom center of boxes (x, y, z).
+                - rotation_y (np.ndarray): Orientation of boxes.
+                - score (np.ndarray): Scores of predictions.
+            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
+            context_name (str): Context name of the frame.
+            frame_timestamp_micros (int): Frame timestamp.
+        Returns:
+            :obj:`Object`: Predictions in waymo dataset Object proto.
+        """
+
+        def parse_one_object(instance_idx):
+            """Parse one instance in kitti format and convert them to `Object`
+            proto.
+            Args:
+                instance_idx (int): Index of the instance to be converted.
+            Returns:
+                :obj:`Object`: Predicted instance in waymo dataset \
+                    Object proto.
+            """
+            cls = kitti_result['name'][instance_idx]
+            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
+            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
+            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
+            x = round(kitti_result['location'][instance_idx, 0], 4)
+            y = round(kitti_result['location'][instance_idx, 1], 4)
+            z = round(kitti_result['location'][instance_idx, 2], 4)
+            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
+            score = round(kitti_result['score'][instance_idx], 4)
+
+            # y: downwards; move box origin from bottom center (kitti) to
+            # true center (waymo)
+            y -= height / 2
+            # frame transformation: kitti -> waymo
+            x, y, z = self.transform(T_k2w, x, y, z)
+
+            # different conventions
+            heading = -(rotation_y + np.pi / 2)
+            while heading < -np.pi:
+                heading += 2 * np.pi
+            while heading > np.pi:
+                heading -= 2 * np.pi
+
+            box = label_pb2.Label.Box()
+            box.center_x = x
+            box.center_y = y
+            box.center_z = z
+            box.length = length
+            box.width = width
+            box.height = height
+            box.heading = heading
+
+            o = metrics_pb2.Object()
+            o.object.box.CopyFrom(box)
+            o.object.type = self.k2w_cls_map[cls]
+            o.score = score
+
+            o.context_name = context_name
+            o.frame_timestamp_micros = frame_timestamp_micros
+
+            return o
+
+        objects = metrics_pb2.Objects()
+
+        for instance_idx in range(len(kitti_result['name'])):
+            o = parse_one_object(instance_idx)
+            objects.objects.append(o)
+
+        return objects
+
+    def convert_one(self, file_idx):
+        """Convert action for single file.
+        Args:
+            file_idx (int): Index of the file to be converted.
+        """
+        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
+        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
+
+        for frame_num, frame_data in enumerate(file_data):
+            frame = open_dataset.Frame()
+            frame.ParseFromString(bytearray(frame_data.numpy()))
+            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
+
+            for camera in frame.context.camera_calibrations:
+                # FRONT = 1, see dataset.proto for details
+                if camera.name == 1:
+                    T_front_cam_to_vehicle = np.array(
+                        camera.extrinsic.transform).reshape(4, 4)
+
+            T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
+
+            context_name = frame.context.name
+            frame_timestamp_micros = frame.timestamp_micros
+
+            if filename in self.name2idx:
+                kitti_result = \
+                    self.kitti_result_files[self.name2idx[filename]]
+                objects = self.parse_objects(kitti_result, T_k2w, context_name,
+                                             frame_timestamp_micros)
+            else:
+                print(filename, 'not found.(bevformer)')
+                objects = metrics_pb2.Objects()
+
+            with open(
+                    join(self.waymo_results_save_dir, f'{filename}.bin'),
+                    'wb') as f:
+                f.write(objects.SerializeToString())
+
+    def convert(self):
+        """Convert action."""
+        print('Start converting ...')
+        mmcv.track_parallel_progress(self.convert_one, range(len(self)),
+                                     self.workers)
+        print('\nFinished ...')
+
+        # combine all files into one .bin
+        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
+        combined = self.combine(pathnames)
+
+        with open(self.waymo_results_final_path, 'wb') as f:
+            f.write(combined.SerializeToString())
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.waymo_tfrecord_pathnames)
+
+    def transform(self, T, x, y, z):
+        """Transform the coordinates with matrix T.
+        Args:
+            T (np.ndarray): Transformation matrix.
+            x(float): Coordinate in x axis.
+            y(float): Coordinate in y axis.
+            z(float): Coordinate in z axis.
+        Returns:
+            list: Coordinates after transformation.
+        """
+        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
+        pt_aft = np.matmul(T, pt_bef)
+        return pt_aft[:3].flatten().tolist()
+
+    def combine(self, pathnames):
+        """Combine predictions in waymo format for each sample together.
+        Args:
+            pathnames (str): Paths to save predictions.
+        Returns:
+            :obj:`Objects`: Combined predictions in Objects proto.
+        """
+        combined = metrics_pb2.Objects()
+
+        for pathname in pathnames:
+            objects = metrics_pb2.Objects()
+            with open(pathname, 'rb') as f:
+                objects.ParseFromString(f.read())
+            for o in objects.objects:
+                combined.objects.append(o)
+
+        return combined
\ No newline at end of file
--- a/projects/mmdet3d_plugin/datasets/__init__.py
+++ b/projects/mmdet3d_plugin/datasets/__init__.py
+from .nuscenes_dataset import CustomNuScenesDataset
+from .nuscenes_dataset_v2 import CustomNuScenesDatasetV2
+
+from .builder import custom_build_dataset
+__all__ = [
+    'CustomNuScenesDataset',
+    'CustomNuScenesDatasetV2',
+]
--- a/projects/mmdet3d_plugin/datasets/builder.py
+++ b/projects/mmdet3d_plugin/datasets/builder.py
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg
+from torch.utils.data import DataLoader
+
+from mmdet.datasets.samplers import GroupSampler
+from projects.mmdet3d_plugin.datasets.samplers.group_sampler import DistributedGroupSampler
+from projects.mmdet3d_plugin.datasets.samplers.distributed_sampler import DistributedSampler
+from projects.mmdet3d_plugin.datasets.samplers.sampler import build_sampler
+
+# import torch
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     shuffler_sampler=None,
+                     nonshuffler_sampler=None,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        # DistributedGroupSampler will definitely shuffle the data to satisfy
+        # that images on each GPU are in the same group
+        if shuffle:
+            sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'),
+                                     dict(
+                                         dataset=dataset,
+                                         samples_per_gpu=samples_per_gpu,
+                                         num_replicas=world_size,
+                                         rank=rank,
+                                         seed=seed)
+                                     )
+
+        else:
+            sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'),
+                                     dict(
+                                         dataset=dataset,
+                                         num_replicas=world_size,
+                                         rank=rank,
+                                         shuffle=shuffle,
+                                         seed=seed)
+                                     )
+
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        # assert False, 'not support in bevformer'
+        print('WARNING!!!!, Only can be used for obtain inference speed!!!!')
+        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=False,
+        worker_init_fn=init_fn,
+        persistent_workers=(num_workers > 0),
+        **kwargs)
+    
+    # if to_channels_last:
+    #     original_collate_fn = data_loader.collate_fn
+        
+    #     def channels_last_collate(batch):
+    #         data = original_collate_fn(batch)
+    #         print("===============================channels_last=================================================")
+    #         if isinstance(data, dict):
+    #             if 'img' in data and isinstance(data['img'], torch.Tensor):
+    #                 data['img'] = data['img'].contiguous(memory_format=torch.channels_last)
+    #         elif isinstance(data, list):
+    #             for item in data:
+    #                 if 'img' in item and isinstance(item['img'], torch.Tensor):
+    #                     item['img'] = item['img'].contiguous(memory_format=torch.channels_last)
+    #         return data
+        
+    #     data_loader.collate_fn = channels_last_collate
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+from mmcv.utils import Registry, build_from_cfg
+
+from mmdet.datasets import DATASETS
+from mmdet.datasets.builder import _concat_dataset
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+OBJECTSAMPLERS = Registry('Object sampler')
+
+
+def custom_build_dataset(cfg, default_args=None):
+    from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+    from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
+                                                 ConcatDataset, RepeatDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [custom_build_dataset(c, default_args) for c in cfg['datasets']],
+            cfg.get('separate_eval', True))
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            custom_build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'ClassBalancedDataset':
+        dataset = ClassBalancedDataset(
+            custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+    elif cfg['type'] == 'CBGSDataset':
+        dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args))
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
--- a/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py
+++ b/projects/mmdet3d_plugin/datasets/nuscenes_dataset.py
+import copy
+
+import numpy as np
+from mmdet.datasets import DATASETS
+from mmdet3d.datasets import NuScenesDataset
+import mmcv
+from os import path as osp
+from mmdet.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+
+
+@DATASETS.register_module()
+class CustomNuScenesDataset(NuScenesDataset):
+    r"""NuScenes Dataset.
+
+    This datset only add camera intrinsics and extrinsics to the results.
+    """
+
+    def __init__(self, queue_length=4, bev_size=(200, 200), overlap_test=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.bev_size = bev_size
+        
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        queue = []
+        index_list = list(range(index-self.queue_length, index))
+        random.shuffle(index_list)
+        index_list = sorted(index_list[1:])
+        index_list.append(index)
+        for i in index_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            self.pre_pipeline(input_dict)
+            example = self.pipeline(input_dict)
+            if self.filter_empty_gt and \
+                    (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+                return None
+            queue.append(example)
+        return self.union2one(queue)
+
+
+    def union2one(self, queue):
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_scene_token = None
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if metas_map[i]['scene_token'] != prev_scene_token:
+                metas_map[i]['prev_bev_exists'] = False
+                prev_scene_token = metas_map[i]['scene_token']
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev_exists'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+        queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            prev_idx=info['prev'],
+            next_idx=info['next'],
+            scene_token=info['scene_token'],
+            can_bus=info['can_bus'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt.T)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        rotation = Quaternion(input_dict['ego2global_rotation'])
+        translation = input_dict['ego2global_translation']
+        can_bus = input_dict['can_bus']
+        can_bus[:3] = translation
+        can_bus[3:7] = rotation
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        if patch_angle < 0:
+            patch_angle += 360
+        can_bus[-2] = patch_angle / 180 * np.pi
+        can_bus[-1] = patch_angle
+
+        return input_dict
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+                             verbose=True)
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=True,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+        )
+        self.nusc_eval.main(plot_examples=0, render_curves=False)
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
--- a/projects/mmdet3d_plugin/datasets/nuscenes_dataset_v2.py
+++ b/projects/mmdet3d_plugin/datasets/nuscenes_dataset_v2.py
+import copy
+from mmdet3d.datasets import NuScenesDataset
+import mmcv
+from os import path as osp
+from mmdet.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from mmcv.parallel import DataContainer as DC
+from collections import defaultdict, OrderedDict
+from projects.mmdet3d_plugin.dd3d.datasets.nuscenes import NuscenesDataset as DD3DNuscenesDataset
+
+
+@DATASETS.register_module()
+class CustomNuScenesDatasetV2(NuScenesDataset):
+    def __init__(self, frames=(),mono_cfg=None, overlap_test=False,*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.frames = frames
+        self.queue_length = len(frames)
+        self.overlap_test = overlap_test
+        self.mono_cfg = mono_cfg
+        if not self.test_mode and mono_cfg is not None:
+            self.mono_dataset = DD3DNuscenesDataset(**mono_cfg)
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        data_queue = OrderedDict()
+        input_dict = self.get_data_info(index)
+        cur_scene_token = input_dict['scene_token']
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        data_queue[0] = example
+        
+        for frame_idx in self.frames:
+            chosen_idx = index + frame_idx
+            if frame_idx ==0 or chosen_idx <0 or chosen_idx >= len(self.data_infos):
+                continue
+            info = self.data_infos[chosen_idx]
+            input_dict = self.prepare_input_dict(info)
+            if input_dict['scene_token'] == cur_scene_token:
+                self.pre_pipeline(input_dict)
+                example = self.pipeline(input_dict)
+                data_queue[frame_idx] = example
+
+        data_queue = OrderedDict(sorted(data_queue.items()))
+        ret = defaultdict(list)
+        for i in range(len(data_queue[0]['img'])):
+            single_aug_data_queue = {}
+            for t in data_queue.keys():
+                single_example = {}
+                for key ,value in data_queue[t].items():
+                    single_example[key] = value[i]
+                single_aug_data_queue[t] = single_example
+            single_aug_data_queue = OrderedDict(sorted(single_aug_data_queue.items()))
+            single_aug_sample = self.union2one(single_aug_data_queue)
+
+            for key, value in single_aug_sample.items():
+                ret[key].append(value)
+        return ret
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        data_queue = OrderedDict()
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None 
+        cur_scene_token = input_dict['scene_token']
+        # cur_frame_idx = input_dict['frame_idx']
+        ann_info = copy.deepcopy(input_dict['ann_info'])
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.filter_empty_gt and \
+                (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+            return None
+        data_queue[0] = example
+        aug_param = copy.deepcopy(example['aug_param']) if 'aug_param' in example else {}
+        
+        # frame_idx_to_idx = self.scene_to_frame_idx_to_idx[cur_scene_token]
+        for frame_idx in self.frames:
+            chosen_idx = index + frame_idx
+            if frame_idx ==0 or chosen_idx <0 or chosen_idx >= len(self.data_infos):
+                continue
+            info = self.data_infos[chosen_idx]
+            input_dict = self.prepare_input_dict(info)
+            if input_dict['scene_token'] == cur_scene_token:
+                input_dict['ann_info'] = copy.deepcopy(ann_info) # only for pipeline, should never be used 
+                self.pre_pipeline(input_dict)
+                input_dict['aug_param'] = copy.deepcopy(aug_param)
+                example = self.pipeline(input_dict)
+                data_queue[frame_idx] = example
+
+        data_queue = OrderedDict(sorted(data_queue.items()))
+        return self.union2one(data_queue)
+
+    def union2one(self, queue: dict):
+        """
+        convert sample queue into one single sample.
+        """
+        imgs_list = [each['img'].data for each in queue.values()]
+        lidar2ego = np.eye(4, dtype=np.float32)
+        lidar2ego[:3, :3] = Quaternion(queue[0]['lidar2ego_rotation']).rotation_matrix
+        lidar2ego[:3, 3] = queue[0]['lidar2ego_translation']
+
+        egocurr2global = np.eye(4, dtype=np.float32)
+        egocurr2global[:3,:3] = Quaternion(queue[0]['ego2global_rotation']).rotation_matrix
+        egocurr2global[:3,3] = queue[0]['ego2global_translation']
+        metas_map = {}
+        for i, each in queue.items():
+            metas_map[i] = each['img_metas'].data
+            metas_map[i]['timestamp'] = each['timestamp']
+            if 'aug_param' in each:
+                metas_map[i]['aug_param'] = each['aug_param']
+            if i == 0:
+                metas_map[i]['lidaradj2lidarcurr'] = None
+            else:
+                egoadj2global = np.eye(4, dtype=np.float32)
+                egoadj2global[:3,:3] = Quaternion(each['ego2global_rotation']).rotation_matrix
+                egoadj2global[:3,3] = each['ego2global_translation']
+
+                lidaradj2lidarcurr = np.linalg.inv(lidar2ego) @ np.linalg.inv(egocurr2global) @ egoadj2global @ lidar2ego
+                metas_map[i]['lidaradj2lidarcurr'] = lidaradj2lidarcurr
+                for i_cam in range(len(metas_map[i]['lidar2img'])):
+                    metas_map[i]['lidar2img'][i_cam] = metas_map[i]['lidar2img'][i_cam] @ np.linalg.inv(lidaradj2lidarcurr)
+        queue[0]['img'] = DC(torch.stack(imgs_list),
+                              cpu_only=False, stack=True)
+        queue[0]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[0]
+        return queue
+
+    def prepare_input_dict(self, info):
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            lidar2ego_translation=info['lidar2ego_translation'],
+            lidar2ego_rotation=info['lidar2ego_rotation'],
+            prev=info['prev'],
+            next=info['next'],
+            scene_token=info['scene_token'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt.T)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam2img=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        return input_dict
+
+    def filter_crowd_annotations(self, data_dict):
+        for ann in data_dict["annotations"]:
+            if ann.get("iscrowd", 0) == 0:
+                return True
+        return False
+
+    def get_data_info(self, index):
+        info = self.data_infos[index]
+        input_dict = self.prepare_input_dict(info)
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        if not self.test_mode and self.mono_cfg is not None:
+            if input_dict is None:
+                return None
+            info = self.data_infos[index]
+            img_ids = []
+            for cam_type, cam_info in info['cams'].items():
+                img_ids.append(cam_info['sample_data_token'])
+
+            mono_input_dict = []; mono_ann_index = []
+            for i, img_id in enumerate(img_ids):
+                tmp_dict = self.mono_dataset.getitem_by_datumtoken(img_id)
+                if tmp_dict is not None:
+                    if self.filter_crowd_annotations(tmp_dict):
+                        mono_input_dict.append(tmp_dict)
+                        mono_ann_index.append(i)
+
+            # filter empth annotation
+            if len(mono_ann_index) == 0:
+                return None
+
+            mono_ann_index = DC(mono_ann_index, cpu_only=True)
+            input_dict['mono_input_dict'] = mono_input_dict
+            input_dict['mono_ann_idx'] = mono_ann_index
+        return input_dict
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+                             verbose=True)
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=True,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+        )
+        self.nusc_eval.main(plot_examples=0, render_curves=False)
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
\ No newline at end of file
--- a/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py
+++ b/projects/mmdet3d_plugin/datasets/nuscenes_mono_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import mmcv
+import numpy as np
+import pyquaternion
+import tempfile
+import torch
+import warnings
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from os import path as osp
+
+from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
+from mmdet.datasets import DATASETS, CocoDataset
+from mmdet3d.core import show_multi_modality_result
+from mmdet3d.core.bbox import CameraInstance3DBoxes, get_box_type
+from mmdet3d.datasets.pipelines import Compose
+from mmdet3d.datasets.utils import extract_result_dict, get_loading_pipeline
+
+
+@DATASETS.register_module()
+class CustomNuScenesMonoDataset(CocoDataset):
+    r"""Monocular 3D detection on NuScenes Dataset.
+    This class serves as the API for experiments on the NuScenes Dataset.
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+    Args:
+        ann_file (str): Path of annotation file.
+        data_root (str): Path of dataset root.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to True.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Camera' in this class. Available options includes.
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        eval_version (str, optional): Configuration version of evaluation.
+            Defaults to  'detection_cvpr_2019'.
+        use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
+            file as mask to filter gt_boxes and gt_names. Defaults to False.
+        version (str, optional): Dataset version. Defaults to 'v1.0-trainval'.
+    """
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+    ErrNameMapping = {
+        'trans_err': 'mATE',
+        'scale_err': 'mASE',
+        'orient_err': 'mAOE',
+        'vel_err': 'mAVE',
+        'attr_err': 'mAAE'
+    }
+
+    def __init__(self,
+                 data_root,
+                 load_interval=1,
+                 with_velocity=True,
+                 modality=None,
+                 box_type_3d='Camera',
+                 eval_version='detection_cvpr_2019',
+                 use_valid_flag=False,
+                 overlap_test=False,
+                 version='v1.0-trainval',
+                 **kwargs):
+        super().__init__(**kwargs)
+        # overlap_test = True
+        self.data_root = data_root
+        self.overlap_test = overlap_test
+        self.load_interval = load_interval
+        self.with_velocity = with_velocity
+        self.modality = modality
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+        self.eval_version = eval_version
+        self.use_valid_flag = use_valid_flag
+        self.bbox_code_size = 9
+        self.version = version
+        if self.eval_version is not None:
+            from nuscenes.eval.detection.config import config_factory
+            self.eval_detection_configs = config_factory(self.eval_version)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=True,
+                use_lidar=False,
+                use_radar=False,
+                use_map=False,
+                use_external=False)
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+        Args:
+            results (dict): Dict before data preprocessing.
+                - img_fields (list): Image fields.
+                - bbox3d_fields (list): 3D bounding boxes fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - bbox_fields (list): Fields of bounding boxes.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+                - box_type_3d (str): 3D box type.
+                - box_mode_3d (str): 3D box mode.
+        """
+        results['img_prefix'] = ''  # self.img_prefix
+        # print('img_prefix', self.img_prefix)
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['img_fields'] = []
+        results['bbox3d_fields'] = []
+        results['pts_mask_fields'] = []
+        results['pts_seg_fields'] = []
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox annotation.
+        Args:
+            img_info (list[dict]): Image info.
+            ann_info (list[dict]): Annotation info of an image.
+        Returns:
+            dict: A dict containing the following keys: bboxes, labels, \
+                gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \
+                depths, bboxes_ignore, masks, seg_map
+        """
+        gt_bboxes = []
+        gt_labels = []
+        attr_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        gt_bboxes_cam3d = []
+        centers2d = []
+        depths = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                attr_labels.append(ann['attribute_id'])
+                gt_masks_ann.append(ann.get('segmentation', None))
+                # 3D annotations in camera coordinates
+                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1)
+                velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2)
+                nan_mask = np.isnan(velo_cam3d[:, 0])
+                velo_cam3d[nan_mask] = [0.0, 0.0]
+                bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1)
+                gt_bboxes_cam3d.append(bbox_cam3d.squeeze())
+                # 2.5D annotations in camera coordinates
+                center2d = ann['center2d'][:2]
+                depth = ann['center2d'][2]
+                centers2d.append(center2d)
+                depths.append(depth)
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+            attr_labels = np.array(attr_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+            attr_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_cam3d:
+            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
+            centers2d = np.array(centers2d, dtype=np.float32)
+            depths = np.array(depths, dtype=np.float32)
+        else:
+            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
+                                       dtype=np.float32)
+            centers2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
+        gt_bboxes_cam3d = CameraInstance3DBoxes(
+            gt_bboxes_cam3d,
+            box_dim=gt_bboxes_cam3d.shape[-1],
+            origin=(0.5, 0.5, 0.5))
+        gt_labels_3d = copy.deepcopy(gt_labels)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            gt_bboxes_3d=gt_bboxes_cam3d,
+            gt_labels_3d=gt_labels_3d,
+            attr_labels=attr_labels,
+            centers2d=centers2d,
+            depths=depths,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def get_attr_name(self, attr_idx, label_name):
+        """Get attribute from predicted index.
+        This is a workaround to predict attribute when the predicted velocity
+        is not reliable. We map the predicted attribute index to the one
+        in the attribute set. If it is consistent with the category, we will
+        keep it. Otherwise, we will use the default attribute.
+        Args:
+            attr_idx (int): Attribute index.
+            label_name (str): Predicted category name.
+        Returns:
+            str: Predicted attribute name.
+        """
+        # TODO: Simplify the variable name
+        AttrMapping_rev2 = [
+            'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
+            'pedestrian.standing', 'pedestrian.sitting_lying_down',
+            'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
+        ]
+        if label_name == 'car' or label_name == 'bus' \
+            or label_name == 'truck' or label_name == 'trailer' \
+                or label_name == 'construction_vehicle':
+            if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
+                AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
+                    AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'pedestrian':
+            if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
+                AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
+                    AttrMapping_rev2[attr_idx] == \
+                    'pedestrian.sitting_lying_down':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'bicycle' or label_name == 'motorcycle':
+            if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
+                    AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        else:
+            return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+
+        CAM_NUM = 6
+
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+
+            if sample_id % CAM_NUM == 0:
+                boxes_per_frame = []
+                attrs_per_frame = []
+
+            # need to merge results from images of the same sample
+            annos = []
+            boxes, attrs = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id],
+                                                  boxes, attrs,
+                                                  mapped_class_names,
+                                                  self.eval_detection_configs,
+                                                  self.eval_version)
+
+            boxes_per_frame.extend(boxes)
+            attrs_per_frame.extend(attrs)
+            # Remove redundant predictions caused by overlap of images
+            if (sample_id + 1) % CAM_NUM != 0:
+                continue
+            boxes = global_nusc_box_to_cam(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+            cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
+            # box nms 3d over 6 images in a frame
+            # TODO: move this global setting into config
+            nms_cfg = dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_pre=4096,
+                nms_thr=0.05,
+                score_thr=0.01,
+                min_bbox_size=0,
+                max_per_frame=500)
+            from mmcv import Config
+            nms_cfg = Config(nms_cfg)
+            cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
+            boxes3d = cam_boxes3d.tensor
+            # generate attr scores from attr labels
+            attrs = labels.new_tensor([attr for attr in attrs_per_frame])
+            boxes3d, scores, labels, attrs = box3d_multiclass_nms(
+                boxes3d,
+                cam_boxes3d_for_nms,
+                scores,
+                nms_cfg.score_thr,
+                nms_cfg.max_per_frame,
+                nms_cfg,
+                mlvl_attr_scores=attrs)
+            cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
+            det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
+            boxes, attrs = output_to_nusc_box(det)
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                attr = self.get_attr_name(attrs[i], name)
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            # other views results of the same frame should be concatenated
+            if sample_token in nusc_annos:
+                nusc_annos[sample_token].extend(annos)
+            else:
+                nusc_annos[sample_token] = annos
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='img_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'img_bbox'.
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        #from nuscenes.eval.detection.evaluate import NuScenesEval
+        from .nuscnes_eval import NuScenesEval_custom
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        self.nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        # nusc_eval = NuScenesEval(
+        #     nusc,
+        #     config=self.eval_detection_configs,
+        #     result_path=result_path,
+        #     eval_set=eval_set_map[self.version],
+        #     output_dir=output_dir,
+        #     verbose=False)
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=True,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+            )
+
+        self.nusc_eval.main(render_curves=True)
+
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None, **kwargs):
+        """Format the results to json (standard format for COCO evaluation).
+        Args:
+            results (list[tuple | numpy.ndarray]): Testing results of the
+                dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                # not evaluate 2D predictions on nuScenes
+                if '2d' in name:
+                    continue
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['img_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+    def _extract_data(self, index, pipeline, key, load_annos=False):
+        """Load data using input pipeline and extract data according to key.
+        Args:
+            index (int): Index for accessing the target data.
+            pipeline (:obj:`Compose`): Composed data loading pipeline.
+            key (str | list[str]): One single or a list of data key.
+            load_annos (bool): Whether to load data annotations.
+                If True, need to set self.test_mode as False before loading.
+        Returns:
+            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+                A single or a list of loaded data.
+        """
+        assert pipeline is not None, 'data loading pipeline is not provided'
+        img_info = self.data_infos[index]
+        input_dict = dict(img_info=img_info)
+
+        if load_annos:
+            ann_info = self.get_ann_info(index)
+            input_dict.update(dict(ann_info=ann_info))
+
+        self.pre_pipeline(input_dict)
+        example = pipeline(input_dict)
+
+        # extract data items according to keys
+        if isinstance(key, str):
+            data = extract_result_dict(example, key)
+        else:
+            data = [extract_result_dict(example, k) for k in key]
+
+        return data
+
+    def _get_pipeline(self, pipeline):
+        """Get data loading pipeline in self.show/evaluate function.
+        Args:
+            pipeline (list[dict] | None): Input pipeline. If None is given, \
+                get from self.pipeline.
+        """
+        if pipeline is None:
+            if not hasattr(self, 'pipeline') or self.pipeline is None:
+                warnings.warn(
+                    'Use default pipeline for data loading, this may cause '
+                    'errors when data is on ceph')
+                return self._build_default_pipeline()
+            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+            return Compose(loading_pipeline)
+        return Compose(pipeline)
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(type='LoadImageFromFileMono3D'),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['img'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'img_bbox' in result.keys():
+                result = result['img_bbox']
+            data_info = self.data_infos[i]
+            img_path = data_info['file_name']
+            file_name = osp.split(img_path)[-1].split('.')[0]
+            img, img_metas = self._extract_data(i, pipeline,
+                                                ['img', 'img_metas'])
+            # need to transpose channel to first dim
+            img = img.numpy().transpose(1, 2, 0)
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d']
+            pred_bboxes = result['boxes_3d']
+            show_multi_modality_result(
+                img,
+                gt_bboxes,
+                pred_bboxes,
+                img_metas['cam2img'],
+                out_dir,
+                file_name,
+                box_mode='camera',
+                show=show)
+
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+    Args:
+        detection (dict): Detection results.
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+            - attrs_3d (torch.Tensor, optional): Predicted attributes.
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    attrs = None
+    if 'attrs_3d' in detection:
+        attrs = detection['attrs_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+
+    # convert the dim/rot to nuscbox convention
+    box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]]
+    box_yaw = -box_yaw
+
+    box_list = []
+    for i in range(len(box3d)):
+        q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
+        quat = q2 * q1
+        velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8])
+        box = NuScenesBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list, attrs
+
+
+def cam_nusc_box_to_global(info,
+                           boxes,
+                           attrs,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from camera to global coordinate.
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    attr_list = []
+    for (box, attr) in zip(boxes, attrs):
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']))
+        box.translate(np.array(info['cam2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+        attr_list.append(attr)
+    return box_list, attr_list
+
+
+def global_nusc_box_to_cam(info,
+                           boxes,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from global to camera coordinate.
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.translate(-np.array(info['ego2global_translation']))
+        box.rotate(
+            pyquaternion.Quaternion(info['ego2global_rotation']).inverse)
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to camera coord system
+        box.translate(-np.array(info['cam2ego_translation']))
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse)
+        box_list.append(box)
+    return box_list
+
+
+def nusc_box_to_cam_box3d(boxes):
+    """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
+    Args:
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+    Returns:
+        tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \
+            Converted 3D bounding boxes, scores and labels.
+    """
+    locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
+    dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
+    rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
+                         for b in boxes]).view(-1, 1)
+    velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2)
+
+    # convert nusbox to cambox convention
+    dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
+    rots = -rots
+
+    boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
+    cam_boxes3d = CameraInstance3DBoxes(
+        boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
+    scores = torch.Tensor([b.score for b in boxes]).cuda()
+    labels = torch.LongTensor([b.label for b in boxes]).cuda()
+    nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
+    indices = labels.new_tensor(list(range(scores.shape[0])))
+    nms_scores[indices, labels] = scores
+    return cam_boxes3d, nms_scores, labels
\ No newline at end of file
--- a/projects/mmdet3d_plugin/datasets/nuscnes_eval.py
+++ b/projects/mmdet3d_plugin/datasets/nuscnes_eval.py
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+import torch
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import points_in_box
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.common.loaders import load_prediction, add_center_dist, filter_eval_boxes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from torchvision.transforms.functional import rotate
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from torchvision.transforms.functional import rotate
+import cv2
+import argparse
+import json
+import os
+import random
+import time
+from typing import Tuple, Dict, Any
+
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import load_prediction, load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+    DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D
+from IPython import embed
+import json
+from typing import Any
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+    PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.utils.geometry_utils import view_points
+
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are plotted.
+        rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+                        min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index
+
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+        )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    in_front = center_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    in_front = corners_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+
+    # Init.
+    if box_cls == DetectionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                detection_name = category_to_detection_name(sample_annotation['category_name'])
+                if detection_name is None:
+                    continue
+
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token]
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[],  verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+                              name=box.detection_name, token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+                                                     verbose=verbose)
+        self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+        assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+            "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+    def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMetrics(self.cfg)
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+                     dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+                           savepath=savepath(detection_name + '_pr'))
+
+            class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+                           savepath=savepath(detection_name + '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+                          savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+    parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+                        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument('--eval_set', type=str, default='val',
+                        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument('--version', type=str, default='v1.0-trainval',
+                        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument('--config_path', type=str, default='',
+                        help='Path to the configuration file.'
+                             'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument('--plot_examples', type=int, default=0,
+                        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+                                    output_dir=output_dir_, verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+    #for index in range(1, 41):
+    #    nusc_eval.update_gt(type_='ord', index=index)
+    #
--- a/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
+from .transform_3d import (
+    PadMultiViewImage, NormalizeMultiviewImage, 
+    PhotoMetricDistortionMultiViewImage, CustomCollect3D, RandomScaleImageMultiViewImage)
+from .formating import CustomDefaultFormatBundle3D
+from .augmentation import (CropResizeFlipImage, GlobalRotScaleTransImage)
+from .dd3d_mapper import DD3DMapper
+__all__ = [
+    'PadMultiViewImage', 'NormalizeMultiviewImage', 
+    'PhotoMetricDistortionMultiViewImage', 'CustomDefaultFormatBundle3D', 'CustomCollect3D',
+    'RandomScaleImageMultiViewImage',
+    'CropResizeFlipImage', 'GlobalRotScaleTransImage',
+    'DD3DMapper',
+]
\ No newline at end of file
--- a/projects/mmdet3d_plugin/datasets/pipelines/augmentation.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/augmentation.py
+import numpy as np
+import torch
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+from PIL import Image
+import random
+
+
+@PIPELINES.register_module()
+class CropResizeFlipImage(object):
+    """Fixed Crop and then randim resize and flip the image. Note the flip requires to flip the feature in the network   
+    ida_aug_conf = {
+        "reisze": [576, 608, 640, 672, 704]  # stride of 32 based on 640 (0.9, 1.1)
+        "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768]  #  (0.8, 1.2)
+        "reisze": [448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832]  #  (0.7, 1.3)
+        "crop": (0, 260, 1600, 900), 
+        "H": 900,
+        "W": 1600,
+        "rand_flip": True,
+}
+    Args:
+        size (tuple, optional): Fixed padding size.
+    """
+
+    def __init__(self, data_aug_conf=None, training=True, debug=False):
+        self.data_aug_conf = data_aug_conf
+        self.training = training
+        self.debug = debug
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        if not 'aug_param' in results.keys():
+            results['aug_param'] = {}
+        imgs = results["img"]
+        N = len(imgs)
+        new_imgs = []
+        resize, resize_dims, crop, flip = self._sample_augmentation(results)
+
+        if self.debug:
+            # unique id per img
+            from uuid import uuid4
+            uid = uuid4()
+            # lidar is RFU in nuscenes
+            lidar_pts = np.array([
+                [10, 30, -2, 1],
+                [-10, 30, -2, 1],
+                [5, 15, -2, 1],
+                [-5, 15, -2, 1],
+                [30, 0, -2, 1],
+                [-30, 0, -2, 1],
+                [10, -30, -2, 1],
+                [-10, -30, -2, 1]
+            ], dtype=np.float32).T
+
+        for i in range(N):
+            img = Image.fromarray(np.uint8(imgs[i]))
+
+            if self.debug:
+                pts_to_img_pre_aug = results['lidar2img'][i] @ lidar_pts
+                pts_to_img_pre_aug = pts_to_img_pre_aug / pts_to_img_pre_aug[2:3,
+                                                          :]  # div by the depth component in homogenous vector
+
+                img_copy = Image.fromarray(np.uint8(imgs[i]))
+                for j in range(pts_to_img_pre_aug.shape[1]):
+                    x, y = int(pts_to_img_pre_aug[0, j]), int(pts_to_img_pre_aug[1, j])
+                    if (0 < x < img_copy.width) and (0 < y < img_copy.height):
+                        img_copy.putpixel((x - 1, y - 1), (255, 0, 0))
+                        img_copy.putpixel((x - 1, y), (255, 0, 0))
+                        img_copy.putpixel((x - 1, y + 1), (255, 0, 0))
+                        img_copy.putpixel((x, y - 1), (0, 255, 0))
+                        img_copy.putpixel((x, y), (0, 255, 0))
+                        img_copy.putpixel((x, y + 1), (0, 255, 0))
+                        img_copy.putpixel((x + 1, y - 1), (0, 0, 255))
+                        img_copy.putpixel((x + 1, y), (0, 0, 255))
+                        img_copy.putpixel((x + 1, y + 1), (0, 0, 255))
+                img_copy.save(f'pre_aug_{uid}_{i}.png')
+
+            # augmentation (resize, crop, horizontal flip, rotate)
+            # resize, resize_dims, crop, flip, rotate = self._sample_augmentation()  ###different view use different aug (BEV Det)
+            img, ida_mat = self._img_transform(
+                img,
+                resize=resize,
+                resize_dims=resize_dims,
+                crop=crop,
+                flip=flip,
+            )
+            new_imgs.append(np.array(img).astype(np.float32))
+            results['cam2img'][i][:3, :3] = np.matmul(ida_mat, results['cam2img'][i][:3, :3])
+
+            if self.debug:
+                pts_to_img_post_aug = np.matmul(results['cam2img'][i], results['lidar2cam'][i]) @ lidar_pts
+                pts_to_img_post_aug = pts_to_img_post_aug / pts_to_img_post_aug[2:3,
+                                                            :]  # div by the depth component in homogenous vector
+                for j in range(pts_to_img_post_aug.shape[1]):
+                    x, y = int(pts_to_img_post_aug[0, j]), int(pts_to_img_post_aug[1, j])
+                    if (0 < x < img.width) and (0 < y < img.height):
+                        img.putpixel((x - 1, y - 1), (255, 0, 0))
+                        img.putpixel((x - 1, y), (255, 0, 0))
+                        img.putpixel((x - 1, y + 1), (255, 0, 0))
+                        img.putpixel((x, y - 1), (0, 255, 0))
+                        img.putpixel((x, y), (0, 255, 0))
+                        img.putpixel((x, y + 1), (0, 255, 0))
+                        img.putpixel((x + 1, y - 1), (0, 0, 255))
+                        img.putpixel((x + 1, y), (0, 0, 255))
+                        img.putpixel((x + 1, y + 1), (0, 0, 255))
+                img.save(f'post_aug_{uid}_{i}.png')
+
+            if 'mono_ann_idx' in results.keys():
+                # apply transform to dd3d intrinsics
+                if i in results['mono_ann_idx'].data:
+                    mono_index = results['mono_ann_idx'].data.index(i)
+                    intrinsics = results['mono_input_dict'][mono_index]['intrinsics']
+                    if torch.is_tensor(intrinsics):
+                        intrinsics = intrinsics.numpy().reshape(3, 3).astype(np.float32)
+                    elif isinstance(intrinsics, np.ndarray):
+                        intrinsics = intrinsics.reshape(3, 3).astype(np.float32)
+                    else:
+                        intrinsics = np.array(intrinsics, dtype=np.float32).reshape(3, 3)
+                    results['mono_input_dict'][mono_index]['intrinsics'] = np.matmul(ida_mat, intrinsics)
+                    results['mono_input_dict'][mono_index]['height'] = img.size[1]
+                    results['mono_input_dict'][mono_index]['width'] = img.size[0]
+
+                    # apply transform to dd3d box
+                    for ann in results['mono_input_dict'][mono_index]['annotations']:
+                        # bbox_mode = BoxMode.XYXY_ABS
+                        box = self._box_transform(ann['bbox'], resize, crop, flip, img.size[0])[0]
+                        box = box.clip(min=0)
+                        box = np.minimum(box, list(img.size + img.size))
+                        ann["bbox"] = box
+
+        results["img"] = new_imgs
+        results['lidar2img'] = [np.matmul(results['cam2img'][i], results['lidar2cam'][i]) for i in
+                                range(len(results['lidar2cam']))]
+
+        return results
+
+    def _box_transform(self, box, resize, crop, flip, img_width):
+        box = np.array([box])
+        idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten()
+        coords = np.asarray(box).reshape(-1, 4)[:, idxs].reshape(-1, 2)
+
+        # crop
+        coords[:, 0] -= crop[0]
+        coords[:, 1] -= crop[1]
+
+        # resize
+        coords[:, 0] = coords[:, 0] * resize
+        coords[:, 1] = coords[:, 1] * resize
+
+        coords = coords.reshape((-1, 4, 2))
+        minxy = coords.min(axis=1)
+        maxxy = coords.max(axis=1)
+        trans_box = np.concatenate((minxy, maxxy), axis=1)
+
+        return trans_box
+
+    def _img_transform(self, img, resize, resize_dims, crop, flip):
+        ida_rot = np.eye(2)
+        ida_tran = np.zeros(2)
+        # adjust image
+        img = img.crop(crop)
+        img = img.resize(resize_dims)
+        if flip:
+            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
+
+        # post-homography transformation
+        ida_rot *= resize
+        ida_tran -= np.array(crop[:2]) * resize
+        ida_mat = np.eye(3)
+        ida_mat[:2, :2] = ida_rot
+        ida_mat[:2, 2] = ida_tran
+        return img, ida_mat
+
+    def _sample_augmentation(self, results):
+        if 'CropResizeFlipImage_param' in results['aug_param'].keys():
+            return results['aug_param']['CropResizeFlipImage_param']
+        crop = self.data_aug_conf["crop"]
+
+        if self.training:
+            resized_h = random.choice(self.data_aug_conf["reisze"])
+            resized_w = resized_h / (crop[3] - crop[1]) * (crop[2] - crop[0])
+            resize = resized_h / (crop[3] - crop[1])
+            resize_dims = (int(resized_w), int(resized_h))
+            flip = False
+            if self.data_aug_conf["rand_flip"] and np.random.choice([0, 1]):
+                flip = True
+        else:
+            resized_h = random.choice(self.data_aug_conf["reisze"])
+            assert len(self.data_aug_conf["reisze"]) == 1
+            resized_w = resized_h / (crop[3] - crop[1]) * (crop[2] - crop[0])
+            resize = resized_h / (crop[3] - crop[1])
+            resize_dims = (int(resized_w), int(resized_h))
+            flip = False
+        results['aug_param']['CropResizeFlipImage_param'] = (resize, resize_dims, crop, flip)
+
+        return resize, resize_dims, crop, flip
+
+
+@PIPELINES.register_module()
+class GlobalRotScaleTransImage(object):
+    """Random resize, Crop and flip the image
+    Args:
+        size (tuple, optional): Fixed padding size.
+    """
+
+    def __init__(
+            self,
+            rot_range=[-0.3925, 0.3925],
+            scale_ratio_range=[0.95, 1.05],
+            translation_std=[0, 0, 0],
+            reverse_angle=False,
+            training=True,
+            flip_dx_ratio=0.5,
+            flip_dy_ratio=0.5,
+            only_gt=False,
+    ):
+
+        self.rot_range = rot_range
+        self.scale_ratio_range = scale_ratio_range
+        self.translation_std = translation_std
+
+        self.reverse_angle = reverse_angle
+        self.training = training
+
+        self.flip_dx_ratio = flip_dx_ratio
+        self.flip_dy_ratio = flip_dy_ratio
+        self.only_gt = only_gt
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        if not 'aug_param' in results.keys():
+            results['aug_param'] = {}
+
+        rot_angle, scale_ratio, flip_dx, flip_dy, _, _ = self._sample_augmentation(results)
+
+        # random rotate
+        if not self.only_gt:
+            self.rotate_bev_along_z(results, rot_angle)
+        if self.reverse_angle:
+            rot_angle *= -1
+        results["gt_bboxes_3d"].rotate(
+            np.array(rot_angle)
+        )
+
+        # random scale
+        if not self.only_gt:
+            self.scale_xyz(results, scale_ratio)
+        results["gt_bboxes_3d"].scale(scale_ratio)
+
+        # random flip
+        if flip_dx:
+            if not self.only_gt:
+                self.flip_along_x(results)
+            results["gt_bboxes_3d"].flip(bev_direction='vertical')
+        if flip_dy:
+            if not self.only_gt:
+                self.flip_along_y(results)
+            results["gt_bboxes_3d"].flip(bev_direction='horizontal')
+
+        # TODO: support translation
+        return results
+
+    def _sample_augmentation(self, results):
+        if 'GlobalRotScaleTransImage_param' in results['aug_param'].keys():
+            return results['aug_param']['GlobalRotScaleTransImage_param']
+        else:
+            rot_angle = np.random.uniform(*self.rot_range) / 180 * np.pi
+            scale_ratio = np.random.uniform(*self.scale_ratio_range)
+            flip_dx = np.random.uniform() < self.flip_dx_ratio
+            flip_dy = np.random.uniform() < self.flip_dy_ratio
+        # generate bda_mat 
+
+        rot_sin = torch.sin(torch.tensor(rot_angle))
+        rot_cos = torch.cos(torch.tensor(rot_angle))
+        rot_mat = torch.Tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0],
+                                [0, 0, 1]])
+        scale_mat = torch.Tensor([[scale_ratio, 0, 0], [0, scale_ratio, 0],
+                                  [0, 0, scale_ratio]])
+        flip_mat = torch.Tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        if flip_dx:
+            flip_mat = flip_mat @ torch.Tensor([[-1, 0, 0], [0, 1, 0],
+                                                [0, 0, 1]])
+        if flip_dy:
+            flip_mat = flip_mat @ torch.Tensor([[1, 0, 0], [0, -1, 0],
+                                                [0, 0, 1]])
+        bda_mat = flip_mat @ (scale_mat @ rot_mat)
+        bda_mat = torch.inverse(bda_mat)
+        results['aug_param']['GlobalRotScaleTransImage_param'] = (
+        rot_angle, scale_ratio, flip_dx, flip_dy, bda_mat, self.only_gt)
+
+        return rot_angle, scale_ratio, flip_dx, flip_dy, bda_mat, self.only_gt
+
+    def rotate_bev_along_z(self, results, angle):
+        rot_cos = np.cos(angle)
+        rot_sin = np.sin(angle)
+
+        rot_mat = np.array([[rot_cos, -rot_sin, 0, 0], [rot_sin, rot_cos, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]])
+        rot_mat_inv = np.linalg.inv(rot_mat)
+
+        num_view = len(results["lidar2img"])
+        for view in range(num_view):
+            results["lidar2img"][view] = np.matmul(results["lidar2img"][view], rot_mat_inv)
+            results['lidar2cam'][view] = np.matmul(results['lidar2cam'][view], rot_mat_inv)
+
+        return
+
+    def scale_xyz(self, results, scale_ratio):
+        scale_mat = np.array(
+            [
+                [scale_ratio, 0, 0, 0],
+                [0, scale_ratio, 0, 0],
+                [0, 0, scale_ratio, 0],
+                [0, 0, 0, 1],
+            ]
+        )
+
+        scale_mat_inv = np.linalg.inv(scale_mat)
+
+        num_view = len(results["lidar2img"])
+        for view in range(num_view):
+            results["lidar2img"][view] = np.matmul(results["lidar2img"][view], scale_mat_inv)
+            results['lidar2cam'][view] = np.matmul(results['lidar2cam'][view], scale_mat_inv)
+        return
+
+    def flip_along_x(self, results):
+        flip_mat = np.array(
+            [
+                [-1, 0, 0, 0],
+                [0, 1, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1],
+            ]
+        ).astype(np.float32)
+
+        flip_mat_inv = np.linalg.inv(flip_mat)
+
+        num_view = len(results["lidar2img"])
+        for view in range(num_view):
+            results["lidar2img"][view] = np.matmul(results["lidar2img"][view], flip_mat_inv)
+            results['lidar2cam'][view] = np.matmul(results['lidar2cam'][view], flip_mat_inv)
+        return
+
+    def flip_along_y(self, results):
+        flip_mat = np.array(
+            [
+                [1, 0, 0, 0],
+                [0, -1, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1],
+            ]
+        ).astype(np.float32)
+
+        flip_mat_inv = np.linalg.inv(flip_mat)
+
+        num_view = len(results["lidar2img"])
+        for view in range(num_view):
+            results["lidar2img"][view] = np.matmul(results["lidar2img"][view], flip_mat_inv)
+            results['lidar2cam'][view] = np.matmul(results['lidar2cam'][view], flip_mat_inv)
+        return
--- a/projects/mmdet3d_plugin/datasets/pipelines/dd3d_mapper.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/dd3d_mapper.py
+import copy
+import numpy as np
+import torch
+from mmcv.parallel.data_container import DataContainer as DC
+from mmdet.datasets.builder import PIPELINES
+from projects.mmdet3d_plugin.dd3d.datasets.transform_utils import annotations_to_instances
+from projects.mmdet3d_plugin.dd3d.structures.pose import Pose
+from projects.mmdet3d_plugin.dd3d.utils.tasks import TaskManager
+
+
+@PIPELINES.register_module()
+class DD3DMapper:
+    def __init__(self,
+                 is_train: bool = True,
+                 tasks=dict(box2d_on=True, box3d_on=True),
+                 ):
+        self.is_train = is_train
+        self.task_manager = TaskManager(**tasks)
+
+    def __call__(self, results):
+        if results['mono_input_dict'] is None:
+            return results
+        mono_input_dict = []
+        for dataset_dict in results['mono_input_dict']:
+            dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+            image_shape = results['img'].data.shape[-2:]
+            intrinsics = None
+            if "intrinsics" in dataset_dict:
+                intrinsics = dataset_dict['intrinsics']
+                if not torch.is_tensor(intrinsics):
+                    intrinsics = np.reshape(
+                        intrinsics,
+                        (3, 3),
+                    ).astype(np.float32)
+                    intrinsics = torch.as_tensor(intrinsics)
+                    # NOTE: intrinsics = transforms.apply_intrinsics(intrinsics)
+                    dataset_dict["intrinsics"] = intrinsics
+                dataset_dict["inv_intrinsics"] = torch.linalg.inv(dataset_dict['intrinsics'])
+
+            if "pose" in dataset_dict:
+                pose = Pose(wxyz=np.float32(dataset_dict["pose"]["wxyz"]),
+                            tvec=np.float32(dataset_dict["pose"]["tvec"]))
+                dataset_dict["pose"] = pose
+                # NOTE: no transforms affect global pose.
+
+            if "extrinsics" in dataset_dict:
+                extrinsics = Pose(
+                    wxyz=np.float32(dataset_dict["extrinsics"]["wxyz"]),
+                    tvec=np.float32(dataset_dict["extrinsics"]["tvec"])
+                )
+                dataset_dict["extrinsics"] = extrinsics
+
+            if not self.task_manager.has_detection_task:
+                dataset_dict.pop("annotations", None)
+
+            if "annotations" in dataset_dict:
+                for anno in dataset_dict["annotations"]:
+                    if not self.task_manager.has_detection_task:
+                        anno.pop("bbox", None)
+                        anno.pop("bbox_mode", None)
+                    if not self.task_manager.box3d_on:
+                        anno.pop("bbox3d", None)
+                annos = [anno for anno in dataset_dict["annotations"] if anno.get("iscrowd", 0) == 0]
+                if annos and 'bbox3d' in annos[0]:
+                    # Remove boxes with negative z-value for center.
+                    annos = [anno for anno in annos if anno['bbox3d'][6] > 0]
+
+                instances = annotations_to_instances(
+                    annos,
+                    image_shape,  # TODO: the effect of the shape?
+                    intrinsics=intrinsics.numpy(),
+                )
+
+                if self.is_train:
+                    # instances = d2_utils.filter_empty_instances(instances)
+                    m = instances.gt_boxes.nonempty(threshold=1e-5)
+                    instances = instances[m]
+                    annos = [anno for tmp_m, anno in zip(m, annos) if tmp_m]
+                dataset_dict["instances"] = instances
+
+            dataset_dict['annotations'] = annos
+
+            mono_input_dict.append(dataset_dict)
+
+        # TODO: drop batch that has no annotations?
+        box_num = 0
+        for dataset_dict in mono_input_dict:
+            box_num += dataset_dict["instances"].gt_boxes.tensor.shape[0]
+        if box_num == 0:
+            return None
+
+        mono_input_dict = DC(mono_input_dict, cpu_only=True)
+        results['mono_input_dict'] = mono_input_dict
+        return results