init

7c0c60e3 · change3n8 · 7c0c60e3 · 7c0c60e3 · 7c0c60e3 · 7c0c60e3
Commit 7c0c60e3 authored Feb 27, 2026 by change3n8
20 changed files
--- a/.gitignore
+++ b/.gitignore
+*.pyc
+*.npy
+*.pth
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+MIT License
+
+Copyright (c) 2024 Horizon Robotics
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+# build
+cd projects/mmdet3d_plugin/ops/
+python3 -m pip install -e . --no-build-isolation
+
+# run
+demo
+python3 forward.py 
\ No newline at end of file
--- a/demo/forward.py
+++ b/demo/forward.py
+import torch
+import deformable_aggregation_ext.deformable_aggregation_ext as da
+
+bs = 6
+cam = 6
+num_feat = 89760
+c = 256
+scale = 4
+group = 8
+anchor = 1220
+pts = 13
+
+H, W = 44, 85  # H*W = 3740, 3740 * 6 * 4 = 89760
+spatial_shape = torch.tensor([[[H, W]] * scale] * cam, dtype=torch.int32)  # [cam, scale, 2]
+
+scale_start_index = torch.zeros((cam, scale), dtype=torch.int32)
+feat_area_per_map = H * W  # 3740
+for i in range(cam):
+    for s in range(scale):
+        idx = i * scale + s
+        scale_start_index[i, s] = idx * feat_area_per_map
+
+# 验证最后一个偏移 + 面积 <= num_feat
+last_offset = scale_start_index[cam-1, scale-1]
+assert last_offset + feat_area_per_map == num_feat, "Total feature size mismatch!"
+
+#print("scale_start_index[0]:", scale_start_index[0])  # [0, 3740, 7480, 11220]
+#print("Total area:", last_offset + feat_area_per_map)
+
+# 其他张量保持不变
+feature_maps = torch.rand(bs, num_feat, c).float().cuda()
+sampling_location = torch.rand(bs, anchor, pts, cam, 2).float().cuda()
+
+#print(f"size of feature_maps is {feature_maps.size()}")
+
+weights = torch.rand(bs, anchor, pts, cam, scale, group).float().cuda()
+
+
+# print(feature_maps.size())
+# print(spatial_shape.size())
+# print(scale_start_index.size())
+# print(sampling_location.size())
+# print(weights.size())
+# print("#######################")
+
+# 调用算子
+out = da.deformable_aggregation_forward(
+    feature_maps, spatial_shape.cuda(), scale_start_index.cuda(),
+    sampling_location, weights
+)
+
+new_out = da.deformable_aggregation_forward_ref(
+    feature_maps, spatial_shape.cuda(), scale_start_index.cuda(),
+    sampling_location, weights
+)
+
+# -------- 计算整体误差 --------
+
+def compare_tensors(t1, t2, name="", rtol=1e-4, atol=1e-6):
+    max_abs_diff = (t1 - t2).abs().max().item()
+    is_close = torch.allclose(t1, t2, rtol=rtol, atol=atol)
+    print(f"{name}: max_abs_diff={max_abs_diff}, allclose={is_close}")
+    return max_abs_diff, is_close
+
+compare_tensors(out, new_out, "OUTPUT")
+
+print("finish")
--- a/projects/configs/sparse4dv3_temporal_r50_1x8_bs6_256x704.py
+++ b/projects/configs/sparse4dv3_temporal_r50_1x8_bs6_256x704.py
+"""
+mAP: 0.4647
+mATE: 0.5403
+mASE: 0.2623
+mAOE: 0.4590
+mAVE: 0.2198
+mAAE: 0.2059
+NDS: 0.5636
+Eval time: 176.9s
+
+Per-class results:
+Object Class    AP  ATE ASE AOE AVE AAE
+car 0.668   0.357   0.142   0.054   0.184   0.195
+truck   0.394   0.528   0.187   0.052   0.163   0.210
+bus 0.451   0.681   0.196   0.070   0.383   0.243
+trailer 0.185   0.971   0.247   0.634   0.175   0.202
+construction_vehicle    0.122   0.879   0.496   1.200   0.136   0.406
+pedestrian  0.559   0.517   0.287   0.513   0.282   0.151
+motorcycle  0.497   0.462   0.238   0.536   0.293   0.236
+bicycle 0.426   0.441   0.257   0.951   0.142   0.004
+traffic_cone    0.697   0.275   0.299   nan nan nan
+barrier 0.648   0.292   0.275   0.122   nan nan
+"""
+
+"""
+Per-class results:
+            AMOTA   AMOTP   RECALL  MOTAR   GT      MOTA    MOTP    MT  ML  FAF     TP      FP  FN  IDS FRAG TID    LGD
+bicycle     0.444   1.169   0.533   0.733   1993    0.389   0.566   53  57  19.3    1059    283 931 3   8   1.60    1.75
+bus         0.559   1.175   0.626   0.824   2112    0.515   0.751   42  35  14.8    1321    233 790 1   20  1.13    1.95
+car         0.678   0.755   0.733   0.819   58317   0.599   0.470   2053    1073    134.2   42626   7706    15565   126 295 0.76    1.03
+motorcy     0.522   1.060   0.609   0.823   1977    0.497   0.564   50  38  15.7    1194    211 773 10  17  1.97    2.17
+pedestr     0.548   1.059   0.652   0.791   25423   0.506   0.678   677 467 77.6    16274   3404    8854    295 225 1.33    1.85
+trailer     0.136   1.603   0.383   0.403   2425    0.154   0.981   30  79  52.6    926 553 1496    3   13  1.49    2.64
+truck       0.454   1.132   0.577   0.691   9650    0.399   0.594   210 214 45.7    5569    1723    4078    3   50  1.35    1.85
+
+Aggregated results:
+AMOTA   0.477
+AMOTP   1.136
+RECALL  0.588
+MOTAR   0.726
+GT  14556
+MOTA    0.437
+MOTP    0.658
+MT  3115
+ML  1963
+FAF 51.4
+TP  68969
+FP  14113
+FN  32487
+IDS 441
+FRAG    628
+TID 1.37
+LGD 1.89
+"""
+
+# ================ base config ===================
+plugin = True
+plugin_dir = "projects/mmdet3d_plugin/"
+dist_params = dict(backend="nccl")
+log_level = "INFO"
+work_dir = None
+
+total_batch_size = 48
+num_gpus = 8
+batch_size = total_batch_size // num_gpus
+num_iters_per_epoch = int(28130 // (num_gpus * batch_size))
+num_epochs = 100
+checkpoint_epoch_interval = 20
+
+checkpoint_config = dict(
+    interval=num_iters_per_epoch * checkpoint_epoch_interval
+)
+log_config = dict(
+    interval=51,
+    hooks=[
+        dict(type="TextLoggerHook", by_epoch=False),
+        dict(type="TensorboardLoggerHook"),
+    ],
+)
+load_from = None
+resume_from = None
+workflow = [("train", 1)]
+fp16 = dict(loss_scale=32.0)
+input_shape = (704, 256)
+
+tracking_test = True
+tracking_threshold = 0.2
+
+# ================== model ========================
+class_names = [
+    "car",
+    "truck",
+    "construction_vehicle",
+    "bus",
+    "trailer",
+    "barrier",
+    "motorcycle",
+    "bicycle",
+    "pedestrian",
+    "traffic_cone",
+]
+
+num_classes = len(class_names)
+embed_dims = 256
+num_groups = 8
+num_decoder = 6
+num_single_frame_decoder = 1
+use_deformable_func = True  # mmdet3d_plugin/ops/setup.py needs to be executed
+strides = [4, 8, 16, 32]
+num_levels = len(strides)
+num_depth_layers = 3
+drop_out = 0.1
+temporal = True
+decouple_attn = True
+with_quality_estimation = True
+
+model = dict(
+    type="Sparse4D",
+    use_grid_mask=True,
+    use_deformable_func=use_deformable_func,
+    img_backbone=dict(
+        type="ResNet",
+        depth=50,
+        num_stages=4,
+        frozen_stages=-1,
+        norm_eval=False,
+        style="pytorch",
+        with_cp=True,
+        out_indices=(0, 1, 2, 3),
+        norm_cfg=dict(type="BN", requires_grad=True),
+        pretrained="ckpt/resnet50-19c8e357.pth",
+    ),
+    img_neck=dict(
+        type="FPN",
+        num_outs=num_levels,
+        start_level=0,
+        out_channels=embed_dims,
+        add_extra_convs="on_output",
+        relu_before_extra_convs=True,
+        in_channels=[256, 512, 1024, 2048],
+    ),
+    depth_branch=dict(  # for auxiliary supervision only
+        type="DenseDepthNet",
+        embed_dims=embed_dims,
+        num_depth_layers=num_depth_layers,
+        loss_weight=0.2,
+    ),
+    head=dict(
+        type="Sparse4DHead",
+        cls_threshold_to_reg=0.05,
+        decouple_attn=decouple_attn,
+        instance_bank=dict(
+            type="InstanceBank",
+            num_anchor=900,
+            embed_dims=embed_dims,
+            anchor="nuscenes_kmeans900.npy",
+            anchor_handler=dict(type="SparseBox3DKeyPointsGenerator"),
+            num_temp_instances=600 if temporal else -1,
+            confidence_decay=0.6,
+            feat_grad=False,
+        ),
+        anchor_encoder=dict(
+            type="SparseBox3DEncoder",
+            vel_dims=3,
+            embed_dims=[128, 32, 32, 64] if decouple_attn else 256,
+            mode="cat" if decouple_attn else "add",
+            output_fc=not decouple_attn,
+            in_loops=1,
+            out_loops=4 if decouple_attn else 2,
+        ),
+        num_single_frame_decoder=num_single_frame_decoder,
+        operation_order=(
+            [
+                "gnn",
+                "norm",
+                "deformable",
+                "ffn",
+                "norm",
+                "refine",
+            ]
+            * num_single_frame_decoder
+            + [
+                "temp_gnn",
+                "gnn",
+                "norm",
+                "deformable",
+                "ffn",
+                "norm",
+                "refine",
+            ]
+            * (num_decoder - num_single_frame_decoder)
+        )[2:],
+        temp_graph_model=dict(
+            type="MultiheadAttention",
+            embed_dims=embed_dims if not decouple_attn else embed_dims * 2,
+            num_heads=num_groups,
+            batch_first=True,
+            dropout=drop_out,
+        )
+        if temporal
+        else None,
+        graph_model=dict(
+            type="MultiheadAttention",
+            embed_dims=embed_dims if not decouple_attn else embed_dims * 2,
+            num_heads=num_groups,
+            batch_first=True,
+            dropout=drop_out,
+        ),
+        norm_layer=dict(type="LN", normalized_shape=embed_dims),
+        ffn=dict(
+            type="AsymmetricFFN",
+            in_channels=embed_dims * 2,
+            pre_norm=dict(type="LN"),
+            embed_dims=embed_dims,
+            feedforward_channels=embed_dims * 4,
+            num_fcs=2,
+            ffn_drop=drop_out,
+            act_cfg=dict(type="ReLU", inplace=True),
+        ),
+        deformable_model=dict(
+            type="DeformableFeatureAggregation",
+            embed_dims=embed_dims,
+            num_groups=num_groups,
+            num_levels=num_levels,
+            num_cams=6,
+            attn_drop=0.15,
+            use_deformable_func=use_deformable_func,
+            use_camera_embed=True,
+            residual_mode="cat",
+            kps_generator=dict(
+                type="SparseBox3DKeyPointsGenerator",
+                num_learnable_pts=6,
+                fix_scale=[
+                    [0, 0, 0],
+                    [0.45, 0, 0],
+                    [-0.45, 0, 0],
+                    [0, 0.45, 0],
+                    [0, -0.45, 0],
+                    [0, 0, 0.45],
+                    [0, 0, -0.45],
+                ],
+            ),
+        ),
+        refine_layer=dict(
+            type="SparseBox3DRefinementModule",
+            embed_dims=embed_dims,
+            num_cls=num_classes,
+            refine_yaw=True,
+            with_quality_estimation=with_quality_estimation,
+        ),
+        sampler=dict(
+            type="SparseBox3DTarget",
+            num_dn_groups=5,
+            num_temp_dn_groups=3,
+            dn_noise_scale=[2.0] * 3 + [0.5] * 7,
+            max_dn_gt=32,
+            add_neg_dn=True,
+            cls_weight=2.0,
+            box_weight=0.25,
+            reg_weights=[2.0] * 3 + [0.5] * 3 + [0.0] * 4,
+            cls_wise_reg_weights={
+                class_names.index("traffic_cone"): [
+                    2.0,
+                    2.0,
+                    2.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    0.0,
+                    0.0,
+                    1.0,
+                    1.0,
+                ],
+            },
+        ),
+        loss_cls=dict(
+            type="FocalLoss",
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0,
+        ),
+        loss_reg=dict(
+            type="SparseBox3DLoss",
+            loss_box=dict(type="L1Loss", loss_weight=0.25),
+            loss_centerness=dict(type="CrossEntropyLoss", use_sigmoid=True),
+            loss_yawness=dict(type="GaussianFocalLoss"),
+            cls_allow_reverse=[class_names.index("barrier")],
+        ),
+        decoder=dict(type="SparseBox3DDecoder"),
+        reg_weights=[2.0] * 3 + [1.0] * 7,
+    ),
+)
+
+# ================== data ========================
+dataset_type = "NuScenes3DDetTrackDataset"
+data_root = "data/nuscenes/"
+anno_root = "data/nuscenes_cam/"
+anno_root = "data/nuscenes_anno_pkls/"
+file_client_args = dict(backend="disk")
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True
+)
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFiles", to_float32=True),
+    dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args,
+    ),
+    dict(type="ResizeCropFlipImage"),
+    dict(
+        type="MultiScaleDepthMapGenerator",
+        downsample=strides[:num_depth_layers],
+    ),
+    dict(type="BBoxRotation"),
+    dict(type="PhotoMetricDistortionMultiViewImage"),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(
+        type="CircleObjectRangeFilter",
+        class_dist_thred=[55] * len(class_names),
+    ),
+    dict(type="InstanceNameFilter", classes=class_names),
+    dict(type="NuScenesSparse4DAdaptor"),
+    dict(
+        type="Collect",
+        keys=[
+            "img",
+            "timestamp",
+            "projection_mat",
+            "image_wh",
+            "gt_depth",
+            "focal",
+            "gt_bboxes_3d",
+            "gt_labels_3d",
+        ],
+        meta_keys=["T_global", "T_global_inv", "timestamp", "instance_id"],
+    ),
+]
+test_pipeline = [
+    dict(type="LoadMultiViewImageFromFiles", to_float32=True),
+    dict(type="ResizeCropFlipImage"),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="NuScenesSparse4DAdaptor"),
+    dict(
+        type="Collect",
+        keys=[
+            "img",
+            "timestamp",
+            "projection_mat",
+            "image_wh",
+        ],
+        meta_keys=["T_global", "T_global_inv", "timestamp"],
+    ),
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False,
+)
+
+data_basic_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    version="v1.0-trainval",
+)
+
+data_aug_conf = {
+    "resize_lim": (0.40, 0.47),
+    "final_dim": input_shape[::-1],
+    "bot_pct_lim": (0.0, 0.0),
+    "rot_lim": (-5.4, 5.4),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": True,
+    "rot3d_range": [-0.3925, 0.3925],
+}
+
+data = dict(
+    samples_per_gpu=batch_size,
+    workers_per_gpu=batch_size,
+    train=dict(
+        **data_basic_config,
+        ann_file=anno_root + "nuscenes_infos_train.pkl",
+        pipeline=train_pipeline,
+        test_mode=False,
+        data_aug_conf=data_aug_conf,
+        with_seq_flag=True,
+        sequences_split_num=2,
+        keep_consistent_seq_aug=True,
+    ),
+    val=dict(
+        **data_basic_config,
+        ann_file=anno_root + "nuscenes_infos_val.pkl",
+        pipeline=test_pipeline,
+        data_aug_conf=data_aug_conf,
+        test_mode=True,
+        tracking=tracking_test,
+        tracking_threshold=tracking_threshold,
+    ),
+    test=dict(
+        **data_basic_config,
+        ann_file=anno_root + "nuscenes_infos_val.pkl",
+        pipeline=test_pipeline,
+        data_aug_conf=data_aug_conf,
+        test_mode=True,
+        tracking=tracking_test,
+        tracking_threshold=tracking_threshold,
+    ),
+)
+
+# ================== training ========================
+optimizer = dict(
+    type="AdamW",
+    lr=6e-4,
+    weight_decay=0.001,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.5),
+        }
+    ),
+)
+optimizer_config = dict(grad_clip=dict(max_norm=25, norm_type=2))
+lr_config = dict(
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+)
+runner = dict(
+    type="IterBasedRunner",
+    max_iters=num_iters_per_epoch * num_epochs,
+)
+
+# ================== eval ========================
+vis_pipeline = [
+    dict(type="LoadMultiViewImageFromFiles", to_float32=True),
+    dict(
+        type="Collect",
+        keys=["img"],
+        meta_keys=["timestamp", "lidar2img"],
+    ),
+]
+evaluation = dict(
+    interval=num_iters_per_epoch * checkpoint_epoch_interval,
+    pipeline=vis_pipeline,
+    # out_dir="./vis",  # for visualization
+)
--- a/projects/mmdet3d_plugin/__init__.py
+++ b/projects/mmdet3d_plugin/__init__.py
+from .datasets import *
+from .models import *
+from .apis import *
+from .core.evaluation import *
--- a/projects/mmdet3d_plugin/apis/__init__.py
+++ b/projects/mmdet3d_plugin/apis/__init__.py
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
+
+# from .test import custom_multi_gpu_test
--- a/projects/mmdet3d_plugin/apis/mmdet_train.py
+++ b/projects/mmdet3d_plugin/apis/mmdet_train.py
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (
+    HOOKS,
+    DistSamplerSeedHook,
+    EpochBasedRunner,
+    Fp16OptimizerHook,
+    OptimizerHook,
+    build_optimizer,
+    build_runner,
+    get_dist_info,
+)
+from mmcv.utils import build_from_cfg
+
+from mmdet.core import EvalHook
+
+from mmdet.datasets import build_dataset, replace_ImageToTensor
+from mmdet.utils import get_root_logger
+import time
+import os.path as osp
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.core.evaluation.eval_hooks import (
+    CustomDistEvalHook,
+)
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+
+
+def custom_train_detector(
+    model,
+    dataset,
+    cfg,
+    distributed=False,
+    validate=False,
+    timestamp=None,
+    meta=None,
+):
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    # assert len(dataset)==1s
+    if "imgs_per_gpu" in cfg.data:
+        logger.warning(
+            '"imgs_per_gpu" is deprecated in MMDet V2.0. '
+            'Please use "samples_per_gpu" instead'
+        )
+        if "samples_per_gpu" in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f"={cfg.data.imgs_per_gpu} is used in this experiments"
+            )
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f"{cfg.data.imgs_per_gpu} in this experiments"
+            )
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    if "runner" in cfg:
+        runner_type = cfg.runner["type"]
+    else:
+        runner_type = "EpochBasedRunner"
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            nonshuffler_sampler=dict(
+                type="DistributedSampler"
+            ),  # dict(type='DistributedSampler'),
+            runner_type=runner_type,
+        )
+        for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get("find_unused_parameters", False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters,
+        )
+
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids
+        )
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if "runner" not in cfg:
+        cfg.runner = {
+            "type": "EpochBasedRunner",
+            "max_epochs": cfg.total_epochs,
+        }
+        warnings.warn(
+            "config is now expected to have a `runner` section, "
+            "please set `runner` in your config.",
+            UserWarning,
+        )
+    else:
+        if "total_epochs" in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+
+    runner = build_runner(
+        cfg.runner,
+        default_args=dict(
+            model=model,
+            optimizer=optimizer,
+            work_dir=cfg.work_dir,
+            logger=logger,
+            meta=meta,
+        ),
+    )
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get("fp16", None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed
+        )
+    elif distributed and "type" not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(
+        cfg.lr_config,
+        optimizer_config,
+        cfg.checkpoint_config,
+        cfg.log_config,
+        cfg.get("momentum_config", None),
+    )
+
+    # register profiler hook
+    # trace_config = dict(type='tb_trace', dir_name='work_dir')
+    # profiler_config = dict(on_trace_ready=trace_config)
+    # runner.register_profiler_hook(profiler_config)
+
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop("samples_per_gpu", 1)
+        if val_samples_per_gpu > 1:
+            assert False
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline
+            )
+        val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
+
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            nonshuffler_sampler=dict(type="DistributedSampler"),
+        )
+        eval_cfg = cfg.get("evaluation", {})
+        eval_cfg["by_epoch"] = cfg.runner["type"] != "IterBasedRunner"
+        eval_cfg["jsonfile_prefix"] = osp.join(
+            "val",
+            cfg.work_dir,
+            time.ctime().replace(" ", "_").replace(":", "_"),
+        )
+        eval_hook = CustomDistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    # user-defined hooks
+    if cfg.get("custom_hooks", None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(
+            custom_hooks, list
+        ), f"custom_hooks expect list type, but got {type(custom_hooks)}"
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), (
+                "Each item in custom_hooks expects dict type, but got "
+                f"{type(hook_cfg)}"
+            )
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop("priority", "NORMAL")
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
--- a/projects/mmdet3d_plugin/apis/test.py
+++ b/projects/mmdet3d_plugin/apis/test.py
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+from mmdet.core import encode_mask_results
+
+
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+
+
+def custom_encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code. Semantic Masks only
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = []
+    for i in range(len(cls_segms)):
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(
+                    cls_segms[i][:, :, np.newaxis], order="F", dtype="uint8"
+                )
+            )[0]
+        )  # encoded with RLE
+    return [encoded_mask_results]
+
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+            # encode mask results
+            if isinstance(result, dict):
+                if "bbox_results" in result.keys():
+                    bbox_result = result["bbox_results"]
+                    batch_size = len(result["bbox_results"])
+                    bbox_results.extend(bbox_result)
+                if (
+                    "mask_results" in result.keys()
+                    and result["mask_results"] is not None
+                ):
+                    mask_result = custom_encode_mask_results(
+                        result["mask_results"]
+                    )
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+        if rank == 0:
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        bbox_results = collect_results_gpu(bbox_results, len(dataset))
+        if have_mask:
+            mask_results = collect_results_gpu(mask_results, len(dataset))
+        else:
+            mask_results = None
+    else:
+        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+        tmpdir = tmpdir + "_mask" if tmpdir is not None else None
+        if have_mask:
+            mask_results = collect_results_cpu(
+                mask_results, len(dataset), tmpdir
+            )
+        else:
+            mask_results = None
+
+    if mask_results is None:
+        return bbox_results
+    return {"bbox_results": bbox_results, "mask_results": mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full(
+            (MAX_LEN,), 32, dtype=torch.uint8, device="cuda"
+        )
+        if rank == 0:
+            mmcv.mkdir_or_exist(".dist_test")
+            tmpdir = tempfile.mkdtemp(dir=".dist_test")
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device="cuda"
+            )
+            dir_tensor[: len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f"part_{rank}.pkl"))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f"part_{i}.pkl")
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        """
+        bacause we change the sample of the evaluation stage to make sure that
+        each gpu will handle continuous sample,
+        """
+        # for res in zip(*part_list):
+        for res in part_list:
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    collect_results_cpu(result_part, size)
--- a/projects/mmdet3d_plugin/apis/train.py
+++ b/projects/mmdet3d_plugin/apis/train.py
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .mmdet_train import custom_train_detector
+# from mmseg.apis import train_segmentor
+from mmdet.apis import train_detector
+
+
+def custom_train_model(
+    model,
+    dataset,
+    cfg,
+    distributed=False,
+    validate=False,
+    timestamp=None,
+    meta=None,
+):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ["EncoderDecoder3D"]:
+        assert False
+    else:
+        custom_train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta,
+        )
+
+
+def train_model(
+    model,
+    dataset,
+    cfg,
+    distributed=False,
+    validate=False,
+    timestamp=None,
+    meta=None,
+):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    train_detector(
+        model,
+        dataset,
+        cfg,
+        distributed=distributed,
+        validate=validate,
+        timestamp=timestamp,
+        meta=meta,
+    )
--- a/projects/mmdet3d_plugin/core/box3d.py
+++ b/projects/mmdet3d_plugin/core/box3d.py
+X, Y, Z, W, L, H, SIN_YAW, COS_YAW, VX, VY, VZ = list(range(11))  # undecoded
+CNS, YNS = 0, 1  # centerness and yawness indices in qulity
+YAW = 6  # decoded
--- a/projects/mmdet3d_plugin/core/evaluation/__init__.py
+++ b/projects/mmdet3d_plugin/core/evaluation/__init__.py
+from .eval_hooks import CustomDistEvalHook
\ No newline at end of file
--- a/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py
+++ b/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py
+# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16,
+# in order to avoid strong version dependency, we did not directly
+# inherit EvalHook but BaseDistEvalHook.
+
+import bisect
+import os.path as osp
+
+import mmcv
+import torch.distributed as dist
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+from torch.nn.modules.batchnorm import _BatchNorm
+from mmdet.core.evaluation.eval_hooks import DistEvalHook
+
+
+def _calc_dynamic_intervals(start_interval, dynamic_interval_list):
+    assert mmcv.is_list_of(dynamic_interval_list, tuple)
+
+    dynamic_milestones = [0]
+    dynamic_milestones.extend(
+        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list]
+    )
+    dynamic_intervals = [start_interval]
+    dynamic_intervals.extend(
+        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list]
+    )
+    return dynamic_milestones, dynamic_intervals
+
+
+class CustomDistEvalHook(BaseDistEvalHook):
+    def __init__(self, *args, dynamic_intervals=None, **kwargs):
+        super(CustomDistEvalHook, self).__init__(*args, **kwargs)
+        self.use_dynamic_intervals = dynamic_intervals is not None
+        if self.use_dynamic_intervals:
+            (
+                self.dynamic_milestones,
+                self.dynamic_intervals,
+            ) = _calc_dynamic_intervals(self.interval, dynamic_intervals)
+
+    def _decide_interval(self, runner):
+        if self.use_dynamic_intervals:
+            progress = runner.epoch if self.by_epoch else runner.iter
+            step = bisect.bisect(self.dynamic_milestones, (progress + 1))
+            # Dynamically modify the evaluation interval
+            self.interval = self.dynamic_intervals[step - 1]
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        self._decide_interval(runner)
+        super().before_train_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self._decide_interval(runner)
+        super().before_train_iter(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if (
+                    isinstance(module, _BatchNorm)
+                    and module.track_running_stats
+                ):
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        if not self._should_evaluate(runner):
+            return
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, ".eval_hook")
+
+        from projects.mmdet3d_plugin.apis.test import (
+            custom_multi_gpu_test,
+        )  # to solve circlur  import
+
+        results = custom_multi_gpu_test(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect,
+        )
+        if runner.rank == 0:
+            print("\n")
+            runner.log_buffer.output["eval_iter_num"] = len(self.dataloader)
+
+            key_score = self.evaluate(runner, results)
+
+            if self.save_best:
+                self._save_ckpt(runner, key_score)
--- a/projects/mmdet3d_plugin/datasets/__init__.py
+++ b/projects/mmdet3d_plugin/datasets/__init__.py
+from .nuscenes_3d_det_track_dataset import NuScenes3DDetTrackDataset
+from .builder import *
+from .pipelines import *
+from .samplers import *
+
+__all__ = [
+    'NuScenes3DDetTrackDataset',
+    "custom_build_dataset",
+]
--- a/projects/mmdet3d_plugin/datasets/builder.py
+++ b/projects/mmdet3d_plugin/datasets/builder.py
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg
+from torch.utils.data import DataLoader
+
+from mmdet.datasets.samplers import GroupSampler
+from projects.mmdet3d_plugin.datasets.samplers import (
+    GroupInBatchSampler,
+    DistributedGroupSampler,
+    DistributedSampler,
+    build_sampler
+)
+
+
+def build_dataloader(
+    dataset,
+    samples_per_gpu,
+    workers_per_gpu,
+    num_gpus=1,
+    dist=True,
+    shuffle=True,
+    seed=None,
+    shuffler_sampler=None,
+    nonshuffler_sampler=None,
+    runner_type="EpochBasedRunner",
+    **kwargs
+):
+    """Build PyTorch DataLoader.
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    batch_sampler = None
+    if runner_type == 'IterBasedRunner':
+        print("Use GroupInBatchSampler !!!")
+        batch_sampler = GroupInBatchSampler(
+            dataset,
+            samples_per_gpu,
+            world_size,
+            rank,
+            seed=seed,
+        )
+        batch_size = 1
+        sampler = None
+        num_workers = workers_per_gpu
+    elif dist:
+        # DistributedGroupSampler will definitely shuffle the data to satisfy
+        # that images on each GPU are in the same group
+        if shuffle:
+            print("Use DistributedGroupSampler !!!")
+            sampler = build_sampler(
+                shuffler_sampler
+                if shuffler_sampler is not None
+                else dict(type="DistributedGroupSampler"),
+                dict(
+                    dataset=dataset,
+                    samples_per_gpu=samples_per_gpu,
+                    num_replicas=world_size,
+                    rank=rank,
+                    seed=seed,
+                ),
+            )
+        else:
+            sampler = build_sampler(
+                nonshuffler_sampler
+                if nonshuffler_sampler is not None
+                else dict(type="DistributedSampler"),
+                dict(
+                    dataset=dataset,
+                    num_replicas=world_size,
+                    rank=rank,
+                    shuffle=shuffle,
+                    seed=seed,
+                ),
+            )
+
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        # assert False, 'not support in bevformer'
+        print("WARNING!!!!, Only can be used for obtain inference speed!!!!")
+        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = (
+        partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed)
+        if seed is not None
+        else None
+    )
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        batch_sampler=batch_sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=False,
+        worker_init_fn=init_fn,
+        **kwargs
+    )
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+from mmcv.utils import Registry, build_from_cfg
+
+from mmdet.datasets import DATASETS
+from mmdet.datasets.builder import _concat_dataset
+
+if platform.system() != "Windows":
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+OBJECTSAMPLERS = Registry("Object sampler")
+
+
+def custom_build_dataset(cfg, default_args=None):
+    try:
+        from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+    except:
+        CBGSDataset = None
+    from mmdet.datasets.dataset_wrappers import (
+        ClassBalancedDataset,
+        ConcatDataset,
+        RepeatDataset,
+    )
+
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset(
+            [custom_build_dataset(c, default_args) for c in cfg]
+        )
+    elif cfg["type"] == "ConcatDataset":
+        dataset = ConcatDataset(
+            [custom_build_dataset(c, default_args) for c in cfg["datasets"]],
+            cfg.get("separate_eval", True),
+        )
+    elif cfg["type"] == "RepeatDataset":
+        dataset = RepeatDataset(
+            custom_build_dataset(cfg["dataset"], default_args), cfg["times"]
+        )
+    elif cfg["type"] == "ClassBalancedDataset":
+        dataset = ClassBalancedDataset(
+            custom_build_dataset(cfg["dataset"], default_args),
+            cfg["oversample_thr"],
+        )
+    elif cfg["type"] == "CBGSDataset":
+        dataset = CBGSDataset(
+            custom_build_dataset(cfg["dataset"], default_args)
+        )
+    elif isinstance(cfg.get("ann_file"), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
--- a/projects/mmdet3d_plugin/datasets/nuscenes_3d_det_track_dataset.py
+++ b/projects/mmdet3d_plugin/datasets/nuscenes_3d_det_track_dataset.py
+import random
+import math
+import os
+from os import path as osp
+import cv2
+import tempfile
+import copy
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+import pyquaternion
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from nuscenes.eval.detection.config import config_factory as det_configs
+from nuscenes.eval.common.config import config_factory as track_configs
+
+import mmcv
+from mmcv.utils import print_log
+from mmdet.datasets import DATASETS
+from mmdet.datasets.pipelines import Compose
+from .utils import (
+    draw_lidar_bbox3d_on_img,
+    draw_lidar_bbox3d_on_bev,
+)
+
+
+@DATASETS.register_module()
+class NuScenes3DDetTrackDataset(Dataset):
+    DefaultAttribute = {
+        "car": "vehicle.parked",
+        "pedestrian": "pedestrian.moving",
+        "trailer": "vehicle.parked",
+        "truck": "vehicle.parked",
+        "bus": "vehicle.moving",
+        "motorcycle": "cycle.without_rider",
+        "construction_vehicle": "vehicle.parked",
+        "bicycle": "cycle.without_rider",
+        "barrier": "",
+        "traffic_cone": "",
+    }
+    ErrNameMapping = {
+        "trans_err": "mATE",
+        "scale_err": "mASE",
+        "orient_err": "mAOE",
+        "vel_err": "mAVE",
+        "attr_err": "mAAE",
+    }
+    CLASSES = (
+        "car",
+        "truck",
+        "trailer",
+        "bus",
+        "construction_vehicle",
+        "bicycle",
+        "motorcycle",
+        "pedestrian",
+        "traffic_cone",
+        "barrier",
+    )
+    ID_COLOR_MAP = [
+        (59, 59, 238),
+        (0, 255, 0),
+        (0, 0, 255),
+        (255, 255, 0),
+        (0, 255, 255),
+        (255, 0, 255),
+        (255, 255, 255),
+        (0, 127, 255),
+        (71, 130, 255),
+        (127, 127, 0),
+    ]
+
+    def __init__(
+        self,
+        ann_file,
+        pipeline=None,
+        data_root=None,
+        classes=None,
+        load_interval=1,
+        with_velocity=True,
+        modality=None,
+        test_mode=False,
+        det3d_eval_version="detection_cvpr_2019",
+        track3d_eval_version="tracking_nips_2019",
+        version="v1.0-trainval",
+        use_valid_flag=False,
+        vis_score_threshold=0.25,
+        data_aug_conf=None,
+        sequences_split_num=1,
+        with_seq_flag=False,
+        keep_consistent_seq_aug=True,
+        tracking=False,
+        tracking_threshold=0.2,
+    ):
+        self.version = version
+        self.load_interval = load_interval
+        self.use_valid_flag = use_valid_flag
+        super().__init__()
+        self.data_root = data_root
+        self.ann_file = ann_file
+        self.test_mode = test_mode
+        self.modality = modality
+        self.box_mode_3d = 0
+
+        if classes is not None:
+            self.CLASSES = classes
+        self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
+        self.data_infos = self.load_annotations(self.ann_file)
+
+        if pipeline is not None:
+            self.pipeline = Compose(pipeline)
+
+        self.with_velocity = with_velocity
+        self.det3d_eval_version = det3d_eval_version
+        self.det3d_eval_configs = det_configs(self.det3d_eval_version)
+        self.track3d_eval_version = track3d_eval_version
+        self.track3d_eval_configs = track_configs(self.track3d_eval_version)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=False,
+                use_lidar=True,
+                use_radar=False,
+                use_map=False,
+                use_external=False,
+            )
+        self.vis_score_threshold = vis_score_threshold
+
+        self.data_aug_conf = data_aug_conf
+        self.tracking = tracking
+        self.tracking_threshold = tracking_threshold
+        self.sequences_split_num = sequences_split_num
+        self.keep_consistent_seq_aug = keep_consistent_seq_aug
+        self.current_aug = None
+        self.last_id = None
+        if with_seq_flag:
+            self._set_sequence_group_flag()
+
+    def __len__(self):
+        return len(self.data_infos)
+
+    def _set_sequence_group_flag(self):
+        """
+        Set each sequence to be a different group
+        """
+        res = []
+
+        curr_sequence = 0
+        for idx in range(len(self.data_infos)):
+            if idx != 0 and len(self.data_infos[idx]["sweeps"]) == 0:
+                # Not first frame and # of sweeps is 0 -> new sequence
+                curr_sequence += 1
+            res.append(curr_sequence)
+
+        self.flag = np.array(res, dtype=np.int64)
+
+        if self.sequences_split_num != 1:
+            if self.sequences_split_num == "all":
+                self.flag = np.array(
+                    range(len(self.data_infos)), dtype=np.int64
+                )
+            else:
+                bin_counts = np.bincount(self.flag)
+                new_flags = []
+                curr_new_flag = 0
+                for curr_flag in range(len(bin_counts)):
+                    curr_sequence_length = np.array(
+                        list(
+                            range(
+                                0,
+                                bin_counts[curr_flag],
+                                math.ceil(
+                                    bin_counts[curr_flag]
+                                    / self.sequences_split_num
+                                ),
+                            )
+                        )
+                        + [bin_counts[curr_flag]]
+                    )
+
+                    for sub_seq_idx in (
+                        curr_sequence_length[1:] - curr_sequence_length[:-1]
+                    ):
+                        for _ in range(sub_seq_idx):
+                            new_flags.append(curr_new_flag)
+                        curr_new_flag += 1
+
+                assert len(new_flags) == len(self.flag)
+                assert (
+                    len(np.bincount(new_flags))
+                    == len(np.bincount(self.flag)) * self.sequences_split_num
+                )
+                self.flag = np.array(new_flags, dtype=np.int64)
+
+    def get_augmentation(self):
+        if self.data_aug_conf is None:
+            return None
+        H, W = self.data_aug_conf["H"], self.data_aug_conf["W"]
+        fH, fW = self.data_aug_conf["final_dim"]
+        if not self.test_mode:
+            resize = np.random.uniform(*self.data_aug_conf["resize_lim"])
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = (
+                int(
+                    (1 - np.random.uniform(*self.data_aug_conf["bot_pct_lim"]))
+                    * newH
+                )
+                - fH
+            )
+            crop_w = int(np.random.uniform(0, max(0, newW - fW)))
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            if self.data_aug_conf["rand_flip"] and np.random.choice([0, 1]):
+                flip = True
+            rotate = np.random.uniform(*self.data_aug_conf["rot_lim"])
+            rotate_3d = np.random.uniform(*self.data_aug_conf["rot3d_range"])
+        else:
+            resize = max(fH / H, fW / W)
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = (
+                int((1 - np.mean(self.data_aug_conf["bot_pct_lim"])) * newH)
+                - fH
+            )
+            crop_w = int(max(0, newW - fW) / 2)
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            rotate = 0
+            rotate_3d = 0
+        aug_config = {
+            "resize": resize,
+            "resize_dims": resize_dims,
+            "crop": crop,
+            "flip": flip,
+            "rotate": rotate,
+            "rotate_3d": rotate_3d,
+        }
+        return aug_config
+
+    def __getitem__(self, idx):
+        if isinstance(idx, dict):
+            aug_config = idx["aug_config"]
+            idx = idx["idx"]
+        else:
+            aug_config = self.get_augmentation()
+        data = self.get_data_info(idx)
+        data["aug_config"] = aug_config
+        data = self.pipeline(data)
+        return data
+
+    def get_cat_ids(self, idx):
+        info = self.data_infos[idx]
+        if self.use_valid_flag:
+            mask = info["valid_flag"]
+            gt_names = set(info["gt_names"][mask])
+        else:
+            gt_names = set(info["gt_names"])
+
+        cat_ids = []
+        for name in gt_names:
+            if name in self.CLASSES:
+                cat_ids.append(self.cat2id[name])
+        return cat_ids
+
+    def load_annotations(self, ann_file):
+        data = mmcv.load(ann_file, file_format="pkl")
+        data_infos = list(sorted(data["infos"], key=lambda e: e["timestamp"]))
+        data_infos = data_infos[:: self.load_interval]
+        self.metadata = data["metadata"]
+        self.version = self.metadata["version"]
+        print(self.metadata)
+        return data_infos
+
+    def get_data_info(self, index):
+        info = self.data_infos[index]
+        # standard protocol modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info["token"],
+            pts_filename=info["lidar_path"],
+            sweeps=info["sweeps"],
+            timestamp=info["timestamp"] / 1e6,
+            lidar2ego_translation=info["lidar2ego_translation"],
+            lidar2ego_rotation=info["lidar2ego_rotation"],
+            ego2global_translation=info["ego2global_translation"],
+            ego2global_rotation=info["ego2global_rotation"],
+        )
+        lidar2ego = np.eye(4)
+        lidar2ego[:3, :3] = pyquaternion.Quaternion(
+            info["lidar2ego_rotation"]
+        ).rotation_matrix
+        lidar2ego[:3, 3] = np.array(info["lidar2ego_translation"])
+        ego2global = np.eye(4)
+        ego2global[:3, :3] = pyquaternion.Quaternion(
+            info["ego2global_rotation"]
+        ).rotation_matrix
+        ego2global[:3, 3] = np.array(info["ego2global_translation"])
+        input_dict["lidar2global"] = ego2global @ lidar2ego
+
+        if self.modality["use_camera"]:
+            image_paths = []
+            lidar2img_rts = []
+            cam_intrinsic = []
+            for cam_type, cam_info in info["cams"].items():
+                image_paths.append(cam_info["data_path"])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info["sensor2lidar_rotation"])
+                lidar2cam_t = (
+                    cam_info["sensor2lidar_translation"] @ lidar2cam_r.T
+                )
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = copy.deepcopy(cam_info["cam_intrinsic"])
+                cam_intrinsic.append(intrinsic)
+                viewpad = np.eye(4)
+                viewpad[: intrinsic.shape[0], : intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = viewpad @ lidar2cam_rt.T
+                lidar2img_rts.append(lidar2img_rt)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsic,
+                )
+            )
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict.update(annos)
+        return input_dict
+
+    def get_ann_info(self, index):
+        info = self.data_infos[index]
+        if self.use_valid_flag:
+            mask = info["valid_flag"]
+        else:
+            mask = info["num_lidar_pts"] > 0
+        gt_bboxes_3d = info["gt_boxes"][mask]
+        gt_names_3d = info["gt_names"][mask]
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if self.with_velocity:
+            gt_velocity = info["gt_velocity"][mask]
+            nan_mask = np.isnan(gt_velocity[:, 0])
+            gt_velocity[nan_mask] = [0.0, 0.0]
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d,
+        )
+        if "instance_inds" in info:
+            instance_inds = np.array(info["instance_inds"], dtype=np.int)[mask]
+            anns_results["instance_inds"] = instance_inds
+        return anns_results
+
+    def _format_bbox(self, results, jsonfile_prefix=None, tracking=False):
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print("Start to convert detection format...")
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+            annos = []
+            boxes = output_to_nusc_box(
+                det, threshold=self.tracking_threshold if tracking else None
+            )
+            sample_token = self.data_infos[sample_id]["token"]
+            boxes = lidar_nusc_box_to_global(
+                self.data_infos[sample_id],
+                boxes,
+                mapped_class_names,
+                self.det3d_eval_configs,
+                self.det3d_eval_version,
+            )
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                if tracking and name in [
+                    "barrier",
+                    "traffic_cone",
+                    "construction_vehicle",
+                ]:
+                    continue
+                if np.sqrt(box.velocity[0] ** 2 + box.velocity[1] ** 2) > 0.2:
+                    if name in [
+                        "car",
+                        "construction_vehicle",
+                        "bus",
+                        "truck",
+                        "trailer",
+                    ]:
+                        attr = "vehicle.moving"
+                    elif name in ["bicycle", "motorcycle"]:
+                        attr = "cycle.with_rider"
+                    else:
+                        attr = NuScenes3DDetTrackDataset.DefaultAttribute[name]
+                else:
+                    if name in ["pedestrian"]:
+                        attr = "pedestrian.standing"
+                    elif name in ["bus"]:
+                        attr = "vehicle.stopped"
+                    else:
+                        attr = NuScenes3DDetTrackDataset.DefaultAttribute[name]
+
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                )
+                if not tracking:
+                    nusc_anno.update(
+                        dict(
+                            detection_name=name,
+                            detection_score=box.score,
+                            attribute_name=attr,
+                        )
+                    )
+                else:
+                    nusc_anno.update(
+                        dict(
+                            tracking_name=name,
+                            tracking_score=box.score,
+                            tracking_id=str(box.token),
+                        )
+                    )
+
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+        nusc_submissions = {
+            "meta": self.modality,
+            "results": nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, "results_nusc.json")
+        print("Results writes to", res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(
+        self, result_path, logger=None, result_name="img_bbox", tracking=False
+    ):
+        from nuscenes import NuScenes
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False
+        )
+        eval_set_map = {
+            "v1.0-mini": "mini_val",
+            "v1.0-trainval": "val",
+        }
+        if not tracking:
+            from nuscenes.eval.detection.evaluate import NuScenesEval
+
+            nusc_eval = NuScenesEval(
+                nusc,
+                config=self.det3d_eval_configs,
+                result_path=result_path,
+                eval_set=eval_set_map[self.version],
+                output_dir=output_dir,
+                verbose=True,
+            )
+            nusc_eval.main(render_curves=False)
+
+            # record metrics
+            metrics = mmcv.load(osp.join(output_dir, "metrics_summary.json"))
+            detail = dict()
+            metric_prefix = f"{result_name}_NuScenes"
+            for name in self.CLASSES:
+                for k, v in metrics["label_aps"][name].items():
+                    val = float("{:.4f}".format(v))
+                    detail[
+                        "{}/{}_AP_dist_{}".format(metric_prefix, name, k)
+                    ] = val
+                for k, v in metrics["label_tp_errors"][name].items():
+                    val = float("{:.4f}".format(v))
+                    detail["{}/{}_{}".format(metric_prefix, name, k)] = val
+                for k, v in metrics["tp_errors"].items():
+                    val = float("{:.4f}".format(v))
+                    detail[
+                        "{}/{}".format(metric_prefix, self.ErrNameMapping[k])
+                    ] = val
+
+            detail["{}/NDS".format(metric_prefix)] = metrics["nd_score"]
+            detail["{}/mAP".format(metric_prefix)] = metrics["mean_ap"]
+        else:
+            from nuscenes.eval.tracking.evaluate import TrackingEval
+
+            nusc_eval = TrackingEval(
+                config=self.track3d_eval_configs,
+                result_path=result_path,
+                eval_set=eval_set_map[self.version],
+                output_dir=output_dir,
+                verbose=True,
+                nusc_version=self.version,
+                nusc_dataroot=self.data_root,
+            )
+            metrics = nusc_eval.main()
+
+            # record metrics
+            metrics = mmcv.load(osp.join(output_dir, "metrics_summary.json"))
+            print(metrics)
+            detail = dict()
+            metric_prefix = f"{result_name}_NuScenes"
+            keys = [
+                "amota",
+                "amotp",
+                "recall",
+                "motar",
+                "gt",
+                "mota",
+                "motp",
+                "mt",
+                "ml",
+                "faf",
+                "tp",
+                "fp",
+                "fn",
+                "ids",
+                "frag",
+                "tid",
+                "lgd",
+            ]
+            for key in keys:
+                detail["{}/{}".format(metric_prefix, key)] = metrics[key]
+
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None, tracking=False):
+        assert isinstance(results, list), "results must be a list"
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, "results")
+        else:
+            tmp_dir = None
+
+        if not ("pts_bbox" in results[0] or "img_bbox" in results[0]):
+            result_files = self._format_bbox(
+                results, jsonfile_prefix, tracking=tracking
+            )
+        else:
+            result_files = dict()
+            for name in results[0]:
+                print(f"\nFormating bboxes of {name}")
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {
+                        name: self._format_bbox(
+                            results_, tmp_file_, tracking=tracking
+                        )
+                    }
+                )
+        return result_files, tmp_dir
+
+    def evaluate(
+        self,
+        results,
+        metric=None,
+        logger=None,
+        jsonfile_prefix=None,
+        result_names=["img_bbox"],
+        show=False,
+        out_dir=None,
+        pipeline=None,
+    ):
+        for metric in ["detection", "tracking"]:
+            tracking = metric == "tracking"
+            if tracking and not self.tracking:
+                continue
+            result_files, tmp_dir = self.format_results(
+                results, jsonfile_prefix, tracking=tracking
+            )
+
+            if isinstance(result_files, dict):
+                results_dict = dict()
+                for name in result_names:
+                    ret_dict = self._evaluate_single(
+                        result_files[name], tracking=tracking
+                    )
+                results_dict.update(ret_dict)
+            elif isinstance(result_files, str):
+                results_dict = self._evaluate_single(
+                    result_files, tracking=tracking
+                )
+            if tmp_dir is not None:
+                tmp_dir.cleanup()
+
+        if show or out_dir:
+            self.show(results, save_dir=out_dir, show=show, pipeline=pipeline)
+        return results_dict
+
+    def show(self, results, save_dir=None, show=False, pipeline=None):
+        save_dir = "./" if save_dir is None else save_dir
+        save_dir = os.path.join(save_dir, "visual")
+        print_log(os.path.abspath(save_dir))
+        pipeline = Compose(pipeline)
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+
+        fourcc = cv2.VideoWriter_fourcc(*"MJPG")
+        videoWriter = None
+
+        for i, result in enumerate(results):
+            if "img_bbox" in result.keys():
+                result = result["img_bbox"]
+            data_info = pipeline(self.get_data_info(i))
+            imgs = []
+
+            raw_imgs = data_info["img"]
+            lidar2img = data_info["img_metas"].data["lidar2img"]
+            pred_bboxes_3d = result["boxes_3d"][
+                result["scores_3d"] > self.vis_score_threshold
+            ]
+            if "instance_ids" in result and self.tracking:
+                color = []
+                for id in result["instance_ids"].cpu().numpy().tolist():
+                    color.append(
+                        self.ID_COLOR_MAP[int(id % len(self.ID_COLOR_MAP))]
+                    )
+            elif "labels_3d" in result:
+                color = []
+                for id in result["labels_3d"].cpu().numpy().tolist():
+                    color.append(self.ID_COLOR_MAP[id])
+            else:
+                color = (255, 0, 0)
+
+            # ===== draw boxes_3d to images =====
+            for j, img_origin in enumerate(raw_imgs):
+                img = img_origin.copy()
+                if len(pred_bboxes_3d) != 0:
+                    img = draw_lidar_bbox3d_on_img(
+                        pred_bboxes_3d,
+                        img,
+                        lidar2img[j],
+                        img_metas=None,
+                        color=color,
+                        thickness=3,
+                    )
+                imgs.append(img)
+
+            # ===== draw boxes_3d to BEV =====
+            bev = draw_lidar_bbox3d_on_bev(
+                pred_bboxes_3d,
+                bev_size=img.shape[0] * 2,
+                color=color,
+            )
+
+            # ===== put text and concat =====
+            for j, name in enumerate(
+                [
+                    "front",
+                    "front right",
+                    "front left",
+                    "rear",
+                    "rear left",
+                    "rear right",
+                ]
+            ):
+                imgs[j] = cv2.rectangle(
+                    imgs[j],
+                    (0, 0),
+                    (440, 80),
+                    color=(255, 255, 255),
+                    thickness=-1,
+                )
+                w, h = cv2.getTextSize(name, cv2.FONT_HERSHEY_SIMPLEX, 2, 2)[0]
+                text_x = int(220 - w / 2)
+                text_y = int(40 + h / 2)
+
+                imgs[j] = cv2.putText(
+                    imgs[j],
+                    name,
+                    (text_x, text_y),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    2,
+                    (0, 0, 0),
+                    2,
+                    cv2.LINE_AA,
+                )
+            image = np.concatenate(
+                [
+                    np.concatenate([imgs[2], imgs[0], imgs[1]], axis=1),
+                    np.concatenate([imgs[5], imgs[3], imgs[4]], axis=1),
+                ],
+                axis=0,
+            )
+            image = np.concatenate([image, bev], axis=1)
+
+            # ===== save video =====
+            if videoWriter is None:
+                videoWriter = cv2.VideoWriter(
+                    os.path.join(save_dir, "video.avi"),
+                    fourcc,
+                    7,
+                    image.shape[:2][::-1],
+                )
+            cv2.imwrite(os.path.join(save_dir, f"{i}.jpg"), image)
+            videoWriter.write(image)
+        videoWriter.release()
+
+
+def output_to_nusc_box(detection, threshold=None):
+    box3d = detection["boxes_3d"]
+    scores = detection["scores_3d"].numpy()
+    labels = detection["labels_3d"].numpy()
+    if "instance_ids" in detection:
+        ids = detection["instance_ids"]  # .numpy()
+    if threshold is not None:
+        if "cls_scores" in detection:
+            mask = detection["cls_scores"].numpy() >= threshold
+        else:
+            mask = scores >= threshold
+        box3d = box3d[mask]
+        scores = scores[mask]
+        labels = labels[mask]
+        ids = ids[mask]
+
+    if hasattr(box3d, "gravity_center"):
+        box_gravity_center = box3d.gravity_center.numpy()
+        box_dims = box3d.dims.numpy()
+        nus_box_dims = box_dims[:, [1, 0, 2]]
+        box_yaw = box3d.yaw.numpy()
+    else:
+        box3d = box3d.numpy()
+        box_gravity_center = box3d[..., :3].copy()
+        box_dims = box3d[..., 3:6].copy()
+        nus_box_dims = box_dims[..., [1, 0, 2]]
+        box_yaw = box3d[..., 6].copy()
+
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    # box_yaw = -box_yaw - np.pi / 2
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        if hasattr(box3d, "gravity_center"):
+            velocity = (*box3d.tensor[i, 7:9], 0.0)
+        else:
+            velocity = (*box3d[i, 7:9], 0.0)
+        box = NuScenesBox(
+            box_gravity_center[i],
+            nus_box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity,
+        )
+        if "instance_ids" in detection:
+            box.token = ids[i]
+        box_list.append(box)
+    return box_list
+
+
+def lidar_nusc_box_to_global(
+    info,
+    boxes,
+    classes,
+    eval_configs,
+    eval_version="detection_cvpr_2019",
+):
+    box_list = []
+    for i, box in enumerate(boxes):
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info["lidar2ego_rotation"]))
+        box.translate(np.array(info["lidar2ego_translation"]))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info["ego2global_rotation"]))
+        box.translate(np.array(info["ego2global_translation"]))
+        box_list.append(box)
+    return box_list
--- a/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
+from .transform import (
+    InstanceNameFilter,
+    CircleObjectRangeFilter,
+    NormalizeMultiviewImage,
+    NuScenesSparse4DAdaptor,
+    MultiScaleDepthMapGenerator,
+)
+from .augment import (
+    ResizeCropFlipImage,
+    BBoxRotation,
+    PhotoMetricDistortionMultiViewImage,
+)
+from .loading import LoadMultiViewImageFromFiles, LoadPointsFromFile
+
+__all__ = [
+    "InstanceNameFilter",
+    "ResizeCropFlipImage",
+    "BBoxRotation",
+    "CircleObjectRangeFilter",
+    "MultiScaleDepthMapGenerator",
+    "NormalizeMultiviewImage",
+    "PhotoMetricDistortionMultiViewImage",
+    "NuScenesSparse4DAdaptor",
+    "LoadMultiViewImageFromFiles",
+    "LoadPointsFromFile",
+]
--- a/projects/mmdet3d_plugin/datasets/pipelines/augment.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/augment.py
+import torch
+
+import numpy as np
+from numpy import random
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+from PIL import Image
+
+
+@PIPELINES.register_module()
+class ResizeCropFlipImage(object):
+    def __call__(self, results):
+        aug_config = results.get("aug_config")
+        if aug_config is None:
+            return results
+        imgs = results["img"]
+        N = len(imgs)
+        new_imgs = []
+        for i in range(N):
+            img, mat = self._img_transform(
+                np.uint8(imgs[i]), aug_config,
+            )
+            new_imgs.append(np.array(img).astype(np.float32))
+            results["lidar2img"][i] = mat @ results["lidar2img"][i]
+            if "cam_intrinsic" in results:
+                results["cam_intrinsic"][i][:3, :3] *= aug_config["resize"]
+                # results["cam_intrinsic"][i][:3, :3] = (
+                #     mat[:3, :3] @ results["cam_intrinsic"][i][:3, :3]
+                # )
+
+        results["img"] = new_imgs
+        results["img_shape"] = [x.shape[:2] for x in new_imgs]
+        return results
+
+    def _img_transform(self, img, aug_configs):
+        H, W = img.shape[:2]
+        resize = aug_configs.get("resize", 1)
+        resize_dims = (int(W * resize), int(H * resize))
+        crop = aug_configs.get("crop", [0, 0, *resize_dims])
+        flip = aug_configs.get("flip", False)
+        rotate = aug_configs.get("rotate", 0)
+
+        origin_dtype = img.dtype
+        if origin_dtype != np.uint8:
+            min_value = img.min()
+            max_vaule = img.max()
+            scale = 255 / (max_vaule - min_value)
+            img = (img - min_value) * scale
+            img = np.uint8(img)
+        img = Image.fromarray(img)
+        img = img.resize(resize_dims).crop(crop)
+        if flip:
+            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
+        img = img.rotate(rotate)
+        img = np.array(img).astype(np.float32)
+        if origin_dtype != np.uint8:
+            img = img.astype(np.float32)
+            img = img / scale + min_value
+
+        transform_matrix = np.eye(3)
+        transform_matrix[:2, :2] *= resize
+        transform_matrix[:2, 2] -= np.array(crop[:2])
+        if flip:
+            flip_matrix = np.array(
+                [[-1, 0, crop[2] - crop[0]], [0, 1, 0], [0, 0, 1]]
+            )
+            transform_matrix = flip_matrix @ transform_matrix
+        rotate = rotate / 180 * np.pi
+        rot_matrix = np.array(
+            [
+                [np.cos(rotate), np.sin(rotate), 0],
+                [-np.sin(rotate), np.cos(rotate), 0],
+                [0, 0, 1],
+            ]
+        )
+        rot_center = np.array([crop[2] - crop[0], crop[3] - crop[1]]) / 2
+        rot_matrix[:2, 2] = -rot_matrix[:2, :2] @ rot_center + rot_center
+        transform_matrix = rot_matrix @ transform_matrix
+        extend_matrix = np.eye(4)
+        extend_matrix[:3, :3] = transform_matrix
+        return img, extend_matrix
+
+
+@PIPELINES.register_module()
+class BBoxRotation(object):
+    def __call__(self, results):
+        angle = results["aug_config"]["rotate_3d"]
+        rot_cos = np.cos(angle)
+        rot_sin = np.sin(angle)
+
+        rot_mat = np.array(
+            [
+                [rot_cos, -rot_sin, 0, 0],
+                [rot_sin, rot_cos, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1],
+            ]
+        )
+        rot_mat_inv = np.linalg.inv(rot_mat)
+
+        num_view = len(results["lidar2img"])
+        for view in range(num_view):
+            results["lidar2img"][view] = (
+                results["lidar2img"][view] @ rot_mat_inv
+            )
+        if "lidar2global" in results:
+            results["lidar2global"] = results["lidar2global"] @ rot_mat_inv
+        if "gt_bboxes_3d" in results:
+            results["gt_bboxes_3d"] = self.box_rotate(
+                results["gt_bboxes_3d"], angle
+            )
+        return results
+
+    @staticmethod
+    def box_rotate(bbox_3d, angle):
+        rot_cos = np.cos(angle)
+        rot_sin = np.sin(angle)
+        rot_mat_T = np.array(
+            [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]]
+        )
+        bbox_3d[:, :3] = bbox_3d[:, :3] @ rot_mat_T
+        bbox_3d[:, 6] += angle
+        if bbox_3d.shape[-1] > 7:
+            vel_dims = bbox_3d[:, 7:].shape[-1]
+            bbox_3d[:, 7:] = bbox_3d[:, 7:] @ rot_mat_T[:vel_dims, :vel_dims]
+        return bbox_3d
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortionMultiViewImage:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(
+        self,
+        brightness_delta=32,
+        contrast_range=(0.5, 1.5),
+        saturation_range=(0.5, 1.5),
+        hue_delta=18,
+    ):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        imgs = results["img"]
+        new_imgs = []
+        for img in imgs:
+            assert img.dtype == np.float32, (
+                "PhotoMetricDistortion needs the input image of dtype np.float32,"
+                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+            )
+            # random brightness
+            if random.randint(2):
+                delta = random.uniform(
+                    -self.brightness_delta, self.brightness_delta
+                )
+                img += delta
+
+            # mode == 0 --> do random contrast first
+            # mode == 1 --> do random contrast last
+            mode = random.randint(2)
+            if mode == 1:
+                if random.randint(2):
+                    alpha = random.uniform(
+                        self.contrast_lower, self.contrast_upper
+                    )
+                    img *= alpha
+
+            # convert color from BGR to HSV
+            img = mmcv.bgr2hsv(img)
+
+            # random saturation
+            if random.randint(2):
+                img[..., 1] *= random.uniform(
+                    self.saturation_lower, self.saturation_upper
+                )
+
+            # random hue
+            if random.randint(2):
+                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+                img[..., 0][img[..., 0] > 360] -= 360
+                img[..., 0][img[..., 0] < 0] += 360
+
+            # convert color from HSV to BGR
+            img = mmcv.hsv2bgr(img)
+
+            # random contrast
+            if mode == 0:
+                if random.randint(2):
+                    alpha = random.uniform(
+                        self.contrast_lower, self.contrast_upper
+                    )
+                    img *= alpha
+
+            # randomly swap channels
+            if random.randint(2):
+                img = img[..., random.permutation(3)]
+            new_imgs.append(img)
+        results["img"] = new_imgs
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(\nbrightness_delta={self.brightness_delta},\n"
+        repr_str += "contrast_range="
+        repr_str += f"{(self.contrast_lower, self.contrast_upper)},\n"
+        repr_str += "saturation_range="
+        repr_str += f"{(self.saturation_lower, self.saturation_upper)},\n"
+        repr_str += f"hue_delta={self.hue_delta})"
+        return repr_str
--- a/projects/mmdet3d_plugin/datasets/pipelines/loading.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/loading.py
+import numpy as np
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadMultiViewImageFromFiles(object):
+    """Load multi channel images from a list of separate channel files.
+
+    Expects results['img_filename'] to be a list of filenames.
+
+    Args:
+        to_float32 (bool, optional): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str, optional): Color type of the file.
+            Defaults to 'unchanged'.
+    """
+
+    def __init__(self, to_float32=False, color_type="unchanged"):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+
+    def __call__(self, results):
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data.
+                Added keys and values are described below.
+
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        filename = results["img_filename"]
+        # img is of shape (h, w, c, num_views)
+        img = np.stack(
+            [mmcv.imread(name, self.color_type) for name in filename], axis=-1
+        )
+        if self.to_float32:
+            img = img.astype(np.float32)
+        results["filename"] = filename
+        # unravel to list, see `DefaultFormatBundle` in formatting.py
+        # which will transpose each image separately and then stack into array
+        results["img"] = [img[..., i] for i in range(img.shape[-1])]
+        results["img_shape"] = img.shape
+        results["ori_shape"] = img.shape
+        # Set initial values for default meta_keys
+        results["pad_shape"] = img.shape
+        results["scale_factor"] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results["img_norm_cfg"] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False,
+        )
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f"(to_float32={self.to_float32}, "
+        repr_str += f"color_type='{self.color_type}')"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadPointsFromFile(object):
+    """Load Points From File.
+
+    Load points from file.
+
+    Args:
+        coord_type (str): The type of coordinates of points cloud.
+            Available options includes:
+            - 'LIDAR': Points in LiDAR coordinates.
+            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+            - 'CAMERA': Points in camera coordinates.
+        load_dim (int, optional): The dimension of the loaded points.
+            Defaults to 6.
+        use_dim (list[int], optional): Which dimensions of the points to use.
+            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+        shift_height (bool, optional): Whether to use shifted height.
+            Defaults to False.
+        use_color (bool, optional): Whether to use color features.
+            Defaults to False.
+        file_client_args (dict, optional): Config dict of file clients,
+            refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details. Defaults to dict(backend='disk').
+    """
+
+    def __init__(
+        self,
+        coord_type,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        shift_height=False,
+        use_color=False,
+        file_client_args=dict(backend="disk"),
+    ):
+        self.shift_height = shift_height
+        self.use_color = use_color
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        assert (
+            max(use_dim) < load_dim
+        ), f"Expect all used dimensions < {load_dim}, got {use_dim}"
+        assert coord_type in ["CAMERA", "LIDAR", "DEPTH"]
+
+        self.coord_type = coord_type
+        self.load_dim = load_dim
+        self.use_dim = use_dim
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_points(self, pts_filename):
+        """Private function to load point clouds data.
+
+        Args:
+            pts_filename (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+        try:
+            pts_bytes = self.file_client.get(pts_filename)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+        except ConnectionError:
+            mmcv.check_file_exist(pts_filename)
+            if pts_filename.endswith(".npy"):
+                points = np.load(pts_filename)
+            else:
+                points = np.fromfile(pts_filename, dtype=np.float32)
+
+        return points
+
+    def __call__(self, results):
+        """Call function to load points data from file.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the point clouds data.
+                Added key and value are described below.
+
+                - points (:obj:`BasePoints`): Point clouds data.
+        """
+        pts_filename = results["pts_filename"]
+        points = self._load_points(pts_filename)
+        points = points.reshape(-1, self.load_dim)
+        points = points[:, self.use_dim]
+        attribute_dims = None
+
+        if self.shift_height:
+            floor_height = np.percentile(points[:, 2], 0.99)
+            height = points[:, 2] - floor_height
+            points = np.concatenate(
+                [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1
+            )
+            attribute_dims = dict(height=3)
+
+        if self.use_color:
+            assert len(self.use_dim) >= 6
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(
+                    color=[
+                        points.shape[1] - 3,
+                        points.shape[1] - 2,
+                        points.shape[1] - 1,
+                    ]
+                )
+            )
+
+        results["points"] = points
+        return results
--- a/projects/mmdet3d_plugin/datasets/pipelines/transform.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/transform.py
+import numpy as np
+import mmcv
+from mmcv.parallel import DataContainer as DC
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import to_tensor
+
+
+@PIPELINES.register_module()
+class MultiScaleDepthMapGenerator(object):
+    def __init__(self, downsample=1, max_depth=60):
+        if not isinstance(downsample, (list, tuple)):
+            downsample = [downsample]
+        self.downsample = downsample
+        self.max_depth = max_depth
+
+    def __call__(self, input_dict):
+        points = input_dict["points"][..., :3, None]
+        gt_depth = []
+        for i, lidar2img in enumerate(input_dict["lidar2img"]):
+            H, W = input_dict["img_shape"][i][:2]
+
+            pts_2d = (
+                np.squeeze(lidar2img[:3, :3] @ points, axis=-1)
+                + lidar2img[:3, 3]
+            )
+            pts_2d[:, :2] /= pts_2d[:, 2:3]
+            U = np.round(pts_2d[:, 0]).astype(np.int32)
+            V = np.round(pts_2d[:, 1]).astype(np.int32)
+            depths = pts_2d[:, 2]
+            mask = np.logical_and.reduce(
+                [
+                    V >= 0,
+                    V < H,
+                    U >= 0,
+                    U < W,
+                    depths >= 0.1,
+                    # depths <= self.max_depth,
+                ]
+            )
+            V, U, depths = V[mask], U[mask], depths[mask]
+            sort_idx = np.argsort(depths)[::-1]
+            V, U, depths = V[sort_idx], U[sort_idx], depths[sort_idx]
+            depths = np.clip(depths, 0.1, self.max_depth)
+            for j, downsample in enumerate(self.downsample):
+                if len(gt_depth) < j + 1:
+                    gt_depth.append([])
+                h, w = (int(H / downsample), int(W / downsample))
+                u = np.floor(U / downsample).astype(np.int32)
+                v = np.floor(V / downsample).astype(np.int32)
+                depth_map = np.ones([h, w], dtype=np.float32) * -1
+                depth_map[v, u] = depths
+                gt_depth[j].append(depth_map)
+
+        input_dict["gt_depth"] = [np.stack(x) for x in gt_depth]
+        return input_dict
+
+
+@PIPELINES.register_module()
+class NuScenesSparse4DAdaptor(object):
+    def __init(self):
+        pass
+
+    def __call__(self, input_dict):
+        input_dict["projection_mat"] = np.float32(
+            np.stack(input_dict["lidar2img"])
+        )
+        input_dict["image_wh"] = np.ascontiguousarray(
+            np.array(input_dict["img_shape"], dtype=np.float32)[:, :2][:, ::-1]
+        )
+        input_dict["T_global_inv"] = np.linalg.inv(input_dict["lidar2global"])
+        input_dict["T_global"] = input_dict["lidar2global"]
+        if "cam_intrinsic" in input_dict:
+            input_dict["cam_intrinsic"] = np.float32(
+                np.stack(input_dict["cam_intrinsic"])
+            )
+            input_dict["focal"] = input_dict["cam_intrinsic"][..., 0, 0]
+            # input_dict["focal"] = np.sqrt(
+            #     np.abs(np.linalg.det(input_dict["cam_intrinsic"][:, :2, :2]))
+            # )
+        if "instance_inds" in input_dict:
+            input_dict["instance_id"] = input_dict["instance_inds"]
+
+        if "gt_bboxes_3d" in input_dict:
+            input_dict["gt_bboxes_3d"][:, 6] = self.limit_period(
+                input_dict["gt_bboxes_3d"][:, 6], offset=0.5, period=2 * np.pi
+            )
+            input_dict["gt_bboxes_3d"] = DC(
+                to_tensor(input_dict["gt_bboxes_3d"]).float()
+            )
+        if "gt_labels_3d" in input_dict:
+            input_dict["gt_labels_3d"] = DC(
+                to_tensor(input_dict["gt_labels_3d"]).long()
+            )
+
+        imgs = [img.transpose(2, 0, 1) for img in input_dict["img"]]
+        imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
+        input_dict["img"] = DC(to_tensor(imgs), stack=True)
+        return input_dict
+
+    def limit_period(
+        self, val: np.ndarray, offset: float = 0.5, period: float = np.pi
+    ) -> np.ndarray:
+        limited_val = val - np.floor(val / period + offset) * period
+        return limited_val
+
+
+@PIPELINES.register_module()
+class InstanceNameFilter(object):
+    """Filter GT objects by their names.
+
+    Args:
+        classes (list[str]): List of class names to be kept for training.
+    """
+
+    def __init__(self, classes):
+        self.classes = classes
+        self.labels = list(range(len(self.classes)))
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by their names.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        gt_labels_3d = input_dict["gt_labels_3d"]
+        gt_bboxes_mask = np.array(
+            [n in self.labels for n in gt_labels_3d], dtype=np.bool_
+        )
+        input_dict["gt_bboxes_3d"] = input_dict["gt_bboxes_3d"][gt_bboxes_mask]
+        input_dict["gt_labels_3d"] = input_dict["gt_labels_3d"][gt_bboxes_mask]
+        if "instance_inds" in input_dict:
+            input_dict["instance_inds"] = input_dict["instance_inds"][
+                gt_bboxes_mask
+            ]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f"(classes={self.classes})"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CircleObjectRangeFilter(object):
+    def __init__(
+        self, class_dist_thred=[52.5] * 5 + [31.5] + [42] * 3 + [31.5]
+    ):
+        self.class_dist_thred = class_dist_thred
+
+    def __call__(self, input_dict):
+        gt_bboxes_3d = input_dict["gt_bboxes_3d"]
+        gt_labels_3d = input_dict["gt_labels_3d"]
+        dist = np.sqrt(
+            np.sum(gt_bboxes_3d[:, :2] ** 2, axis=-1)
+        )
+        mask = np.array([False] * len(dist))
+        for label_idx, dist_thred in enumerate(self.class_dist_thred):
+            mask = np.logical_or(
+                mask,
+                np.logical_and(gt_labels_3d == label_idx, dist <= dist_thred),
+            )
+
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        gt_labels_3d = gt_labels_3d[mask]
+
+        input_dict["gt_bboxes_3d"] = gt_bboxes_3d
+        input_dict["gt_labels_3d"] = gt_labels_3d
+        if "instance_inds" in input_dict:
+            input_dict["instance_inds"] = input_dict["instance_inds"][mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f"(class_dist_thred={self.class_dist_thred})"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+    """Normalize the image.
+    Added key is "img_norm_cfg".
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        """Call function to normalize images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+        results["img"] = [
+            mmcv.imnormalize(img, self.mean, self.std, self.to_rgb)
+            for img in results["img"]
+        ]
+        results["img_norm_cfg"] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb
+        )
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})"
+        return repr_str