init

4cd43886 · lishj6 · a9a1fe81 · 4cd43886 · 4cd43886 · 4cd43886
Commit 4cd43886 authored Sep 01, 2025 by lishj6 🏸
20 changed files
--- a/tools/data_converter/indoor_converter.py
+++ b/tools/data_converter/indoor_converter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+
+from data_converter.s3dis_data_utils import S3DISData, S3DISSegData
+from data_converter.scannet_data_utils import ScanNetData, ScanNetSegData
+from data_converter.sunrgbd_data_utils import SUNRGBDData
+
+
+def create_indoor_info_file(data_path,
+                            pkl_prefix='sunrgbd',
+                            save_path=None,
+                            use_v1=False,
+                            workers=4):
+    """Create indoor information file.
+
+    Get information of the raw data and save it to the pkl file.
+
+    Args:
+        data_path (str): Path of the data.
+        pkl_prefix (str): Prefix of the pkl to be saved. Default: 'sunrgbd'.
+        save_path (str): Path of the pkl to be saved. Default: None.
+        use_v1 (bool): Whether to use v1. Default: False.
+        workers (int): Number of threads to be used. Default: 4.
+    """
+    assert os.path.exists(data_path)
+    assert pkl_prefix in ['sunrgbd', 'scannet', 's3dis'], \
+        f'unsupported indoor dataset {pkl_prefix}'
+    save_path = data_path if save_path is None else save_path
+    assert os.path.exists(save_path)
+
+    # generate infos for both detection and segmentation task
+    if pkl_prefix in ['sunrgbd', 'scannet']:
+        train_filename = os.path.join(save_path,
+                                      f'{pkl_prefix}_infos_train.pkl')
+        val_filename = os.path.join(save_path, f'{pkl_prefix}_infos_val.pkl')
+        if pkl_prefix == 'sunrgbd':
+            # SUN RGB-D has a train-val split
+            train_dataset = SUNRGBDData(
+                root_path=data_path, split='train', use_v1=use_v1)
+            val_dataset = SUNRGBDData(
+                root_path=data_path, split='val', use_v1=use_v1)
+        else:
+            # ScanNet has a train-val-test split
+            train_dataset = ScanNetData(root_path=data_path, split='train')
+            val_dataset = ScanNetData(root_path=data_path, split='val')
+            test_dataset = ScanNetData(root_path=data_path, split='test')
+            test_filename = os.path.join(save_path,
+                                         f'{pkl_prefix}_infos_test.pkl')
+
+        infos_train = train_dataset.get_infos(
+            num_workers=workers, has_label=True)
+        mmcv.dump(infos_train, train_filename, 'pkl')
+        print(f'{pkl_prefix} info train file is saved to {train_filename}')
+
+        infos_val = val_dataset.get_infos(num_workers=workers, has_label=True)
+        mmcv.dump(infos_val, val_filename, 'pkl')
+        print(f'{pkl_prefix} info val file is saved to {val_filename}')
+
+    if pkl_prefix == 'scannet':
+        infos_test = test_dataset.get_infos(
+            num_workers=workers, has_label=False)
+        mmcv.dump(infos_test, test_filename, 'pkl')
+        print(f'{pkl_prefix} info test file is saved to {test_filename}')
+
+    # generate infos for the semantic segmentation task
+    # e.g. re-sampled scene indexes and label weights
+    # scene indexes are used to re-sample rooms with different number of points
+    # label weights are used to balance classes with different number of points
+    if pkl_prefix == 'scannet':
+        # label weight computation function is adopted from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        train_dataset = ScanNetSegData(
+            data_root=data_path,
+            ann_file=train_filename,
+            split='train',
+            num_points=8192,
+            label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+        # TODO: do we need to generate on val set?
+        val_dataset = ScanNetSegData(
+            data_root=data_path,
+            ann_file=val_filename,
+            split='val',
+            num_points=8192,
+            label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+        # no need to generate for test set
+        train_dataset.get_seg_infos()
+        val_dataset.get_seg_infos()
+    elif pkl_prefix == 's3dis':
+        # S3DIS doesn't have a fixed train-val split
+        # it has 6 areas instead, so we generate info file for each of them
+        # in training, we will use dataset to wrap different areas
+        splits = [f'Area_{i}' for i in [1, 2, 3, 4, 5, 6]]
+        for split in splits:
+            dataset = S3DISData(root_path=data_path, split=split)
+            info = dataset.get_infos(num_workers=workers, has_label=True)
+            filename = os.path.join(save_path,
+                                    f'{pkl_prefix}_infos_{split}.pkl')
+            mmcv.dump(info, filename, 'pkl')
+            print(f'{pkl_prefix} info {split} file is saved to {filename}')
+            seg_dataset = S3DISSegData(
+                data_root=data_path,
+                ann_file=filename,
+                split=split,
+                num_points=4096,
+                label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+            seg_dataset.get_seg_infos()
--- a/tools/data_converter/kitti_converter.py
+++ b/tools/data_converter/kitti_converter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from collections import OrderedDict
+from nuscenes.utils.geometry_utils import view_points
+from pathlib import Path
+
+from mmdet3d.core.bbox import box_np_ops
+from .kitti_data_utils import get_kitti_image_info, get_waymo_image_info
+from .nuscenes_converter import post_process_coords
+
+kitti_categories = ('Pedestrian', 'Cyclist', 'Car')
+
+
+def convert_to_kitti_info_version2(info):
+    """convert kitti info v1 to v2 if possible.
+
+    Args:
+        info (dict): Info of the input kitti data.
+            - image (dict): image info
+            - calib (dict): calibration info
+            - point_cloud (dict): point cloud info
+    """
+    if 'image' not in info or 'calib' not in info or 'point_cloud' not in info:
+        info['image'] = {
+            'image_shape': info['img_shape'],
+            'image_idx': info['image_idx'],
+            'image_path': info['img_path'],
+        }
+        info['calib'] = {
+            'R0_rect': info['calib/R0_rect'],
+            'Tr_velo_to_cam': info['calib/Tr_velo_to_cam'],
+            'P2': info['calib/P2'],
+        }
+        info['point_cloud'] = {
+            'velodyne_path': info['velodyne_path'],
+        }
+
+
+def _read_imageset_file(path):
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    return [int(line) for line in lines]
+
+
+def _calculate_num_points_in_gt(data_path,
+                                infos,
+                                relative_path,
+                                remove_outside=True,
+                                num_features=4):
+    for info in mmcv.track_iter_progress(infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+        if relative_path:
+            v_path = str(Path(data_path) / pc_info['velodyne_path'])
+        else:
+            v_path = pc_info['velodyne_path']
+        points_v = np.fromfile(
+            v_path, dtype=np.float32, count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        Trv2c = calib['Tr_velo_to_cam']
+        P2 = calib['P2']
+        if remove_outside:
+            points_v = box_np_ops.remove_outside_points(
+                points_v, rect, Trv2c, P2, image_info['image_shape'])
+
+        # points_v = points_v[points_v[:, 0] > 0]
+        annos = info['annos']
+        num_obj = len([n for n in annos['name'] if n != 'DontCare'])
+        # annos = kitti.filter_kitti_anno(annos, ['DontCare'])
+        dims = annos['dimensions'][:num_obj]
+        loc = annos['location'][:num_obj]
+        rots = annos['rotation_y'][:num_obj]
+        gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                         axis=1)
+        gt_boxes_lidar = box_np_ops.box_camera_to_lidar(
+            gt_boxes_camera, rect, Trv2c)
+        indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar)
+        num_points_in_gt = indices.sum(0)
+        num_ignored = len(annos['dimensions']) - num_obj
+        num_points_in_gt = np.concatenate(
+            [num_points_in_gt, -np.ones([num_ignored])])
+        annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32)
+
+
+def create_kitti_info_file(data_path,
+                           pkl_prefix='kitti',
+                           save_path=None,
+                           relative_path=True):
+    """Create info file of KITTI dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        data_path (str): Path of the data root.
+        pkl_prefix (str): Prefix of the info file to be generated.
+        save_path (str): Path to save the info file.
+        relative_path (bool): Whether to use relative path.
+    """
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+
+    val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+    test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    kitti_infos_train = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        image_ids=train_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Kitti info train file is saved to {filename}')
+    mmcv.dump(kitti_infos_train, filename)
+    kitti_infos_val = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        image_ids=val_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    print(f'Kitti info val file is saved to {filename}')
+    mmcv.dump(kitti_infos_val, filename)
+    filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    print(f'Kitti info trainval file is saved to {filename}')
+    mmcv.dump(kitti_infos_train + kitti_infos_val, filename)
+
+    kitti_infos_test = get_kitti_image_info(
+        data_path,
+        training=False,
+        label_info=False,
+        velodyne=True,
+        calib=True,
+        image_ids=test_img_ids,
+        relative_path=relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    print(f'Kitti info test file is saved to {filename}')
+    mmcv.dump(kitti_infos_test, filename)
+
+
+def create_waymo_info_file(data_path,
+                           pkl_prefix='waymo',
+                           save_path=None,
+                           relative_path=True,
+                           max_sweeps=5):
+    """Create info file of waymo dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        data_path (str): Path of the data root.
+        pkl_prefix (str): Prefix of the info file to be generated.
+        save_path (str | None): Path to save the info file.
+        relative_path (bool): Whether to use relative path.
+        max_sweeps (int): Max sweeps before the detection frame to be used.
+    """
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+    # val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+    # test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+    train_img_ids = [each for each in train_img_ids if each % 5 == 0]
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    waymo_infos_train = get_waymo_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        pose=True,
+        image_ids=train_img_ids,
+        relative_path=relative_path,
+        max_sweeps=max_sweeps)
+    _calculate_num_points_in_gt(
+        data_path,
+        waymo_infos_train,
+        relative_path,
+        num_features=6,
+        remove_outside=False)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Waymo info train file is saved to {filename}')
+    mmcv.dump(waymo_infos_train, filename)
+    #
+    # waymo_infos_val = get_waymo_image_info(
+    #     data_path,
+    #     training=True,
+    #     velodyne=True,
+    #     calib=True,
+    #     pose=True,
+    #     image_ids=val_img_ids,
+    #     relative_path=relative_path,
+    #     max_sweeps=max_sweeps)
+    # _calculate_num_points_in_gt(
+    #     data_path,
+    #     waymo_infos_val,
+    #     relative_path,
+    #     num_features=6,
+    #     remove_outside=False)
+    # filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    # print(f'Waymo info val file is saved to {filename}')
+    # mmcv.dump(waymo_infos_val, filename)
+    # filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    # print(f'Waymo info trainval file is saved to {filename}')
+    # mmcv.dump(waymo_infos_train + waymo_infos_val, filename)
+    # waymo_infos_test = get_waymo_image_info(
+    #     data_path,
+    #     training=False,
+    #     label_info=False,
+    #     velodyne=True,
+    #     calib=True,
+    #     pose=True,
+    #     image_ids=test_img_ids,
+    #     relative_path=relative_path,
+    #     max_sweeps=max_sweeps)
+    # filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    # print(f'Waymo info test file is saved to {filename}')
+    # mmcv.dump(waymo_infos_test, filename)
+
+
+def _create_reduced_point_cloud(data_path,
+                                info_path,
+                                save_path=None,
+                                back=False,
+                                num_features=4,
+                                front_camera_id=2):
+    """Create reduced point clouds for given info.
+
+    Args:
+        data_path (str): Path of original data.
+        info_path (str): Path of data info.
+        save_path (str | None): Path to save reduced point cloud data.
+            Default: None.
+        back (bool): Whether to flip the points to back.
+        num_features (int): Number of point features. Default: 4.
+        front_camera_id (int): The referenced/front camera ID. Default: 2.
+    """
+    kitti_infos = mmcv.load(info_path)
+
+    for info in mmcv.track_iter_progress(kitti_infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+
+        v_path = pc_info['velodyne_path']
+        v_path = Path(data_path) / v_path
+        points_v = np.fromfile(
+            str(v_path), dtype=np.float32,
+            count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        if front_camera_id == 2:
+            P2 = calib['P2']
+        else:
+            P2 = calib[f'P{str(front_camera_id)}']
+        Trv2c = calib['Tr_velo_to_cam']
+        # first remove z < 0 points
+        # keep = points_v[:, -1] > 0
+        # points_v = points_v[keep]
+        # then remove outside.
+        if back:
+            points_v[:, 0] = -points_v[:, 0]
+        points_v = box_np_ops.remove_outside_points(points_v, rect, Trv2c, P2,
+                                                    image_info['image_shape'])
+        if save_path is None:
+            save_dir = v_path.parent.parent / (v_path.parent.stem + '_reduced')
+            if not save_dir.exists():
+                save_dir.mkdir()
+            save_filename = save_dir / v_path.name
+            # save_filename = str(v_path) + '_reduced'
+            if back:
+                save_filename += '_back'
+        else:
+            save_filename = str(Path(save_path) / v_path.name)
+            if back:
+                save_filename += '_back'
+        with open(save_filename, 'w') as f:
+            points_v.tofile(f)
+
+
+def create_reduced_point_cloud(data_path,
+                               pkl_prefix,
+                               train_info_path=None,
+                               val_info_path=None,
+                               test_info_path=None,
+                               save_path=None,
+                               with_back=False):
+    """Create reduced point clouds for training/validation/testing.
+
+    Args:
+        data_path (str): Path of original data.
+        pkl_prefix (str): Prefix of info files.
+        train_info_path (str | None): Path of training set info.
+            Default: None.
+        val_info_path (str | None): Path of validation set info.
+            Default: None.
+        test_info_path (str | None): Path of test set info.
+            Default: None.
+        save_path (str | None): Path to save reduced point cloud data.
+        with_back (bool): Whether to flip the points to back.
+    """
+    if train_info_path is None:
+        train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl'
+    if val_info_path is None:
+        val_info_path = Path(data_path) / f'{pkl_prefix}_infos_val.pkl'
+    if test_info_path is None:
+        test_info_path = Path(data_path) / f'{pkl_prefix}_infos_test.pkl'
+
+    print('create reduced point cloud for training set')
+    _create_reduced_point_cloud(data_path, train_info_path, save_path)
+    print('create reduced point cloud for validation set')
+    _create_reduced_point_cloud(data_path, val_info_path, save_path)
+    print('create reduced point cloud for testing set')
+    _create_reduced_point_cloud(data_path, test_info_path, save_path)
+    if with_back:
+        _create_reduced_point_cloud(
+            data_path, train_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, val_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, test_info_path, save_path, back=True)
+
+
+def export_2d_annotation(root_path, info_path, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
+    """
+    # get bbox annotations for camera
+    kitti_infos = mmcv.load(info_path)
+    cat2Ids = [
+        dict(id=kitti_categories.index(cat_name), name=cat_name)
+        for cat_name in kitti_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    from os import path as osp
+    for info in mmcv.track_iter_progress(kitti_infos):
+        coco_infos = get_2d_boxes(info, occluded=[0, 1, 2, 3], mono3d=mono3d)
+        (height, width,
+         _) = mmcv.imread(osp.join(root_path,
+                                   info['image']['image_path'])).shape
+        coco_2d_dict['images'].append(
+            dict(
+                file_name=info['image']['image_path'],
+                id=info['image']['image_idx'],
+                Tri2v=info['calib']['Tr_imu_to_velo'],
+                Trv2c=info['calib']['Tr_velo_to_cam'],
+                rect=info['calib']['R0_rect'],
+                cam_intrinsic=info['calib']['P2'],
+                width=width,
+                height=height))
+        for coco_info in coco_infos:
+            if coco_info is None:
+                continue
+            # add an empty key for coco format
+            coco_info['segmentation'] = []
+            coco_info['id'] = coco_ann_id
+            coco_2d_dict['annotations'].append(coco_info)
+            coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(info, occluded, mono3d=True):
+    """Get the 2D annotation records for a given info.
+
+    Args:
+        info: Information of the given sample data.
+        occluded: Integer (0, 1, 2, 3) indicating occlusion state: \
+            0 = fully visible, 1 = partly occluded, 2 = largely occluded, \
+            3 = unknown, -1 = DontCare
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+    # Get calibration information
+    P2 = info['calib']['P2']
+
+    repro_recs = []
+    # if no annotations in info (test dataset), then return
+    if 'annos' not in info:
+        return repro_recs
+
+    # Get all the annotation with the specified visibilties.
+    ann_dicts = info['annos']
+    mask = [(ocld in occluded) for ocld in ann_dicts['occluded']]
+    for k in ann_dicts.keys():
+        ann_dicts[k] = ann_dicts[k][mask]
+
+    # convert dict of list to list of dict
+    ann_recs = []
+    for i in range(len(ann_dicts['occluded'])):
+        ann_rec = {}
+        for k in ann_dicts.keys():
+            ann_rec[k] = ann_dicts[k][i]
+        ann_recs.append(ann_rec)
+
+    for ann_idx, ann_rec in enumerate(ann_recs):
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = \
+            f"{info['image']['image_idx']}.{ann_idx}"
+        ann_rec['sample_data_token'] = info['image']['image_idx']
+        sample_data_token = info['image']['image_idx']
+
+        loc = ann_rec['location'][np.newaxis, :]
+        dim = ann_rec['dimensions'][np.newaxis, :]
+        rot = ann_rec['rotation_y'][np.newaxis, np.newaxis]
+        # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5]
+        dst = np.array([0.5, 0.5, 0.5])
+        src = np.array([0.5, 1.0, 0.5])
+        loc = loc + dim * (dst - src)
+        offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \
+            / info['calib']['P2'][0, 0]
+        loc_3d = np.copy(loc)
+        loc_3d[0, 0] += offset
+        gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box_np_ops.center_to_corner_box3d(
+            gt_bbox_3d[:, :3],
+            gt_bbox_3d[:, 3:6],
+            gt_bbox_3d[:, 6], [0.5, 0.5, 0.5],
+            axis=1)
+        corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        camera_intrinsic = P2
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token,
+                                    info['image']['image_path'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            repro_rec['bbox_cam3d'] = np.concatenate(
+                [loc_3d, dim, rot],
+                axis=1).astype(np.float32).squeeze().tolist()
+            repro_rec['velo_cam3d'] = -1  # no velocity in KITTI
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = box_np_ops.points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            repro_rec['attribute_name'] = -1  # no attribute in KITTI
+            repro_rec['attribute_id'] = -1
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename):
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    key_mapping = {
+        'name': 'category_name',
+        'num_points_in_gt': 'num_lidar_pts',
+        'sample_annotation_token': 'sample_annotation_token',
+        'sample_data_token': 'sample_data_token',
+    }
+
+    for key, value in ann_rec.items():
+        if key in key_mapping.keys():
+            repro_rec[key_mapping[key]] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in kitti_categories:
+        return None
+    cat_name = repro_rec['category_name']
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = kitti_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
--- a/tools/data_converter/kitti_data_utils.py
+++ b/tools/data_converter/kitti_data_utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from collections import OrderedDict
+from concurrent import futures as futures
+from os import path as osp
+from pathlib import Path
+from skimage import io
+
+
+def get_image_index_str(img_idx, use_prefix_id=False):
+    if use_prefix_id:
+        return '{:07d}'.format(img_idx)
+    else:
+        return '{:06d}'.format(img_idx)
+
+
+def get_kitti_info_path(idx,
+                        prefix,
+                        info_type='image_2',
+                        file_tail='.png',
+                        training=True,
+                        relative_path=True,
+                        exist_check=True,
+                        use_prefix_id=False):
+    img_idx_str = get_image_index_str(idx, use_prefix_id)
+    img_idx_str += file_tail
+    prefix = Path(prefix)
+    if training:
+        file_path = Path('training') / info_type / img_idx_str
+    else:
+        file_path = Path('testing') / info_type / img_idx_str
+    if exist_check and not (prefix / file_path).exists():
+        raise ValueError('file not exist: {}'.format(file_path))
+    if relative_path:
+        return str(file_path)
+    else:
+        return str(prefix / file_path)
+
+
+def get_image_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='image_2',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, '.png', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_label_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='label_2',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_velodyne_path(idx,
+                      prefix,
+                      training=True,
+                      relative_path=True,
+                      exist_check=True,
+                      use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_calib_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'calib', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_pose_path(idx,
+                  prefix,
+                  training=True,
+                  relative_path=True,
+                  exist_check=True,
+                  use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'pose', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_label_anno(label_path):
+    annotations = {}
+    annotations.update({
+        'name': [],
+        'truncated': [],
+        'occluded': [],
+        'alpha': [],
+        'bbox': [],
+        'dimensions': [],
+        'location': [],
+        'rotation_y': []
+    })
+    with open(label_path, 'r') as f:
+        lines = f.readlines()
+    # if len(lines) == 0 or len(lines[0]) < 15:
+    #     content = []
+    # else:
+    content = [line.strip().split(' ') for line in lines]
+    num_objects = len([x[0] for x in content if x[0] != 'DontCare'])
+    annotations['name'] = np.array([x[0] for x in content])
+    num_gt = len(annotations['name'])
+    annotations['truncated'] = np.array([float(x[1]) for x in content])
+    annotations['occluded'] = np.array([int(x[2]) for x in content])
+    annotations['alpha'] = np.array([float(x[3]) for x in content])
+    annotations['bbox'] = np.array([[float(info) for info in x[4:8]]
+                                    for x in content]).reshape(-1, 4)
+    # dimensions will convert hwl format to standard lhw(camera) format.
+    annotations['dimensions'] = np.array([[float(info) for info in x[8:11]]
+                                          for x in content
+                                          ]).reshape(-1, 3)[:, [2, 0, 1]]
+    annotations['location'] = np.array([[float(info) for info in x[11:14]]
+                                        for x in content]).reshape(-1, 3)
+    annotations['rotation_y'] = np.array([float(x[14])
+                                          for x in content]).reshape(-1)
+    if len(content) != 0 and len(content[0]) == 16:  # have score
+        annotations['score'] = np.array([float(x[15]) for x in content])
+    else:
+        annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
+    index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
+    annotations['index'] = np.array(index, dtype=np.int32)
+    annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
+    return annotations
+
+
+def _extend_matrix(mat):
+    mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0)
+    return mat
+
+
+def get_kitti_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True):
+    """
+    KITTI annotation format version 2:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 4
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam: ...
+            P2: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+    root_path = Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+
+    def map_func(idx):
+        info = {}
+        pc_info = {'num_features': 4}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path)
+        image_info['image_path'] = get_image_path(idx, path, training,
+                                                  relative_path)
+        if with_imageshape:
+            img_path = image_info['image_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['image_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(idx, path, training, relative_path)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+            R0_rect = np.array([
+                float(info) for info in lines[4].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[5].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_imu_to_velo = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+                Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo
+            info['calib'] = calib_info
+
+        if annotations is not None:
+            info['annos'] = annotations
+            add_difficulty_to_annos(info)
+        return info
+
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+
+    return list(image_infos)
+
+
+def get_waymo_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         pose=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True,
+                         max_sweeps=5):
+    """
+    Waymo annotation format version like KITTI:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 6
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam0: ...
+            P0: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+    root_path = Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+
+    def map_func(idx):
+        info = {}
+        pc_info = {'num_features': 6}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path, use_prefix_id=True)
+            points = np.fromfile(
+                Path(path) / pc_info['velodyne_path'], dtype=np.float32)
+            points = np.copy(points).reshape(-1, pc_info['num_features'])
+            info['timestamp'] = np.int64(points[0, -1])
+            # values of the last dim are all the timestamp
+        image_info['image_path'] = get_image_path(
+            idx,
+            path,
+            training,
+            relative_path,
+            info_type='image_0',
+            use_prefix_id=True)
+        if with_imageshape:
+            img_path = image_info['image_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['image_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(
+                idx,
+                path,
+                training,
+                relative_path,
+                info_type='label_all',
+                use_prefix_id=True)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False, use_prefix_id=True)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P4 = np.array([float(info) for info in lines[4].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+                P4 = _extend_matrix(P4)
+            R0_rect = np.array([
+                float(info) for info in lines[5].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['P4'] = P4
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            info['calib'] = calib_info
+        if pose:
+            pose_path = get_pose_path(
+                idx, path, training, relative_path=False, use_prefix_id=True)
+            info['pose'] = np.loadtxt(pose_path)
+
+        if annotations is not None:
+            info['annos'] = annotations
+            info['annos']['camera_id'] = info['annos'].pop('score')
+            add_difficulty_to_annos(info)
+
+        sweeps = []
+        prev_idx = idx
+        while len(sweeps) < max_sweeps:
+            prev_info = {}
+            prev_idx -= 1
+            prev_info['velodyne_path'] = get_velodyne_path(
+                prev_idx,
+                path,
+                training,
+                relative_path,
+                exist_check=False,
+                use_prefix_id=True)
+            if_prev_exists = osp.exists(
+                Path(path) / prev_info['velodyne_path'])
+            if if_prev_exists:
+                prev_points = np.fromfile(
+                    Path(path) / prev_info['velodyne_path'], dtype=np.float32)
+                prev_points = np.copy(prev_points).reshape(
+                    -1, pc_info['num_features'])
+                prev_info['timestamp'] = np.int64(prev_points[0, -1])
+                prev_pose_path = get_pose_path(
+                    prev_idx,
+                    path,
+                    training,
+                    relative_path=False,
+                    use_prefix_id=True)
+                prev_info['pose'] = np.loadtxt(prev_pose_path)
+                sweeps.append(prev_info)
+            else:
+                break
+        info['sweeps'] = sweeps
+
+        return info
+
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+
+    return list(image_infos)
+
+
+def kitti_anno_to_label_file(annos, folder):
+    folder = Path(folder)
+    for anno in annos:
+        image_idx = anno['metadata']['image_idx']
+        label_lines = []
+        for j in range(anno['bbox'].shape[0]):
+            label_dict = {
+                'name': anno['name'][j],
+                'alpha': anno['alpha'][j],
+                'bbox': anno['bbox'][j],
+                'location': anno['location'][j],
+                'dimensions': anno['dimensions'][j],
+                'rotation_y': anno['rotation_y'][j],
+                'score': anno['score'][j],
+            }
+            label_line = kitti_result_line(label_dict)
+            label_lines.append(label_line)
+        label_file = folder / f'{get_image_index_str(image_idx)}.txt'
+        label_str = '\n'.join(label_lines)
+        with open(label_file, 'w') as f:
+            f.write(label_str)
+
+
+def add_difficulty_to_annos(info):
+    min_height = [40, 25,
+                  25]  # minimum height for evaluated groundtruth/detections
+    max_occlusion = [
+        0, 1, 2
+    ]  # maximum occlusion level of the groundtruth used for evaluation
+    max_trunc = [
+        0.15, 0.3, 0.5
+    ]  # maximum truncation level of the groundtruth used for evaluation
+    annos = info['annos']
+    dims = annos['dimensions']  # lhw format
+    bbox = annos['bbox']
+    height = bbox[:, 3] - bbox[:, 1]
+    occlusion = annos['occluded']
+    truncation = annos['truncated']
+    diff = []
+    easy_mask = np.ones((len(dims), ), dtype=np.bool)
+    moderate_mask = np.ones((len(dims), ), dtype=np.bool)
+    hard_mask = np.ones((len(dims), ), dtype=np.bool)
+    i = 0
+    for h, o, t in zip(height, occlusion, truncation):
+        if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]:
+            easy_mask[i] = False
+        if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]:
+            moderate_mask[i] = False
+        if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]:
+            hard_mask[i] = False
+        i += 1
+    is_easy = easy_mask
+    is_moderate = np.logical_xor(easy_mask, moderate_mask)
+    is_hard = np.logical_xor(hard_mask, moderate_mask)
+
+    for i in range(len(dims)):
+        if is_easy[i]:
+            diff.append(0)
+        elif is_moderate[i]:
+            diff.append(1)
+        elif is_hard[i]:
+            diff.append(2)
+        else:
+            diff.append(-1)
+    annos['difficulty'] = np.array(diff, np.int32)
+    return diff
+
+
+def kitti_result_line(result_dict, precision=4):
+    prec_float = '{' + ':.{}f'.format(precision) + '}'
+    res_line = []
+    all_field_default = OrderedDict([
+        ('name', None),
+        ('truncated', -1),
+        ('occluded', -1),
+        ('alpha', -10),
+        ('bbox', None),
+        ('dimensions', [-1, -1, -1]),
+        ('location', [-1000, -1000, -1000]),
+        ('rotation_y', -10),
+        ('score', 0.0),
+    ])
+    res_dict = [(key, None) for key, val in all_field_default.items()]
+    res_dict = OrderedDict(res_dict)
+    for key, val in result_dict.items():
+        if all_field_default[key] is None and val is None:
+            raise ValueError('you must specify a value for {}'.format(key))
+        res_dict[key] = val
+
+    for key, val in res_dict.items():
+        if key == 'name':
+            res_line.append(val)
+        elif key in ['truncated', 'alpha', 'rotation_y', 'score']:
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append(prec_float.format(val))
+        elif key == 'occluded':
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append('{}'.format(val))
+        elif key in ['bbox', 'dimensions', 'location']:
+            if val is None:
+                res_line += [str(v) for v in all_field_default[key]]
+            else:
+                res_line += [prec_float.format(v) for v in val]
+        else:
+            raise ValueError('unknown key. supported key:{}'.format(
+                res_dict.keys()))
+    return ' '.join(res_line)
--- a/tools/data_converter/lyft_converter.py
+++ b/tools/data_converter/lyft_converter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from logging import warning
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from os import path as osp
+from pyquaternion import Quaternion
+
+from mmdet3d.datasets import LyftDataset
+from .nuscenes_converter import (get_2d_boxes, get_available_scenes,
+                                 obtain_sensor2top)
+
+lyft_categories = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+                   'motorcycle', 'bicycle', 'pedestrian', 'animal')
+
+
+def create_lyft_infos(root_path,
+                      info_prefix,
+                      version='v1.01-train',
+                      max_sweeps=10):
+    """Create info file of lyft dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.01-train'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    lyft = Lyft(
+        data_path=osp.join(root_path, version),
+        json_path=osp.join(root_path, version, version),
+        verbose=True)
+    available_vers = ['v1.01-train', 'v1.01-test']
+    assert version in available_vers
+    if version == 'v1.01-train':
+        train_scenes = mmcv.list_from_file('data/lyft/train.txt')
+        val_scenes = mmcv.list_from_file('data/lyft/val.txt')
+    elif version == 'v1.01-test':
+        train_scenes = mmcv.list_from_file('data/lyft/test.txt')
+        val_scenes = []
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(lyft)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print(f'test scene: {len(train_scenes)}')
+    else:
+        print(f'train scene: {len(train_scenes)}, \
+                val scene: {len(val_scenes)}')
+    train_lyft_infos, val_lyft_infos = _fill_trainval_infos(
+        lyft, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print(f'test sample: {len(train_lyft_infos)}')
+        data = dict(infos=train_lyft_infos, metadata=metadata)
+        info_name = f'{info_prefix}_infos_test'
+        info_path = osp.join(root_path, f'{info_name}.pkl')
+        mmcv.dump(data, info_path)
+    else:
+        print(f'train sample: {len(train_lyft_infos)}, \
+                val sample: {len(val_lyft_infos)}')
+        data = dict(infos=train_lyft_infos, metadata=metadata)
+        train_info_name = f'{info_prefix}_infos_train'
+        info_path = osp.join(root_path, f'{train_info_name}.pkl')
+        mmcv.dump(data, info_path)
+        data['infos'] = val_lyft_infos
+        val_info_name = f'{info_prefix}_infos_val'
+        info_val_path = osp.join(root_path, f'{val_info_name}.pkl')
+        mmcv.dump(data, info_val_path)
+
+
+def _fill_trainval_infos(lyft,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Dataset class in the Lyft dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and
+            validation set that will be saved to the info file.
+    """
+    train_lyft_infos = []
+    val_lyft_infos = []
+
+    for sample in mmcv.track_iter_progress(lyft.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = lyft.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = lyft.get('ego_pose', sd_rec['ego_pose_token'])
+        abs_lidar_path, boxes, _ = lyft.get_sample_data(lidar_token)
+        # nuScenes devkit returns more convenient relative paths while
+        # lyft devkit returns absolute paths
+        abs_lidar_path = str(abs_lidar_path)  # absolute path
+        lidar_path = abs_lidar_path.split(f'{os.getcwd()}/')[-1]
+        # relative path
+
+        mmcv.check_file_exist(lidar_path)
+
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'sweeps': [],
+            'cams': dict(),
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = lyft.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(lyft, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(lyft, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = lyft.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                lyft.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in LyftDataset.NameMapping:
+                    names[i] = LyftDataset.NameMapping[names[i]]
+            names = np.array(names)
+
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+
+        if sample['scene_token'] in train_scenes:
+            train_lyft_infos.append(info)
+        else:
+            val_lyft_infos.append(info)
+
+    return train_lyft_infos, val_lyft_infos
+
+
+def export_2d_annotation(root_path, info_path, version):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+    """
+    warning.warn('DeprecationWarning: 2D annotations are not used on the '
+                 'Lyft dataset. The function export_2d_annotation will be '
+                 'deprecated.')
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    lyft_infos = mmcv.load(info_path)['infos']
+    lyft = Lyft(
+        data_path=osp.join(root_path, version),
+        json_path=osp.join(root_path, version, version),
+        verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=lyft_categories.index(cat_name), name=cat_name)
+        for cat_name in lyft_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmcv.track_iter_progress(lyft_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                lyft,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'])
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'],
+                    id=cam_info['sample_data_token'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
--- a/tools/data_converter/lyft_data_fixer.py
+++ b/tools/data_converter/lyft_data_fixer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import os
+
+
+def fix_lyft(root_folder='./data/lyft', version='v1.01'):
+    # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000  # noqa
+    lidar_path = 'lidar/host-a011_lidar1_1233090652702363606.bin'
+    root_folder = os.path.join(root_folder, f'{version}-train')
+    lidar_path = os.path.join(root_folder, lidar_path)
+    assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \
+        f'dataset and make sure {lidar_path} is present.'
+    points = np.fromfile(lidar_path, dtype=np.float32, count=-1)
+    try:
+        points.reshape([-1, 5])
+        print(f'This fix is not required for version {version}.')
+    except ValueError:
+        new_points = np.array(list(points) + [100.0, 1.0], dtype='float32')
+        new_points.tofile(lidar_path)
+        print(f'Appended 100.0 and 1.0 to the end of {lidar_path}.')
+
+
+parser = argparse.ArgumentParser(description='Lyft dataset fixer arg parser')
+parser.add_argument(
+    '--root-folder',
+    type=str,
+    default='./data/lyft',
+    help='specify the root path of Lyft dataset')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.01',
+    help='specify Lyft dataset version')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    fix_lyft(root_folder=args.root_folder, version=args.version)
--- a/tools/data_converter/nuimage_converter.py
+++ b/tools/data_converter/nuimage_converter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import base64
+import mmcv
+import numpy as np
+from nuimages import NuImages
+from nuimages.utils.utils import mask_decode, name_to_index_mapping
+from os import path as osp
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+NAME_MAPPING = {
+    'movable_object.barrier': 'barrier',
+    'vehicle.bicycle': 'bicycle',
+    'vehicle.bus.bendy': 'bus',
+    'vehicle.bus.rigid': 'bus',
+    'vehicle.car': 'car',
+    'vehicle.construction': 'construction_vehicle',
+    'vehicle.motorcycle': 'motorcycle',
+    'human.pedestrian.adult': 'pedestrian',
+    'human.pedestrian.child': 'pedestrian',
+    'human.pedestrian.construction_worker': 'pedestrian',
+    'human.pedestrian.police_officer': 'pedestrian',
+    'movable_object.trafficcone': 'traffic_cone',
+    'vehicle.trailer': 'trailer',
+    'vehicle.truck': 'truck',
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Data converter arg parser')
+    parser.add_argument(
+        '--data-root',
+        type=str,
+        default='./data/nuimages',
+        help='specify the root path of dataset')
+    parser.add_argument(
+        '--version',
+        type=str,
+        nargs='+',
+        default=['v1.0-mini'],
+        required=False,
+        help='specify the dataset version')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='./data/nuimages/annotations/',
+        required=False,
+        help='path to save the exported json')
+    parser.add_argument(
+        '--nproc',
+        type=int,
+        default=4,
+        required=False,
+        help='workers to process semantic masks')
+    parser.add_argument('--extra-tag', type=str, default='nuimages')
+    args = parser.parse_args()
+    return args
+
+
+def get_img_annos(nuim, img_info, cat2id, out_dir, data_root, seg_root):
+    """Get semantic segmentation map for an image.
+
+    Args:
+        nuim (obj:`NuImages`): NuImages dataset object
+        img_info (dict): Meta information of img
+
+    Returns:
+        np.ndarray: Semantic segmentation map of the image
+    """
+    sd_token = img_info['token']
+    image_id = img_info['id']
+    name_to_index = name_to_index_mapping(nuim.category)
+
+    # Get image data.
+    width, height = img_info['width'], img_info['height']
+    semseg_mask = np.zeros((height, width)).astype('uint8')
+
+    # Load stuff / surface regions.
+    surface_anns = [
+        o for o in nuim.surface_ann if o['sample_data_token'] == sd_token
+    ]
+
+    # Draw stuff / surface regions.
+    for ann in surface_anns:
+        # Get color and mask.
+        category_token = ann['category_token']
+        category_name = nuim.get('category', category_token)['name']
+        if ann['mask'] is None:
+            continue
+        mask = mask_decode(ann['mask'])
+
+        # Draw mask for semantic segmentation.
+        semseg_mask[mask == 1] = name_to_index[category_name]
+
+    # Load object instances.
+    object_anns = [
+        o for o in nuim.object_ann if o['sample_data_token'] == sd_token
+    ]
+
+    # Sort by token to ensure that objects always appear in the
+    # instance mask in the same order.
+    object_anns = sorted(object_anns, key=lambda k: k['token'])
+
+    # Draw object instances.
+    # The 0 index is reserved for background; thus, the instances
+    # should start from index 1.
+    annotations = []
+    for i, ann in enumerate(object_anns, start=1):
+        # Get color, box, mask and name.
+        category_token = ann['category_token']
+        category_name = nuim.get('category', category_token)['name']
+        if ann['mask'] is None:
+            continue
+        mask = mask_decode(ann['mask'])
+
+        # Draw masks for semantic segmentation and instance segmentation.
+        semseg_mask[mask == 1] = name_to_index[category_name]
+
+        if category_name in NAME_MAPPING:
+            cat_name = NAME_MAPPING[category_name]
+            cat_id = cat2id[cat_name]
+
+            x_min, y_min, x_max, y_max = ann['bbox']
+            # encode calibrated instance mask
+            mask_anno = dict()
+            mask_anno['counts'] = base64.b64decode(
+                ann['mask']['counts']).decode()
+            mask_anno['size'] = ann['mask']['size']
+
+            data_anno = dict(
+                image_id=image_id,
+                category_id=cat_id,
+                bbox=[x_min, y_min, x_max - x_min, y_max - y_min],
+                area=(x_max - x_min) * (y_max - y_min),
+                segmentation=mask_anno,
+                iscrowd=0)
+            annotations.append(data_anno)
+
+    # after process, save semantic masks
+    img_filename = img_info['file_name']
+    seg_filename = img_filename.replace('jpg', 'png')
+    seg_filename = osp.join(seg_root, seg_filename)
+    mmcv.imwrite(semseg_mask, seg_filename)
+    return annotations, np.max(semseg_mask)
+
+
+def export_nuim_to_coco(nuim, data_root, out_dir, extra_tag, version, nproc):
+    print('Process category information')
+    categories = []
+    categories = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    cat2id = {k_v['name']: k_v['id'] for k_v in categories}
+
+    images = []
+    print('Process image meta information...')
+    for sample_info in mmcv.track_iter_progress(nuim.sample_data):
+        if sample_info['is_key_frame']:
+            img_idx = len(images)
+            images.append(
+                dict(
+                    id=img_idx,
+                    token=sample_info['token'],
+                    file_name=sample_info['filename'],
+                    width=sample_info['width'],
+                    height=sample_info['height']))
+
+    seg_root = f'{out_dir}semantic_masks'
+    mmcv.mkdir_or_exist(seg_root)
+    mmcv.mkdir_or_exist(osp.join(data_root, 'calibrated'))
+
+    global process_img_anno
+
+    def process_img_anno(img_info):
+        single_img_annos, max_cls_id = get_img_annos(nuim, img_info, cat2id,
+                                                     out_dir, data_root,
+                                                     seg_root)
+        return single_img_annos, max_cls_id
+
+    print('Process img annotations...')
+    if nproc > 1:
+        outputs = mmcv.track_parallel_progress(
+            process_img_anno, images, nproc=nproc)
+    else:
+        outputs = []
+        for img_info in mmcv.track_iter_progress(images):
+            outputs.append(process_img_anno(img_info))
+
+    # Determine the index of object annotation
+    print('Process annotation information...')
+    annotations = []
+    max_cls_ids = []
+    for single_img_annos, max_cls_id in outputs:
+        max_cls_ids.append(max_cls_id)
+        for img_anno in single_img_annos:
+            img_anno.update(id=len(annotations))
+            annotations.append(img_anno)
+
+    max_cls_id = max(max_cls_ids)
+    print(f'Max ID of class in the semantic map: {max_cls_id}')
+
+    coco_format_json = dict(
+        images=images, annotations=annotations, categories=categories)
+
+    mmcv.mkdir_or_exist(out_dir)
+    out_file = osp.join(out_dir, f'{extra_tag}_{version}.json')
+    print(f'Annotation dumped to {out_file}')
+    mmcv.dump(coco_format_json, out_file)
+
+
+def main():
+    args = parse_args()
+    for version in args.version:
+        nuim = NuImages(
+            dataroot=args.data_root, version=version, verbose=True, lazy=True)
+        export_nuim_to_coco(nuim, args.data_root, args.out_dir, args.extra_tag,
+                            version, args.nproc)
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/data_converter/nuscenes_converter.py
+++ b/tools/data_converter/nuscenes_converter.py
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import mmcv
+import numpy as np
+import os
+from collections import OrderedDict
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from os import path as osp
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+from typing import List, Tuple, Union
+
+from mmdet3d.core.bbox.box_np_ops import points_cam2img
+from mmdet3d.datasets import NuScenesDataset
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+
+
+def create_nuscenes_infos(root_path,
+                          out_path,
+                          can_bus_root_path,
+                          info_prefix,
+                          version='v1.0-trainval',
+                          max_sweeps=10):
+    """Create info file of nuscene dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.0-trainval'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    from nuscenes.nuscenes import NuScenes
+    from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+    print(version, root_path)
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+    from nuscenes.utils import splits
+    available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+    assert version in available_vers
+    if version == 'v1.0-trainval':
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == 'v1.0-test':
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == 'v1.0-mini':
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(nusc)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print('test scene: {}'.format(len(train_scenes)))
+    else:
+        print('train scene: {}, val scene: {}'.format(
+            len(train_scenes), len(val_scenes)))
+
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print('test sample: {}'.format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_test.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+    else:
+        print('train sample: {}, val sample: {}'.format(
+            len(train_nusc_infos), len(val_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_train.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+        data['infos'] = val_nusc_infos
+        info_val_path = osp.join(out_path,
+                                 '{}_infos_temporal_val.pkl'.format(info_prefix))
+        mmcv.dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+    """Get available scenes from the input nuscenes class.
+
+    Given the raw data, get the information of available scenes for
+    further info generation.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+
+    Returns:
+        available_scenes (list[dict]): List of basic information for the
+            available scenes.
+    """
+    available_scenes = []
+    print('total scene num: {}'.format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene['token']
+        scene_rec = nusc.get('scene', scene_token)
+        sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+        sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+            lidar_path = str(lidar_path)
+            if os.getcwd() in lidar_path:
+                # path from lyftdataset is absolute path
+                lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+                # relative path
+            if not mmcv.is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print('exist scene num: {}'.format(len(available_scenes)))
+    return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+    scene_name = nusc.get('scene', sample['scene_token'])['name']
+    sample_timestamp = sample['timestamp']
+    try:
+        pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+    except:
+        return np.zeros(18)  # server scenes do not have can bus information.
+    can_bus = []
+    # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+    last_pose = pose_list[0]
+    for i, pose in enumerate(pose_list):
+        if pose['utime'] > sample_timestamp:
+            break
+        last_pose = pose
+    _ = last_pose.pop('utime')  # useless
+    pos = last_pose.pop('pos')
+    rotation = last_pose.pop('orientation')
+    can_bus.extend(pos)
+    can_bus.extend(rotation)
+    for key in last_pose.keys():
+        can_bus.extend(pose[key])  # 16 elements
+    can_bus.extend([0., 0.])
+    return np.array(can_bus)
+
+
+def _fill_trainval_infos(nusc,
+                         nusc_can_bus,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and validation set
+            that will be saved to the info file.
+    """
+    train_nusc_infos = []
+    val_nusc_infos = []
+    frame_idx = 0
+    for sample in mmcv.track_iter_progress(nusc.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        mmcv.check_file_exist(lidar_path)
+        can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+        ##
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'prev': sample['prev'],
+            'next': sample['next'],
+            'can_bus': can_bus,
+            'frame_idx': frame_idx,  # temporal related info
+            'sweeps': [],
+            'cams': dict(),
+            'scene_token': sample['scene_token'],  # temporal related info
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        if sample['next'] == '':
+            frame_idx = 0
+        else:
+            frame_idx += 1
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = nusc.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample['anns']])
+            valid_flag = np.array(
+                [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+                 for anno in annotations],
+                dtype=bool).reshape(-1)
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NuScenesDataset.NameMapping:
+                    names[i] = NuScenesDataset.NameMapping[names[i]]
+            names = np.array(names)
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['gt_velocity'] = velocity.reshape(-1, 2)
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+            info['valid_flag'] = valid_flag
+
+        if sample['scene_token'] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
+    sweep['sensor2lidar_translation'] = T
+    return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
+    """
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    nusc_infos = mmcv.load(info_path)['infos']
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmcv.track_iter_progress(nusc_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                nusc,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
+                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token (str): Sample data token belonging to a camera \
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (list[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (tuple[int]): Size of the image canvas.
+
+    Return:
+        tuple [float]: Intersection of the convex hull of the 2D box
+            corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+        return None
+    cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
--- a/tools/data_converter/s3dis_data_utils.py
+++ b/tools/data_converter/s3dis_data_utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+
+class S3DISData(object):
+    """S3DIS data.
+
+    Generate s3dis infos for s3dis_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'Area_1'.
+    """
+
+    def __init__(self, root_path, split='Area_1'):
+        self.root_dir = root_path
+        self.split = split
+        self.data_dir = osp.join(root_path,
+                                 'Stanford3dDataset_v1.2_Aligned_Version')
+
+        # Following `GSDN <https://arxiv.org/abs/2006.12356>`_, use 5 furniture
+        # classes for detection: table, chair, sofa, bookcase, board.
+        self.cat_ids = np.array([7, 8, 9, 10, 11])
+        self.cat_ids2class = {
+            cat_id: i
+            for i, cat_id in enumerate(list(self.cat_ids))
+        }
+
+        assert split in [
+            'Area_1', 'Area_2', 'Area_3', 'Area_4', 'Area_5', 'Area_6'
+        ]
+        self.sample_id_list = os.listdir(osp.join(self.data_dir,
+                                                  split))  # conferenceRoom_1
+        for sample_id in self.sample_id_list:
+            if os.path.isfile(osp.join(self.data_dir, split, sample_id)):
+                self.sample_id_list.remove(sample_id)
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {
+                'num_features': 6,
+                'lidar_idx': f'{self.split}_{sample_idx}'
+            }
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 's3dis_data',
+                                    f'{self.split}_{sample_idx}_point.npy')
+            pts_instance_mask_path = osp.join(
+                self.root_dir, 's3dis_data',
+                f'{self.split}_{sample_idx}_ins_label.npy')
+            pts_semantic_mask_path = osp.join(
+                self.root_dir, 's3dis_data',
+                f'{self.split}_{sample_idx}_sem_label.npy')
+
+            points = np.load(pts_filename).astype(np.float32)
+            pts_instance_mask = np.load(pts_instance_mask_path).astype(np.int)
+            pts_semantic_mask = np.load(pts_semantic_mask_path).astype(np.int)
+
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+            points.tofile(
+                osp.join(self.root_dir, 'points',
+                         f'{self.split}_{sample_idx}.bin'))
+            pts_instance_mask.tofile(
+                osp.join(self.root_dir, 'instance_mask',
+                         f'{self.split}_{sample_idx}.bin'))
+            pts_semantic_mask.tofile(
+                osp.join(self.root_dir, 'semantic_mask',
+                         f'{self.split}_{sample_idx}.bin'))
+
+            info['pts_path'] = osp.join('points',
+                                        f'{self.split}_{sample_idx}.bin')
+            info['pts_instance_mask_path'] = osp.join(
+                'instance_mask', f'{self.split}_{sample_idx}.bin')
+            info['pts_semantic_mask_path'] = osp.join(
+                'semantic_mask', f'{self.split}_{sample_idx}.bin')
+            info['annos'] = self.get_bboxes(points, pts_instance_mask,
+                                            pts_semantic_mask)
+
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
+
+    def get_bboxes(self, points, pts_instance_mask, pts_semantic_mask):
+        """Convert instance masks to axis-aligned bounding boxes.
+
+        Args:
+            points (np.array): Scene points of shape (n, 6).
+            pts_instance_mask (np.ndarray): Instance labels of shape (n,).
+            pts_semantic_mask (np.ndarray): Semantic labels of shape (n,).
+
+        Returns:
+            dict: A dict containing detection infos with following keys:
+
+                - gt_boxes_upright_depth (np.ndarray): Bounding boxes
+                    of shape (n, 6)
+                - class (np.ndarray): Box labels of shape (n,)
+                - gt_num (int): Number of boxes.
+        """
+        bboxes, labels = [], []
+        for i in range(1, pts_instance_mask.max()):
+            ids = pts_instance_mask == i
+            # check if all instance points have same semantic label
+            assert pts_semantic_mask[ids].min() == pts_semantic_mask[ids].max()
+            label = pts_semantic_mask[ids][0]
+            # keep only furniture objects
+            if label in self.cat_ids2class:
+                labels.append(self.cat_ids2class[pts_semantic_mask[ids][0]])
+                pts = points[:, :3][ids]
+                min_pts = pts.min(axis=0)
+                max_pts = pts.max(axis=0)
+                locations = (min_pts + max_pts) / 2
+                dimensions = max_pts - min_pts
+                bboxes.append(np.concatenate((locations, dimensions)))
+        annotation = dict()
+        # follow ScanNet and SUN RGB-D keys
+        annotation['gt_boxes_upright_depth'] = np.array(bboxes)
+        annotation['class'] = np.array(labels)
+        annotation['gt_num'] = len(labels)
+        return annotation
+
+
+class S3DISSegData(object):
+    """S3DIS dataset used to generate infos for semantic segmentation task.
+
+    Args:
+        data_root (str): Root path of the raw data.
+        ann_file (str): The generated scannet infos.
+        split (str): Set split type of the data. Default: 'train'.
+        num_points (int): Number of points in each data input. Default: 8192.
+        label_weight_func (function): Function to compute the label weight.
+            Default: None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split='Area_1',
+                 num_points=4096,
+                 label_weight_func=None):
+        self.data_root = data_root
+        self.data_infos = mmcv.load(ann_file)
+        self.split = split
+        self.num_points = num_points
+
+        self.all_ids = np.arange(13)  # all possible ids
+        self.cat_ids = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12])  # used for seg task
+        self.ignore_index = len(self.cat_ids)
+
+        self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \
+            self.ignore_index
+        for i, cat_id in enumerate(self.cat_ids):
+            self.cat_id2class[cat_id] = i
+
+        # label weighting function is taken from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+            label_weight_func is None else label_weight_func
+
+    def get_seg_infos(self):
+        scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+        save_folder = osp.join(self.data_root, 'seg_info')
+        mmcv.mkdir_or_exist(save_folder)
+        np.save(
+            osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+            scene_idxs)
+        np.save(
+            osp.join(save_folder, f'{self.split}_label_weight.npy'),
+            label_weight)
+        print(f'{self.split} resampled scene index and label weight saved')
+
+    def _convert_to_label(self, mask):
+        """Convert class_id in loaded segmentation mask to label."""
+        if isinstance(mask, str):
+            if mask.endswith('npy'):
+                mask = np.load(mask)
+            else:
+                mask = np.fromfile(mask, dtype=np.long)
+        label = self.cat_id2class[mask]
+        return label
+
+    def get_scene_idxs_and_label_weight(self):
+        """Compute scene_idxs for data sampling and label weight for loss \
+        calculation.
+
+        We sample more times for scenes with more points. Label_weight is
+        inversely proportional to number of class points.
+        """
+        num_classes = len(self.cat_ids)
+        num_point_all = []
+        label_weight = np.zeros((num_classes + 1, ))  # ignore_index
+        for data_info in self.data_infos:
+            label = self._convert_to_label(
+                osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+            num_point_all.append(label.shape[0])
+            class_count, _ = np.histogram(label, range(num_classes + 2))
+            label_weight += class_count
+
+        # repeat scene_idx for num_scene_point // num_sample_point times
+        sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+        num_iter = int(np.sum(num_point_all) / float(self.num_points))
+        scene_idxs = []
+        for idx in range(len(self.data_infos)):
+            scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+        scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+        # calculate label weight, adopted from PointNet++
+        label_weight = label_weight[:-1].astype(np.float32)
+        label_weight = label_weight / label_weight.sum()
+        label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+        return scene_idxs, label_weight
--- a/tools/data_converter/scannet_data_utils.py
+++ b/tools/data_converter/scannet_data_utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+
+class ScanNetData(object):
+    """ScanNet data.
+
+    Generate scannet infos for scannet_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'train'.
+    """
+
+    def __init__(self, root_path, split='train'):
+        self.root_dir = root_path
+        self.split = split
+        self.split_dir = osp.join(root_path)
+        self.classes = [
+            'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+            'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+            'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+            'garbagebin'
+        ]
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {self.cat2label[t]: t for t in self.cat2label}
+        self.cat_ids = np.array(
+            [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39])
+        self.cat_ids2class = {
+            nyu40id: i
+            for i, nyu40id in enumerate(list(self.cat_ids))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.root_dir, 'meta_data',
+                              f'scannetv2_{split}.txt')
+        mmcv.check_file_exist(split_file)
+        self.sample_id_list = mmcv.list_from_file(split_file)
+        self.test_mode = (split == 'test')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_aligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_aligned_bbox.npy')
+        mmcv.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_unaligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_unaligned_bbox.npy')
+        mmcv.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_axis_align_matrix(self, idx):
+        matrix_file = osp.join(self.root_dir, 'scannet_instance_data',
+                               f'{idx}_axis_align_matrix.npy')
+        mmcv.check_file_exist(matrix_file)
+        return np.load(matrix_file)
+
+    def get_images(self, idx):
+        paths = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.jpg'):
+                paths.append(osp.join('posed_images', idx, file))
+        return paths
+
+    def get_extrinsics(self, idx):
+        extrinsics = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.txt') and not file == 'intrinsic.txt':
+                extrinsics.append(np.loadtxt(osp.join(path, file)))
+        return extrinsics
+
+    def get_intrinsics(self, idx):
+        matrix_file = osp.join(self.root_dir, 'posed_images', idx,
+                               'intrinsic.txt')
+        mmcv.check_file_exist(matrix_file)
+        return np.loadtxt(matrix_file)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 'scannet_instance_data',
+                                    f'{sample_idx}_vert.npy')
+            points = np.load(pts_filename)
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            points.tofile(
+                osp.join(self.root_dir, 'points', f'{sample_idx}.bin'))
+            info['pts_path'] = osp.join('points', f'{sample_idx}.bin')
+
+            # update with RGB image paths if exist
+            if os.path.exists(osp.join(self.root_dir, 'posed_images')):
+                info['intrinsics'] = self.get_intrinsics(sample_idx)
+                all_extrinsics = self.get_extrinsics(sample_idx)
+                all_img_paths = self.get_images(sample_idx)
+                # some poses in ScanNet are invalid
+                extrinsics, img_paths = [], []
+                for extrinsic, img_path in zip(all_extrinsics, all_img_paths):
+                    if np.all(np.isfinite(extrinsic)):
+                        img_paths.append(img_path)
+                        extrinsics.append(extrinsic)
+                info['extrinsics'] = extrinsics
+                info['img_paths'] = img_paths
+
+            if not self.test_mode:
+                pts_instance_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_ins_label.npy')
+                pts_semantic_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_sem_label.npy')
+
+                pts_instance_mask = np.load(pts_instance_mask_path).astype(
+                    np.long)
+                pts_semantic_mask = np.load(pts_semantic_mask_path).astype(
+                    np.long)
+
+                mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+                mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+                pts_instance_mask.tofile(
+                    osp.join(self.root_dir, 'instance_mask',
+                             f'{sample_idx}.bin'))
+                pts_semantic_mask.tofile(
+                    osp.join(self.root_dir, 'semantic_mask',
+                             f'{sample_idx}.bin'))
+
+                info['pts_instance_mask_path'] = osp.join(
+                    'instance_mask', f'{sample_idx}.bin')
+                info['pts_semantic_mask_path'] = osp.join(
+                    'semantic_mask', f'{sample_idx}.bin')
+
+            if has_label:
+                annotations = {}
+                # box is of shape [k, 6 + class]
+                aligned_box_label = self.get_aligned_box_label(sample_idx)
+                unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+                annotations['gt_num'] = aligned_box_label.shape[0]
+                if annotations['gt_num'] != 0:
+                    aligned_box = aligned_box_label[:, :-1]  # k, 6
+                    unaligned_box = unaligned_box_label[:, :-1]
+                    classes = aligned_box_label[:, -1]  # k
+                    annotations['name'] = np.array([
+                        self.label2cat[self.cat_ids2class[classes[i]]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                    # default names are given to aligned bbox for compatibility
+                    # we also save unaligned bbox info with marked names
+                    annotations['location'] = aligned_box[:, :3]
+                    annotations['dimensions'] = aligned_box[:, 3:6]
+                    annotations['gt_boxes_upright_depth'] = aligned_box
+                    annotations['unaligned_location'] = unaligned_box[:, :3]
+                    annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+                    annotations[
+                        'unaligned_gt_boxes_upright_depth'] = unaligned_box
+                    annotations['index'] = np.arange(
+                        annotations['gt_num'], dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat_ids2class[classes[i]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+                annotations['axis_align_matrix'] = axis_align_matrix  # 4x4
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
+
+
+class ScanNetSegData(object):
+    """ScanNet dataset used to generate infos for semantic segmentation task.
+
+    Args:
+        data_root (str): Root path of the raw data.
+        ann_file (str): The generated scannet infos.
+        split (str): Set split type of the data. Default: 'train'.
+        num_points (int): Number of points in each data input. Default: 8192.
+        label_weight_func (function): Function to compute the label weight.
+            Default: None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split='train',
+                 num_points=8192,
+                 label_weight_func=None):
+        self.data_root = data_root
+        self.data_infos = mmcv.load(ann_file)
+        self.split = split
+        assert split in ['train', 'val', 'test']
+        self.num_points = num_points
+
+        self.all_ids = np.arange(41)  # all possible ids
+        self.cat_ids = np.array([
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36,
+            39
+        ])  # used for seg task
+        self.ignore_index = len(self.cat_ids)
+
+        self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \
+            self.ignore_index
+        for i, cat_id in enumerate(self.cat_ids):
+            self.cat_id2class[cat_id] = i
+
+        # label weighting function is taken from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+            label_weight_func is None else label_weight_func
+
+    def get_seg_infos(self):
+        if self.split == 'test':
+            return
+        scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+        save_folder = osp.join(self.data_root, 'seg_info')
+        mmcv.mkdir_or_exist(save_folder)
+        np.save(
+            osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+            scene_idxs)
+        np.save(
+            osp.join(save_folder, f'{self.split}_label_weight.npy'),
+            label_weight)
+        print(f'{self.split} resampled scene index and label weight saved')
+
+    def _convert_to_label(self, mask):
+        """Convert class_id in loaded segmentation mask to label."""
+        if isinstance(mask, str):
+            if mask.endswith('npy'):
+                mask = np.load(mask)
+            else:
+                mask = np.fromfile(mask, dtype=np.long)
+        label = self.cat_id2class[mask]
+        return label
+
+    def get_scene_idxs_and_label_weight(self):
+        """Compute scene_idxs for data sampling and label weight for loss \
+        calculation.
+
+        We sample more times for scenes with more points. Label_weight is
+        inversely proportional to number of class points.
+        """
+        num_classes = len(self.cat_ids)
+        num_point_all = []
+        label_weight = np.zeros((num_classes + 1, ))  # ignore_index
+        for data_info in self.data_infos:
+            label = self._convert_to_label(
+                osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+            num_point_all.append(label.shape[0])
+            class_count, _ = np.histogram(label, range(num_classes + 2))
+            label_weight += class_count
+
+        # repeat scene_idx for num_scene_point // num_sample_point times
+        sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+        num_iter = int(np.sum(num_point_all) / float(self.num_points))
+        scene_idxs = []
+        for idx in range(len(self.data_infos)):
+            scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+        scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+        # calculate label weight, adopted from PointNet++
+        label_weight = label_weight[:-1].astype(np.float32)
+        label_weight = label_weight / label_weight.sum()
+        label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+        return scene_idxs, label_weight
--- a/tools/data_converter/sunrgbd_data_utils.py
+++ b/tools/data_converter/sunrgbd_data_utils.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from concurrent import futures as futures
+from os import path as osp
+from scipy import io as sio
+
+
+def random_sampling(points, num_points, replace=None, return_choices=False):
+    """Random sampling.
+
+    Sampling point cloud to a certain number of points.
+
+    Args:
+        points (ndarray): Point cloud.
+        num_points (int): The number of samples.
+        replace (bool): Whether the sample is with or without replacement.
+        return_choices (bool): Whether to return choices.
+
+    Returns:
+        points (ndarray): Point cloud after sampling.
+    """
+
+    if replace is None:
+        replace = (points.shape[0] < num_points)
+    choices = np.random.choice(points.shape[0], num_points, replace=replace)
+    if return_choices:
+        return points[choices], choices
+    else:
+        return points[choices]
+
+
+class SUNRGBDInstance(object):
+
+    def __init__(self, line):
+        data = line.split(' ')
+        data[1:] = [float(x) for x in data[1:]]
+        self.classname = data[0]
+        self.xmin = data[1]
+        self.ymin = data[2]
+        self.xmax = data[1] + data[3]
+        self.ymax = data[2] + data[4]
+        self.box2d = np.array([self.xmin, self.ymin, self.xmax, self.ymax])
+        self.centroid = np.array([data[5], data[6], data[7]])
+        self.w = data[8]
+        self.l = data[9]  # noqa: E741
+        self.h = data[10]
+        self.orientation = np.zeros((3, ))
+        self.orientation[0] = data[11]
+        self.orientation[1] = data[12]
+        self.heading_angle = -1 * np.arctan2(self.orientation[1],
+                                             self.orientation[0])
+        self.box3d = np.concatenate([
+            self.centroid,
+            np.array([self.l * 2, self.w * 2, self.h * 2, self.heading_angle])
+        ])
+
+
+class SUNRGBDData(object):
+    """SUNRGBD data.
+
+    Generate scannet infos for sunrgbd_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'train'.
+        use_v1 (bool): Whether to use v1. Default: False.
+    """
+
+    def __init__(self, root_path, split='train', use_v1=False):
+        self.root_dir = root_path
+        self.split = split
+        self.split_dir = osp.join(root_path, 'sunrgbd_trainval')
+        self.classes = [
+            'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+            'night_stand', 'bookshelf', 'bathtub'
+        ]
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {
+            label: self.classes[label]
+            for label in range(len(self.classes))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.split_dir, f'{split}_data_idx.txt')
+        mmcv.check_file_exist(split_file)
+        self.sample_id_list = map(int, mmcv.list_from_file(split_file))
+        self.image_dir = osp.join(self.split_dir, 'image')
+        self.calib_dir = osp.join(self.split_dir, 'calib')
+        self.depth_dir = osp.join(self.split_dir, 'depth')
+        if use_v1:
+            self.label_dir = osp.join(self.split_dir, 'label_v1')
+        else:
+            self.label_dir = osp.join(self.split_dir, 'label')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_image(self, idx):
+        img_filename = osp.join(self.image_dir, f'{idx:06d}.jpg')
+        return mmcv.imread(img_filename)
+
+    def get_image_shape(self, idx):
+        image = self.get_image(idx)
+        return np.array(image.shape[:2], dtype=np.int32)
+
+    def get_depth(self, idx):
+        depth_filename = osp.join(self.depth_dir, f'{idx:06d}.mat')
+        depth = sio.loadmat(depth_filename)['instance']
+        return depth
+
+    def get_calibration(self, idx):
+        calib_filepath = osp.join(self.calib_dir, f'{idx:06d}.txt')
+        lines = [line.rstrip() for line in open(calib_filepath)]
+        Rt = np.array([float(x) for x in lines[0].split(' ')])
+        Rt = np.reshape(Rt, (3, 3), order='F').astype(np.float32)
+        K = np.array([float(x) for x in lines[1].split(' ')])
+        K = np.reshape(K, (3, 3), order='F').astype(np.float32)
+        return K, Rt
+
+    def get_label_objects(self, idx):
+        label_filename = osp.join(self.label_dir, f'{idx:06d}.txt')
+        lines = [line.rstrip() for line in open(label_filename)]
+        objects = [SUNRGBDInstance(line) for line in lines]
+        return objects
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            # convert depth to points
+            SAMPLE_NUM = 50000
+            # TODO: Check whether can move the point
+            #  sampling process during training.
+            pc_upright_depth = self.get_depth(sample_idx)
+            pc_upright_depth_subsampled = random_sampling(
+                pc_upright_depth, SAMPLE_NUM)
+
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            pc_upright_depth_subsampled.tofile(
+                osp.join(self.root_dir, 'points', f'{sample_idx:06d}.bin'))
+
+            info['pts_path'] = osp.join('points', f'{sample_idx:06d}.bin')
+            img_path = osp.join('image', f'{sample_idx:06d}.jpg')
+            image_info = {
+                'image_idx': sample_idx,
+                'image_shape': self.get_image_shape(sample_idx),
+                'image_path': img_path
+            }
+            info['image'] = image_info
+
+            K, Rt = self.get_calibration(sample_idx)
+            calib_info = {'K': K, 'Rt': Rt}
+            info['calib'] = calib_info
+
+            if has_label:
+                obj_list = self.get_label_objects(sample_idx)
+                annotations = {}
+                annotations['gt_num'] = len([
+                    obj.classname for obj in obj_list
+                    if obj.classname in self.cat2label.keys()
+                ])
+                if annotations['gt_num'] != 0:
+                    annotations['name'] = np.array([
+                        obj.classname for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['bbox'] = np.concatenate([
+                        obj.box2d.reshape(1, 4) for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ],
+                                                         axis=0)
+                    annotations['location'] = np.concatenate([
+                        obj.centroid.reshape(1, 3) for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ],
+                                                             axis=0)
+                    annotations['dimensions'] = 2 * np.array([
+                        [obj.l, obj.w, obj.h] for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])  # lwh (depth) format
+                    annotations['rotation_y'] = np.array([
+                        obj.heading_angle for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['index'] = np.arange(
+                        len(obj_list), dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat2label[obj.classname] for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['gt_boxes_upright_depth'] = np.stack(
+                        [
+                            obj.box3d for obj in obj_list
+                            if obj.classname in self.cat2label.keys()
+                        ],
+                        axis=0)  # (K,8)
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if \
+            sample_id_list is not None else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
--- a/tools/data_converter/waymo_converter.py
+++ b/tools/data_converter/waymo_converter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2
+except ImportError:
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-2-0==1.2.0" '
+        'to install the official devkit first.')
+
+import mmcv
+import numpy as np
+import tensorflow as tf
+from glob import glob
+from os.path import join
+from waymo_open_dataset.utils import range_image_utils, transform_utils
+from waymo_open_dataset.utils.frame_utils import \
+    parse_range_image_and_camera_projection
+
+
+class Waymo2KITTI(object):
+    """Waymo to KITTI converter.
+
+    This class serves as the converter to change the waymo raw data to KITTI
+    format.
+
+    Args:
+        load_dir (str): Directory to load waymo raw data.
+        save_dir (str): Directory to save data in KITTI format.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (str): Number of workers for the parallel process.
+        test_mode (bool): Whether in the test_mode. Default: False.
+    """
+
+    def __init__(self,
+                 load_dir,
+                 save_dir,
+                 prefix,
+                 workers=64,
+                 test_mode=False):
+        self.filter_empty_3dboxes = True
+        self.filter_no_label_zone_points = True
+
+        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+
+        # Only data collected in specific locations will be converted
+        # If set None, this filter is disabled
+        # Available options: location_sf (main dataset)
+        self.selected_waymo_locations = None
+        self.save_track_id = False
+
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        self.lidar_list = [
+            '_FRONT', '_FRONT_RIGHT', '_FRONT_LEFT', '_SIDE_RIGHT',
+            '_SIDE_LEFT'
+        ]
+        self.type_list = [
+            'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
+        ]
+        self.waymo_to_kitti_class_map = {
+            'UNKNOWN': 'DontCare',
+            'PEDESTRIAN': 'Pedestrian',
+            'VEHICLE': 'Car',
+            'CYCLIST': 'Cyclist',
+            'SIGN': 'Sign'  # not in kitti
+        }
+
+        self.load_dir = load_dir
+        self.save_dir = save_dir
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.test_mode = test_mode
+
+        self.tfrecord_pathnames = sorted(
+            glob(join(self.load_dir, '*.tfrecord')))
+
+        self.label_save_dir = f'{self.save_dir}/label_'
+        self.label_all_save_dir = f'{self.save_dir}/label_all'
+        self.image_save_dir = f'{self.save_dir}/image_'
+        self.calib_save_dir = f'{self.save_dir}/calib'
+        self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
+        self.pose_save_dir = f'{self.save_dir}/pose'
+
+        self.create_folder()
+
+    def convert(self):
+        """Convert action."""
+        print('Start converting ...')
+        mmcv.track_parallel_progress(self.convert_one, range(len(self)),
+                                     self.workers)
+        print('\nFinished ...')
+
+    def convert_one(self, file_idx):
+        """Convert action for single file.
+
+        Args:
+            file_idx (int): Index of the file to be converted.
+        """
+        pathname = self.tfrecord_pathnames[file_idx]
+        dataset = tf.data.TFRecordDataset(pathname, compression_type='')
+
+        for frame_idx, data in enumerate(dataset):
+
+            if frame_idx % 5 != 0:
+                continue
+            # print(frame_idx)
+            frame = dataset_pb2.Frame()
+            frame.ParseFromString(bytearray(data.numpy()))
+            if (self.selected_waymo_locations is not None
+                    and frame.context.stats.location
+                    not in self.selected_waymo_locations):
+                continue
+
+            self.save_image(frame, file_idx, frame_idx)
+            self.save_calib(frame, file_idx, frame_idx)
+            self.save_lidar(frame, file_idx, frame_idx)
+            self.save_pose(frame, file_idx, frame_idx)
+
+            if not self.test_mode:
+                self.save_label(frame, file_idx, frame_idx)
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.tfrecord_pathnames)
+
+    def save_image(self, frame, file_idx, frame_idx):
+        """Parse and save the images in png format.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        for img in frame.images:
+            img_path = f'{self.image_save_dir}{str(img.name - 1)}/' + \
+                f'{self.prefix}{str(file_idx).zfill(3)}' + \
+                f'{str(frame_idx).zfill(3)}.png'
+            img = mmcv.imfrombytes(img.image)
+            mmcv.imwrite(img, img_path)
+
+    def save_calib(self, frame, file_idx, frame_idx):
+        """Parse and save the calibration data.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        # waymo front camera to kitti reference camera
+        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+                                       [1.0, 0.0, 0.0]])
+        camera_calibs = []
+        R0_rect = [f'{i:e}' for i in np.eye(3).flatten()]
+        Tr_velo_to_cams = []
+        calib_context = ''
+
+        for camera in frame.context.camera_calibrations:
+            # extrinsic parameters
+            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+                4, 4)
+            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+            Tr_velo_to_cam = \
+                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+            if camera.name == 1:  # FRONT = 1, see dataset.proto for details
+                self.T_velo_to_front_cam = Tr_velo_to_cam.copy()
+            Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, ))
+            Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam])
+
+            # intrinsic parameters
+            camera_calib = np.zeros((3, 4))
+            camera_calib[0, 0] = camera.intrinsic[0]
+            camera_calib[1, 1] = camera.intrinsic[1]
+            camera_calib[0, 2] = camera.intrinsic[2]
+            camera_calib[1, 2] = camera.intrinsic[3]
+            camera_calib[2, 2] = 1
+            camera_calib = list(camera_calib.reshape(12))
+            camera_calib = [f'{i:e}' for i in camera_calib]
+            camera_calibs.append(camera_calib)
+
+        # all camera ids are saved as id-1 in the result because
+        # camera 0 is unknown in the proto
+        for i in range(5):
+            calib_context += 'P' + str(i) + ': ' + \
+                ' '.join(camera_calibs[i]) + '\n'
+        calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n'
+        for i in range(5):
+            calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \
+                ' '.join(Tr_velo_to_cams[i]) + '\n'
+
+        with open(
+                f'{self.calib_save_dir}/{self.prefix}' +
+                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt',
+                'w+') as fp_calib:
+            fp_calib.write(calib_context)
+            fp_calib.close()
+
+    def save_lidar(self, frame, file_idx, frame_idx):
+        """Parse and save the lidar data in psd format.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        range_images, camera_projections, range_image_top_pose = \
+            parse_range_image_and_camera_projection(frame)
+
+        # First return
+        points_0, cp_points_0, intensity_0, elongation_0 = \
+            self.convert_range_image_to_point_cloud(
+                frame,
+                range_images,
+                camera_projections,
+                range_image_top_pose,
+                ri_index=0
+            )
+        points_0 = np.concatenate(points_0, axis=0)
+        intensity_0 = np.concatenate(intensity_0, axis=0)
+        elongation_0 = np.concatenate(elongation_0, axis=0)
+
+        # Second return
+        points_1, cp_points_1, intensity_1, elongation_1 = \
+            self.convert_range_image_to_point_cloud(
+                frame,
+                range_images,
+                camera_projections,
+                range_image_top_pose,
+                ri_index=1
+            )
+        points_1 = np.concatenate(points_1, axis=0)
+        intensity_1 = np.concatenate(intensity_1, axis=0)
+        elongation_1 = np.concatenate(elongation_1, axis=0)
+
+        points = np.concatenate([points_0, points_1], axis=0)
+        intensity = np.concatenate([intensity_0, intensity_1], axis=0)
+        elongation = np.concatenate([elongation_0, elongation_1], axis=0)
+        timestamp = frame.timestamp_micros * np.ones_like(intensity)
+
+        # concatenate x,y,z, intensity, elongation, timestamp (6-dim)
+        point_cloud = np.column_stack(
+            (points, intensity, elongation, timestamp))
+
+        pc_path = f'{self.point_cloud_save_dir}/{self.prefix}' + \
+            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
+        point_cloud.astype(np.float32).tofile(pc_path)
+
+    def save_label(self, frame, file_idx, frame_idx):
+        """Parse and save the label data in txt format.
+        The relation between waymo and kitti coordinates is noteworthy:
+        1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti)
+        2. x-y-z: front-left-up (waymo) -> right-down-front(kitti)
+        3. bbox origin at volumetric center (waymo) -> bottom center (kitti)
+        4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo)
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        fp_label_all = open(
+            f'{self.label_all_save_dir}/{self.prefix}' +
+            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'w+')
+        id_to_bbox = dict()
+        id_to_name = dict()
+        for labels in frame.projected_lidar_labels:
+            name = labels.name
+            for label in labels.labels:
+                # TODO: need a workaround as bbox may not belong to front cam
+                bbox = [
+                    label.box.center_x - label.box.length / 2,
+                    label.box.center_y - label.box.width / 2,
+                    label.box.center_x + label.box.length / 2,
+                    label.box.center_y + label.box.width / 2
+                ]
+                id_to_bbox[label.id] = bbox
+                id_to_name[label.id] = name - 1
+
+        for obj in frame.laser_labels:
+            bounding_box = None
+            name = None
+            id = obj.id
+            for lidar in self.lidar_list:
+                if id + lidar in id_to_bbox:
+                    bounding_box = id_to_bbox.get(id + lidar)
+                    name = str(id_to_name.get(id + lidar))
+                    break
+
+            if bounding_box is None or name is None:
+                name = '0'
+                bounding_box = (0, 0, 0, 0)
+
+            my_type = self.type_list[obj.type]
+
+            if my_type not in self.selected_waymo_classes:
+                continue
+
+            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+                continue
+
+            my_type = self.waymo_to_kitti_class_map[my_type]
+
+            height = obj.box.height
+            width = obj.box.width
+            length = obj.box.length
+
+            x = obj.box.center_x
+            y = obj.box.center_y
+            z = obj.box.center_z - height / 2
+
+            # project bounding box to the virtual reference frame
+            pt_ref = self.T_velo_to_front_cam @ \
+                np.array([x, y, z, 1]).reshape((4, 1))
+            x, y, z, _ = pt_ref.flatten().tolist()
+
+            rotation_y = -obj.box.heading - np.pi / 2
+            track_id = obj.id
+
+            # not available
+            truncated = 0
+            occluded = 0
+            alpha = -10
+
+            line = my_type + \
+                ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
+                    round(truncated, 2), occluded, round(alpha, 2),
+                    round(bounding_box[0], 2), round(bounding_box[1], 2),
+                    round(bounding_box[2], 2), round(bounding_box[3], 2),
+                    round(height, 2), round(width, 2), round(length, 2),
+                    round(x, 2), round(y, 2), round(z, 2),
+                    round(rotation_y, 2))
+
+            if self.save_track_id:
+                line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
+            else:
+                line_all = line[:-1] + ' ' + name + '\n'
+
+            fp_label = open(
+                f'{self.label_save_dir}{name}/{self.prefix}' +
+                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'a')
+            fp_label.write(line)
+            fp_label.close()
+
+            fp_label_all.write(line_all)
+
+        fp_label_all.close()
+
+    def save_pose(self, frame, file_idx, frame_idx):
+        """Parse and save the pose data.
+
+        Note that SDC's own pose is not included in the regular training
+        of KITTI dataset. KITTI raw dataset contains ego motion files
+        but are not often used. Pose is important for algorithms that
+        take advantage of the temporal information.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        pose = np.array(frame.pose.transform).reshape(4, 4)
+        np.savetxt(
+            join(f'{self.pose_save_dir}/{self.prefix}' +
+                 f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
+            pose)
+
+    def create_folder(self):
+        """Create folder for data preprocessing."""
+        if not self.test_mode:
+            dir_list1 = [
+                self.label_all_save_dir, self.calib_save_dir,
+                self.point_cloud_save_dir, self.pose_save_dir
+            ]
+            dir_list2 = [self.label_save_dir, self.image_save_dir]
+        else:
+            dir_list1 = [
+                self.calib_save_dir, self.point_cloud_save_dir,
+                self.pose_save_dir
+            ]
+            dir_list2 = [self.image_save_dir]
+        for d in dir_list1:
+            mmcv.mkdir_or_exist(d)
+        for d in dir_list2:
+            for i in range(5):
+                mmcv.mkdir_or_exist(f'{d}{str(i)}')
+
+    def convert_range_image_to_point_cloud(self,
+                                           frame,
+                                           range_images,
+                                           camera_projections,
+                                           range_image_top_pose,
+                                           ri_index=0):
+        """Convert range images to point cloud.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame.
+            range_images (dict): Mapping from laser_name to list of two
+                range images corresponding with two returns.
+            camera_projections (dict): Mapping from laser_name to list of two
+                camera projections corresponding with two returns.
+            range_image_top_pose (:obj:`Transform`): Range image pixel pose for
+                top lidar.
+            ri_index (int): 0 for the first return, 1 for the second return.
+                Default: 0.
+
+        Returns:
+            tuple[list[np.ndarray]]: (List of points with shape [N, 3],
+                camera projections of points with shape [N, 6], intensity
+                with shape [N, 1], elongation with shape [N, 1]). All the
+                lists have the length of lidar numbers (5).
+        """
+        calibrations = sorted(
+            frame.context.laser_calibrations, key=lambda c: c.name)
+        points = []
+        cp_points = []
+        intensity = []
+        elongation = []
+
+        frame_pose = tf.convert_to_tensor(
+            value=np.reshape(np.array(frame.pose.transform), [4, 4]))
+        # [H, W, 6]
+        range_image_top_pose_tensor = tf.reshape(
+            tf.convert_to_tensor(value=range_image_top_pose.data),
+            range_image_top_pose.shape.dims)
+        # [H, W, 3, 3]
+        range_image_top_pose_tensor_rotation = \
+            transform_utils.get_rotation_matrix(
+                range_image_top_pose_tensor[..., 0],
+                range_image_top_pose_tensor[..., 1],
+                range_image_top_pose_tensor[..., 2])
+        range_image_top_pose_tensor_translation = \
+            range_image_top_pose_tensor[..., 3:]
+        range_image_top_pose_tensor = transform_utils.get_transform(
+            range_image_top_pose_tensor_rotation,
+            range_image_top_pose_tensor_translation)
+        for c in calibrations:
+            range_image = range_images[c.name][ri_index]
+            if len(c.beam_inclinations) == 0:
+                beam_inclinations = range_image_utils.compute_inclination(
+                    tf.constant(
+                        [c.beam_inclination_min, c.beam_inclination_max]),
+                    height=range_image.shape.dims[0])
+            else:
+                beam_inclinations = tf.constant(c.beam_inclinations)
+
+            beam_inclinations = tf.reverse(beam_inclinations, axis=[-1])
+            extrinsic = np.reshape(np.array(c.extrinsic.transform), [4, 4])
+
+            range_image_tensor = tf.reshape(
+                tf.convert_to_tensor(value=range_image.data),
+                range_image.shape.dims)
+            pixel_pose_local = None
+            frame_pose_local = None
+            if c.name == dataset_pb2.LaserName.TOP:
+                pixel_pose_local = range_image_top_pose_tensor
+                pixel_pose_local = tf.expand_dims(pixel_pose_local, axis=0)
+                frame_pose_local = tf.expand_dims(frame_pose, axis=0)
+            range_image_mask = range_image_tensor[..., 0] > 0
+
+            if self.filter_no_label_zone_points:
+                nlz_mask = range_image_tensor[..., 3] != 1.0  # 1.0: in NLZ
+                range_image_mask = range_image_mask & nlz_mask
+
+            range_image_cartesian = \
+                range_image_utils.extract_point_cloud_from_range_image(
+                    tf.expand_dims(range_image_tensor[..., 0], axis=0),
+                    tf.expand_dims(extrinsic, axis=0),
+                    tf.expand_dims(tf.convert_to_tensor(
+                        value=beam_inclinations), axis=0),
+                    pixel_pose=pixel_pose_local,
+                    frame_pose=frame_pose_local)
+
+            range_image_cartesian = tf.squeeze(range_image_cartesian, axis=0)
+            points_tensor = tf.gather_nd(range_image_cartesian,
+                                         tf.compat.v1.where(range_image_mask))
+
+            cp = camera_projections[c.name][ri_index]
+            cp_tensor = tf.reshape(
+                tf.convert_to_tensor(value=cp.data), cp.shape.dims)
+            cp_points_tensor = tf.gather_nd(
+                cp_tensor, tf.compat.v1.where(range_image_mask))
+            points.append(points_tensor.numpy())
+            cp_points.append(cp_points_tensor.numpy())
+
+            intensity_tensor = tf.gather_nd(range_image_tensor[..., 1],
+                                            tf.where(range_image_mask))
+            intensity.append(intensity_tensor.numpy())
+
+            elongation_tensor = tf.gather_nd(range_image_tensor[..., 2],
+                                             tf.where(range_image_mask))
+            elongation.append(elongation_tensor.numpy())
+
+        return points, cp_points, intensity, elongation
+
+    def cart_to_homo(self, mat):
+        """Convert transformation matrix in Cartesian coordinates to
+        homogeneous format.
+
+        Args:
+            mat (np.ndarray): Transformation matrix in Cartesian.
+                The input matrix shape is 3x3 or 3x4.
+
+        Returns:
+            np.ndarray: Transformation matrix in homogeneous format.
+                The matrix shape is 4x4.
+        """
+        ret = np.eye(4)
+        if mat.shape == (3, 3):
+            ret[:3, :3] = mat
+        elif mat.shape == (3, 4):
+            ret[:3, :] = mat
+        else:
+            raise ValueError(mat.shape)
+        return ret
--- a/tools/dist_test.sh
+++ b/tools/dist_test.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29503}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox
--- a/tools/dist_train.sh
+++ b/tools/dist_train.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28509}
+
+# 设置主节点地址（单机可省略）
+export MASTER_ADDR="localhost"
+# export GPU_FLUSH_ON_EXECUTION=1
+export PYTORCH_MIOPEN_SUGGEST_NHWC=1         #.to(memory_format=torch.channels_last)
+export MIOPEN_FIND_MODE=1
+# export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+torchrun --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic \
+    # --enable-profiler \
+    #--to_channels_last \
--- a/tools/dist_train_copy.sh
+++ b/tools/dist_train_copy.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28509}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
--- a/tools/dist_train_kenel.sh
+++ b/tools/dist_train_kenel.sh
+#!/usr/bin/env bash
+script  get_miopen_conv.log  #将输出的日志进行保存
+
+export MIOPEN_ENABLE_LOGGING=1        # 打开MIOPEN  LOGGING日志 default =0
+export MIOPEN_ENABLE_LOGGING_CMD=1    # 输出日志CMD信息 default =0
+export MIOPEN_LOG_LEVEL=6             # 设置日志打印level default=0
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28509}
+
+# 设置主节点地址（单机可省略）
+export MASTER_ADDR="localhost"
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+torchrun --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
+
+
+exit                                # 退出script
+cat get_miopen_conv_{model_name}.log | grep "./bin/MIOpenDriver" |sort -n|uniq -c > miopen_conv_{model_name}.log
+# 将会获得如下的conv log
+# 抓取bn
+cat get_miopen_conv_{model_name}.log | grep "./bin/MIOpenDriver bnorm" |sort -n|uniq -c > miopen_bn_{model_name}.log
+
+
--- a/tools/fp16/dist_train.sh
+++ b/tools/fp16/dist_train.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28508}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
--- a/tools/fp16/train.py
+++ b/tools/fp16/train.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model
+from os import path as osp
+
+from mmdet import __version__ as mmdet_version
+from mmdet3d import __version__ as mmdet3d_version
+#from mmdet3d.apis import train_model
+
+from mmdet3d.datasets import build_dataset
+from mmdet3d.models import build_model
+from mmdet3d.utils import collect_env, get_root_logger
+from mmdet.apis import set_random_seed
+from mmseg import __version__ as mmseg_version
+
+from mmcv.utils import TORCH_VERSION, digit_version
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both specified, '
+            '--options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            
+            from projects.mmdet3d_plugin.bevformer.apis import custom_train_model
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    #if args.resume_from is not None:
+
+    if args.resume_from is not None and osp.isfile(args.resume_from): 
+        cfg.resume_from = args.resume_from
+
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+    if digit_version(TORCH_VERSION) != digit_version('1.8.1'):
+        cfg.optimizer['type'] = 'AdamW'
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        assert False, 'DOT NOT SUPPORT!!!'
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    # specify logger name, if we still use 'mmdet', the output info will be
+    # filtered and won't be saved in the log_file
+    # TODO: ugly workaround to judge whether we are training det or seg model
+    if cfg.model.type in ['EncoderDecoder3D']:
+        logger_name = 'mmseg'
+    else:
+        logger_name = 'mmdet'
+    logger = get_root_logger(
+        log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    eval_model_config = copy.deepcopy(cfg.model)
+    eval_model = build_model(
+        eval_model_config,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(eval_model)
+
+    #eval_model.init_weights()
+    eval_model.load_state_dict(model.state_dict())
+
+    logger.info(f'Model:\n{model}')
+    from projects.mmdet3d_plugin.datasets import custom_build_dataset
+    datasets = [custom_build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        # in case we use a dataset wrapper
+        if 'dataset' in cfg.data.train:
+            val_dataset.pipeline = cfg.data.train.dataset.pipeline
+        else:
+            val_dataset.pipeline = cfg.data.train.pipeline
+        # set test_mode=False here in deep copied config
+        # which do not affect AP/AR calculation later
+        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
+        val_dataset.test_mode = False
+        datasets.append(custom_build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=mmdet_version,
+            mmseg_version=mmseg_version,
+            mmdet3d_version=mmdet3d_version,
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE  # for segmentors
+            if hasattr(datasets[0], 'PALETTE') else None)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    custom_train_model(
+        model,
+        datasets,
+        cfg,
+        eval_model=eval_model,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/misc/browse_dataset.py
+++ b/tools/misc/browse_dataset.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import warnings
+from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
+from os import path as osp
+
+from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+                               DepthInstance3DBoxes, LiDARInstance3DBoxes)
+from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
+                                     show_seg_result)
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--skip-type',
+        type=str,
+        nargs='+',
+        default=['Normalize'],
+        help='skip some useless pipeline')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
+        help='Determine the visualization method depending on the task.')
+    parser.add_argument(
+        '--online',
+        action='store_true',
+        help='Whether to perform online visualization. Note that you often '
+        'need a monitor to do so.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def build_data_cfg(config_path, skip_type, cfg_options):
+    """Build data config for loading visualization data."""
+    cfg = Config.fromfile(config_path)
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    # extract inner dataset of `RepeatDataset` as `cfg.data.train`
+    # so we don't need to worry about it later
+    if cfg.data.train['type'] == 'RepeatDataset':
+        cfg.data.train = cfg.data.train.dataset
+    # use only first dataset for `ConcatDataset`
+    if cfg.data.train['type'] == 'ConcatDataset':
+        cfg.data.train = cfg.data.train.datasets[0]
+    train_data_cfg = cfg.data.train
+    # eval_pipeline purely consists of loading functions
+    # use eval_pipeline for data loading
+    train_data_cfg['pipeline'] = [
+        x for x in cfg.eval_pipeline if x['type'] not in skip_type
+    ]
+
+    return cfg
+
+
+def to_depth_mode(points, bboxes):
+    """Convert points and bboxes to Depth Coord and Depth Box mode."""
+    if points is not None:
+        points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
+                                           Coord3DMode.DEPTH)
+    if bboxes is not None:
+        bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
+                                   Box3DMode.DEPTH)
+    return points, bboxes
+
+
+def show_det_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and 3D bboxes."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
+    if dataset.box_mode_3d != Box3DMode.DEPTH:
+        points, gt_bboxes = to_depth_mode(points, gt_bboxes)
+    show_result(
+        points,
+        gt_bboxes.clone(),
+        None,
+        out_dir,
+        filename,
+        show=show,
+        snapshot=True)
+
+
+def show_seg_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and segmentation mask."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_seg = example['pts_semantic_mask']._data.numpy()
+    show_seg_result(
+        points,
+        gt_seg.copy(),
+        None,
+        out_dir,
+        filename,
+        np.array(dataset.PALETTE),
+        dataset.ignore_index,
+        show=show,
+        snapshot=True)
+
+
+def show_proj_bbox_img(idx,
+                       dataset,
+                       out_dir,
+                       filename,
+                       show=False,
+                       is_nus_mono=False):
+    """Visualize 3D bboxes on 2D image by projection."""
+    try:
+        example = dataset.prepare_train_data(idx)
+    except AttributeError:  # for Mono-3D datasets
+        example = dataset.prepare_train_img(idx)
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
+    img_metas = example['img_metas']._data
+    img = example['img']._data.numpy()
+    # need to transpose channel to first dim
+    img = img.transpose(1, 2, 0)
+    # no 3D gt bboxes, just show img
+    if gt_bboxes.tensor.shape[0] == 0:
+        gt_bboxes = None
+    if isinstance(gt_bboxes, DepthInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            None,
+            out_dir,
+            filename,
+            box_mode='depth',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['lidar2img'],
+            out_dir,
+            filename,
+            box_mode='lidar',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, CameraInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['cam2img'],
+            out_dir,
+            filename,
+            box_mode='camera',
+            img_metas=img_metas,
+            show=show)
+    else:
+        # can't project, just show img
+        warnings.warn(
+            f'unrecognized gt box type {type(gt_bboxes)}, only show image')
+        show_multi_modality_result(
+            img, None, None, None, out_dir, filename, show=show)
+
+
+def main():
+    args = parse_args()
+
+    if args.output_dir is not None:
+        mkdir_or_exist(args.output_dir)
+
+    cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
+    try:
+        dataset = build_dataset(
+            cfg.data.train, default_args=dict(filter_empty_gt=False))
+    except TypeError:  # seg dataset doesn't have `filter_empty_gt` key
+        dataset = build_dataset(cfg.data.train)
+    data_infos = dataset.data_infos
+    dataset_type = cfg.dataset_type
+
+    # configure visualization mode
+    vis_task = args.task  # 'det', 'seg', 'multi_modality-det', 'mono-det'
+
+    for idx, data_info in enumerate(track_iter_progress(data_infos)):
+        if dataset_type in ['KittiDataset', 'WaymoDataset']:
+            data_path = data_info['point_cloud']['velodyne_path']
+        elif dataset_type in [
+                'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
+                'S3DISSegDataset', 'S3DISDataset'
+        ]:
+            data_path = data_info['pts_path']
+        elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
+            data_path = data_info['lidar_path']
+        elif dataset_type in ['NuScenesMonoDataset']:
+            data_path = data_info['file_name']
+        else:
+            raise NotImplementedError(
+                f'unsupported dataset type {dataset_type}')
+
+        file_name = osp.splitext(osp.basename(data_path))[0]
+
+        if vis_task in ['det', 'multi_modality-det']:
+            # show 3D bboxes on 3D point clouds
+            show_det_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+        if vis_task in ['multi_modality-det', 'mono-det']:
+            # project 3D bboxes to 2D image
+            show_proj_bbox_img(
+                idx,
+                dataset,
+                args.output_dir,
+                file_name,
+                show=args.online,
+                is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
+        elif vis_task in ['seg']:
+            # show 3D segmentation mask on 3D point clouds
+            show_seg_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/misc/fuse_conv_bn.py
+++ b/tools/misc/fuse_conv_bn.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from mmcv.runner import save_checkpoint
+from torch import nn as nn
+
+from mmdet.apis import init_model
+
+
+def fuse_conv_bn(conv, bn):
+    """During inference, the functionary of batch norm layers is turned off but
+    only the mean and var alone channels are used, which exposes the chance to
+    fuse it with the preceding conv layers to save computations and simplify
+    network structures."""
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_module(m):
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in m.named_children():
+        if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = fuse_conv_bn(last_conv, child)
+            m._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            m._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_module(child)
+    return m
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='fuse Conv and BN layers in a model')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file path')
+    parser.add_argument('out', help='output path of the converted model')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint)
+    # fuse conv and bn layers of the model
+    fused_model = fuse_module(model)
+    save_checkpoint(fused_model, args.out)
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/misc/print_config.py
+++ b/tools/misc/print_config.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from mmcv import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()