Initial commit

d1aac35d · zhangwenwei · d1aac35d · d1aac35d · d1aac35d · d1aac35d
Commit d1aac35d authored Apr 14, 2020 by zhangwenwei
20 changed files
--- a/mmdet3d/core/evaluation/bbox_overlaps.py
+++ b/mmdet3d/core/evaluation/bbox_overlaps.py
+import numpy as np
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou'):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1(ndarray): shape (n, 4)
+        bboxes2(ndarray): shape (k, 4)
+        mode(str): iou (intersection over union) or iof (intersection
+            over foreground)
+
+    Returns:
+        ious(ndarray): shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start, 0) * np.maximum(
+            y_end - y_start, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
--- a/mmdet3d/core/evaluation/class_names.py
+++ b/mmdet3d/core/evaluation/class_names.py
+import mmcv
+
+
+def wider_face_classes():
+    return ['face']
+
+
+def voc_classes():
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes():
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes():
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes():
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+        'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+        'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+    ]
+
+
+def cityscapes_classes():
+    return [
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def kitti_classes():
+    return [
+        'Car',
+        'Pedestrian',
+        'Cyclist',
+        'Van',
+        'Person_sitting',
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco'],
+    'wider_face': ['WIDERFaceDataset', 'wider_face', 'WDIERFace'],
+    'cityscapes': ['cityscapes'],
+    'kitti': ['KITTI', 'kitti']
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if mmcv.is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError('Unrecognized dataset: {}'.format(dataset))
+    else:
+        raise TypeError('dataset must a str, but got {}'.format(type(dataset)))
+    return labels
--- a/mmdet3d/core/evaluation/coco_utils.py
+++ b/mmdet3d/core/evaluation/coco_utils.py
+import itertools
+
+import mmcv
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from terminaltables import AsciiTable
+
+from .recall import eval_recalls
+
+
+def coco_eval(result_files,
+              result_types,
+              coco,
+              max_dets=(100, 300, 1000),
+              cat_ids=[],
+              classwise=False):
+    for res_type in result_types:
+        assert res_type in [
+            'proposal', 'proposal_fast', 'bbox', 'segm', 'keypoints'
+        ]
+
+    if mmcv.is_str(coco):
+        coco = COCO(coco)
+    assert isinstance(coco, COCO)
+
+    if result_types == ['proposal_fast']:
+        ar = fast_eval_recall(result_files, coco, np.array(max_dets))
+        for i, num in enumerate(max_dets):
+            print('AR@{}\t= {:.4f}'.format(num, ar[i]))
+        return
+
+    for res_type in result_types:
+        if isinstance(result_files, str):
+            result_file = result_files
+        elif isinstance(result_files, dict):
+            result_file = result_files[res_type]
+        else:
+            assert TypeError('result_files must be a str or dict')
+        assert result_file.endswith('.json')
+
+        coco_dets = coco.loadRes(result_file)
+        # it will load all images if cat_ids is []
+        # img_ids = getImgIds(coco, catIds=cat_ids)
+        if len(cat_ids) < 80:
+            img_ids = getImgIds(coco, catIds=cat_ids)
+        else:
+            img_ids = coco.getImgIds()
+        iou_type = 'bbox' if res_type == 'proposal' else res_type
+        cocoEval = COCOeval(coco, coco_dets, iou_type)
+        if cat_ids:
+            # cat_ids is not None means it is set
+            cocoEval.params.catIds = cat_ids
+        cocoEval.params.imgIds = img_ids
+        if res_type == 'proposal':
+            cocoEval.params.useCats = 0
+            cocoEval.params.maxDets = list(max_dets)
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+
+        if classwise:
+            # Compute per-category AP
+            # from https://github.com/facebookresearch/detectron2/blob/03064eb5bafe4a3e5750cc7a16672daf5afe8435/detectron2/evaluation/coco_evaluation.py#L259-L283 # noqa
+            precisions = cocoEval.eval['precision']
+            catIds = cat_ids if cat_ids else coco.getCatIds()
+            # precision has dims (iou, recall, cls, area range, max dets)
+            assert len(catIds) == precisions.shape[2]
+
+            results_per_category = []
+            for idx, catId in enumerate(catIds):
+                # area range index 0: all area ranges
+                # max dets index -1: typically 100 per image
+                nm = coco.loadCats(catId)[0]
+                precision = precisions[:, :, idx, 0, -1]
+                precision = precision[precision > -1]
+                ap = np.mean(precision) if precision.size else float('nan')
+                results_per_category.append(
+                    ('{}'.format(nm['name']),
+                     '{:0.3f}'.format(float(ap * 100))))
+
+            N_COLS = min(6, len(results_per_category) * 2)
+            results_flatten = list(itertools.chain(*results_per_category))
+            headers = ['category', 'AP'] * (N_COLS // 2)
+            results_2d = itertools.zip_longest(
+                *[results_flatten[i::N_COLS] for i in range(N_COLS)])
+            table_data = [headers]
+            table_data += [result for result in results_2d]
+            table = AsciiTable(table_data)
+            print(table.table)
+
+
+def fast_eval_recall(results,
+                     coco,
+                     max_dets,
+                     iou_thrs=np.arange(0.5, 0.96, 0.05)):
+    if mmcv.is_str(results):
+        assert results.endswith('.pkl')
+        results = mmcv.load(results)
+    elif not isinstance(results, list):
+        raise TypeError(
+            'results must be a list of numpy arrays or a filename, not {}'.
+            format(type(results)))
+
+    gt_bboxes = []
+    img_ids = coco.getImgIds()
+    for i in range(len(img_ids)):
+        ann_ids = coco.getAnnIds(imgIds=img_ids[i])
+        ann_info = coco.loadAnns(ann_ids)
+        if len(ann_info) == 0:
+            gt_bboxes.append(np.zeros((0, 4)))
+            continue
+        bboxes = []
+        for ann in ann_info:
+            if ann.get('ignore', False) or ann['iscrowd']:
+                continue
+            x1, y1, w, h = ann['bbox']
+            bboxes.append([x1, y1, x1 + w, y1 + h])
+        bboxes = np.array(bboxes, dtype=np.float32)
+        if bboxes.shape[0] == 0:
+            bboxes = np.zeros((0, 4))
+        gt_bboxes.append(bboxes)
+
+    recalls = eval_recalls(
+        gt_bboxes, results, max_dets, iou_thrs, print_summary=False)
+    ar = recalls.mean(axis=1)
+    return ar
+
+
+def xyxy2xywh(bbox):
+    _bbox = bbox.tolist()
+    return [
+        _bbox[0],
+        _bbox[1],
+        _bbox[2] - _bbox[0],
+        _bbox[3] - _bbox[1],
+    ]
+
+
+def proposal2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        bboxes = results[idx]
+        for i in range(bboxes.shape[0]):
+            data = dict()
+            data['image_id'] = img_id
+            data['bbox'] = xyxy2xywh(bboxes[i])
+            data['score'] = float(bboxes[i][4])
+            data['category_id'] = 1
+            json_results.append(data)
+    return json_results
+
+
+def det2json(dataset, results):
+    json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        result = results[idx]
+        for label in range(len(result)):
+            bboxes = result[label]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = dataset.cat_ids[label]
+                json_results.append(data)
+    return json_results
+
+
+def segm2json(dataset, results):
+    bbox_json_results = []
+    segm_json_results = []
+    for idx in range(len(dataset)):
+        img_id = dataset.img_ids[idx]
+        det, seg = results[idx]
+        for label in range(len(det)):
+            # bbox results
+            bboxes = det[label]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = dataset.cat_ids[label]
+                bbox_json_results.append(data)
+
+            # segm results
+            # some detectors use different score for det and segm
+            if isinstance(seg, tuple):
+                segms = seg[0][label]
+                mask_score = seg[1][label]
+            else:
+                segms = seg[label]
+                mask_score = [bbox[4] for bbox in bboxes]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = xyxy2xywh(bboxes[i])
+                data['score'] = float(mask_score[i])
+                data['category_id'] = dataset.cat_ids[label]
+                if isinstance(segms[i]['counts'], bytes):
+                    segms[i]['counts'] = segms[i]['counts'].decode()
+                data['segmentation'] = segms[i]
+                segm_json_results.append(data)
+    return bbox_json_results, segm_json_results
+
+
+def results2json(dataset, results, out_file):
+    result_files = dict()
+    if isinstance(results[0], list):
+        json_results = det2json(dataset, results)
+        result_files['bbox'] = '{}.{}.json'.format(out_file, 'bbox')
+        result_files['proposal'] = '{}.{}.json'.format(out_file, 'bbox')
+        mmcv.dump(json_results, result_files['bbox'])
+    elif isinstance(results[0], tuple):
+        json_results = segm2json(dataset, results)
+        result_files['bbox'] = '{}.{}.json'.format(out_file, 'bbox')
+        result_files['proposal'] = '{}.{}.json'.format(out_file, 'bbox')
+        result_files['segm'] = '{}.{}.json'.format(out_file, 'segm')
+        mmcv.dump(json_results[0], result_files['bbox'])
+        mmcv.dump(json_results[1], result_files['segm'])
+    elif isinstance(results[0], np.ndarray):
+        json_results = proposal2json(dataset, results)
+        result_files['proposal'] = '{}.{}.json'.format(out_file, 'proposal')
+        mmcv.dump(json_results, result_files['proposal'])
+    else:
+        raise TypeError('invalid type of results')
+    return result_files
+
+
+def getImgIds(coco, imgIds=[], catIds=[]):
+    '''
+    Get img ids that satisfy given filter conditions.
+    Different from the coco.getImgIds, this function returns the id if
+    the img contains one of the cat rather than all.
+    :param imgIds (int array) : get imgs for given ids
+    :param catIds (int array) : get imgs with all given cats
+    :return: ids (int array)  : integer array of img ids
+    '''
+    if len(imgIds) == len(catIds) == 0:
+        ids = coco.imgs.keys()
+    else:
+        ids = set(imgIds)
+        for i, catId in enumerate(catIds):
+            if i == 0 and len(ids) == 0:
+                ids = set(coco.catToImgs[catId])
+            else:
+                ids |= set(coco.catToImgs[catId])
+    return list(ids)
--- a/mmdet3d/core/evaluation/eval_hooks.py
+++ b/mmdet3d/core/evaluation/eval_hooks.py
+import os
+import os.path as osp
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import collate, scatter
+from mmcv.runner import Hook
+from pycocotools.cocoeval import COCOeval
+from torch.utils.data import Dataset
+
+from mmdet3d import datasets
+from .coco_utils import fast_eval_recall, results2json
+from .mean_ap import eval_map
+
+
+class DistEvalHook(Hook):
+
+    def __init__(self, dataset, interval=1):
+        if isinstance(dataset, Dataset):
+            self.dataset = dataset
+        elif isinstance(dataset, dict):
+            self.dataset = datasets.build_dataset(dataset, {'test_mode': True})
+        else:
+            raise TypeError(
+                'dataset must be a Dataset object or a dict, not {}'.format(
+                    type(dataset)))
+        self.interval = interval
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        runner.model.eval()
+        results = [None for _ in range(len(self.dataset))]
+        if runner.rank == 0:
+            prog_bar = mmcv.ProgressBar(len(self.dataset))
+        for idx in range(runner.rank, len(self.dataset), runner.world_size):
+            data = self.dataset[idx]
+            data_gpu = scatter(
+                collate([data], samples_per_gpu=1),
+                [torch.cuda.current_device()])[0]
+
+            # compute output
+            with torch.no_grad():
+                result = runner.model(
+                    return_loss=False, rescale=True, **data_gpu)
+            results[idx] = result
+
+            batch_size = runner.world_size
+            if runner.rank == 0:
+                for _ in range(batch_size):
+                    prog_bar.update()
+
+        if runner.rank == 0:
+            print('\n')
+            dist.barrier()
+            for i in range(1, runner.world_size):
+                tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
+                tmp_results = mmcv.load(tmp_file)
+                for idx in range(i, len(results), runner.world_size):
+                    results[idx] = tmp_results[idx]
+                os.remove(tmp_file)
+            self.evaluate(runner, results)
+        else:
+            tmp_file = osp.join(runner.work_dir,
+                                'temp_{}.pkl'.format(runner.rank))
+            mmcv.dump(results, tmp_file)
+            dist.barrier()
+        dist.barrier()
+
+    def evaluate(self):
+        raise NotImplementedError
+
+
+class DistEvalmAPHook(DistEvalHook):
+
+    def evaluate(self, runner, results):
+        gt_bboxes = []
+        gt_labels = []
+        gt_ignore = []
+        for i in range(len(self.dataset)):
+            ann = self.dataset.get_ann_info(i)
+            bboxes = ann['bboxes']
+            labels = ann['labels']
+            if 'bboxes_ignore' in ann:
+                ignore = np.concatenate([
+                    np.zeros(bboxes.shape[0], dtype=np.bool),
+                    np.ones(ann['bboxes_ignore'].shape[0], dtype=np.bool)
+                ])
+                gt_ignore.append(ignore)
+                bboxes = np.vstack([bboxes, ann['bboxes_ignore']])
+                labels = np.concatenate([labels, ann['labels_ignore']])
+            gt_bboxes.append(bboxes)
+            gt_labels.append(labels)
+        if not gt_ignore:
+            gt_ignore = None
+        # If the dataset is VOC2007, then use 11 points mAP evaluation.
+        if hasattr(self.dataset, 'year') and self.dataset.year == 2007:
+            ds_name = 'voc07'
+        else:
+            ds_name = self.dataset.CLASSES
+        mean_ap, eval_results = eval_map(
+            results,
+            gt_bboxes,
+            gt_labels,
+            gt_ignore=gt_ignore,
+            scale_ranges=None,
+            iou_thr=0.5,
+            dataset=ds_name,
+            print_summary=True)
+        runner.log_buffer.output['mAP'] = mean_ap
+        runner.log_buffer.ready = True
+
+
+class KittiDistEvalmAPHook(DistEvalHook):
+
+    def evaluate(self, runner, results):
+        tmp_file = osp.join(runner.work_dir, 'temp_0')
+        if not isinstance(results[0], dict):
+            result_files = self.dataset.reformat_bbox(results, tmp_file)
+            paste_result, ret_dict = self.dataset.evaluate(result_files)
+            for ap_cls, ap_result in ret_dict.items():
+                for ap_type, ap in ap_result.items():
+                    key = f'{ap_cls}_{ap_type}'
+                    val = float('{:.4f}'.format(ap))
+                    runner.log_buffer.output[key] = val
+        else:
+            for name in results[0]:
+                print('\nEvaluating {}'.format(name))
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(tmp_file, name)
+                result_files = self.dataset.reformat_bbox(results_, tmp_file_)
+                paste_result, ret_dict = self.dataset.evaluate(
+                    result_files, name)
+                for ap_cls, ap_result in ret_dict.items():
+                    for ap_type, ap in ap_result.items():
+                        key = f'{name}/{ap_cls}_{ap_type}'
+                        val = float('{:.4f}'.format(ap))
+                        runner.log_buffer.output[key] = val
+        runner.log_buffer.ready = True
+
+
+class CocoDistEvalRecallHook(DistEvalHook):
+
+    def __init__(self,
+                 dataset,
+                 interval=1,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        super(CocoDistEvalRecallHook, self).__init__(
+            dataset, interval=interval)
+        self.proposal_nums = np.array(proposal_nums, dtype=np.int32)
+        self.iou_thrs = np.array(iou_thrs, dtype=np.float32)
+
+    def evaluate(self, runner, results):
+        # the official coco evaluation is too slow, here we use our own
+        # implementation instead, which may get slightly different results
+        ar = fast_eval_recall(results, self.dataset.coco, self.proposal_nums,
+                              self.iou_thrs)
+        for i, num in enumerate(self.proposal_nums):
+            runner.log_buffer.output['AR@{}'.format(num)] = ar[i]
+        runner.log_buffer.ready = True
+
+
+class CocoDistEvalmAPHook(DistEvalHook):
+
+    def evaluate(self, runner, results):
+        tmp_file = osp.join(runner.work_dir, 'temp_0')
+        result_files = results2json(self.dataset, results, tmp_file)
+
+        res_types = ['bbox', 'segm'
+                     ] if runner.model.module.with_mask else ['bbox']
+        cocoGt = self.dataset.coco
+        # load image based on cat_ids
+        if len(self.dataset.cat_ids) < len(self.dataset.CLASSES):
+            from .coco_utils import getImgIds
+            imgIds = getImgIds(cocoGt, catIds=self.dataset.cat_ids)
+        else:
+            imgIds = cocoGt.getImgIds()
+        for res_type in res_types:
+            try:
+                cocoDt = cocoGt.loadRes(result_files[res_type])
+            except IndexError:
+                print('No prediction found.')
+                break
+            iou_type = res_type
+            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+            cocoEval.params.catIds = self.dataset.cat_ids
+            cocoEval.params.imgIds = imgIds
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            cocoEval.summarize()
+            metrics = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
+            for i in range(len(metrics)):
+                key = '{}_{}'.format(res_type, metrics[i])
+                val = float('{:.3f}'.format(cocoEval.stats[i]))
+                runner.log_buffer.output[key] = val
+            runner.log_buffer.output['{}_mAP_copypaste'.format(res_type)] = (
+                '{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                '{ap[4]:.3f} {ap[5]:.3f}').format(ap=cocoEval.stats[:6])
+        runner.log_buffer.ready = True
+        for res_type in res_types:
+            os.remove(result_files[res_type])
--- a/mmdet3d/core/evaluation/kitti_utils/__init__.py
+++ b/mmdet3d/core/evaluation/kitti_utils/__init__.py
+from .eval import kitti_eval, kitti_eval_coco_style
+
+__all__ = ['kitti_eval', 'kitti_eval_coco_style']
--- a/mmdet3d/core/evaluation/kitti_utils/eval.py
+++ b/mmdet3d/core/evaluation/kitti_utils/eval.py
+import gc
+import io as sysio
+
+import numba
+import numpy as np
+
+
+@numba.jit
+def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):
+    scores.sort()
+    scores = scores[::-1]
+    current_recall = 0
+    thresholds = []
+    for i, score in enumerate(scores):
+        l_recall = (i + 1) / num_gt
+        if i < (len(scores) - 1):
+            r_recall = (i + 2) / num_gt
+        else:
+            r_recall = l_recall
+        if (((r_recall - current_recall) < (current_recall - l_recall))
+                and (i < (len(scores) - 1))):
+            continue
+        # recall = l_recall
+        thresholds.append(score)
+        current_recall += 1 / (num_sample_pts - 1.0)
+    return thresholds
+
+
+def clean_data(gt_anno, dt_anno, current_class, difficulty):
+    CLASS_NAMES = ['car', 'pedestrian', 'cyclist']
+    MIN_HEIGHT = [40, 25, 25]
+    MAX_OCCLUSION = [0, 1, 2]
+    MAX_TRUNCATION = [0.15, 0.3, 0.5]
+    dc_bboxes, ignored_gt, ignored_dt = [], [], []
+    current_cls_name = CLASS_NAMES[current_class].lower()
+    num_gt = len(gt_anno['name'])
+    num_dt = len(dt_anno['name'])
+    num_valid_gt = 0
+    for i in range(num_gt):
+        bbox = gt_anno['bbox'][i]
+        gt_name = gt_anno['name'][i].lower()
+        height = bbox[3] - bbox[1]
+        valid_class = -1
+        if (gt_name == current_cls_name):
+            valid_class = 1
+        elif (current_cls_name == 'Pedestrian'.lower()
+              and 'Person_sitting'.lower() == gt_name):
+            valid_class = 0
+        elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name):
+            valid_class = 0
+        else:
+            valid_class = -1
+        ignore = False
+        if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty])
+                or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty])
+                or (height <= MIN_HEIGHT[difficulty])):
+            ignore = True
+        if valid_class == 1 and not ignore:
+            ignored_gt.append(0)
+            num_valid_gt += 1
+        elif (valid_class == 0 or (ignore and (valid_class == 1))):
+            ignored_gt.append(1)
+        else:
+            ignored_gt.append(-1)
+    # for i in range(num_gt):
+        if gt_anno['name'][i] == 'DontCare':
+            dc_bboxes.append(gt_anno['bbox'][i])
+    for i in range(num_dt):
+        if (dt_anno['name'][i].lower() == current_cls_name):
+            valid_class = 1
+        else:
+            valid_class = -1
+        height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
+        if height < MIN_HEIGHT[difficulty]:
+            ignored_dt.append(1)
+        elif valid_class == 1:
+            ignored_dt.append(0)
+        else:
+            ignored_dt.append(-1)
+
+    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
+
+
+@numba.jit(nopython=True)
+def image_box_overlap(boxes, query_boxes, criterion=-1):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) *
+                     (query_boxes[k, 3] - query_boxes[k, 1]))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]))
+                if ih > 0:
+                    if criterion == -1:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]) + qbox_area -
+                              iw * ih)
+                    elif criterion == 0:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]))
+                    elif criterion == 1:
+                        ua = qbox_area
+                    else:
+                        ua = 1.0
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def bev_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    riou = rotate_iou_gpu_eval(boxes, qboxes, criterion)
+    return riou
+
+
+@numba.jit(nopython=True, parallel=True)
+def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
+    # ONLY support overlap in CAMERA, not lidar.
+    # TODO: change to use prange for parallel mode, should check the difference
+    N, K = boxes.shape[0], qboxes.shape[0]
+    for i in numba.prange(N):
+        for j in numba.prange(K):
+            if rinc[i, j] > 0:
+                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
+                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
+                iw = (
+                    min(boxes[i, 1], qboxes[j, 1]) -
+                    max(boxes[i, 1] - boxes[i, 4],
+                        qboxes[j, 1] - qboxes[j, 4]))
+
+                if iw > 0:
+                    area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
+                    area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
+                    inc = iw * rinc[i, j]
+                    if criterion == -1:
+                        ua = (area1 + area2 - inc)
+                    elif criterion == 0:
+                        ua = area1
+                    elif criterion == 1:
+                        ua = area2
+                    else:
+                        ua = inc
+                    rinc[i, j] = inc / ua
+                else:
+                    rinc[i, j] = 0.0
+
+
+def d3_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]],
+                               qboxes[:, [0, 2, 3, 5, 6]], 2)
+    d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
+    return rinc
+
+
+@numba.jit(nopython=True)
+def compute_statistics_jit(overlaps,
+                           gt_datas,
+                           dt_datas,
+                           ignored_gt,
+                           ignored_det,
+                           dc_bboxes,
+                           metric,
+                           min_overlap,
+                           thresh=0,
+                           compute_fp=False,
+                           compute_aos=False):
+
+    det_size = dt_datas.shape[0]
+    gt_size = gt_datas.shape[0]
+    dt_scores = dt_datas[:, -1]
+    dt_alphas = dt_datas[:, 4]
+    gt_alphas = gt_datas[:, 4]
+    dt_bboxes = dt_datas[:, :4]
+    # gt_bboxes = gt_datas[:, :4]
+
+    assigned_detection = [False] * det_size
+    ignored_threshold = [False] * det_size
+    if compute_fp:
+        for i in range(det_size):
+            if (dt_scores[i] < thresh):
+                ignored_threshold[i] = True
+    NO_DETECTION = -10000000
+    tp, fp, fn, similarity = 0, 0, 0, 0
+    # thresholds = [0.0]
+    # delta = [0.0]
+    thresholds = np.zeros((gt_size, ))
+    thresh_idx = 0
+    delta = np.zeros((gt_size, ))
+    delta_idx = 0
+    for i in range(gt_size):
+        if ignored_gt[i] == -1:
+            continue
+        det_idx = -1
+        valid_detection = NO_DETECTION
+        max_overlap = 0
+        assigned_ignored_det = False
+
+        for j in range(det_size):
+            if (ignored_det[j] == -1):
+                continue
+            if (assigned_detection[j]):
+                continue
+            if (ignored_threshold[j]):
+                continue
+            overlap = overlaps[j, i]
+            dt_score = dt_scores[j]
+            if (not compute_fp and (overlap > min_overlap)
+                    and dt_score > valid_detection):
+                det_idx = j
+                valid_detection = dt_score
+            elif (compute_fp and (overlap > min_overlap)
+                  and (overlap > max_overlap or assigned_ignored_det)
+                  and ignored_det[j] == 0):
+                max_overlap = overlap
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = False
+            elif (compute_fp and (overlap > min_overlap)
+                  and (valid_detection == NO_DETECTION)
+                  and ignored_det[j] == 1):
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = True
+
+        if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
+            fn += 1
+        elif ((valid_detection != NO_DETECTION)
+              and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)):
+            assigned_detection[det_idx] = True
+        elif valid_detection != NO_DETECTION:
+            tp += 1
+            # thresholds.append(dt_scores[det_idx])
+            thresholds[thresh_idx] = dt_scores[det_idx]
+            thresh_idx += 1
+            if compute_aos:
+                # delta.append(gt_alphas[i] - dt_alphas[det_idx])
+                delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
+                delta_idx += 1
+
+            assigned_detection[det_idx] = True
+    if compute_fp:
+        for i in range(det_size):
+            if (not (assigned_detection[i] or ignored_det[i] == -1
+                     or ignored_det[i] == 1 or ignored_threshold[i])):
+                fp += 1
+        nstuff = 0
+        if metric == 0:
+            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
+            for i in range(dc_bboxes.shape[0]):
+                for j in range(det_size):
+                    if (assigned_detection[j]):
+                        continue
+                    if (ignored_det[j] == -1 or ignored_det[j] == 1):
+                        continue
+                    if (ignored_threshold[j]):
+                        continue
+                    if overlaps_dt_dc[j, i] > min_overlap:
+                        assigned_detection[j] = True
+                        nstuff += 1
+        fp -= nstuff
+        if compute_aos:
+            tmp = np.zeros((fp + delta_idx, ))
+            # tmp = [0] * fp
+            for i in range(delta_idx):
+                tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
+                # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
+            # assert len(tmp) == fp + tp
+            # assert len(delta) == tp
+            if tp > 0 or fp > 0:
+                similarity = np.sum(tmp)
+            else:
+                similarity = -1
+    return tp, fp, fn, similarity, thresholds[:thresh_idx]
+
+
+def get_split_parts(num, num_part):
+    same_part = num // num_part
+    remain_num = num % num_part
+    if remain_num == 0:
+        return [same_part] * num_part
+    else:
+        return [same_part] * num_part + [remain_num]
+
+
+@numba.jit(nopython=True)
+def fused_compute_statistics(overlaps,
+                             pr,
+                             gt_nums,
+                             dt_nums,
+                             dc_nums,
+                             gt_datas,
+                             dt_datas,
+                             dontcares,
+                             ignored_gts,
+                             ignored_dets,
+                             metric,
+                             min_overlap,
+                             thresholds,
+                             compute_aos=False):
+    gt_num = 0
+    dt_num = 0
+    dc_num = 0
+    for i in range(gt_nums.shape[0]):
+        for t, thresh in enumerate(thresholds):
+            overlap = overlaps[dt_num:dt_num + dt_nums[i],
+                               gt_num:gt_num + gt_nums[i]]
+
+            gt_data = gt_datas[gt_num:gt_num + gt_nums[i]]
+            dt_data = dt_datas[dt_num:dt_num + dt_nums[i]]
+            ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]]
+            ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]]
+            dontcare = dontcares[dc_num:dc_num + dc_nums[i]]
+            tp, fp, fn, similarity, _ = compute_statistics_jit(
+                overlap,
+                gt_data,
+                dt_data,
+                ignored_gt,
+                ignored_det,
+                dontcare,
+                metric,
+                min_overlap=min_overlap,
+                thresh=thresh,
+                compute_fp=True,
+                compute_aos=compute_aos)
+            pr[t, 0] += tp
+            pr[t, 1] += fp
+            pr[t, 2] += fn
+            if similarity != -1:
+                pr[t, 3] += similarity
+        gt_num += gt_nums[i]
+        dt_num += dt_nums[i]
+        dc_num += dc_nums[i]
+
+
+def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50):
+    """fast iou algorithm. this function can be used independently to
+    do result analysis. Must be used in CAMERA coordinate system.
+    Args:
+        gt_annos: dict, must from get_label_annos() in kitti_common.py
+        dt_annos: dict, must from get_label_annos() in kitti_common.py
+        metric: eval type. 0: bbox, 1: bev, 2: 3d
+        num_parts: int. a parameter for fast calculate algorithm
+    """
+    assert len(gt_annos) == len(dt_annos)
+    total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0)
+    total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0)
+    num_examples = len(gt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+    parted_overlaps = []
+    example_idx = 0
+
+    for num_part in split_parts:
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        if metric == 0:
+            gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0)
+            dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0)
+            overlap_part = image_box_overlap(gt_boxes, dt_boxes)
+        elif metric == 1:
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in gt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in dt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = bev_box_overlap(gt_boxes,
+                                           dt_boxes).astype(np.float64)
+        elif metric == 2:
+            loc = np.concatenate([a['location'] for a in gt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate([a['location'] for a in dt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = d3_box_overlap(gt_boxes,
+                                          dt_boxes).astype(np.float64)
+        else:
+            raise ValueError('unknown metric')
+        parted_overlaps.append(overlap_part)
+        example_idx += num_part
+    overlaps = []
+    example_idx = 0
+    for j, num_part in enumerate(split_parts):
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        gt_num_idx, dt_num_idx = 0, 0
+        for i in range(num_part):
+            gt_box_num = total_gt_num[example_idx + i]
+            dt_box_num = total_dt_num[example_idx + i]
+            overlaps.append(
+                parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num,
+                                   dt_num_idx:dt_num_idx + dt_box_num])
+            gt_num_idx += gt_box_num
+            dt_num_idx += dt_box_num
+        example_idx += num_part
+
+    return overlaps, parted_overlaps, total_gt_num, total_dt_num
+
+
+def _prepare_data(gt_annos, dt_annos, current_class, difficulty):
+    gt_datas_list = []
+    dt_datas_list = []
+    total_dc_num = []
+    ignored_gts, ignored_dets, dontcares = [], [], []
+    total_num_valid_gt = 0
+    for i in range(len(gt_annos)):
+        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
+        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+        ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
+        ignored_dets.append(np.array(ignored_det, dtype=np.int64))
+        if len(dc_bboxes) == 0:
+            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
+        else:
+            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
+        total_dc_num.append(dc_bboxes.shape[0])
+        dontcares.append(dc_bboxes)
+        total_num_valid_gt += num_valid_gt
+        gt_datas = np.concatenate(
+            [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1)
+        dt_datas = np.concatenate([
+            dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis],
+            dt_annos[i]['score'][..., np.newaxis]
+        ], 1)
+        gt_datas_list.append(gt_datas)
+        dt_datas_list.append(dt_datas)
+    total_dc_num = np.stack(total_dc_num, axis=0)
+    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares,
+            total_dc_num, total_num_valid_gt)
+
+
+def eval_class(gt_annos,
+               dt_annos,
+               current_classes,
+               difficultys,
+               metric,
+               min_overlaps,
+               compute_aos=False,
+               num_parts=200):
+    """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
+    Args:
+        gt_annos: dict, must from get_label_annos() in kitti_common.py
+        dt_annos: dict, must from get_label_annos() in kitti_common.py
+        current_classes: list of int, 0: car, 1: pedestrian, 2: cyclist
+        difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard
+        metric: eval type. 0: bbox, 1: bev, 2: 3d
+        min_overlaps: float, min overlap. format: [num_overlap, metric, class].
+        num_parts: int. a parameter for fast calculate algorithm
+
+    Returns:
+        dict of recall, precision and aos
+    """
+    assert len(gt_annos) == len(dt_annos)
+    num_examples = len(gt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+
+    rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
+    overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
+    N_SAMPLE_PTS = 41
+    num_minoverlap = len(min_overlaps)
+    num_class = len(current_classes)
+    num_difficulty = len(difficultys)
+    precision = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    recall = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    for m, current_class in enumerate(current_classes):
+        for l, difficulty in enumerate(difficultys):
+            rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
+            (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
+             dontcares, total_dc_num, total_num_valid_gt) = rets
+            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
+                thresholdss = []
+                for i in range(len(gt_annos)):
+                    rets = compute_statistics_jit(
+                        overlaps[i],
+                        gt_datas_list[i],
+                        dt_datas_list[i],
+                        ignored_gts[i],
+                        ignored_dets[i],
+                        dontcares[i],
+                        metric,
+                        min_overlap=min_overlap,
+                        thresh=0.0,
+                        compute_fp=False)
+                    tp, fp, fn, similarity, thresholds = rets
+                    thresholdss += thresholds.tolist()
+                thresholdss = np.array(thresholdss)
+                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
+                thresholds = np.array(thresholds)
+                pr = np.zeros([len(thresholds), 4])
+                idx = 0
+                for j, num_part in enumerate(split_parts):
+                    gt_datas_part = np.concatenate(
+                        gt_datas_list[idx:idx + num_part], 0)
+                    dt_datas_part = np.concatenate(
+                        dt_datas_list[idx:idx + num_part], 0)
+                    dc_datas_part = np.concatenate(
+                        dontcares[idx:idx + num_part], 0)
+                    ignored_dets_part = np.concatenate(
+                        ignored_dets[idx:idx + num_part], 0)
+                    ignored_gts_part = np.concatenate(
+                        ignored_gts[idx:idx + num_part], 0)
+                    fused_compute_statistics(
+                        parted_overlaps[j],
+                        pr,
+                        total_gt_num[idx:idx + num_part],
+                        total_dt_num[idx:idx + num_part],
+                        total_dc_num[idx:idx + num_part],
+                        gt_datas_part,
+                        dt_datas_part,
+                        dc_datas_part,
+                        ignored_gts_part,
+                        ignored_dets_part,
+                        metric,
+                        min_overlap=min_overlap,
+                        thresholds=thresholds,
+                        compute_aos=compute_aos)
+                    idx += num_part
+                for i in range(len(thresholds)):
+                    recall[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+                    precision[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1])
+                    if compute_aos:
+                        aos[m, l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
+                for i in range(len(thresholds)):
+                    precision[m, l, k, i] = np.max(
+                        precision[m, l, k, i:], axis=-1)
+                    recall[m, l, k, i] = np.max(recall[m, l, k, i:], axis=-1)
+                    if compute_aos:
+                        aos[m, l, k, i] = np.max(aos[m, l, k, i:], axis=-1)
+    ret_dict = {
+        'recall': recall,
+        'precision': precision,
+        'orientation': aos,
+    }
+
+    # clean temp variables
+    del overlaps
+    del parted_overlaps
+
+    gc.collect()
+    return ret_dict
+
+
+def get_mAP(prec):
+    sums = 0
+    for i in range(0, prec.shape[-1], 4):
+        sums = sums + prec[..., i]
+    return sums / 11 * 100
+
+
+def print_str(value, *arg, sstream=None):
+    if sstream is None:
+        sstream = sysio.StringIO()
+    sstream.truncate(0)
+    sstream.seek(0)
+    print(value, *arg, file=sstream)
+    return sstream.getvalue()
+
+
+def do_eval(gt_annos,
+            dt_annos,
+            current_classes,
+            min_overlaps,
+            eval_types=['bbox', 'bev', '3d']):
+    # min_overlaps: [num_minoverlap, metric, num_class]
+    difficultys = [0, 1, 2]
+    ret = eval_class(
+        gt_annos,
+        dt_annos,
+        current_classes,
+        difficultys,
+        0,
+        min_overlaps,
+        compute_aos=('aos' in eval_types))
+    # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
+    mAP_bbox = get_mAP(ret['precision'])
+    mAP_aos = None
+    if 'aos' in eval_types:
+        mAP_aos = get_mAP(ret['orientation'])
+
+    mAP_bev = None
+    if 'bev' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1,
+                         min_overlaps)
+        mAP_bev = get_mAP(ret['precision'])
+
+    mAP_3d = None
+    if '3d' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2,
+                         min_overlaps)
+        mAP_3d = get_mAP(ret['precision'])
+    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges,
+                       compute_aos):
+    # overlap_ranges: [range, metric, num_class]
+    min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
+    for i in range(overlap_ranges.shape[1]):
+        for j in range(overlap_ranges.shape[2]):
+            min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j])
+    mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval(gt_annos, dt_annos,
+                                                 current_classes, min_overlaps,
+                                                 compute_aos)
+    # ret: [num_class, num_diff, num_minoverlap]
+    mAP_bbox = mAP_bbox.mean(-1)
+    mAP_bev = mAP_bev.mean(-1)
+    mAP_3d = mAP_3d.mean(-1)
+    if mAP_aos is not None:
+        mAP_aos = mAP_aos.mean(-1)
+    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def kitti_eval(gt_annos,
+               dt_annos,
+               current_classes,
+               eval_types=['bbox', 'bev', '3d']):
+    assert 'bbox' in eval_types, 'must evaluate bbox at least'
+    overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7,
+                             0.5], [0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.7, 0.5, 0.5, 0.7, 0.5]])
+    overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.5, 0.25, 0.25, 0.5, 0.25],
+                            [0.5, 0.25, 0.25, 0.5, 0.25]])
+    min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0)  # [2, 3, 5]
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    min_overlaps = min_overlaps[:, :, current_classes]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    for anno in dt_annos:
+        if anno['alpha'].shape[0] != 0:
+            if anno['alpha'][0] != -10:
+                compute_aos = True
+                eval_types.append('aos')
+            break
+
+    mAPbbox, mAPbev, mAP3d, mAPaos = do_eval(gt_annos, dt_annos,
+                                             current_classes, min_overlaps,
+                                             eval_types)
+
+    ret_dict = {}
+    difficulty = ['easy', 'moderate', 'hard']
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        curcls_name = class_to_name[curcls]
+        ret_dict[curcls_name] = {}
+        for i in range(min_overlaps.shape[0]):
+            # prepare results for print
+            result += ('{} AP@{:.2f}, {:.2f}, {:.2f}:\n'.format(
+                curcls_name, *min_overlaps[i, :, j]))
+            if mAPbbox is not None:
+                result += (
+                    'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[j, :,
+                                                                       i]))
+            if mAPbev is not None:
+                result += (
+                    'bev  AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[j, :,
+                                                                      i]))
+            if mAP3d is not None:
+                result += (
+                    '3d   AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[j, :, i]))
+
+            if compute_aos:
+                result += (
+                    'aos  AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[j, :,
+                                                                      i]))
+
+            # prepare results for logger
+            for idx in range(3):
+                postfix = '{}_{}'.format(difficulty[idx], min_overlaps[i, idx,
+                                                                       j])
+                if mAP3d is not None:
+                    ret_dict[curcls_name]['3D_{}'.format(postfix)] = mAP3d[j,
+                                                                           idx,
+                                                                           i]
+                if mAPbev is not None:
+                    ret_dict[curcls_name]['BEV_{}'.format(postfix)] = mAPbev[
+                        j, idx, i]
+                if mAPbbox is not None:
+                    ret_dict[curcls_name]['2D_{}'.format(postfix)] = mAPbbox[
+                        j, idx, i]
+
+    # calculate mAP over all classes if there are multiple classes
+    if len(current_classes) > 1:
+        # prepare results for print
+        result += ('\nOverall AP@{}, {}, {}:\n'.format(*difficulty))
+        if mAPbbox is not None:
+            mAPbbox = mAPbbox.mean(axis=0)
+            result += ('bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[:,
+                                                                          0]))
+        if mAPbev is not None:
+            mAPbev = mAPbev.mean(axis=0)
+            result += ('bev  AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[:,
+                                                                         0]))
+        if mAP3d is not None:
+            mAP3d = mAP3d.mean(axis=0)
+            result += ('3d   AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[:, 0]))
+        if compute_aos:
+            mAPaos = mAPaos.mean(axis=0)
+            result += ('aos  AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:,
+                                                                         0]))
+
+        # prepare results for logger
+        ret_dict['Overall'] = dict()
+        for idx in range(3):
+            postfix = '{}'.format(difficulty[idx])
+            if mAP3d is not None:
+                ret_dict['Overall']['3D_{}'.format(postfix)] = mAP3d[idx, 0]
+            if mAPbev is not None:
+                ret_dict['Overall']['BEV_{}'.format(postfix)] = mAPbev[idx, 0]
+            if mAPbbox is not None:
+                ret_dict['Overall']['2D_{}'.format(postfix)] = mAPbbox[idx, 0]
+    print(result)
+    return result, ret_dict
+
+
+def kitti_eval_coco_style(gt_annos, dt_annos, current_classes):
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    class_to_range = {
+        0: [0.5, 0.95, 10],
+        1: [0.25, 0.7, 10],
+        2: [0.25, 0.7, 10],
+        3: [0.5, 0.95, 10],
+        4: [0.25, 0.7, 10],
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    overlap_ranges = np.zeros([3, 3, len(current_classes)])
+    for i, curcls in enumerate(current_classes):
+        overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:,
+                                                                   np.newaxis]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    for anno in dt_annos:
+        if anno['alpha'].shape[0] != 0:
+            if anno['alpha'][0] != -10:
+                compute_aos = True
+            break
+    mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval(
+        gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        o_range = np.array(class_to_range[curcls])[[0, 2, 1]]
+        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
+        result += print_str((f'{class_to_name[curcls]} '
+                             'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range)))
+        result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, '
+                             f'{mAPbbox[j, 1]:.2f}, '
+                             f'{mAPbbox[j, 2]:.2f}'))
+        result += print_str((f'bev  AP:{mAPbev[j, 0]:.2f}, '
+                             f'{mAPbev[j, 1]:.2f}, '
+                             f'{mAPbev[j, 2]:.2f}'))
+        result += print_str((f'3d   AP:{mAP3d[j, 0]:.2f}, '
+                             f'{mAP3d[j, 1]:.2f}, '
+                             f'{mAP3d[j, 2]:.2f}'))
+        if compute_aos:
+            result += print_str((f'aos  AP:{mAPaos[j, 0]:.2f}, '
+                                 f'{mAPaos[j, 1]:.2f}, '
+                                 f'{mAPaos[j, 2]:.2f}'))
+    return result
--- a/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
+++ b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
+#####################
+# Based on https://github.com/hongzhenwang/RRPN-revise
+# Licensed under The MIT License
+# Author: yanyan, scrin@foxmail.com
+#####################
+import math
+
+import numba
+import numpy as np
+from numba import cuda
+
+
+@numba.jit(nopython=True)
+def div_up(m, n):
+    return m // n + (m % n > 0)
+
+
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def trangle_area(a, b, c):
+    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
+            (b[0] - c[0])) / 2.0
+
+
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def area(int_pts, num_of_inter):
+    area_val = 0.0
+    for i in range(num_of_inter - 2):
+        area_val += abs(
+            trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
+                         int_pts[2 * i + 4:2 * i + 6]))
+    return area_val
+
+
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
+    if num_of_inter > 0:
+        center = cuda.local.array((2, ), dtype=numba.float32)
+        center[:] = 0.0
+        for i in range(num_of_inter):
+            center[0] += int_pts[2 * i]
+            center[1] += int_pts[2 * i + 1]
+        center[0] /= num_of_inter
+        center[1] /= num_of_inter
+        v = cuda.local.array((2, ), dtype=numba.float32)
+        vs = cuda.local.array((16, ), dtype=numba.float32)
+        for i in range(num_of_inter):
+            v[0] = int_pts[2 * i] - center[0]
+            v[1] = int_pts[2 * i + 1] - center[1]
+            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+            v[0] = v[0] / d
+            v[1] = v[1] / d
+            if v[1] < 0:
+                v[0] = -2 - v[0]
+            vs[i] = v[0]
+        j = 0
+        temp = 0
+        for i in range(1, num_of_inter):
+            if vs[i - 1] > vs[i]:
+                temp = vs[i]
+                tx = int_pts[2 * i]
+                ty = int_pts[2 * i + 1]
+                j = i
+                while j > 0 and vs[j - 1] > temp:
+                    vs[j] = vs[j - 1]
+                    int_pts[j * 2] = int_pts[j * 2 - 2]
+                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+                    j -= 1
+
+                vs[j] = temp
+                int_pts[j * 2] = tx
+                int_pts[j * 2 + 1] = ty
+
+
+@cuda.jit(
+    '(float32[:], float32[:], int32, int32, float32[:])',
+    device=True,
+    inline=True)
+def line_segment_intersection(pts1, pts2, i, j, temp_pts):
+    A = cuda.local.array((2, ), dtype=numba.float32)
+    B = cuda.local.array((2, ), dtype=numba.float32)
+    C = cuda.local.array((2, ), dtype=numba.float32)
+    D = cuda.local.array((2, ), dtype=numba.float32)
+
+    A[0] = pts1[2 * i]
+    A[1] = pts1[2 * i + 1]
+
+    B[0] = pts1[2 * ((i + 1) % 4)]
+    B[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    C[0] = pts2[2 * j]
+    C[1] = pts2[2 * j + 1]
+
+    D[0] = pts2[2 * ((j + 1) % 4)]
+    D[1] = pts2[2 * ((j + 1) % 4) + 1]
+    BA0 = B[0] - A[0]
+    BA1 = B[1] - A[1]
+    DA0 = D[0] - A[0]
+    CA0 = C[0] - A[0]
+    DA1 = D[1] - A[1]
+    CA1 = C[1] - A[1]
+    acd = DA1 * CA0 > CA1 * DA0
+    bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
+    if acd != bcd:
+        abc = CA1 * BA0 > BA1 * CA0
+        abd = DA1 * BA0 > BA1 * DA0
+        if abc != abd:
+            DC0 = D[0] - C[0]
+            DC1 = D[1] - C[1]
+            ABBA = A[0] * B[1] - B[0] * A[1]
+            CDDC = C[0] * D[1] - D[0] * C[1]
+            DH = BA1 * DC0 - BA0 * DC1
+            Dx = ABBA * DC0 - BA0 * CDDC
+            Dy = ABBA * DC1 - BA1 * CDDC
+            temp_pts[0] = Dx / DH
+            temp_pts[1] = Dy / DH
+            return True
+    return False
+
+
+@cuda.jit(
+    '(float32[:], float32[:], int32, int32, float32[:])',
+    device=True,
+    inline=True)
+def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
+    a = cuda.local.array((2, ), dtype=numba.float32)
+    b = cuda.local.array((2, ), dtype=numba.float32)
+    c = cuda.local.array((2, ), dtype=numba.float32)
+    d = cuda.local.array((2, ), dtype=numba.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+    area_abc = trangle_area(a, b, c)
+    area_abd = trangle_area(a, b, d)
+
+    if area_abc * area_abd >= 0:
+        return False
+
+    area_cda = trangle_area(c, d, a)
+    area_cdb = area_cda + area_abc - area_abd
+
+    if area_cda * area_cdb >= 0:
+        return False
+    t = area_cda / (area_abd - area_abc)
+
+    dx = t * (b[0] - a[0])
+    dy = t * (b[1] - a[1])
+    temp_pts[0] = a[0] + dx
+    temp_pts[1] = a[1] + dy
+    return True
+
+
+@cuda.jit('(float32, float32, float32[:])', device=True, inline=True)
+def point_in_quadrilateral(pt_x, pt_y, corners):
+    ab0 = corners[2] - corners[0]
+    ab1 = corners[3] - corners[1]
+
+    ad0 = corners[6] - corners[0]
+    ad1 = corners[7] - corners[1]
+
+    ap0 = pt_x - corners[0]
+    ap1 = pt_y - corners[1]
+
+    abab = ab0 * ab0 + ab1 * ab1
+    abap = ab0 * ap0 + ab1 * ap1
+    adad = ad0 * ad0 + ad1 * ad1
+    adap = ad0 * ap0 + ad1 * ap1
+
+    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+
+
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def quadrilateral_intersection(pts1, pts2, int_pts):
+    num_of_inter = 0
+    for i in range(4):
+        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+            int_pts[num_of_inter * 2] = pts1[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+            num_of_inter += 1
+        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+            int_pts[num_of_inter * 2] = pts2[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+            num_of_inter += 1
+    temp_pts = cuda.local.array((2, ), dtype=numba.float32)
+    for i in range(4):
+        for j in range(4):
+            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+            if has_pts:
+                int_pts[num_of_inter * 2] = temp_pts[0]
+                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+                num_of_inter += 1
+
+    return num_of_inter
+
+
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def rbbox_to_corners(corners, rbbox):
+    # generate clockwise corners and rotate it clockwise
+    angle = rbbox[4]
+    a_cos = math.cos(angle)
+    a_sin = math.sin(angle)
+    center_x = rbbox[0]
+    center_y = rbbox[1]
+    x_d = rbbox[2]
+    y_d = rbbox[3]
+    corners_x = cuda.local.array((4, ), dtype=numba.float32)
+    corners_y = cuda.local.array((4, ), dtype=numba.float32)
+    corners_x[0] = -x_d / 2
+    corners_x[1] = -x_d / 2
+    corners_x[2] = x_d / 2
+    corners_x[3] = x_d / 2
+    corners_y[0] = -y_d / 2
+    corners_y[1] = y_d / 2
+    corners_y[2] = y_d / 2
+    corners_y[3] = -y_d / 2
+    for i in range(4):
+        corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+        corners[2 * i +
+                1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+
+
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def inter(rbbox1, rbbox2):
+    corners1 = cuda.local.array((8, ), dtype=numba.float32)
+    corners2 = cuda.local.array((8, ), dtype=numba.float32)
+    intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
+
+    rbbox_to_corners(corners1, rbbox1)
+    rbbox_to_corners(corners2, rbbox2)
+
+    num_intersection = quadrilateral_intersection(corners1, corners2,
+                                                  intersection_corners)
+    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+    # print(intersection_corners.reshape([-1, 2])[:num_intersection])
+
+    return area(intersection_corners, num_intersection)
+
+
+@cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True)
+def devRotateIoUEval(rbox1, rbox2, criterion=-1):
+    area1 = rbox1[2] * rbox1[3]
+    area2 = rbox2[2] * rbox2[3]
+    area_inter = inter(rbox1, rbox2)
+    if criterion == -1:
+        return area_inter / (area1 + area2 - area_inter)
+    elif criterion == 0:
+        return area_inter / area1
+    elif criterion == 1:
+        return area_inter / area2
+    else:
+        return area_inter
+
+
+@cuda.jit(
+    '(int64, int64, float32[:], float32[:], float32[:], int32)',
+    fastmath=False)
+def rotate_iou_kernel_eval(N,
+                           K,
+                           dev_boxes,
+                           dev_query_boxes,
+                           dev_iou,
+                           criterion=-1):
+    threadsPerBlock = 8 * 8
+    row_start = cuda.blockIdx.x
+    col_start = cuda.blockIdx.y
+    tx = cuda.threadIdx.x
+    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
+    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
+    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+
+    dev_query_box_idx = threadsPerBlock * col_start + tx
+    dev_box_idx = threadsPerBlock * row_start + tx
+    if (tx < col_size):
+        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
+        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
+        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
+        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
+        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
+    if (tx < row_size):
+        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
+        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
+        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
+        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
+        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
+    cuda.syncthreads()
+    if tx < row_size:
+        for i in range(col_size):
+            offset = (
+                row_start * threadsPerBlock * K + col_start * threadsPerBlock +
+                tx * K + i)
+            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
+                                               block_boxes[tx * 5:tx * 5 + 5],
+                                               criterion)
+
+
+def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
+    """rotated box iou running in gpu. 500x faster than cpu version
+    (take 5ms in one example with numba.cuda code).
+    convert from [this project](
+        https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
+
+    Args:
+        boxes (float tensor: [N, 5]): rbboxes. format: centers, dims,
+            angles(clockwise when positive)
+        query_boxes (float tensor: [K, 5]): [description]
+        device_id (int, optional): Defaults to 0. [description]
+
+    Returns:
+        [type]: [description]
+    """
+    boxes = boxes.astype(np.float32)
+    query_boxes = query_boxes.astype(np.float32)
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    iou = np.zeros((N, K), dtype=np.float32)
+    if N == 0 or K == 0:
+        return iou
+    threadsPerBlock = 8 * 8
+    cuda.select_device(device_id)
+    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
+
+    stream = cuda.stream()
+    with stream.auto_synchronize():
+        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
+        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
+        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
+        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
+                               stream](N, K, boxes_dev, query_boxes_dev,
+                                       iou_dev, criterion)
+        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
+    return iou.astype(boxes.dtype)
--- a/mmdet3d/core/evaluation/mean_ap.py
+++ b/mmdet3d/core/evaluation/mean_ap.py
+import mmcv
+import numpy as np
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_ignore,
+                  default_iou_thr,
+                  area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): the detected bbox
+        gt_bboxes (ndarray): ground truth bboxes of this image
+        gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
+        default_iou_thr (float): the iou thresholds for medium and large bboxes
+        area_ranges (list or None): gt bbox area ranges
+
+    Returns:
+        tuple: two arrays (tp, fp) whose elements are 0 and 1
+    """
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1])
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlaped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes, gt_bboxes, gt_ignore, iou_thr, area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): the detected bbox
+        gt_bboxes (ndarray): ground truth bboxes of this image
+        gt_ignore (ndarray): indicate if gts are ignored for evaluation or not
+        iou_thr (float): the iou thresholds
+
+    Returns:
+        tuple: (tp, fp), two arrays whose elements are 0 and 1
+    """
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1])
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes)
+    ious_max = ious.max(axis=1)
+    ious_argmax = ious.argmax(axis=1)
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def get_cls_results(det_results, gt_bboxes, gt_labels, gt_ignore, class_id):
+    """Get det results and gt information of a certain class."""
+    cls_dets = [det[class_id]
+                for det in det_results]  # det bboxes of this class
+    cls_gts = []  # gt bboxes of this class
+    cls_gt_ignore = []
+    for j in range(len(gt_bboxes)):
+        gt_bbox = gt_bboxes[j]
+        cls_inds = (gt_labels[j] == class_id)
+        cls_gt = gt_bbox[cls_inds, :] if gt_bbox.shape[0] > 0 else gt_bbox
+        cls_gts.append(cls_gt)
+        if gt_ignore is None:
+            cls_gt_ignore.append(np.zeros(cls_gt.shape[0], dtype=np.int32))
+        else:
+            cls_gt_ignore.append(gt_ignore[j][cls_inds])
+    return cls_dets, cls_gts, cls_gt_ignore
+
+
+def eval_map(det_results,
+             gt_bboxes,
+             gt_labels,
+             gt_ignore=None,
+             scale_ranges=None,
+             iou_thr=0.5,
+             dataset=None,
+             print_summary=True):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list): a list of list, [[cls1_det, cls2_det, ...], ...]
+        gt_bboxes (list): ground truth bboxes of each image, a list of K*4
+            array.
+        gt_labels (list): ground truth labels of each image, a list of K array
+        gt_ignore (list): gt ignore indicators of each image, a list of K array
+        scale_ranges (list, optional): [(min1, max1), (min2, max2), ...]
+        iou_thr (float): IoU threshold
+        dataset (None or str or list): dataset name or dataset classes, there
+            are minor differences in metrics for different datsets, e.g.
+            "voc07", "imagenet_det", etc.
+        print_summary (bool): whether to print the mAP summary
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(gt_bboxes) == len(gt_labels)
+    if gt_ignore is not None:
+        assert len(gt_ignore) == len(gt_labels)
+        for i in range(len(gt_ignore)):
+            assert len(gt_labels[i]) == len(gt_ignore[i])
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    eval_results = []
+    num_classes = len(det_results[0])  # positive class num
+    gt_labels = [
+        label if label.ndim == 1 else label[:, 0] for label in gt_labels
+    ]
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gt_ignore = get_cls_results(
+            det_results, gt_bboxes, gt_labels, gt_ignore, i)
+        # calculate tp and fp for each image
+        tpfp_func = (
+            tpfp_imagenet if dataset in ['det', 'vid'] else tpfp_default)
+        tpfp = [
+            tpfp_func(cls_dets[j], cls_gts[j], cls_gt_ignore[j], iou_thr,
+                      area_ranges) for j in range(len(cls_dets))
+        ]
+        tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale, gts ignored or beyond scale
+        # are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += np.sum(np.logical_not(cls_gt_ignore[j]))
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0]) * (
+                    bbox[:, 3] - bbox[:, 1])
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum(
+                        np.logical_not(cls_gt_ignore[j])
+                        & (gt_areas >= min_area) & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        mode = 'area' if dataset != 'voc07' else '11points'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = []
+        for i in range(num_scales):
+            if np.any(all_num_gts[:, i] > 0):
+                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+            else:
+                mean_ap.append(0.0)
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+    if print_summary:
+        print_map_summary(mean_ap, eval_results, dataset, area_ranges)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap, results, dataset=None, ranges=None):
+    """Print mAP and results of each class.
+
+    Args:
+        mean_ap(float): calculated from `eval_map`
+        results(list): calculated from `eval_map`
+        dataset(None or str or list): dataset name or dataset classes.
+        ranges(list or Tuple): ranges of areas
+    """
+    num_scales = len(results[0]['ap']) if isinstance(results[0]['ap'],
+                                                     np.ndarray) else 1
+    if ranges is not None:
+        assert len(ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    precisions = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+            precisions[:, i] = np.array(
+                cls_result['precision'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(0, num_classes)]
+    elif mmcv.is_str(dataset):
+        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+    header = ['class', 'gts', 'dets', 'recall', 'precision', 'ap']
+    for i in range(num_scales):
+        if ranges is not None:
+            print('Area range ', ranges[i])
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                '{:.3f}'.format(recalls[i, j]),
+                '{:.3f}'.format(precisions[i, j]), '{:.3f}'.format(aps[i, j])
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', '', '{:.3f}'.format(mean_ap[i])])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print(table.table)
--- a/mmdet3d/core/evaluation/recall.py
+++ b/mmdet3d/core/evaluation/recall.py
+import numpy as np
+from terminaltables import AsciiTable
+
+from ..bbox import bbox_overlaps_2d
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format.
+    """
+    if isinstance(proposal_nums, list):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, list):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=None,
+                 print_summary=True):
+    """Calculate recalls.
+
+    Args:
+        gts(list or ndarray): a list of arrays of shape (n, 4)
+        proposals(list or ndarray): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums(int or list of int or ndarray): top N proposals
+        thrs(float or list or ndarray): iou thresholds
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps_2d(gts[i], img_proposal[:prop_num, :4])
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+    if print_summary:
+        print_recall_summary(recalls, proposal_nums, iou_thrs)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls(ndarray): calculated from `bbox_recalls`
+        proposal_nums(ndarray or list): top N proposals
+        iou_thrs(ndarray or list): iou thresholds
+        row_idxs(ndarray): which rows(proposal nums) to print
+        col_idxs(ndarray): which cols(iou thresholds) to print
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [
+            '{:.3f}'.format(val)
+            for val in recalls[row_idxs[i], col_idxs].tolist()
+        ]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print(table.table)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
--- a/mmdet3d/core/optimizer/__init__.py
+++ b/mmdet3d/core/optimizer/__init__.py
+from .builder import build_optimizer
+from .mix_optimizer import MixedOptimizer
+from .registry import OPTIMIZERS
+
+__all__ = ['OPTIMIZERS', 'build_optimizer', 'MixedOptimizer']
--- a/mmdet3d/core/optimizer/builder.py
+++ b/mmdet3d/core/optimizer/builder.py
+import re
+
+import torch
+
+from mmdet.utils import build_from_cfg, get_root_logger
+from .registry import OPTIMIZERS
+
+
+def build_optimizer(model, optimizer_cfg):
+    """Build optimizer from configs.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        optimizer_cfg (dict): The config dict of the optimizer.
+            Positional fields are:
+                - type: class name of the optimizer.
+                - lr: base learning rate.
+            Optional fields are:
+                - any arguments of the corresponding optimizer type, e.g.,
+                  weight_decay, momentum, etc.
+                - paramwise_options: a dict with 4 accepted fileds
+                  (bias_lr_mult, bias_decay_mult, norm_decay_mult,
+                  dwconv_decay_mult).
+                  `bias_lr_mult` and `bias_decay_mult` will be multiplied to
+                  the lr and weight decay respectively for all bias parameters
+                  (except for the normalization layers), and
+                  `norm_decay_mult` will be multiplied to the weight decay
+                  for all weight and bias parameters of normalization layers.
+                  `dwconv_decay_mult` will be multiplied to the weight decay
+                  for all weight and bias parameters of depthwise conv layers.
+
+    Returns:
+        torch.optim.Optimizer: The initialized optimizer.
+
+    Example:
+        >>> import torch
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+        >>>                      weight_decay=0.0001)
+        >>> optimizer = build_optimizer(model, optimizer_cfg)
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+
+    optimizer_cfg = optimizer_cfg.copy()
+
+    if isinstance(optimizer_cfg, list):
+        # Assume paramwise_options is None if optimizer_cfg is list
+        from .mix_optimizer import MixedOptimizer
+        logger = get_root_logger()
+        keys = [optimizer.pop('key') for optimizer in optimizer_cfg]
+        keys_params = {key: [] for key in keys}
+        keys_params_name = {key: [] for key in keys}
+        keys_optimizer = []
+        for name, param in model.named_parameters():
+            param_group = {'params': [param]}
+            find_flag = False
+            for key in keys:
+                if key in name:
+                    keys_params[key].append(param_group)
+                    keys_params_name[key].append(name)
+                    find_flag = True
+                    break
+            assert find_flag, 'key {} is not matched to any optimizer'.format(
+                name)
+
+        step_intervals = []
+        for key, single_cfg in zip(keys, optimizer_cfg):
+            optimizer_cls = getattr(torch.optim, single_cfg.pop('type'))
+            step_intervals.append(single_cfg.pop('step_interval', 1))
+            single_optim = optimizer_cls(keys_params[key], **single_cfg)
+            keys_optimizer.append(single_optim)
+            logger.info('{} optimizes key:\n {}\n'.format(
+                optimizer_cls.__name__, keys_params_name[key]))
+
+        mix_optimizer = MixedOptimizer(keys_optimizer, step_intervals)
+        return mix_optimizer
+    else:
+        paramwise_options = optimizer_cfg.pop('paramwise_options', None)
+
+    # if no paramwise option is specified, just use the global setting
+    if paramwise_options is None:
+        params = model.parameters()
+    else:
+        assert isinstance(paramwise_options, dict)
+        # get base lr and weight decay
+        base_lr = optimizer_cfg['lr']
+        base_wd = optimizer_cfg.get('weight_decay', None)
+        # weight_decay must be explicitly specified if mult is specified
+        if ('bias_decay_mult' in paramwise_options
+                or 'norm_decay_mult' in paramwise_options
+                or 'dwconv_decay_mult' in paramwise_options):
+            assert base_wd is not None
+        # get param-wise options
+        bias_lr_mult = paramwise_options.get('bias_lr_mult', 1.)
+        bias_decay_mult = paramwise_options.get('bias_decay_mult', 1.)
+        norm_decay_mult = paramwise_options.get('norm_decay_mult', 1.)
+        dwconv_decay_mult = paramwise_options.get('dwconv_decay_mult', 1.)
+        named_modules = dict(model.named_modules())
+        # set param-wise lr and weight decay
+        params = []
+        for name, param in model.named_parameters():
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                # FP16 training needs to copy gradient/weight between master
+                # weight copy and model weight, it is convenient to keep all
+                # parameters here to align with model.parameters()
+                params.append(param_group)
+                continue
+
+            # for norm layers, overwrite the weight decay of weight and bias
+            # TODO: obtain the norm layer prefixes dynamically
+            if re.search(r'(bn|gn)(\d+)?.(weight|bias)', name):
+                if base_wd is not None:
+                    param_group['weight_decay'] = base_wd * norm_decay_mult
+            # for other layers, overwrite both lr and weight decay of bias
+            elif name.endswith('.bias'):
+                param_group['lr'] = base_lr * bias_lr_mult
+                if base_wd is not None:
+                    param_group['weight_decay'] = base_wd * bias_decay_mult
+
+            module_name = name.replace('.weight', '').replace('.bias', '')
+            if module_name in named_modules and base_wd is not None:
+                module = named_modules[module_name]
+                # if this Conv2d is depthwise Conv2d
+                if isinstance(module, torch.nn.Conv2d) and \
+                        module.in_channels == module.groups:
+                    param_group['weight_decay'] = base_wd * dwconv_decay_mult
+            # otherwise use the global settings
+
+            params.append(param_group)
+
+    optimizer_cfg['params'] = params
+
+    return build_from_cfg(optimizer_cfg, OPTIMIZERS)
--- a/mmdet3d/core/optimizer/mix_optimizer.py
+++ b/mmdet3d/core/optimizer/mix_optimizer.py
+from torch.optim import Optimizer
+
+from .registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module
+class MixedOptimizer(Optimizer):
+    """Mixed Optimizer that contains multiple optimizers
+
+    This optimizer applies the cocktail optimzation for multi-modality models.
+
+    """
+
+    def __init__(self, optimizers, step_intervals=None):
+        self.optimizers = optimizers
+        self.param_groups = []
+        for optimizer in self.optimizers:
+            self.param_groups += optimizer.param_groups
+        if not isinstance(step_intervals, list):
+            step_intervals = [1] * len(self.optimizers)
+        self.step_intervals = step_intervals
+        self.num_step_updated = 0
+
+    def __getstate__(self):
+        return {
+            'num_step_updated':
+            self.num_step_updated,
+            'defaults': [optimizer.defaults for optimizer in self.optimizers],
+            'state': [optimizer.state for optimizer in self.optimizers],
+            'param_groups':
+            [optimizer.param_groups for optimizer in self.optimizers],
+        }
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + ' (\n'
+        for optimizer in self.optimizers:
+            format_string += '\t' + optimizer.__repr__ + ',\n'
+        format_string += ')'
+        return format_string
+
+    def state_dict(self):
+        state_dicts = [optimizer.state_dict() for optimizer in self.optimizers]
+        return {
+            'num_step_updated':
+            self.num_step_updated,
+            'state': [state_dict['state'] for state_dict in state_dicts],
+            'param_groups':
+            [state_dict['param_groups'] for state_dict in state_dicts],
+        }
+
+    def load_state_dict(self, state_dict):
+        r"""Loads the optimizer state.
+
+        Arguments:
+            state_dict (dict): optimizer state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        assert len(state_dict['state']) == len(self.optimizers)
+        assert len(state_dict['param_groups']) == len(self.optimizers)
+        for i, (single_state, single_param_groups) in enumerate(
+                zip(state_dict['state'], state_dict['param_groups'])):
+            single_state_dict = dict(
+                state=single_state, param_groups=single_param_groups)
+            self.optimizers[i].load_state_dict(single_state_dict)
+
+        self.param_groups = []
+        for optimizer in self.optimizers:
+            self.param_groups += optimizer.param_groups
+        self.num_step_updated = state_dict['num_step_updated']
+
+    def zero_grad(self):
+        r"""Clears the gradients of all optimized :class:`torch.Tensor` s."""
+        for optimizer in self.optimizers:
+            optimizer.zero_grad()
+
+    def step(self, closure=None):
+        r"""Performs a single optimization step (parameter update).
+
+        Arguments:
+            closure (callable): A closure that reevaluates the model and
+                returns the loss. Optional for most optimizers.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        self.num_step_updated += 1
+        for step_interval, optimizer in zip(self.step_intervals,
+                                            self.optimizers):
+            if self.num_step_updated % step_interval == 0:
+                optimizer.step()
+
+        return loss
+
+    def add_param_group(self, param_group):
+        raise NotImplementedError
--- a/mmdet3d/core/optimizer/registry.py
+++ b/mmdet3d/core/optimizer/registry.py
+import inspect
+
+import torch
+
+from mmdet.utils import Registry
+
+OPTIMIZERS = Registry('optimizer')
+
+
+def register_torch_optimizers():
+    torch_optimizers = []
+    for module_name in dir(torch.optim):
+        if module_name.startswith('__'):
+            continue
+        _optim = getattr(torch.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            OPTIMIZERS.register_module(_optim)
+            torch_optimizers.append(module_name)
+    return torch_optimizers
+
+
+TORCH_OPTIMIZERS = register_torch_optimizers()
--- a/mmdet3d/core/post_processing/__init__.py
+++ b/mmdet3d/core/post_processing/__init__.py
+from .bbox_nms import multiclass_nms
+from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+                         merge_aug_proposals, merge_aug_scores)
+
+__all__ = [
+    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
+    'merge_aug_scores', 'merge_aug_masks'
+]
--- a/mmdet3d/core/post_processing/bbox_nms.py
+++ b/mmdet3d/core/post_processing/bbox_nms.py
+import torch
+
+from mmdet.ops.nms import nms_wrapper
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   nms_cfg,
+                   max_num=-1,
+                   score_factors=None):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class+1), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+        score_factors (Tensor): The factors multiplied to scores before
+            applying NMS
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels
+            are 0-based.
+    """
+    # scores has num_classes + 1 (last one is BG)
+    num_classes = multi_scores.shape[1] - 1
+    bboxes, labels = [], []
+    nms_cfg_ = nms_cfg.copy()
+    nms_type = nms_cfg_.pop('type', 'nms')
+    nms_op = getattr(nms_wrapper, nms_type)
+    # the fg class id range: [0, num_classes-1]
+    for i in range(0, num_classes):
+        cls_inds = multi_scores[:, i] > score_thr
+        if not cls_inds.any():
+            continue
+        # get bboxes and scores of this class
+        if multi_bboxes.shape[1] == 4:
+            _bboxes = multi_bboxes[cls_inds, :]
+        else:
+            _bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4]
+        _scores = multi_scores[cls_inds, i]
+        if score_factors is not None:
+            _scores *= score_factors[cls_inds]
+        cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1)
+        cls_dets, _ = nms_op(cls_dets, **nms_cfg_)
+        cls_labels = multi_bboxes.new_full((cls_dets.shape[0], ),
+                                           i,
+                                           dtype=torch.long)
+        bboxes.append(cls_dets)
+        labels.append(cls_labels)
+    if bboxes:
+        bboxes = torch.cat(bboxes)
+        labels = torch.cat(labels)
+        if bboxes.shape[0] > max_num:
+            _, inds = bboxes[:, -1].sort(descending=True)
+            inds = inds[:max_num]
+            bboxes = bboxes[inds]
+            labels = labels[inds]
+    else:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+
+    return bboxes, labels
--- a/mmdet3d/core/post_processing/merge_augs.py
+++ b/mmdet3d/core/post_processing/merge_augs.py
+import numpy as np
+import torch
+
+from mmdet3d.ops import nms
+from ..bbox import bbox_mapping_back
+
+
+def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+
+        img_metas (list[dict]): list of image info dict where each dict has:
+            'img_shape', 'scale_factor', 'flip', and my also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys see
+            `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+        rpn_test_cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    merged_proposals, _ = nms(aug_proposals, rpn_test_cfg.nms_thr)
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(rpn_test_cfg.max_num, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[ndarray]): shape (n, #class, h, w)
+        img_shapes (list[ndarray]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_masks = [
+        mask if not img_info[0]['flip'] else mask[..., ::-1]
+        for mask, img_info in zip(aug_masks, img_metas)
+    ]
+    if weights is None:
+        merged_masks = np.mean(recovered_masks, axis=0)
+    else:
+        merged_masks = np.average(
+            np.array(recovered_masks), axis=0, weights=np.array(weights))
+    return merged_masks
--- a/mmdet3d/core/utils/__init__.py
+++ b/mmdet3d/core/utils/__init__.py
+from .dist_utils import DistOptimizerHook, allreduce_grads
+from .misc import tensor2imgs  # merge_batch, merge_hook_batch
+from .misc import multi_apply, unmap
+
+__all__ = [
+    'allreduce_grads',
+    'DistOptimizerHook',
+    'multi_apply',
+    'tensor2imgs',
+    'unmap',  # 'merge_batch', 'merge_hook_batch'
+]
--- a/mmdet3d/core/utils/contextmanagers.py
+++ b/mmdet3d/core/utils/contextmanagers.py
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """
+    Async context manager that waits for work to complete on
+    given CUDA streams.
+
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert grad_enabled_before == grad_enabled_after, \
+            'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug('%s %s completed: %s streams: %s', trace_name,
+                             name, are_done, streams)
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += ' {stream} {elapsed_time:.2f} ms'.format(
+                    stream, elapsed_time)
+            logger.info('{trace_name} {name} cpu_time {cpu_time:.2f} ms',
+                        trace_name, name, cpu_time, stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
--- a/mmdet3d/core/utils/dist_utils.py
+++ b/mmdet3d/core/utils/dist_utils.py
+from collections import OrderedDict
+
+import torch.distributed as dist
+from mmcv.runner import OptimizerHook
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+class DistOptimizerHook(OptimizerHook):
+
+    def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        runner.outputs['loss'].backward()
+        # allreduce_grads(runner.model.parameters(), self.coalesce,
+        #                 self.bucket_size_mb)
+        if self.grad_clip is not None:
+            self.clip_grads(runner.model.parameters())
+        runner.optimizer.step()
--- a/mmdet3d/core/utils/kitti_utils.py
+++ b/mmdet3d/core/utils/kitti_utils.py
+import numpy as np
+import scipy
+import torch
+from scipy.spatial import Delaunay
+
+
+def in_hull(p, hull):
+    """
+    :param p: (N, K) test points
+    :param hull: (M, K) M corners of a box
+    :return (N) bool
+    """
+    try:
+        if not isinstance(hull, Delaunay):
+            hull = Delaunay(hull)
+        flag = hull.find_simplex(p) >= 0
+    except scipy.spatial.qhull.QhullError:
+        print('Warning: not a hull %s' % str(hull))
+        flag = np.zeros(p.shape[0], dtype=np.bool)
+
+    return flag
+
+
+def enlarge_box3d(boxes3d, extra_width):
+    """
+    :param boxes3d: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coords
+    """
+    if isinstance(boxes3d, np.ndarray):
+        large_boxes3d = boxes3d.copy()
+    else:
+        large_boxes3d = boxes3d.clone()
+    large_boxes3d[:, 3:6] += extra_width * 2
+    # bugfixed: here should be minus, not addion in LiDAR, 20190508
+    large_boxes3d[:, 2] -= extra_width
+    return large_boxes3d
+
+
+def rotate_pc_along_z(pc, rot_angle):
+    """
+    params pc: (N, 3+C), (N, 3) is in the LiDAR coordinate
+    params rot_angle: rad scalar
+    Output pc: updated pc with XYZ rotated
+    """
+    cosval = np.cos(rot_angle)
+    sinval = np.sin(rot_angle)
+    rotmat = np.array([[cosval, -sinval], [sinval, cosval]])
+    pc[:, 0:2] = np.dot(pc[:, 0:2], rotmat)
+    return pc
+
+
+def rotate_pc_along_z_torch(pc, rot_angle):
+    """
+    :param pc: (N, 512, 3 + C) in the LiDAR coordinate
+    :param rot_angle: (N)
+    :return:
+    TODO: merge with rotate_pc_along_y_torch in bbox_transform.py
+    """
+    cosa = torch.cos(rot_angle).view(-1, 1)  # (N, 1)
+    sina = torch.sin(rot_angle).view(-1, 1)  # (N, 1)
+
+    raw_1 = torch.cat([cosa, -sina], dim=1)  # (N, 2)
+    raw_2 = torch.cat([sina, cosa], dim=1)  # (N, 2)
+    R = torch.cat((raw_1.unsqueeze(dim=1), raw_2.unsqueeze(dim=1)),
+                  dim=1)  # (N, 2, 2)
+
+    pc_temp = pc[:, :, 0:2]  # (N, 512, 2)
+
+    pc[:, :, 0:2] = torch.matmul(pc_temp, R)  # (N, 512, 2)
+    return pc