Merge pull request #2 from sshaoshuai/develop_gcx_release

Support PointPillar, Second, AxisAlignedTargetAssigner, MultiHead and multi-gpu testing

Merge pull request #2 from sshaoshuai/develop_gcx_release
Support PointPillar, Second, AxisAlignedTargetAssigner, MultiHead and multi-gpu testing
b7e2fb70 · Shaoshuai Shi · GitHub · c3dd2d54 · 8007ce04 · b7e2fb70
Unverified Commit b7e2fb70 authored Jun 24, 2020 by Shaoshuai Shi Committed by GitHub Jun 24, 2020
20 changed files
--- a/pcdet/datasets/__init__.py
+++ b/pcdet/datasets/__init__.py
@@ -2,13 +2,37 @@ import torch
 from torch.utils.data import DataLoader
 from .dataset import DatasetTemplate
 from .kitti.kitti_dataset import KittiDataset
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from pcdet.utils import common_utils
 __all__ = {
    'DatasetTemplate': DatasetTemplate,
    'KittiDataset': KittiDataset,
 }
+class DistributedSampler(_DistributedSampler):
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank)
+        self.shuffle = shuffle
+    def __iter__(self):
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        assert len(indices) == self.num_samples
+        return iter(indices)
 def build_dataloader(dataset_cfg, class_names, batch_size, dist, root_path=None, workers=4,
                     logger=None, training=True):
@@ -20,8 +44,14 @@ def build_dataloader(dataset_cfg, class_names, batch_size, dist, root_path=None,
        training=training,
        logger=logger,
    )
+    if dist:
-    sampler = torch.utils.data.distributed.DistributedSampler(dataset) if dist else None
+        if training:
+            sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+        else:
+            rank, world_size = common_utils.get_dist_info()
+            sampler = DistributedSampler(dataset, world_size, rank, shuffle=False)
+    else:
+        sampler = None
    dataloader = DataLoader(
        dataset, batch_size=batch_size, pin_memory=True, num_workers=workers,
        shuffle=(sampler is None) and training, collate_fn=dataset.collate_batch,

--- a/pcdet/datasets/kitti/kitti_object_eval_python/LICENSE
+++ b/pcdet/datasets/kitti/kitti_object_eval_python/LICENSE
+MIT License
+Copyright (c) 2018 
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/pcdet/datasets/kitti/kitti_object_eval_python/README.md
+++ b/pcdet/datasets/kitti/kitti_object_eval_python/README.md
+# kitti-object-eval-python
+**Note**: This is borrowed from [traveller59/kitti-object-eval-python](https://github.com/traveller59/kitti-object-eval-python)
+Fast kitti object detection eval in python(finish eval in less than 10 second), support 2d/bev/3d/aos. , support coco-style AP. If you use command line interface, numba need some time to compile jit functions.
+## Dependencies
+Only support python 3.6+, need `numpy`, `skimage`, `numba`, `fire`. If you have Anaconda, just install `cudatoolkit` in anaconda. Otherwise, please reference to this [page](https://github.com/numba/numba#custom-python-environments) to set up llvm and cuda for numba.
+* Install by conda:
+```
+conda install -c numba cudatoolkit=x.x  (8.0, 9.0, 9.1, depend on your environment) 
+```
+## Usage
+* commandline interface:
+```
+python evaluate.py evaluate --label_path=/path/to/your_gt_label_folder --result_path=/path/to/your_result_folder --label_split_file=/path/to/val.txt --current_class=0 --coco=False
+```
+* python interface:
+```Python
+import kitti_common as kitti
+from eval import get_official_eval_result, get_coco_eval_result
+def _read_imageset_file(path):
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    return [int(line) for line in lines]
+det_path = "/path/to/your_result_folder"
+dt_annos = kitti.get_label_annos(det_path)
+gt_path = "/path/to/your_gt_label_folder"
+gt_split_file = "/path/to/val.txt" # from https://xiaozhichen.github.io/files/mv3d/imagesets.tar.gz
+val_image_ids = _read_imageset_file(gt_split_file)
+gt_annos = kitti.get_label_annos(gt_path, val_image_ids)
+print(get_official_eval_result(gt_annos, dt_annos, 0)) # 6s in my computer
+print(get_coco_eval_result(gt_annos, dt_annos, 0)) # 18s in my computer
+```
--- a/pcdet/datasets/kitti/kitti_object_eval_python/eval.py
+++ b/pcdet/datasets/kitti/kitti_object_eval_python/eval.py
--- a/pcdet/datasets/kitti/kitti_object_eval_python/evaluate.py
+++ b/pcdet/datasets/kitti/kitti_object_eval_python/evaluate.py
+import time
+import fire
+import .kitti_common as kitti
+from .eval import get_official_eval_result, get_coco_eval_result
+def _read_imageset_file(path):
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    return [int(line) for line in lines]
+def evaluate(label_path,
+             result_path,
+             label_split_file,
+             current_class=0,
+             coco=False,
+             score_thresh=-1):
+    dt_annos = kitti.get_label_annos(result_path)
+    if score_thresh > 0:
+        dt_annos = kitti.filter_annos_low_score(dt_annos, score_thresh)
+    val_image_ids = _read_imageset_file(label_split_file)
+    gt_annos = kitti.get_label_annos(label_path, val_image_ids)
+    if coco:
+        return get_coco_eval_result(gt_annos, dt_annos, current_class)
+    else:
+        return get_official_eval_result(gt_annos, dt_annos, current_class)
+if __name__ == '__main__':
+    fire.Fire()
--- a/pcdet/datasets/kitti/kitti_object_eval_python/kitti_common.py
+++ b/pcdet/datasets/kitti/kitti_object_eval_python/kitti_common.py
+import concurrent.futures as futures
+import os
+import pathlib
+import re
+from collections import OrderedDict
+import numpy as np
+from skimage import io
+def get_image_index_str(img_idx):
+    return "{:06d}".format(img_idx)
+def get_kitti_info_path(idx,
+                        prefix,
+                        info_type='image_2',
+                        file_tail='.png',
+                        training=True,
+                        relative_path=True):
+    img_idx_str = get_image_index_str(idx)
+    img_idx_str += file_tail
+    prefix = pathlib.Path(prefix)
+    if training:
+        file_path = pathlib.Path('training') / info_type / img_idx_str
+    else:
+        file_path = pathlib.Path('testing') / info_type / img_idx_str
+    if not (prefix / file_path).exists():
+        raise ValueError("file not exist: {}".format(file_path))
+    if relative_path:
+        return str(file_path)
+    else:
+        return str(prefix / file_path)
+def get_image_path(idx, prefix, training=True, relative_path=True):
+    return get_kitti_info_path(idx, prefix, 'image_2', '.png', training,
+                               relative_path)
+def get_label_path(idx, prefix, training=True, relative_path=True):
+    return get_kitti_info_path(idx, prefix, 'label_2', '.txt', training,
+                               relative_path)
+def get_velodyne_path(idx, prefix, training=True, relative_path=True):
+    return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training,
+                               relative_path)
+def get_calib_path(idx, prefix, training=True, relative_path=True):
+    return get_kitti_info_path(idx, prefix, 'calib', '.txt', training,
+                               relative_path)
+def _extend_matrix(mat):
+    mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0)
+    return mat
+def get_kitti_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True):
+    # image_infos = []
+    root_path = pathlib.Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+    def map_func(idx):
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            image_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path)
+        image_info['img_path'] = get_image_path(idx, path, training,
+                                                relative_path)
+        if with_imageshape:
+            img_path = image_info['img_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['img_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(idx, path, training, relative_path)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array(
+                [float(info) for info in lines[0].split(' ')[1:13]]).reshape(
+                    [3, 4])
+            P1 = np.array(
+                [float(info) for info in lines[1].split(' ')[1:13]]).reshape(
+                    [3, 4])
+            P2 = np.array(
+                [float(info) for info in lines[2].split(' ')[1:13]]).reshape(
+                    [3, 4])
+            P3 = np.array(
+                [float(info) for info in lines[3].split(' ')[1:13]]).reshape(
+                    [3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+            image_info['calib/P0'] = P0
+            image_info['calib/P1'] = P1
+            image_info['calib/P2'] = P2
+            image_info['calib/P3'] = P3
+            R0_rect = np.array([
+                float(info) for info in lines[4].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+            image_info['calib/R0_rect'] = rect_4x4
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[5].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_imu_to_velo = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+                Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
+            image_info['calib/Tr_velo_to_cam'] = Tr_velo_to_cam
+            image_info['calib/Tr_imu_to_velo'] = Tr_imu_to_velo
+        if annotations is not None:
+            image_info['annos'] = annotations
+            add_difficulty_to_annos(image_info)
+        return image_info
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+    return list(image_infos)
+def filter_kitti_anno(image_anno,
+                      used_classes,
+                      used_difficulty=None,
+                      dontcare_iou=None):
+    if not isinstance(used_classes, (list, tuple)):
+        used_classes = [used_classes]
+    img_filtered_annotations = {}
+    relevant_annotation_indices = [
+        i for i, x in enumerate(image_anno['name']) if x in used_classes
+    ]
+    for key in image_anno.keys():
+        img_filtered_annotations[key] = (
+            image_anno[key][relevant_annotation_indices])
+    if used_difficulty is not None:
+        relevant_annotation_indices = [
+            i for i, x in enumerate(img_filtered_annotations['difficulty'])
+            if x in used_difficulty
+        ]
+        for key in image_anno.keys():
+            img_filtered_annotations[key] = (
+                img_filtered_annotations[key][relevant_annotation_indices])
+    if 'DontCare' in used_classes and dontcare_iou is not None:
+        dont_care_indices = [
+            i for i, x in enumerate(img_filtered_annotations['name'])
+            if x == 'DontCare'
+        ]
+        # bounding box format [y_min, x_min, y_max, x_max]
+        all_boxes = img_filtered_annotations['bbox']
+        ious = iou(all_boxes, all_boxes[dont_care_indices])
+        # Remove all bounding boxes that overlap with a dontcare region.
+        if ious.size > 0:
+            boxes_to_remove = np.amax(ious, axis=1) > dontcare_iou
+            for key in image_anno.keys():
+                img_filtered_annotations[key] = (img_filtered_annotations[key][
+                    np.logical_not(boxes_to_remove)])
+    return img_filtered_annotations
+def filter_annos_low_score(image_annos, thresh):
+    new_image_annos = []
+    for anno in image_annos:
+        img_filtered_annotations = {}
+        relevant_annotation_indices = [
+            i for i, s in enumerate(anno['score']) if s >= thresh
+        ]
+        for key in anno.keys():
+            img_filtered_annotations[key] = (
+                anno[key][relevant_annotation_indices])
+        new_image_annos.append(img_filtered_annotations)
+    return new_image_annos
+def kitti_result_line(result_dict, precision=4):
+    prec_float = "{" + ":.{}f".format(precision) + "}"
+    res_line = []
+    all_field_default = OrderedDict([
+        ('name', None),
+        ('truncated', -1),
+        ('occluded', -1),
+        ('alpha', -10),
+        ('bbox', None),
+        ('dimensions', [-1, -1, -1]),
+        ('location', [-1000, -1000, -1000]),
+        ('rotation_y', -10),
+        ('score', None),
+    ])
+    res_dict = [(key, None) for key, val in all_field_default.items()]
+    res_dict = OrderedDict(res_dict)
+    for key, val in result_dict.items():
+        if all_field_default[key] is None and val is None:
+            raise ValueError("you must specify a value for {}".format(key))
+        res_dict[key] = val
+    for key, val in res_dict.items():
+        if key == 'name':
+            res_line.append(val)
+        elif key in ['truncated', 'alpha', 'rotation_y', 'score']:
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append(prec_float.format(val))
+        elif key == 'occluded':
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append('{}'.format(val))
+        elif key in ['bbox', 'dimensions', 'location']:
+            if val is None:
+                res_line += [str(v) for v in all_field_default[key]]
+            else:
+                res_line += [prec_float.format(v) for v in val]
+        else:
+            raise ValueError("unknown key. supported key:{}".format(
+                res_dict.keys()))
+    return ' '.join(res_line)
+def add_difficulty_to_annos(info):
+    min_height = [40, 25,
+                  25]  # minimum height for evaluated groundtruth/detections
+    max_occlusion = [
+        0, 1, 2
+    ]  # maximum occlusion level of the groundtruth used for eval_utils
+    max_trunc = [
+        0.15, 0.3, 0.5
+    ]  # maximum truncation level of the groundtruth used for eval_utils
+    annos = info['annos']
+    dims = annos['dimensions']  # lhw format
+    bbox = annos['bbox']
+    height = bbox[:, 3] - bbox[:, 1]
+    occlusion = annos['occluded']
+    truncation = annos['truncated']
+    diff = []
+    easy_mask = np.ones((len(dims), ), dtype=np.bool)
+    moderate_mask = np.ones((len(dims), ), dtype=np.bool)
+    hard_mask = np.ones((len(dims), ), dtype=np.bool)
+    i = 0
+    for h, o, t in zip(height, occlusion, truncation):
+        if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]:
+            easy_mask[i] = False
+        if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]:
+            moderate_mask[i] = False
+        if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]:
+            hard_mask[i] = False
+        i += 1
+    is_easy = easy_mask
+    is_moderate = np.logical_xor(easy_mask, moderate_mask)
+    is_hard = np.logical_xor(hard_mask, moderate_mask)
+    for i in range(len(dims)):
+        if is_easy[i]:
+            diff.append(0)
+        elif is_moderate[i]:
+            diff.append(1)
+        elif is_hard[i]:
+            diff.append(2)
+        else:
+            diff.append(-1)
+    annos["difficulty"] = np.array(diff, np.int32)
+    return diff
+def get_label_anno(label_path):
+    annotations = {}
+    annotations.update({
+        'name': [],
+        'truncated': [],
+        'occluded': [],
+        'alpha': [],
+        'bbox': [],
+        'dimensions': [],
+        'location': [],
+        'rotation_y': []
+    })
+    with open(label_path, 'r') as f:
+        lines = f.readlines()
+    # if len(lines) == 0 or len(lines[0]) < 15:
+    #     content = []
+    # else:
+    content = [line.strip().split(' ') for line in lines]
+    annotations['name'] = np.array([x[0] for x in content])
+    annotations['truncated'] = np.array([float(x[1]) for x in content])
+    annotations['occluded'] = np.array([int(x[2]) for x in content])
+    annotations['alpha'] = np.array([float(x[3]) for x in content])
+    annotations['bbox'] = np.array(
+        [[float(info) for info in x[4:8]] for x in content]).reshape(-1, 4)
+    # dimensions will convert hwl format to standard lhw(camera) format.
+    annotations['dimensions'] = np.array(
+        [[float(info) for info in x[8:11]] for x in content]).reshape(
+            -1, 3)[:, [2, 0, 1]]
+    annotations['location'] = np.array(
+        [[float(info) for info in x[11:14]] for x in content]).reshape(-1, 3)
+    annotations['rotation_y'] = np.array(
+        [float(x[14]) for x in content]).reshape(-1)
+    if len(content) != 0 and len(content[0]) == 16:  # have score
+        annotations['score'] = np.array([float(x[15]) for x in content])
+    else:
+        annotations['score'] = np.zeros([len(annotations['bbox'])])
+    return annotations
+def get_label_annos(label_folder, image_ids=None):
+    if image_ids is None:
+        filepaths = pathlib.Path(label_folder).glob('*.txt')
+        prog = re.compile(r'^\d{6}.txt$')
+        filepaths = filter(lambda f: prog.match(f.name), filepaths)
+        image_ids = [int(p.stem) for p in filepaths]
+        image_ids = sorted(image_ids)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+    annos = []
+    label_folder = pathlib.Path(label_folder)
+    for idx in image_ids:
+        image_idx = get_image_index_str(idx)
+        label_filename = label_folder / (image_idx + '.txt')
+        annos.append(get_label_anno(label_filename))
+    return annos
+def area(boxes, add1=False):
+    """Computes area of boxes.
+    Args:
+        boxes: Numpy array with shape [N, 4] holding N boxes
+    Returns:
+        a numpy array with shape [N*1] representing box areas
+    """
+    if add1:
+        return (boxes[:, 2] - boxes[:, 0] + 1.0) * (
+            boxes[:, 3] - boxes[:, 1] + 1.0)
+    else:
+        return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+def intersection(boxes1, boxes2, add1=False):
+    """Compute pairwise intersection areas between boxes.
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes
+        boxes2: a numpy array with shape [M, 4] holding M boxes
+    Returns:
+        a numpy array with shape [N*M] representing pairwise intersection area
+    """
+    [y_min1, x_min1, y_max1, x_max1] = np.split(boxes1, 4, axis=1)
+    [y_min2, x_min2, y_max2, x_max2] = np.split(boxes2, 4, axis=1)
+    all_pairs_min_ymax = np.minimum(y_max1, np.transpose(y_max2))
+    all_pairs_max_ymin = np.maximum(y_min1, np.transpose(y_min2))
+    if add1:
+        all_pairs_min_ymax += 1.0
+    intersect_heights = np.maximum(
+        np.zeros(all_pairs_max_ymin.shape),
+        all_pairs_min_ymax - all_pairs_max_ymin)
+    all_pairs_min_xmax = np.minimum(x_max1, np.transpose(x_max2))
+    all_pairs_max_xmin = np.maximum(x_min1, np.transpose(x_min2))
+    if add1:
+        all_pairs_min_xmax += 1.0
+    intersect_widths = np.maximum(
+        np.zeros(all_pairs_max_xmin.shape),
+        all_pairs_min_xmax - all_pairs_max_xmin)
+    return intersect_heights * intersect_widths
+def iou(boxes1, boxes2, add1=False):
+    """Computes pairwise intersection-over-union between box collections.
+    Args:
+        boxes1: a numpy array with shape [N, 4] holding N boxes.
+        boxes2: a numpy array with shape [M, 4] holding N boxes.
+    Returns:
+        a numpy array with shape [N, M] representing pairwise iou scores.
+    """
+    intersect = intersection(boxes1, boxes2, add1)
+    area1 = area(boxes1, add1)
+    area2 = area(boxes2, add1)
+    union = np.expand_dims(
+        area1, axis=1) + np.expand_dims(
+            area2, axis=0) - intersect
+    return intersect / union
--- a/pcdet/datasets/kitti/kitti_object_eval_python/rotate_iou.py
+++ b/pcdet/datasets/kitti/kitti_object_eval_python/rotate_iou.py
+#####################
+# Based on https://github.com/hongzhenwang/RRPN-revise
+# Licensed under The MIT License
+# Author: yanyan, scrin@foxmail.com
+#####################
+import math
+import numba
+import numpy as np
+from numba import cuda
+@numba.jit(nopython=True)
+def div_up(m, n):
+    return m // n + (m % n > 0)
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def trangle_area(a, b, c):
+    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
+            (b[0] - c[0])) / 2.0
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def area(int_pts, num_of_inter):
+    area_val = 0.0
+    for i in range(num_of_inter - 2):
+        area_val += abs(
+            trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
+                         int_pts[2 * i + 4:2 * i + 6]))
+    return area_val
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
+    if num_of_inter > 0:
+        center = cuda.local.array((2, ), dtype=numba.float32)
+        center[:] = 0.0
+        for i in range(num_of_inter):
+            center[0] += int_pts[2 * i]
+            center[1] += int_pts[2 * i + 1]
+        center[0] /= num_of_inter
+        center[1] /= num_of_inter
+        v = cuda.local.array((2, ), dtype=numba.float32)
+        vs = cuda.local.array((16, ), dtype=numba.float32)
+        for i in range(num_of_inter):
+            v[0] = int_pts[2 * i] - center[0]
+            v[1] = int_pts[2 * i + 1] - center[1]
+            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+            v[0] = v[0] / d
+            v[1] = v[1] / d
+            if v[1] < 0:
+                v[0] = -2 - v[0]
+            vs[i] = v[0]
+        j = 0
+        temp = 0
+        for i in range(1, num_of_inter):
+            if vs[i - 1] > vs[i]:
+                temp = vs[i]
+                tx = int_pts[2 * i]
+                ty = int_pts[2 * i + 1]
+                j = i
+                while j > 0 and vs[j - 1] > temp:
+                    vs[j] = vs[j - 1]
+                    int_pts[j * 2] = int_pts[j * 2 - 2]
+                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+                    j -= 1
+                vs[j] = temp
+                int_pts[j * 2] = tx
+                int_pts[j * 2 + 1] = ty
+@cuda.jit(
+    '(float32[:], float32[:], int32, int32, float32[:])',
+    device=True,
+    inline=True)
+def line_segment_intersection(pts1, pts2, i, j, temp_pts):
+    A = cuda.local.array((2, ), dtype=numba.float32)
+    B = cuda.local.array((2, ), dtype=numba.float32)
+    C = cuda.local.array((2, ), dtype=numba.float32)
+    D = cuda.local.array((2, ), dtype=numba.float32)
+    A[0] = pts1[2 * i]
+    A[1] = pts1[2 * i + 1]
+    B[0] = pts1[2 * ((i + 1) % 4)]
+    B[1] = pts1[2 * ((i + 1) % 4) + 1]
+    C[0] = pts2[2 * j]
+    C[1] = pts2[2 * j + 1]
+    D[0] = pts2[2 * ((j + 1) % 4)]
+    D[1] = pts2[2 * ((j + 1) % 4) + 1]
+    BA0 = B[0] - A[0]
+    BA1 = B[1] - A[1]
+    DA0 = D[0] - A[0]
+    CA0 = C[0] - A[0]
+    DA1 = D[1] - A[1]
+    CA1 = C[1] - A[1]
+    acd = DA1 * CA0 > CA1 * DA0
+    bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
+    if acd != bcd:
+        abc = CA1 * BA0 > BA1 * CA0
+        abd = DA1 * BA0 > BA1 * DA0
+        if abc != abd:
+            DC0 = D[0] - C[0]
+            DC1 = D[1] - C[1]
+            ABBA = A[0] * B[1] - B[0] * A[1]
+            CDDC = C[0] * D[1] - D[0] * C[1]
+            DH = BA1 * DC0 - BA0 * DC1
+            Dx = ABBA * DC0 - BA0 * CDDC
+            Dy = ABBA * DC1 - BA1 * CDDC
+            temp_pts[0] = Dx / DH
+            temp_pts[1] = Dy / DH
+            return True
+    return False
+@cuda.jit(
+    '(float32[:], float32[:], int32, int32, float32[:])',
+    device=True,
+    inline=True)
+def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
+    a = cuda.local.array((2, ), dtype=numba.float32)
+    b = cuda.local.array((2, ), dtype=numba.float32)
+    c = cuda.local.array((2, ), dtype=numba.float32)
+    d = cuda.local.array((2, ), dtype=numba.float32)
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+    area_abc = trangle_area(a, b, c)
+    area_abd = trangle_area(a, b, d)
+    if area_abc * area_abd >= 0:
+        return False
+    area_cda = trangle_area(c, d, a)
+    area_cdb = area_cda + area_abc - area_abd
+    if area_cda * area_cdb >= 0:
+        return False
+    t = area_cda / (area_abd - area_abc)
+    dx = t * (b[0] - a[0])
+    dy = t * (b[1] - a[1])
+    temp_pts[0] = a[0] + dx
+    temp_pts[1] = a[1] + dy
+    return True
+@cuda.jit('(float32, float32, float32[:])', device=True, inline=True)
+def point_in_quadrilateral(pt_x, pt_y, corners):
+    ab0 = corners[2] - corners[0]
+    ab1 = corners[3] - corners[1]
+    ad0 = corners[6] - corners[0]
+    ad1 = corners[7] - corners[1]
+    ap0 = pt_x - corners[0]
+    ap1 = pt_y - corners[1]
+    abab = ab0 * ab0 + ab1 * ab1
+    abap = ab0 * ap0 + ab1 * ap1
+    adad = ad0 * ad0 + ad1 * ad1
+    adap = ad0 * ap0 + ad1 * ap1
+    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def quadrilateral_intersection(pts1, pts2, int_pts):
+    num_of_inter = 0
+    for i in range(4):
+        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+            int_pts[num_of_inter * 2] = pts1[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+            num_of_inter += 1
+        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+            int_pts[num_of_inter * 2] = pts2[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+            num_of_inter += 1
+    temp_pts = cuda.local.array((2, ), dtype=numba.float32)
+    for i in range(4):
+        for j in range(4):
+            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+            if has_pts:
+                int_pts[num_of_inter * 2] = temp_pts[0]
+                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+                num_of_inter += 1
+    return num_of_inter
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def rbbox_to_corners(corners, rbbox):
+    # generate clockwise corners and rotate it clockwise
+    angle = rbbox[4]
+    a_cos = math.cos(angle)
+    a_sin = math.sin(angle)
+    center_x = rbbox[0]
+    center_y = rbbox[1]
+    x_d = rbbox[2]
+    y_d = rbbox[3]
+    corners_x = cuda.local.array((4, ), dtype=numba.float32)
+    corners_y = cuda.local.array((4, ), dtype=numba.float32)
+    corners_x[0] = -x_d / 2
+    corners_x[1] = -x_d / 2
+    corners_x[2] = x_d / 2
+    corners_x[3] = x_d / 2
+    corners_y[0] = -y_d / 2
+    corners_y[1] = y_d / 2
+    corners_y[2] = y_d / 2
+    corners_y[3] = -y_d / 2
+    for i in range(4):
+        corners[2 *
+                i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+        corners[2 * i
+                + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def inter(rbbox1, rbbox2):
+    corners1 = cuda.local.array((8, ), dtype=numba.float32)
+    corners2 = cuda.local.array((8, ), dtype=numba.float32)
+    intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
+    rbbox_to_corners(corners1, rbbox1)
+    rbbox_to_corners(corners2, rbbox2)
+    num_intersection = quadrilateral_intersection(corners1, corners2,
+                                                  intersection_corners)
+    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+    # print(intersection_corners.reshape([-1, 2])[:num_intersection])
+    return area(intersection_corners, num_intersection)
+@cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True)
+def devRotateIoUEval(rbox1, rbox2, criterion=-1):
+    area1 = rbox1[2] * rbox1[3]
+    area2 = rbox2[2] * rbox2[3]
+    area_inter = inter(rbox1, rbox2)
+    if criterion == -1:
+        return area_inter / (area1 + area2 - area_inter)
+    elif criterion == 0:
+        return area_inter / area1
+    elif criterion == 1:
+        return area_inter / area2
+    else:
+        return area_inter
+@cuda.jit('(int64, int64, float32[:], float32[:], float32[:], int32)', fastmath=False)
+def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1):
+    threadsPerBlock = 8 * 8
+    row_start = cuda.blockIdx.x
+    col_start = cuda.blockIdx.y
+    tx = cuda.threadIdx.x
+    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
+    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
+    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+    dev_query_box_idx = threadsPerBlock * col_start + tx
+    dev_box_idx = threadsPerBlock * row_start + tx
+    if (tx < col_size):
+        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
+        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
+        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
+        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
+        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
+    if (tx < row_size):
+        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
+        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
+        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
+        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
+        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
+    cuda.syncthreads()
+    if tx < row_size:
+        for i in range(col_size):
+            offset = row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i
+            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
+                                           block_boxes[tx * 5:tx * 5 + 5], criterion)
+def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
+    """rotated box iou running in gpu. 500x faster than cpu version
+    (take 5ms in one example with numba.cuda code).
+    convert from [this project](
+        https://github.com/hongzhenwang/RRPN-revise/tree/master/pcdet/rotation).
+    Args:
+        boxes (float tensor: [N, 5]): rbboxes. format: centers, dims, 
+            angles(clockwise when positive)
+        query_boxes (float tensor: [K, 5]): [description]
+        device_id (int, optional): Defaults to 0. [description]
+    Returns:
+        [type]: [description]
+    """
+    box_dtype = boxes.dtype
+    boxes = boxes.astype(np.float32)
+    query_boxes = query_boxes.astype(np.float32)
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    iou = np.zeros((N, K), dtype=np.float32)
+    if N == 0 or K == 0:
+        return iou
+    threadsPerBlock = 8 * 8
+    cuda.select_device(device_id)
+    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
+    stream = cuda.stream()
+    with stream.auto_synchronize():
+        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
+        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
+        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
+        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](
+            N, K, boxes_dev, query_boxes_dev, iou_dev, criterion)
+        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
+    return iou.astype(boxes.dtype)
--- a/pcdet/models/backbones_2d/map_to_bev/__init__.py
+++ b/pcdet/models/backbones_2d/map_to_bev/__init__.py
 from .height_compression import HeightCompression
+from .pointpillar_scatter import PointPillarScatter
 __all__ = {
-    'HeightCompression': HeightCompression
+    'HeightCompression': HeightCompression,
+    'PointPillarScatter': PointPillarScatter
 }
--- a/pcdet/models/backbones_2d/map_to_bev/height_compression.py
+++ b/pcdet/models/backbones_2d/map_to_bev/height_compression.py
@@ -2,7 +2,7 @@ import torch.nn as nn
 class HeightCompression(nn.Module):
-    def __init__(self, model_cfg):
+    def __init__(self, model_cfg, **kwargs):
        super().__init__()
        self.model_cfg = model_cfg
        self.num_bev_features = self.model_cfg.NUM_BEV_FEATURES

--- a/pcdet/models/backbones_2d/map_to_bev/pointpillar_scatter.py
+++ b/pcdet/models/backbones_2d/map_to_bev/pointpillar_scatter.py
+import torch
+import torch.nn as nn
+class PointPillarScatter(nn.Module):
+    def __init__(self, model_cfg, grid_size, **kwargs):
+        super().__init__()
+        self.model_cfg = model_cfg
+        self.num_bev_features = self.model_cfg.NUM_BEV_FEATURES
+        self.nx, self.ny, self.nz = grid_size
+        assert self.nz == 1
+    def forward(self, batch_dict, **kwargs):
+        pillar_features, coords = batch_dict['pillar_features'], batch_dict['voxel_coords']
+        batch_spatial_features = []
+        batch_size = coords[:, 0].max().int().item() + 1
+        for batch_idx in range(batch_size):
+            spatial_feature = torch.zeros(
+                self.num_bev_features,
+                self.nz * self.nx * self.ny,
+                dtype=pillar_features.dtype,
+                device=pillar_features.device)
+            batch_mask = coords[:, 0] == batch_idx
+            this_coords = coords[batch_mask, :]
+            indices = this_coords[:, 1] + this_coords[:, 2] * self.nx + this_coords[:, 3]
+            indices = indices.type(torch.long)
+            pillars = pillar_features[batch_mask, :]
+            pillars = pillars.t()
+            spatial_feature[:, indices] = pillars
+            batch_spatial_features.append(spatial_feature)
+        batch_spatial_features = torch.stack(batch_spatial_features, 0)
+        batch_spatial_features = batch_spatial_features.view(batch_size, self.num_bev_features * self.nz, self.ny, self.nx)
+        batch_dict['spatial_features'] = batch_spatial_features
+        return batch_dict
--- a/pcdet/models/backbones_3d/vfe/__init__.py
+++ b/pcdet/models/backbones_3d/vfe/__init__.py
 from .vfe_template import VFETemplate
 from .mean_vfe import MeanVFE
+from .pillar_vfe import PillarVFE
 __all__ = {
    'VFETemplate': VFETemplate,
-    'MeanVFE': MeanVFE
+    'MeanVFE': MeanVFE,
+    'PillarVFE': PillarVFE
 }
--- a/pcdet/models/backbones_3d/vfe/pillar_vfe.py
+++ b/pcdet/models/backbones_3d/vfe/pillar_vfe.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .vfe_template import VFETemplate
+class PFNLayer(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 use_norm=True,
+                 last_layer=False):
+        super().__init__()
+        self.last_vfe = last_layer
+        self.use_norm = use_norm
+        if not self.last_vfe:
+            out_channels = out_channels // 2
+        if self.use_norm:
+            self.linear = nn.Linear(in_channels, out_channels, bias=False)
+            self.norm = nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01)
+        else:
+            self.linear = nn.Linear(in_channels, out_channels, bias=True)
+    def forward(self, inputs):
+        x = self.linear(inputs)
+        total_points, voxel_points, channels = x.shape
+        x = self.norm(x.view(-1, channels)).view(total_points, voxel_points, channels) if self.use_norm else x
+        x = F.relu(x)
+        x_max = torch.max(x, dim=1, keepdim=True)[0]
+        if self.last_vfe:
+            return x_max
+        else:
+            x_repeat = x_max.repeat(1, inputs.shape[1], 1)
+            x_concatenated = torch.cat([x, x_repeat], dim=2)
+            return x_concatenated
+class PillarVFE(VFETemplate):
+    def __init__(self, model_cfg, num_point_features, voxel_size, point_cloud_range):
+        super().__init__(model_cfg=model_cfg)
+        self.use_norm = self.model_cfg.USE_NORM
+        self.with_distance = self.model_cfg.WITH_DISTANCE
+        self.use_absolute_xyz = self.model_cfg.USE_ABSLOTE_XYZ
+        num_point_features += 6 if self.use_absolute_xyz else 3
+        if self.with_distance:
+            num_point_features += 1
+        self.num_filters = self.model_cfg.NUM_FILTERS
+        assert len(self.num_filters) > 0
+        num_filters = [num_point_features] + list(self.num_filters)
+        pfn_layers = []
+        for i in range(len(num_filters) - 1):
+            in_filters = num_filters[i]
+            out_filters = num_filters[i + 1]
+            pfn_layers.append(
+                PFNLayer(in_filters, out_filters, self.use_norm, last_layer=(i >= len(num_filters) - 2))
+            )
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+        self.voxel_x = voxel_size[0]
+        self.voxel_y = voxel_size[1]
+        self.voxel_z = voxel_size[2]
+        self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
+        self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
+        self.z_offset = self.voxel_z / 2 + point_cloud_range[2]
+    def get_output_feature_dim(self):
+        return self.num_filters[-1]
+    def get_paddings_indicator(self, actual_num, max_num, axis=0):
+        actual_num = torch.unsqueeze(actual_num, axis + 1)
+        max_num_shape = [1] * len(actual_num.shape)
+        max_num_shape[axis + 1] = -1
+        max_num = torch.arange(max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape)
+        paddings_indicator = actual_num.int() > max_num
+        return paddings_indicator
+    def forward(self, batch_dict, **kwargs):
+        voxel_features, voxel_num_points, coords = batch_dict['voxels'], batch_dict['voxel_num_points'], batch_dict['voxel_coords']
+        points_mean = voxel_features[:, :, :3].sum(dim=1, keepdim=True) / voxel_num_points.type_as(voxel_features).view(-1, 1, 1)
+        f_cluster = voxel_features[:, :, :3] - points_mean
+        f_center = torch.zeros_like(voxel_features[:, :, :3])
+        f_center[:, :, 0] = voxel_features[:, :, 0] - (coords[:, 3].to(voxel_features.dtype).unsqueeze(1) * self.voxel_x + self.x_offset)
+        f_center[:, :, 1] = voxel_features[:, :, 1] - (coords[:, 2].to(voxel_features.dtype).unsqueeze(1) * self.voxel_y + self.y_offset)
+        f_center[:, :, 2] = voxel_features[:, :, 2] - (coords[:, 1].to(voxel_features.dtype).unsqueeze(1) * self.voxel_z + self.z_offset)
+        if self.use_absolute_xyz:
+            features = [voxel_features, f_cluster, f_center]
+        else:
+            features = [voxel_features[..., 3:], f_cluster, f_center]
+        if self.with_distance:
+            points_dist = torch.norm(voxel_features[:, :, :3], 2, 2, keepdim=True)
+            features.append(points_dist)
+        features = torch.cat(features, dim=-1)
+        voxel_count = features.shape[1]
+        mask = self.get_paddings_indicator(voxel_num_points, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(voxel_features)
+        features *= mask
+        for pfn in self.pfn_layers:
+            features = pfn(features)
+        features = features.squeeze()
+        batch_dict['pillar_features'] = features
+        return batch_dict
--- a/pcdet/models/dense_heads/__init__.py
+++ b/pcdet/models/dense_heads/__init__.py
@@ -2,11 +2,12 @@ from .anchor_head_template import AnchorHeadTemplate
 from .anchor_head_single import AnchorHeadSingle
 from .point_intra_part_head import PointIntraPartOffsetHead
 from .point_head_simple import PointHeadSimple
+from .anchor_head_multi import AnchorHeadMulti
 __all__ = {
    'AnchorHeadTemplate': AnchorHeadTemplate,
    'AnchorHeadSingle': AnchorHeadSingle,
    'PointIntraPartOffsetHead': PointIntraPartOffsetHead,
-    'PointHeadSimple': PointHeadSimple
+    'PointHeadSimple': PointHeadSimple,
+    'AnchorHeadMulti': AnchorHeadMulti,
 }
--- a/pcdet/models/dense_heads/anchor_head_multi.py
+++ b/pcdet/models/dense_heads/anchor_head_multi.py
+import numpy as np
+import torch.nn as nn
+from .anchor_head_template import AnchorHeadTemplate
+import torch
+class SingleHead(nn.Module):
+    def __init__(self, model_cfg, input_channels, num_class, num_anchors_per_location, code_size, encode_conv_cfg=None):
+        super(SingleHead, self).__init__()
+        if encode_conv_cfg is not None:
+            stride = encode_conv_cfg['stride']
+            layer_num = encode_conv_cfg['layer_num']
+            num_filters = input_channels
+            encode_conv = []
+            encode_conv.append(nn.Conv2d(num_filters, num_filters, kernel_size=1, stride=stride, bias=False))
+            for i in range(layer_num-1):
+                encode_conv.append(nn.Conv2d(num_filters, num_filters, 1, bias=False))
+                encode_conv.append(nn.BatchNorm2d(num_filters))
+                encode_conv.append(nn.ReLU(inplace=True))
+            self.encode_conv = nn.Sequential(*encode_conv)
+        else:
+            self.encode_conv = None
+        self.num_anchors_per_location = num_anchors_per_location
+        self.num_class = num_class
+        self.code_size = code_size
+        self.model_cfg = model_cfg
+        self.conv_cls = nn.Conv2d(
+            input_channels, self.num_anchors_per_location * self.num_class,
+            kernel_size=1
+        )
+        self.conv_box = nn.Conv2d(
+            input_channels, self.num_anchors_per_location * self.code_size,
+            kernel_size=1
+        )
+        if self.model_cfg.get('USE_DIRECTION_CLASSIFIER', None) is not None:
+            self.conv_dir_cls = nn.Conv2d(
+                input_channels,
+                self.num_anchors_per_location * self.model_cfg.NUM_DIR_BINS,
+                kernel_size=1
+            )
+        else:
+            self.conv_dir_cls = None
+        self.use_multihead = self.model_cfg.get('USE_MULTI_HEAD', False)
+        self.init_weights()
+    def init_weights(self):
+        pi = 0.01
+        nn.init.constant_(self.conv_cls.bias, -np.log((1 - pi) / pi))
+    def forward(self, spatial_features_2d):
+        ret_dict = {}
+        if self.encode_conv is not None:
+            spatial_features_2d = self.encode_conv(spatial_features_2d)
+        cls_preds = self.conv_cls(spatial_features_2d)
+        box_preds = self.conv_box(spatial_features_2d)
+        if not self.use_multihead:
+            box_preds = box_preds.permute(0, 2, 3, 1).contiguous()
+            cls_preds = cls_preds.permute(0, 2, 3, 1).contiguous()
+        else:
+            H, W = box_preds.shape[2:]
+            batch_size = box_preds.shape[0]
+            box_preds = box_preds.view(-1, self.num_anchors_per_location,
+                                       self.code_size, H, W).permute(0, 1, 3, 4, 2).contiguous()
+            cls_preds = cls_preds.view(-1, self.num_anchors_per_location,
+                                       self.num_class, H, W).permute(0, 1, 3, 4, 2).contiguous()
+            box_preds = box_preds.view(batch_size, -1, self.code_size)
+            cls_preds = cls_preds.view(batch_size, -1, self.num_class).unsqueeze(-1)
+        if self.conv_dir_cls is not None:
+            dir_cls_preds = self.conv_dir_cls(spatial_features_2d)
+            if self.use_multihead:
+                dir_cls_preds = dir_cls_preds.view(
+                    -1, self.num_anchors_per_location, self.model_cfg.NUM_DIR_BINS, H, W).permute(0, 1, 3, 4, 2).contiguous()
+                dir_cls_preds = dir_cls_preds.view(batch_size, -1, self.model_cfg.NUM_DIR_BINS)
+            else:
+                dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).contiguous()
+        else:
+            dir_cls_preds = None
+        ret_dict['cls_preds'] = cls_preds
+        ret_dict['box_preds'] = box_preds
+        ret_dict['dir_cls_preds'] = dir_cls_preds
+        return ret_dict
+class AnchorHeadMulti(AnchorHeadTemplate):
+    def __init__(self, model_cfg, input_channels, num_class, grid_size, point_cloud_range, predict_boxes_when_training=True):
+        super().__init__(
+            model_cfg=model_cfg, num_class=num_class, grid_size=grid_size, point_cloud_range=point_cloud_range, predict_boxes_when_training=predict_boxes_when_training
+        )
+        self.model_cfg = model_cfg
+        self.make_multihead(input_channels)
+    def make_multihead(self, input_channels):
+        rpn_head_cfgs = self.model_cfg.RPN_HEAD_CFGS
+        rpn_heads = []
+        class_names = []
+        for rpn_head_cfg in rpn_head_cfgs:
+            class_names.extend(rpn_head_cfg['head_cls_name'])
+        for rpn_head_cfg in rpn_head_cfgs:
+            num_anchors_per_location = sum([self.num_anchors_per_location[class_names.index(head_cls)] for head_cls in rpn_head_cfg['head_cls_name']])
+            rpn_head = SingleHead(self.model_cfg, input_channels, self.num_class, num_anchors_per_location, self.box_coder.code_size, rpn_head_cfg)
+            rpn_heads.append(rpn_head)
+        self.rpn_heads = nn.ModuleList(rpn_heads)
+    def forward(self, data_dict):
+        spatial_features_2d = data_dict['spatial_features_2d']
+        ret_dicts = []
+        for rpn_head in self.rpn_heads:
+            ret_dicts.append(rpn_head(spatial_features_2d))
+        cls_preds = torch.cat([ret_dict['cls_preds'] for ret_dict in ret_dicts], dim=1)
+        box_preds = torch.cat([ret_dict['box_preds'] for ret_dict in ret_dicts], dim=1)
+        ret = {
+            'cls_preds': cls_preds,
+            'box_preds': box_preds,
+        }
+        if self.model_cfg.get('USE_DIRECTION_CLASSIFIER', False):
+            dir_cls_preds = torch.cat([ret_dict['dir_cls_preds'] for ret_dict in ret_dicts], dim=1)
+            ret['dir_cls_preds'] = dir_cls_preds
+        else:
+            dir_cls_preds = None
+        self.forward_ret_dict.update(ret)
+        if self.training:
+            targets_dict = self.assign_targets(
+                gt_boxes=data_dict['gt_boxes']
+            )
+            self.forward_ret_dict.update(targets_dict)
+        else:
+            batch_cls_preds, batch_box_preds = self.generate_predicted_boxes(
+                batch_size=data_dict['batch_size'],
+                cls_preds=cls_preds, box_preds=box_preds, dir_cls_preds=dir_cls_preds
+            )
+            data_dict['batch_cls_preds'] = batch_cls_preds
+            data_dict['batch_box_preds'] = batch_box_preds
+            data_dict['cls_preds_normalized'] = False
+        return data_dict
--- a/pcdet/models/dense_heads/anchor_head_template.py
+++ b/pcdet/models/dense_heads/anchor_head_template.py
@@ -3,6 +3,7 @@ import torch
 import torch.nn as nn
 from .target_assigner.anchor_generator import AnchorGenerator
 from .target_assigner.atss_target_assigner import ATSSTargetAssigner
+from .target_assigner.axis_aligned_target_assigner import AxisAlignedTargetAssigner
 from ...utils import box_coder_utils, loss_utils, common_utils
@@ -45,8 +46,8 @@ class AnchorHeadTemplate(nn.Module):
                box_coder=self.box_coder,
                match_height=anchor_target_cfg.MATCH_HEIGHT
            )
-        elif anchor_target_cfg.NAME == 'Second':
+        elif anchor_target_cfg.NAME == 'AxisAlignedTargetAssigner':
-            target_assigner = SecondTargetAssigner(
+            target_assigner = AxisAlignedTargetAssigner(
                anchor_target_cfg=anchor_target_cfg,
                box_coder=self.box_coder,
                match_height=anchor_target_cfg.MATCH_HEIGHT

--- a/pcdet/models/dense_heads/target_assigner/axis_aligned_target_assigner.py
+++ b/pcdet/models/dense_heads/target_assigner/axis_aligned_target_assigner.py
+import torch
+from ....utils import box_utils
+from ....ops.iou3d_nms import iou3d_nms_utils
+class AxisAlignedTargetAssigner(object):
+    def __init__(self, anchor_target_cfg, box_coder, match_height=False):
+        super().__init__()
+        self.box_coder = box_coder
+        self.match_height = match_height
+        self.pos_fraction = anchor_target_cfg.POS_FRACTION if anchor_target_cfg.POS_FRACTION >= 0 else None
+        self.sample_size = anchor_target_cfg.SAMPLE_SIZE
+        self.matched_thresholds = anchor_target_cfg.MATCHED_THRESHOLDS
+        self.unmatched_thresholds = anchor_target_cfg.UNMATCHED_THRESHOLDS
+        self.norm_by_num_examples = anchor_target_cfg.NORM_BY_NUM_EXAMPLES
+    def assign_targets(self, all_anchors, gt_boxes_with_classes, use_multihead=False):
+        """
+        Args:
+            all_anchors: [(N, 7), ...]
+            gt_boxes: (B, M, 8)
+        Returns:
+        """
+        bbox_targets = []
+        bbox_src_targets = []
+        cls_labels = [] 
+        reg_weights = []
+        batch_size = gt_boxes_with_classes.shape[0]
+        gt_classes = gt_boxes_with_classes[:, :, 7]
+        gt_boxes = gt_boxes_with_classes[:, :, :7]
+        for k in range(batch_size):
+            cur_gt = gt_boxes[k]
+            cnt = cur_gt.__len__() - 1
+            while cnt > 0 and cur_gt[cnt].sum() == 0:
+                cnt -= 1
+            cur_gt = cur_gt[:cnt + 1]
+            cur_gt_classes = gt_classes[k][:cnt + 1].int()
+            target_list = []
+            for class_index, anchors in enumerate(all_anchors):
+                mask = torch.tensor([c == class_index + 1 for c in cur_gt_classes], dtype=torch.bool)
+                if use_multihead:
+                    anchors = anchors.permute(3, 4, 0, 1, 2, 5).contiguous().view(-1, anchors.shape[-1])
+                else:
+                    feature_map_size = anchors.shape[:3]
+                    anchors = anchors.view(-1, anchors.shape[-1])
+                single_target = self.assign_targets_single(
+                    anchors,
+                    cur_gt[mask],
+                    gt_classes=cur_gt_classes[mask],
+                    matched_threshold=self.matched_thresholds[class_index],
+                    unmatched_threshold=self.unmatched_thresholds[class_index]
+                )
+                target_list.append(single_target)
+            if use_multihead:
+                target_dict = {
+                    'box_cls_labels': [t['box_cls_labels'].view(-1) for t in target_list],
+                    'box_reg_targets': [t['box_reg_targets'].view(-1, self.box_coder.code_size) for t in target_list],
+                    'reg_weights': [t['reg_weights'].view(-1) for t in target_list]
+                }
+                target_dict['box_reg_targets'] = torch.cat(target_dict['box_reg_targets'], dim=0)
+                target_dict['box_cls_labels'] = torch.cat(target_dict['box_cls_labels'], dim=0).view(-1)
+                target_dict['reg_weights'] = torch.cat(target_dict['reg_weights'], dim=0).view(-1)
+            else:
+                target_dict = {
+                    'box_cls_labels': [t['box_cls_labels'].view(*feature_map_size, -1) for t in target_list],
+                    'box_reg_targets': [t['box_reg_targets'].view(*feature_map_size, -1, self.box_coder.code_size) for t in target_list],
+                    'reg_weights': [t['reg_weights'].view(*feature_map_size, -1) for t in target_list]
+                }
+                target_dict['box_reg_targets'] = torch.cat(target_dict['box_reg_targets'], dim=-2).view(-1, self.box_coder.code_size)
+                target_dict['box_cls_labels'] = torch.cat(target_dict['box_cls_labels'], dim=-1).view(-1)
+                target_dict['reg_weights'] = torch.cat(target_dict['reg_weights'], dim=-1).view(-1)
+            bbox_targets.append(target_dict['box_reg_targets'])
+            cls_labels.append(target_dict['box_cls_labels'])
+            reg_weights.append(target_dict['reg_weights'])
+        bbox_targets = torch.stack(bbox_targets, dim=0)
+        cls_labels = torch.stack(cls_labels, dim=0)
+        reg_weights = torch.stack(reg_weights, dim=0)
+        all_targets_dict = {
+            'box_cls_labels': cls_labels,
+            'box_reg_targets': bbox_targets,
+            'reg_weights': reg_weights
+        }
+        return all_targets_dict
+    def assign_targets_single(self, anchors,
+                         gt_boxes,
+                         gt_classes,
+                         matched_threshold=0.6,
+                         unmatched_threshold=0.45
+                        ):
+        num_anchors = anchors.shape[0]
+        num_gt = gt_boxes.shape[0]
+        box_ndim = anchors.shape[1]
+        labels = torch.ones((num_anchors,), dtype=torch.int32, device=anchors.device) * -1
+        gt_ids = torch.ones((num_anchors,), dtype=torch.int32, device=anchors.device) * -1
+        if len(gt_boxes) > 0 and anchors.shape[0] > 0:
+            anchor_by_gt_overlap = iou3d_nms_utils.boxes_iou3d_gpu(anchors, gt_boxes) if self.match_height else box_utils.boxes3d_nearest_bev_iou(anchors, gt_boxes)
+            anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(dim=1)
+            anchor_to_gt_max = anchor_by_gt_overlap[torch.arange(num_anchors),
+                                                    anchor_to_gt_argmax]  
+            gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(dim=0)
+            gt_to_anchor_max = anchor_by_gt_overlap[
+                gt_to_anchor_argmax,
+                torch.arange(num_gt)]
+            empty_gt_mask = gt_to_anchor_max == 0
+            gt_to_anchor_max[empty_gt_mask] = -1
+            anchors_with_max_overlap = torch.nonzero(
+                anchor_by_gt_overlap == gt_to_anchor_max)[:, 0]
+            gt_inds_force = anchor_to_gt_argmax[anchors_with_max_overlap]
+            labels[anchors_with_max_overlap] = gt_classes[gt_inds_force]
+            gt_ids[anchors_with_max_overlap] = gt_inds_force.int()
+            pos_inds = anchor_to_gt_max >= matched_threshold
+            gt_inds_over_thresh = anchor_to_gt_argmax[pos_inds]
+            labels[pos_inds] = gt_classes[gt_inds_over_thresh]
+            gt_ids[pos_inds] = gt_inds_over_thresh.int()
+            bg_inds = torch.nonzero(anchor_to_gt_max < unmatched_threshold)[:, 0]
+        else:
+            bg_inds = torch.arange(num_anchors)
+        fg_inds = torch.nonzero(labels > 0)[:, 0]
+        if self.pos_fraction is not None:
+            num_fg = int(self.pos_fraction * self.sample_size)
+            if len(fg_inds) > num_fg:
+                num_disabled = len(fg_inds) - num_fg
+                disable_inds = torch.randperm(len(fg_inds))[:num_disabled]
+                labels[disable_inds] = -1
+                fg_inds = torch.nonzero(labels > 0)[:, 0]
+            num_bg = self.sample_size - (labels > 0).sum()
+            if len(bg_inds) > num_bg:
+                enable_inds = bg_inds[torch.randint(0, len(bg_inds), size=(num_bg,))]
+                labels[enable_inds] = 0
+            bg_inds = torch.nonzero(labels == 0)[:, 0]
+        else:
+            if len(gt_boxes) == 0 or anchors.shape[0] == 0:
+                labels[:] = 0
+            else:
+                labels[bg_inds] = 0
+                labels[anchors_with_max_overlap] = gt_classes[gt_inds_force]
+        bbox_targets = anchors.new_zeros((num_anchors, self.box_coder.code_size))
+        if len(gt_boxes) > 0 and anchors.shape[0] > 0:
+            fg_gt_boxes = gt_boxes[anchor_to_gt_argmax[fg_inds], :]
+            fg_anchors = anchors[fg_inds, :]
+            bbox_targets[fg_inds, :] = self.box_coder.encode_torch(fg_gt_boxes, fg_anchors)
+        reg_weights = anchors.new_zeros((num_anchors,))
+        if self.norm_by_num_examples:
+            num_examples = (labels >= 0).sum()
+            num_examples = num_examples if num_examples > 1.0 else 1.0
+            reg_weights[labels > 0] = 1.0 / num_examples
+        else:
+            reg_weights[labels > 0] = 1.0
+        ret_dict = {
+            'box_cls_labels': labels,
+            'box_reg_targets': bbox_targets,
+            'reg_weights': reg_weights,
+        }
+        return ret_dict
--- a/pcdet/models/detectors/__init__.py
+++ b/pcdet/models/detectors/__init__.py
@@ -2,13 +2,14 @@ from .detector3d_template import Detector3DTemplate
 from .second_net import SECONDNet
 from .PartA2_net import PartA2Net
 from .pv_rcnn import PVRCNN
+from .pointpillar import PointPillar
 __all__ = {
    'Detector3DTemplate': Detector3DTemplate,
    'SECONDNet': SECONDNet,
    'PartA2Net': PartA2Net,
-    'PVRCNN': PVRCNN
+    'PVRCNN': PVRCNN,
+    'PointPillar': PointPillar
 }

--- a/pcdet/models/detectors/detector3d_template.py
+++ b/pcdet/models/detectors/detector3d_template.py
@@ -49,7 +49,9 @@ class Detector3DTemplate(nn.Module):
        vfe_module = vfe.__all__[self.model_cfg.VFE.NAME](
            model_cfg=self.model_cfg.VFE,
-            num_point_features=model_info_dict['num_rawpoint_features']
+            num_point_features=model_info_dict['num_rawpoint_features'],
+            point_cloud_range=model_info_dict['point_cloud_range'],
+            voxel_size=model_info_dict['voxel_size']
        )
        model_info_dict['num_point_features'] = vfe_module.get_output_feature_dim()
        model_info_dict['module_list'].append(vfe_module)
@@ -75,7 +77,8 @@ class Detector3DTemplate(nn.Module):
            return None, model_info_dict
        map_to_bev_module = map_to_bev.__all__[self.model_cfg.MAP_TO_BEV.NAME](
-            model_cfg=self.model_cfg.MAP_TO_BEV
+            model_cfg=self.model_cfg.MAP_TO_BEV,
+            grid_size=model_info_dict['grid_size']
        )
        model_info_dict['module_list'].append(map_to_bev_module)
        model_info_dict['num_bev_features'] = map_to_bev_module.num_bev_features

--- a/pcdet/models/detectors/pointpillar.py
+++ b/pcdet/models/detectors/pointpillar.py
+from .detector3d_template import Detector3DTemplate
+class PointPillar(Detector3DTemplate):
+    def __init__(self, model_cfg, num_class, dataset):
+        super().__init__(model_cfg=model_cfg, num_class=num_class, dataset=dataset)
+        self.module_list = self.build_networks()
+    def forward(self, batch_dict):
+        for cur_module in self.module_list:
+            batch_dict = cur_module(batch_dict)
+        if self.training:
+            loss, tb_dict, disp_dict = self.get_training_loss()
+            ret_dict = {
+                'loss': loss
+            }
+            return ret_dict, tb_dict, disp_dict
+        else:
+            pred_dicts, recall_dicts = self.post_processing(batch_dict)
+            return pred_dicts, recall_dicts
+    def get_training_loss(self):
+        disp_dict = {}
+        loss_rpn, tb_dict = self.dense_head.get_loss()
+        tb_dict = {
+            'loss_rpn': loss_rpn.item(),
+            **tb_dict
+        }
+        loss = loss_rpn
+        return loss, tb_dict, disp_dict
--- a/pcdet/utils/common_utils.py
+++ b/pcdet/utils/common_utils.py
@@ -6,6 +6,8 @@ import os
 import torch.multiprocessing as mp
 import torch.distributed as dist
 import subprocess
+import pickle
+import shutil
 def check_numpy_to_torch(x):
@@ -153,3 +155,42 @@ def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'):
    batch_size_each_gpu = batch_size // num_gpus
    rank = dist.get_rank()
    return batch_size_each_gpu, rank
+def get_dist_info():
+    if torch.__version__ < '1.0':
+        initialized = dist._initialized
+    else:
+        if dist.is_available():
+            initialized = dist.is_initialized()
+        else:
+            initialized = False
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def merge_results_dist(result_part, size, tmpdir):
+    rank, world_size = get_dist_info()
+    os.makedirs(tmpdir, exist_ok=True)
+    dist.barrier()
+    pickle.dump(result_part, open(os.path.join(tmpdir, 'result_part_{}.pkl'.format(rank)), 'wb'))
+    dist.barrier()
+    if rank != 0:
+        return None
+    part_list = []
+    for i in range(world_size):
+        part_file = os.path.join(tmpdir, 'result_part_{}.pkl'.format(i))
+        part_list.append(pickle.load(open(part_file, 'rb')))
+    ordered_results = []
+    for res in zip(*part_list):
+        ordered_results.extend(list(res)) 
+    ordered_results = ordered_results[:size]
+    shutil.rmtree(tmpdir)
+    return ordered_results