Merge pull request #988 from yukang2017/focalsconv

Merge to support Focals Conv (CVPR 2022 paper).

Merge pull request #988 from yukang2017/focalsconv
Merge to support Focals Conv (CVPR 2022 paper).
4c8009fc · Shaoshuai Shi · GitHub · dadda9ed · fa330622 · 4c8009fc
Unverified Commit 4c8009fc authored Jul 04, 2022 by Shaoshuai Shi Committed by GitHub Jul 04, 2022
17 changed files
--- a/pcdet/datasets/augmentor/augmentor_utils.py
+++ b/pcdet/datasets/augmentor/augmentor_utils.py
@@ -5,7 +5,7 @@ from ...utils import common_utils
 from ...utils import box_utils


-def random_flip_along_x(gt_boxes, points):
+def random_flip_along_x(gt_boxes, points, return_flip=False):
    """
    Args:
        gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]]
@@ -20,11 +20,12 @@ def random_flip_along_x(gt_boxes, points):
        
        if gt_boxes.shape[1] > 7:
            gt_boxes[:, 8] = -gt_boxes[:, 8]
-    
+    if return_flip:
+        return gt_boxes, points, enable
    return gt_boxes, points


-def random_flip_along_y(gt_boxes, points):
+def random_flip_along_y(gt_boxes, points, return_flip=False):
    """
    Args:
        gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]]
@@ -39,11 +40,12 @@ def random_flip_along_y(gt_boxes, points):

        if gt_boxes.shape[1] > 7:
            gt_boxes[:, 7] = -gt_boxes[:, 7]
-
+    if return_flip:
+        return gt_boxes, points, enable
    return gt_boxes, points


-def global_rotation(gt_boxes, points, rot_range):
+def global_rotation(gt_boxes, points, rot_range, return_rot=False):
    """
    Args:
        gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]]
@@ -61,10 +63,12 @@ def global_rotation(gt_boxes, points, rot_range):
            np.array([noise_rotation])
        )[0][:, 0:2]

+    if return_rot:
+        return gt_boxes, points, noise_rotation
    return gt_boxes, points


-def global_scaling(gt_boxes, points, scale_range):
+def global_scaling(gt_boxes, points, scale_range, return_scale=False):
    """
    Args:
        gt_boxes: (N, 7), [x, y, z, dx, dy, dz, heading]
@@ -77,7 +81,8 @@ def global_scaling(gt_boxes, points, scale_range):
    noise_scale = np.random.uniform(scale_range[0], scale_range[1])
    points[:, :3] *= noise_scale
    gt_boxes[:, :6] *= noise_scale
-
+    if return_scale:
+        return gt_boxes, points, noise_scale
    return gt_boxes, points



--- a/pcdet/datasets/augmentor/data_augmentor.py
+++ b/pcdet/datasets/augmentor/data_augmentor.py
@@ -11,18 +11,18 @@ class DataAugmentor(object):
        self.root_path = root_path
        self.class_names = class_names
        self.logger = logger
-        
+
        self.data_augmentor_queue = []
        aug_config_list = augmentor_configs if isinstance(augmentor_configs, list) \
            else augmentor_configs.AUG_CONFIG_LIST
-        
+
        for cur_cfg in aug_config_list:
            if not isinstance(augmentor_configs, list):
                if cur_cfg.NAME in augmentor_configs.DISABLE_AUG_LIST:
                    continue
            cur_augmentor = getattr(self, cur_cfg.NAME)(config=cur_cfg)
            self.data_augmentor_queue.append(cur_augmentor)
-    
+
    def gt_sampling(self, config=None):
        db_sampler = database_sampler.DataBaseSampler(
            root_path=self.root_path,
@@ -31,54 +31,57 @@ class DataAugmentor(object):
            logger=self.logger
        )
        return db_sampler
-    
+
    def __getstate__(self):
        d = dict(self.__dict__)
        del d['logger']
        return d
-    
+
    def __setstate__(self, d):
        self.__dict__.update(d)
-    
+
    def random_world_flip(self, data_dict=None, config=None):
        if data_dict is None:
            return partial(self.random_world_flip, config=config)
        gt_boxes, points = data_dict['gt_boxes'], data_dict['points']
        for cur_axis in config['ALONG_AXIS_LIST']:
            assert cur_axis in ['x', 'y']
-            gt_boxes, points = getattr(augmentor_utils, 'random_flip_along_%s' % cur_axis)(
-                gt_boxes, points,
+            gt_boxes, points, enable = getattr(augmentor_utils, 'random_flip_along_%s' % cur_axis)(
+                gt_boxes, points, return_flip=True
            )
-        
+            data_dict['flip_%s'%cur_axis] = enable
+
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
        return data_dict
-    
+
    def random_world_rotation(self, data_dict=None, config=None):
        if data_dict is None:
            return partial(self.random_world_rotation, config=config)
        rot_range = config['WORLD_ROT_ANGLE']
        if not isinstance(rot_range, list):
            rot_range = [-rot_range, rot_range]
-        gt_boxes, points = augmentor_utils.global_rotation(
-            data_dict['gt_boxes'], data_dict['points'], rot_range=rot_range
+        gt_boxes, points, noise_rot = augmentor_utils.global_rotation(
+            data_dict['gt_boxes'], data_dict['points'], rot_range=rot_range, return_rot=True
        )
-        
+
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
+        data_dict['noise_rot'] = noise_rot
        return data_dict
-    
+
    def random_world_scaling(self, data_dict=None, config=None):
        if data_dict is None:
            return partial(self.random_world_scaling, config=config)
-        gt_boxes, points = augmentor_utils.global_scaling(
-            data_dict['gt_boxes'], data_dict['points'], config['WORLD_SCALE_RANGE']
+        gt_boxes, points, noise_scale = augmentor_utils.global_scaling(
+            data_dict['gt_boxes'], data_dict['points'], config['WORLD_SCALE_RANGE'], return_scale=True
        )
-        
+
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
+        data_dict['noise_scale'] = noise_scale
        return data_dict
-    
+
    def random_image_flip(self, data_dict=None, config=None):
        if data_dict is None:
            return partial(self.random_image_flip, config=config)
@@ -92,12 +95,12 @@ class DataAugmentor(object):
            images, depth_maps, gt_boxes = getattr(augmentor_utils, 'random_image_flip_%s' % cur_axis)(
                images, depth_maps, gt_boxes, calib,
            )
-        
+
        data_dict['images'] = images
        data_dict['depth_maps'] = depth_maps
        data_dict['gt_boxes'] = gt_boxes
        return data_dict
-    
+
    def random_world_translation(self, data_dict=None, config=None):
        if data_dict is None:
            return partial(self.random_world_translation, config=config)
@@ -128,11 +131,11 @@ class DataAugmentor(object):
            gt_boxes, points = getattr(augmentor_utils, 'random_local_translation_along_%s' % cur_axis)(
                gt_boxes, points, offset_range,
            )
-        
+
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
        return data_dict
-    
+
    def random_local_rotation(self, data_dict=None, config=None):
        """
        Please check the correctness of it before using.
@@ -145,11 +148,11 @@ class DataAugmentor(object):
        gt_boxes, points = augmentor_utils.local_rotation(
            data_dict['gt_boxes'], data_dict['points'], rot_range=rot_range
        )
-        
+
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
        return data_dict
-    
+
    def random_local_scaling(self, data_dict=None, config=None):
        """
        Please check the correctness of it before using.
@@ -159,18 +162,18 @@ class DataAugmentor(object):
        gt_boxes, points = augmentor_utils.local_scaling(
            data_dict['gt_boxes'], data_dict['points'], config['LOCAL_SCALE_RANGE']
        )
-        
+
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
        return data_dict
-    
+
    def random_world_frustum_dropout(self, data_dict=None, config=None):
        """
        Please check the correctness of it before using.
        """
        if data_dict is None:
            return partial(self.random_world_frustum_dropout, config=config)
-        
+
        intensity_range = config['INTENSITY_RANGE']
        gt_boxes, points = data_dict['gt_boxes'], data_dict['points']
        for direction in config['DIRECTION']:
@@ -178,18 +181,18 @@ class DataAugmentor(object):
            gt_boxes, points = getattr(augmentor_utils, 'global_frustum_dropout_%s' % direction)(
                gt_boxes, points, intensity_range,
            )
-        
+
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
        return data_dict
-    
+
    def random_local_frustum_dropout(self, data_dict=None, config=None):
        """
        Please check the correctness of it before using.
        """
        if data_dict is None:
            return partial(self.random_local_frustum_dropout, config=config)
-        
+
        intensity_range = config['INTENSITY_RANGE']
        gt_boxes, points = data_dict['gt_boxes'], data_dict['points']
        for direction in config['DIRECTION']:
@@ -197,21 +200,21 @@ class DataAugmentor(object):
            gt_boxes, points = getattr(augmentor_utils, 'local_frustum_dropout_%s' % direction)(
                gt_boxes, points, intensity_range,
            )
-        
+
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
        return data_dict
-    
+
    def random_local_pyramid_aug(self, data_dict=None, config=None):
        """
-        Refer to the paper: 
+        Refer to the paper:
            SE-SSD: Self-Ensembling Single-Stage Object Detector From Point Cloud
        """
        if data_dict is None:
            return partial(self.random_local_pyramid_aug, config=config)
-        
+
        gt_boxes, points = data_dict['gt_boxes'], data_dict['points']
-        
+
        gt_boxes, points, pyramids = augmentor_utils.local_pyramid_dropout(gt_boxes, points, config['DROP_PROB'])
        gt_boxes, points, pyramids = augmentor_utils.local_pyramid_sparsify(gt_boxes, points,
                                                                            config['SPARSIFY_PROB'],
@@ -224,7 +227,7 @@ class DataAugmentor(object):
        data_dict['gt_boxes'] = gt_boxes
        data_dict['points'] = points
        return data_dict
-    
+
    def forward(self, data_dict):
        """
        Args:
@@ -238,12 +241,12 @@ class DataAugmentor(object):
        """
        for cur_augmentor in self.data_augmentor_queue:
            data_dict = cur_augmentor(data_dict=data_dict)
-        
+
        data_dict['gt_boxes'][:, 6] = common_utils.limit_period(
            data_dict['gt_boxes'][:, 6], offset=0.5, period=2 * np.pi
        )
-        if 'calib' in data_dict:
-            data_dict.pop('calib')
+        # if 'calib' in data_dict:
+        #     data_dict.pop('calib')
        if 'road_plane' in data_dict:
            data_dict.pop('road_plane')
        if 'gt_boxes_mask' in data_dict:
@@ -252,6 +255,6 @@ class DataAugmentor(object):
            data_dict['gt_names'] = data_dict['gt_names'][gt_boxes_mask]
            if 'gt_boxes2d' in data_dict:
                data_dict['gt_boxes2d'] = data_dict['gt_boxes2d'][gt_boxes_mask]
-            
+
            data_dict.pop('gt_boxes_mask')
        return data_dict
--- a/pcdet/datasets/augmentor/database_sampler.py
+++ b/pcdet/datasets/augmentor/database_sampler.py
@@ -3,25 +3,31 @@ import pickle
 import os
 import copy
 import numpy as np
+from skimage import io
+import torch
 import SharedArray
 import torch.distributed as dist

 from ...ops.iou3d_nms import iou3d_nms_utils
-from ...utils import box_utils, common_utils
-
+from ...utils import box_utils, common_utils, calibration_kitti
+from pcdet.datasets.kitti.kitti_object_eval_python import kitti_common

 class DataBaseSampler(object):
    def __init__(self, root_path, sampler_cfg, class_names, logger=None):
        self.root_path = root_path
        self.class_names = class_names
        self.sampler_cfg = sampler_cfg
+
+        self.img_aug_type = sampler_cfg.get('IMG_AUG_TYPE', None)
+        self.img_aug_iou_thresh = sampler_cfg.get('IMG_AUG_IOU_THRESH', 0.5)
+
        self.logger = logger
        self.db_infos = {}
        for class_name in class_names:
            self.db_infos[class_name] = []
-            
+
        self.use_shared_memory = sampler_cfg.get('USE_SHARED_MEMORY', False)
-        
+
        for db_info_path in sampler_cfg.DB_INFO_PATH:
            db_info_path = self.root_path.resolve() / db_info_path
            with open(str(db_info_path), 'rb') as f:
@@ -30,7 +36,7 @@ class DataBaseSampler(object):

        for func_name, val in sampler_cfg.PREPARE.items():
            self.db_infos = getattr(self, func_name)(self.db_infos, val)
-        
+
        self.gt_database_data_key = self.load_db_to_shared_memory() if self.use_shared_memory else None

        self.sample_groups = {}
@@ -79,7 +85,7 @@ class DataBaseSampler(object):
        if cur_rank % num_gpus == 0 and not os.path.exists(f"/dev/shm/{sa_key}"):
            gt_database_data = np.load(db_data_path)
            common_utils.sa_create(f"shm://{sa_key}", gt_database_data)
-            
+
        if num_gpus > 1:
            dist.barrier()
        self.logger.info('GT database has been saved to shared memory')
@@ -153,12 +159,208 @@ class DataBaseSampler(object):
        gt_boxes[:, 2] -= mv_height  # lidar view
        return gt_boxes, mv_height

-    def add_sampled_boxes_to_scene(self, data_dict, sampled_gt_boxes, total_valid_sampled_dict):
+    def copy_paste_to_image_kitti(self, data_dict, crop_feat, gt_number, point_idxes=None):
+        kitti_img_aug_type = 'by_depth'
+        kitti_img_aug_use_type = 'annotation'
+
+        image = data_dict['images']
+        boxes3d = data_dict['gt_boxes']
+        boxes2d = data_dict['gt_boxes2d']
+        corners_lidar = box_utils.boxes_to_corners_3d(boxes3d)
+        if 'depth' in kitti_img_aug_type:
+            paste_order = boxes3d[:,0].argsort()
+            paste_order = paste_order[::-1]
+        else:
+            paste_order = np.arange(len(boxes3d),dtype=np.int)
+
+        if 'reverse' in kitti_img_aug_type:
+            paste_order = paste_order[::-1]
+
+        paste_mask = -255 * np.ones(image.shape[:2], dtype=np.int)
+        fg_mask = np.zeros(image.shape[:2], dtype=np.int)
+        overlap_mask = np.zeros(image.shape[:2], dtype=np.int)
+        depth_mask = np.zeros((*image.shape[:2], 2), dtype=np.float)
+        points_2d, depth_2d = data_dict['calib'].lidar_to_img(data_dict['points'][:,:3])
+        points_2d[:,0] = np.clip(points_2d[:,0], a_min=0, a_max=image.shape[1]-1)
+        points_2d[:,1] = np.clip(points_2d[:,1], a_min=0, a_max=image.shape[0]-1)
+        points_2d = points_2d.astype(np.int)
+        for _order in paste_order:
+            _box2d = boxes2d[_order]
+            image[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] = crop_feat[_order]
+            overlap_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] += \
+                (paste_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] > 0).astype(np.int)
+            paste_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] = _order
+
+            if 'cover' in kitti_img_aug_use_type:
+                # HxWx2 for min and max depth of each box region
+                depth_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2],0] = corners_lidar[_order,:,0].min()
+                depth_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2],1] = corners_lidar[_order,:,0].max()
+
+            # foreground area of original point cloud in image plane
+            if _order < gt_number:
+                fg_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] = 1
+
+        data_dict['images'] = image
+
+        # if not self.joint_sample:
+        #     return data_dict
+
+        new_mask = paste_mask[points_2d[:,1], points_2d[:,0]]==(point_idxes+gt_number)
+        if False:  # self.keep_raw:
+            raw_mask = (point_idxes == -1)
+        else:
+            raw_fg = (fg_mask == 1) & (paste_mask >= 0) & (paste_mask < gt_number)
+            raw_bg = (fg_mask == 0) & (paste_mask < 0)
+            raw_mask = raw_fg[points_2d[:,1], points_2d[:,0]] | raw_bg[points_2d[:,1], points_2d[:,0]]
+        keep_mask = new_mask | raw_mask
+        data_dict['points_2d'] = points_2d
+
+        if 'annotation' in kitti_img_aug_use_type:
+            data_dict['points'] = data_dict['points'][keep_mask]
+            data_dict['points_2d'] = data_dict['points_2d'][keep_mask]
+        elif 'projection' in kitti_img_aug_use_type:
+            overlap_mask[overlap_mask>=1] = 1
+            data_dict['overlap_mask'] = overlap_mask
+            if 'cover' in kitti_img_aug_use_type:
+                data_dict['depth_mask'] = depth_mask
+
+        return data_dict
+
+    def collect_image_crops_kitti(self, info, data_dict, obj_points, sampled_gt_boxes, sampled_gt_boxes2d, idx):
+        calib_file = kitti_common.get_calib_path(int(info['image_idx']), self.root_path, relative_path=False)
+        sampled_calib = calibration_kitti.Calibration(calib_file)
+        points_2d, depth_2d = sampled_calib.lidar_to_img(obj_points[:,:3])
+
+        if True:  # self.point_refine:
+            # align calibration metrics for points
+            points_ract = data_dict['calib'].img_to_rect(points_2d[:,0], points_2d[:,1], depth_2d)
+            points_lidar = data_dict['calib'].rect_to_lidar(points_ract)
+            obj_points[:, :3] = points_lidar
+            # align calibration metrics for boxes
+            box3d_raw = sampled_gt_boxes[idx].reshape(1,-1)
+            box3d_coords = box_utils.boxes_to_corners_3d(box3d_raw)[0]
+            box3d_box, box3d_depth = sampled_calib.lidar_to_img(box3d_coords)
+            box3d_coord_rect = data_dict['calib'].img_to_rect(box3d_box[:,0], box3d_box[:,1], box3d_depth)
+            box3d_rect = box_utils.corners_rect_to_camera(box3d_coord_rect).reshape(1,-1)
+            box3d_lidar = box_utils.boxes3d_kitti_camera_to_lidar(box3d_rect, data_dict['calib'])
+            box2d = box_utils.boxes3d_kitti_camera_to_imageboxes(box3d_rect, data_dict['calib'],
+                                                                    data_dict['images'].shape[:2])
+            sampled_gt_boxes[idx] = box3d_lidar[0]
+            sampled_gt_boxes2d[idx] = box2d[0]
+
+        obj_idx = idx * np.ones(len(obj_points), dtype=np.int)
+
+        # copy crops from images
+        img_path = self.root_path /  f'training/image_2/{info["image_idx"]}.png'
+        raw_image = io.imread(img_path)
+        raw_image = raw_image.astype(np.float32)
+        raw_center = info['bbox'].reshape(2,2).mean(0)
+        new_box = sampled_gt_boxes2d[idx].astype(np.int)
+        new_shape = np.array([new_box[2]-new_box[0], new_box[3]-new_box[1]])
+        raw_box = np.concatenate([raw_center-new_shape/2, raw_center+new_shape/2]).astype(np.int)
+        raw_box[0::2] = np.clip(raw_box[0::2], a_min=0, a_max=raw_image.shape[1])
+        raw_box[1::2] = np.clip(raw_box[1::2], a_min=0, a_max=raw_image.shape[0])
+        if (raw_box[2]-raw_box[0])!=new_shape[0] or (raw_box[3]-raw_box[1])!=new_shape[1]:
+            new_center = new_box.reshape(2,2).mean(0)
+            new_shape = np.array([raw_box[2]-raw_box[0], raw_box[3]-raw_box[1]])
+            new_box = np.concatenate([new_center-new_shape/2, new_center+new_shape/2]).astype(np.int)
+
+        img_crop2d = raw_image[raw_box[1]:raw_box[3],raw_box[0]:raw_box[2]] / 255
+
+        return new_box, img_crop2d, obj_points, obj_idx
+
+    def sample_gt_boxes_2d_kitti(self, data_dict, sampled_boxes, valid_mask):
+        mv_height = None
+        # filter out box2d iou > thres
+        if self.sampler_cfg.get('USE_ROAD_PLANE', False):
+            sampled_boxes, mv_height = self.put_boxes_on_road_planes(
+                sampled_boxes, data_dict['road_plane'], data_dict['calib']
+            )
+
+        # sampled_boxes2d = np.stack([x['bbox'] for x in sampled_dict], axis=0).astype(np.float32)
+        boxes3d_camera = box_utils.boxes3d_lidar_to_kitti_camera(sampled_boxes, data_dict['calib'])
+        sampled_boxes2d = box_utils.boxes3d_kitti_camera_to_imageboxes(boxes3d_camera, data_dict['calib'],
+                                                                        data_dict['images'].shape[:2])
+        sampled_boxes2d = torch.Tensor(sampled_boxes2d)
+        existed_boxes2d = torch.Tensor(data_dict['gt_boxes2d'])
+        iou2d1 = box_utils.pairwise_iou(sampled_boxes2d, existed_boxes2d).cpu().numpy()
+        iou2d2 = box_utils.pairwise_iou(sampled_boxes2d, sampled_boxes2d).cpu().numpy()
+        iou2d2[range(sampled_boxes2d.shape[0]), range(sampled_boxes2d.shape[0])] = 0
+        iou2d1 = iou2d1 if iou2d1.shape[1] > 0 else iou2d2
+
+        ret_valid_mask = ((iou2d1.max(axis=1)<self.img_aug_iou_thresh) &
+                         (iou2d2.max(axis=1)<self.img_aug_iou_thresh) &
+                         (valid_mask))
+
+        sampled_boxes2d = sampled_boxes2d[ret_valid_mask].cpu().numpy()
+        if mv_height is not None:
+            mv_height = mv_height[ret_valid_mask]
+        return sampled_boxes2d, mv_height, ret_valid_mask
+
+    def sample_gt_boxes_2d(self, data_dict, sampled_boxes, valid_mask):
+        mv_height = None
+
+        if self.img_aug_type == 'kitti':
+            sampled_boxes2d, mv_height, ret_valid_mask = self.sample_gt_boxes_2d_kitti(data_dict, sampled_boxes, valid_mask)
+        else:
+            raise NotImplementedError
+
+        return sampled_boxes2d, mv_height, ret_valid_mask
+
+    def initilize_image_aug_dict(self, data_dict, gt_boxes_mask):
+        img_aug_gt_dict = None
+        if self.img_aug_type is None:
+            pass
+        elif self.img_aug_type == 'kitti':
+            obj_index_list, crop_boxes2d = [], []
+            gt_number = gt_boxes_mask.sum().astype(np.int)
+            gt_boxes2d = data_dict['gt_boxes2d'][gt_boxes_mask].astype(np.int)
+            gt_crops2d = [data_dict['images'][_x[1]:_x[3],_x[0]:_x[2]] for _x in gt_boxes2d]
+
+            img_aug_gt_dict = {
+                'obj_index_list': obj_index_list,
+                'gt_crops2d': gt_crops2d,
+                'gt_boxes2d': gt_boxes2d,
+                'gt_number': gt_number,
+                'crop_boxes2d': crop_boxes2d
+            }
+        else:
+            raise NotImplementedError
+
+        return img_aug_gt_dict
+
+    def collect_image_crops(self, img_aug_gt_dict, info, data_dict, obj_points, sampled_gt_boxes, sampled_gt_boxes2d, idx):
+        if self.img_aug_type == 'kitti':
+            new_box, img_crop2d, obj_points, obj_idx = self.collect_image_crops_kitti(info, data_dict,
+                                                    obj_points, sampled_gt_boxes, sampled_gt_boxes2d, idx)
+            img_aug_gt_dict['crop_boxes2d'].append(new_box)
+            img_aug_gt_dict['gt_crops2d'].append(img_crop2d)
+            img_aug_gt_dict['obj_index_list'].append(obj_idx)
+        else:
+            raise NotImplementedError
+
+        return img_aug_gt_dict, obj_points
+
+    def copy_paste_to_image(self, img_aug_gt_dict, data_dict, points):
+        if self.img_aug_type == 'kitti':
+            obj_points_idx = np.concatenate(img_aug_gt_dict['obj_index_list'], axis=0)
+            point_idxes = -1 * np.ones(len(points), dtype=np.int)
+            point_idxes[:obj_points_idx.shape[0]] = obj_points_idx
+
+            data_dict['gt_boxes2d'] = np.concatenate([img_aug_gt_dict['gt_boxes2d'], np.array(img_aug_gt_dict['crop_boxes2d'])], axis=0)
+            data_dict = self.copy_paste_to_image_kitti(data_dict, img_aug_gt_dict['gt_crops2d'], img_aug_gt_dict['gt_number'], point_idxes)
+            if 'road_plane' in data_dict:
+                data_dict.pop('road_plane')
+        else:
+            raise NotImplementedError
+        return data_dict
+
+    def add_sampled_boxes_to_scene(self, data_dict, sampled_gt_boxes, total_valid_sampled_dict, mv_height=None, sampled_gt_boxes2d=None):
        gt_boxes_mask = data_dict['gt_boxes_mask']
        gt_boxes = data_dict['gt_boxes'][gt_boxes_mask]
        gt_names = data_dict['gt_names'][gt_boxes_mask]
        points = data_dict['points']
-        if self.sampler_cfg.get('USE_ROAD_PLANE', False):
+        if self.sampler_cfg.get('USE_ROAD_PLANE', False) and mv_height is None:
            sampled_gt_boxes, mv_height = self.put_boxes_on_road_planes(
                sampled_gt_boxes, data_dict['road_plane'], data_dict['calib']
            )
@@ -166,11 +368,15 @@ class DataBaseSampler(object):
            data_dict.pop('road_plane')

        obj_points_list = []
+
+        # convert sampled 3D boxes to image plane
+        img_aug_gt_dict = self.initilize_image_aug_dict(data_dict, gt_boxes_mask)
+
        if self.use_shared_memory:
            gt_database_data = SharedArray.attach(f"shm://{self.gt_database_data_key}")
            gt_database_data.setflags(write=0)
        else:
-            gt_database_data = None 
+            gt_database_data = None

        for idx, info in enumerate(total_valid_sampled_dict):
            if self.use_shared_memory:
@@ -187,6 +393,11 @@ class DataBaseSampler(object):
                # mv height
                obj_points[:, 2] -= mv_height[idx]

+            if self.img_aug_type is not None:
+                img_aug_gt_dict, obj_points = self.collect_image_crops(
+                    img_aug_gt_dict, info, data_dict, obj_points, sampled_gt_boxes, sampled_gt_boxes2d, idx
+                )
+
            obj_points_list.append(obj_points)

        obj_points = np.concatenate(obj_points_list, axis=0)
@@ -202,6 +413,10 @@ class DataBaseSampler(object):
        data_dict['gt_boxes'] = gt_boxes
        data_dict['gt_names'] = gt_names
        data_dict['points'] = points
+
+        if self.img_aug_type is not None:
+            data_dict = self.copy_paste_to_image(img_aug_gt_dict, data_dict, points)
+
        return data_dict

    def __call__(self, data_dict):
@@ -217,6 +432,9 @@ class DataBaseSampler(object):
        gt_names = data_dict['gt_names'].astype(str)
        existed_boxes = gt_boxes
        total_valid_sampled_dict = []
+        sampled_mv_height = []
+        sampled_gt_boxes2d = []
+
        for class_name, sample_group in self.sample_groups.items():
            if self.limit_whole_scene:
                num_gt = np.sum(class_name == gt_names)
@@ -226,14 +444,21 @@ class DataBaseSampler(object):

                sampled_boxes = np.stack([x['box3d_lidar'] for x in sampled_dict], axis=0).astype(np.float32)

-                if self.sampler_cfg.get('DATABASE_WITH_FAKELIDAR', False):
-                    sampled_boxes = box_utils.boxes3d_kitti_fakelidar_to_lidar(sampled_boxes)
+                assert not self.sampler_cfg.get('DATABASE_WITH_FAKELIDAR', False), 'Please use latest codes to generate GT_DATABASE'

                iou1 = iou3d_nms_utils.boxes_bev_iou_cpu(sampled_boxes[:, 0:7], existed_boxes[:, 0:7])
                iou2 = iou3d_nms_utils.boxes_bev_iou_cpu(sampled_boxes[:, 0:7], sampled_boxes[:, 0:7])
                iou2[range(sampled_boxes.shape[0]), range(sampled_boxes.shape[0])] = 0
                iou1 = iou1 if iou1.shape[1] > 0 else iou2
-                valid_mask = ((iou1.max(axis=1) + iou2.max(axis=1)) == 0).nonzero()[0]
+                valid_mask = ((iou1.max(axis=1) + iou2.max(axis=1)) == 0)
+
+                if self.img_aug_type is not None:
+                    sampled_boxes2d, mv_height, valid_mask = self.sample_gt_boxes_2d(data_dict, sampled_boxes, valid_mask)
+                    sampled_gt_boxes2d.append(sampled_boxes2d)
+                    if mv_height is not None:
+                        sampled_mv_height.append(mv_height)
+
+                valid_mask = valid_mask.nonzero()[0]
                valid_sampled_dict = [sampled_dict[x] for x in valid_mask]
                valid_sampled_boxes = sampled_boxes[valid_mask]

@@ -241,8 +466,14 @@ class DataBaseSampler(object):
                total_valid_sampled_dict.extend(valid_sampled_dict)

        sampled_gt_boxes = existed_boxes[gt_boxes.shape[0]:, :]
+
        if total_valid_sampled_dict.__len__() > 0:
-            data_dict = self.add_sampled_boxes_to_scene(data_dict, sampled_gt_boxes, total_valid_sampled_dict)
+            sampled_gt_boxes2d = np.concatenate(sampled_gt_boxes2d, axis=0) if len(sampled_gt_boxes2d) > 0 else None
+            sampled_mv_height = np.concatenate(sampled_mv_height, axis=0) if len(sampled_mv_height) > 0 else None
+
+            data_dict = self.add_sampled_boxes_to_scene(
+                data_dict, sampled_gt_boxes, total_valid_sampled_dict, sampled_mv_height, sampled_gt_boxes2d
+            )

        data_dict.pop('gt_boxes_mask')
        return data_dict
--- a/pcdet/datasets/dataset.py
+++ b/pcdet/datasets/dataset.py
@@ -9,7 +9,6 @@ from .augmentor.data_augmentor import DataAugmentor
 from .processor.data_processor import DataProcessor
 from .processor.point_feature_encoder import PointFeatureEncoder

-
 class DatasetTemplate(torch_data.Dataset):
    def __init__(self, dataset_cfg=None, class_names=None, training=True, root_path=None, logger=None):
        super().__init__()
@@ -44,7 +43,7 @@ class DatasetTemplate(torch_data.Dataset):
            self.depth_downsample_factor = self.data_processor.depth_downsample_factor
        else:
            self.depth_downsample_factor = None
-
+            
    @property
    def mode(self):
        return 'train' if self.training else 'test'
@@ -123,14 +122,17 @@ class DatasetTemplate(torch_data.Dataset):
        if self.training:
            assert 'gt_boxes' in data_dict, 'gt_boxes should be provided for training'
            gt_boxes_mask = np.array([n in self.class_names for n in data_dict['gt_names']], dtype=np.bool_)
-
+            
+            if 'calib' in data_dict:
+                calib = data_dict['calib']
            data_dict = self.data_augmentor.forward(
                data_dict={
                    **data_dict,
                    'gt_boxes_mask': gt_boxes_mask
                }
            )
-
+            if 'calib' in data_dict:
+                data_dict['calib'] = calib
        if data_dict.get('gt_boxes', None) is not None:
            selected = common_utils.keep_arrays_by_name(data_dict['gt_names'], self.class_names)
            data_dict['gt_boxes'] = data_dict['gt_boxes'][selected]
@@ -204,8 +206,7 @@ class DatasetTemplate(torch_data.Dataset):
                        pad_h = common_utils.get_pad_params(desired_size=max_h, cur_size=image.shape[0])
                        pad_w = common_utils.get_pad_params(desired_size=max_w, cur_size=image.shape[1])
                        pad_width = (pad_h, pad_w)
-                        # Pad with nan, to be replaced later in the pipeline.
-                        pad_value = np.nan
+                        pad_value = 0

                        if key == "images":
                            pad_width = (pad_h, pad_w, (0, 0))
@@ -219,6 +220,20 @@ class DatasetTemplate(torch_data.Dataset):

                        images.append(image_pad)
                    ret[key] = np.stack(images, axis=0)
+                elif key in ['calib']:
+                    ret[key] = val
+                elif key in ["points_2d"]:
+                    max_len = max([len(_val) for _val in val])
+                    pad_value = 0
+                    points = []
+                    for _points in val:
+                        pad_width = ((0, max_len-len(_points)), (0,0))
+                        points_pad = np.pad(_points,
+                                pad_width=pad_width,
+                                mode='constant',
+                                constant_values=pad_value)
+                        points.append(points_pad)
+                    ret[key] = np.stack(points, axis=0)
                else:
                    ret[key] = np.stack(val, axis=0)
            except:

--- a/pcdet/datasets/kitti/kitti_dataset.py
+++ b/pcdet/datasets/kitti/kitti_dataset.py
@@ -421,6 +421,7 @@ class KittiDataset(DatasetTemplate):
        if "calib_matricies" in get_item_list:
            input_dict["trans_lidar_to_cam"], input_dict["trans_cam_to_img"] = kitti_utils.calib_to_matricies(calib)

+        input_dict['calib'] = calib
        data_dict = self.prepare_data(data_dict=input_dict)

        data_dict['image_shape'] = img_shape

--- a/pcdet/models/backbones_3d/__init__.py
+++ b/pcdet/models/backbones_3d/__init__.py
 from .pointnet2_backbone import PointNet2Backbone, PointNet2MSG
 from .spconv_backbone import VoxelBackBone8x, VoxelResBackBone8x
+from .spconv_backbone_focal import VoxelBackBone8xFocal
 from .spconv_unet import UNetV2

 __all__ = {
@@ -8,4 +9,5 @@ __all__ = {
    'PointNet2Backbone': PointNet2Backbone,
    'PointNet2MSG': PointNet2MSG,
    'VoxelResBackBone8x': VoxelResBackBone8x,
+    'VoxelBackBone8xFocal': VoxelBackBone8xFocal,
 }
--- a/pcdet/models/backbones_3d/focal_sparse_conv/SemanticSeg/basic_blocks.py
+++ b/pcdet/models/backbones_3d/focal_sparse_conv/SemanticSeg/basic_blocks.py
+import torch.nn as nn
+
+class BasicBlock1D(nn.Module):
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        """
+        Initializes convolutional block
+        Args:
+            in_channels: int, Number of input channels
+            out_channels: int, Number of output channels
+            **kwargs: Dict, Extra arguments for nn.Conv2d
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv = nn.Conv1d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              **kwargs)
+        self.bn = nn.BatchNorm1d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, features):
+        """
+        Applies convolutional block
+        Args:
+            features: (B, C_in, H, W), Input features
+        Returns:
+            x: (B, C_out, H, W), Output features
+        """
+        x = self.conv(features)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+class BasicBlock2D(nn.Module):
+
+    def __init__(self, in_channels, out_channels, **kwargs):
+        """
+        Initializes convolutional block
+        Args:
+            in_channels: int, Number of input channels
+            out_channels: int, Number of output channels
+            **kwargs: Dict, Extra arguments for nn.Conv2d
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              **kwargs)
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, features):
+        """
+        Applies convolutional block
+        Args:
+            features: (B, C_in, H, W), Input features
+        Returns:
+            x: (B, C_out, H, W), Output features
+        """
+        x = self.conv(features)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
--- a/pcdet/models/backbones_3d/focal_sparse_conv/SemanticSeg/pyramid_ffn.py
+++ b/pcdet/models/backbones_3d/focal_sparse_conv/SemanticSeg/pyramid_ffn.py
+import torch
+import torch.nn as nn
+from .basic_blocks import BasicBlock2D
+from .sem_deeplabv3 import SemDeepLabV3
+
+class PyramidFeat2D(nn.Module):
+
+    def __init__(self, optimize, model_cfg):
+        """
+        Initialize 2D feature network via pretrained model
+        Args:
+            model_cfg: EasyDict, Dense classification network config
+        """
+        super().__init__()
+        self.model_cfg = model_cfg
+        self.is_optimize = optimize
+
+        # Create modules
+        self.ifn = SemDeepLabV3(
+            num_classes=model_cfg.num_class,
+            backbone_name=model_cfg.backbone,
+            **model_cfg.args
+        )
+        self.reduce_blocks = torch.nn.ModuleList()
+        self.out_channels = {}
+        for _idx, _channel in enumerate(model_cfg.channel_reduce["in_channels"]):
+            _channel_out = model_cfg.channel_reduce["out_channels"][_idx]
+            self.out_channels[model_cfg.args['feat_extract_layer'][_idx]] = _channel_out
+            block_cfg = {"in_channels": _channel,
+                         "out_channels": _channel_out,
+                         "kernel_size": model_cfg.channel_reduce["kernel_size"][_idx],
+                         "stride": model_cfg.channel_reduce["stride"][_idx],
+                         "bias": model_cfg.channel_reduce["bias"][_idx]}
+            self.reduce_blocks.append(BasicBlock2D(**block_cfg))
+
+    def get_output_feature_dim(self):
+        return self.out_channels
+
+    def forward(self, images):
+        """
+        Predicts depths and creates image depth feature volume using depth distributions
+        Args:
+            images: (N, 3, H_in, W_in), Input images
+        Returns:
+            batch_dict:
+                frustum_features: (N, C, D, H_out, W_out), Image depth features
+        """
+        # Pixel-wise depth classification
+        batch_dict = {}
+        ifn_result = self.ifn(images)
+
+        for _idx, _layer in enumerate(self.model_cfg.args['feat_extract_layer']):
+            image_features = ifn_result[_layer]
+            # Channel reduce
+            if self.reduce_blocks[_idx] is not None:
+                image_features = self.reduce_blocks[_idx](image_features)
+
+            batch_dict[_layer+"_feat2d"] = image_features
+        
+        if self.training:
+            # detach feature from graph if not optimize
+            if "logits" in ifn_result:
+                ifn_result["logits"].detach_()
+            if not self.is_optimize:
+                image_features.detach_()
+
+        return batch_dict
+
+    def get_loss(self):
+        """
+        Gets loss
+        Args:
+        Returns:
+            loss: (1), Network loss
+            tb_dict: dict[float], All losses to log in tensorboard
+        """
+        return None, None
--- a/pcdet/models/backbones_3d/focal_sparse_conv/SemanticSeg/sem_deeplabv3.py
+++ b/pcdet/models/backbones_3d/focal_sparse_conv/SemanticSeg/sem_deeplabv3.py
+from collections import OrderedDict
+from pathlib import Path
+from torch import hub
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+
+
+class SegTemplate(nn.Module):
+    def __init__(self, constructor, feat_extract_layer, num_classes, pretrained_path=None, aux_loss=None):
+        """
+        Initializes depth distribution network.
+        Args:
+            constructor: function, Model constructor
+            feat_extract_layer: string, Layer to extract features from
+            num_classes: int, Number of classes
+            pretrained_path: string, (Optional) Path of the model to load weights from
+            aux_loss: bool, Flag to include auxillary loss
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.pretrained_path = pretrained_path
+        self.pretrained = pretrained_path is not None
+        self.aux_loss = aux_loss
+
+        if self.pretrained:
+            # Preprocess Module
+            self.norm_mean = torch.Tensor([0.485, 0.456, 0.406])
+            self.norm_std = torch.Tensor([0.229, 0.224, 0.225])
+
+        # Model
+        self.model = self.get_model(constructor=constructor)
+        self.feat_extract_layer = feat_extract_layer
+
+        return_layers = {_layer:_layer for _layer in feat_extract_layer}
+        self.model.backbone.return_layers.update(return_layers)
+
+
+    def get_model(self, constructor):
+        """
+        Get model
+        Args:
+            constructor: function, Model constructor
+        Returns:
+            model: nn.Module, Model
+        """
+        # Get model
+        model = constructor(pretrained=False,
+                            pretrained_backbone=False,
+                            num_classes=self.num_classes,
+                            aux_loss=self.aux_loss)
+        # Update weights
+        if self.pretrained_path is not None:
+            model_dict = model.state_dict()
+
+            # Download pretrained model if not available yet
+            checkpoint_path = Path(self.pretrained_path)
+            if not checkpoint_path.exists():
+                checkpoint = checkpoint_path.name
+                save_dir = checkpoint_path.parent
+                save_dir.mkdir(parents=True, exist_ok=True)
+                url = f'https://download.pytorch.org/models/{checkpoint}'
+                hub.load_state_dict_from_url(url, save_dir)
+
+            # Get pretrained state dict
+            pretrained_dict = torch.load(self.pretrained_path)
+            #pretrained_dict = self.filter_pretrained_dict(model_dict=model_dict, pretrained_dict=pretrained_dict)
+
+            # Update current model state dict
+            model_dict.update(pretrained_dict)
+            model.load_state_dict(model_dict, strict=False)
+
+        return model.cuda()
+
+    def filter_pretrained_dict(self, model_dict, pretrained_dict):
+        """
+        Removes layers from pretrained state dict that are not used or changed in model
+        Args:
+            model_dict: dict, Default model state dictionary
+            pretrained_dict: dict, Pretrained model state dictionary
+        Returns:
+            pretrained_dict: dict, Pretrained model state dictionary with removed weights
+        """
+        # Removes aux classifier weights if not used
+        if "aux_classifier.0.weight" in pretrained_dict and "aux_classifier.0.weight" not in model_dict:
+            pretrained_dict = {key: value for key, value in pretrained_dict.items()
+                               if "aux_classifier" not in key}
+
+        # Removes final conv layer from weights if number of classes are different
+        model_num_classes = model_dict["classifier.4.weight"].shape[0]
+        pretrained_num_classes = pretrained_dict["classifier.4.weight"].shape[0]
+        if model_num_classes != pretrained_num_classes:
+            pretrained_dict.pop("classifier.4.weight")
+            pretrained_dict.pop("classifier.4.bias")
+
+        return pretrained_dict
+
+    def forward(self, images):
+        """
+        Forward pass
+        Args:
+            images: (N, 3, H_in, W_in), Input images
+        Returns
+            result: dict[torch.Tensor], Depth distribution result
+                features: (N, C, H_out, W_out), Image features
+                logits: (N, num_classes, H_out, W_out), Classification logits
+                aux: (N, num_classes, H_out, W_out), Auxillary classification logits
+        """
+
+        # Preprocess images
+        if self.pretrained:
+            images = (images - self.norm_mean[None, :, None, None].type_as(images)) / self.norm_std[None, :, None, None].type_as(images)
+        x = images.cuda()
+
+        # Extract features
+        result = OrderedDict()
+        features = self.model.backbone(x)
+        for _layer in self.feat_extract_layer:
+            result[_layer] = features[_layer]
+        return result
+
+        if 'features' in features.keys():
+            feat_shape = features['features'].shape[-2:]
+        else:
+            feat_shape = features['layer1'].shape[-2:]
+
+        # Prediction classification logits
+        x = features["out"] # comment the classifier to reduce memory
+        # x = self.model.classifier(x)
+        # x = F.interpolate(x, size=feat_shape, mode='bilinear', align_corners=False)
+        result["logits"] = x
+
+        # Prediction auxillary classification logits
+        if self.model.aux_classifier is not None:
+            x = features["aux"]
+            x = self.model.aux_classifier(x)
+            x = F.interpolate(x, size=feat_shape, mode='bilinear', align_corners=False)
+            result["aux"] = x
+
+        return result
+
+
+class SemDeepLabV3(SegTemplate):
+
+    def __init__(self, backbone_name, **kwargs):
+        """
+        Initializes SemDeepLabV3 model
+        Args:
+            backbone_name: string, ResNet Backbone Name [ResNet50/ResNet101]
+        """
+        if backbone_name == "ResNet50":
+            constructor = torchvision.models.segmentation.deeplabv3_resnet50
+        elif backbone_name == "ResNet101":
+            constructor = torchvision.models.segmentation.deeplabv3_resnet101
+        else:
+            raise NotImplementedError
+
+        super().__init__(constructor=constructor, **kwargs)
--- a/pcdet/models/backbones_3d/focal_sparse_conv/focal_sparse_conv.py
+++ b/pcdet/models/backbones_3d/focal_sparse_conv/focal_sparse_conv.py
+import torch
+import torch.nn as nn
+import spconv.pytorch as spconv
+from pcdet.ops.roiaware_pool3d.roiaware_pool3d_utils import points_in_boxes_gpu
+from pcdet.models.backbones_3d.focal_sparse_conv.focal_sparse_utils import split_voxels, check_repeat, FocalLoss
+from pcdet.utils import common_utils
+
+
+class FocalSparseConv(spconv.SparseModule):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, voxel_stride, norm_fn=None, indice_key=None,
+                image_channel=3, kernel_size=3, padding=1, mask_multi=False, use_img=False,
+                topk=False, threshold=0.5, skip_mask_kernel=False, enlarge_voxel_channels=-1, 
+                point_cloud_range=[-3, -40, 0, 1, 40, 70.4],
+                voxel_size = [0.1, 0.05, 0.05]):
+        super(FocalSparseConv, self).__init__()
+
+        self.conv = spconv.SubMConv3d(inplanes, planes, kernel_size=kernel_size, stride=1, bias=False, indice_key=indice_key)
+        self.bn1 = norm_fn(planes)
+        self.relu = nn.ReLU(True)
+        offset_channels = kernel_size**3
+
+        self.topk = topk
+        self.threshold = threshold
+        self.voxel_stride = voxel_stride
+        self.focal_loss = FocalLoss()
+        self.mask_multi = mask_multi
+        self.skip_mask_kernel = skip_mask_kernel
+        self.use_img = use_img
+
+        voxel_channel = enlarge_voxel_channels if enlarge_voxel_channels>0 else inplanes
+        in_channels = image_channel + voxel_channel if use_img else voxel_channel
+
+        self.conv_enlarge = spconv.SparseSequential(spconv.SubMConv3d(inplanes, enlarge_voxel_channels, 
+            kernel_size=3, stride=1, padding=1, bias=False, indice_key=indice_key+'_enlarge'),
+            norm_fn(enlarge_voxel_channels),
+            nn.ReLU(True)) if enlarge_voxel_channels>0 else None
+
+        self.conv_imp = spconv.SubMConv3d(in_channels, offset_channels, kernel_size=3, stride=1, padding=1, bias=False, indice_key=indice_key+'_imp')
+
+        _step = int(kernel_size//2)
+        kernel_offsets = [[i, j, k] for i in range(-_step, _step+1) for j in range(-_step, _step+1) for k in range(-_step, _step+1)]
+        kernel_offsets.remove([0, 0, 0])
+        self.kernel_offsets = torch.Tensor(kernel_offsets).cuda()
+        self.inv_idx =  torch.Tensor([2, 1, 0]).long().cuda()
+        self.point_cloud_range = torch.Tensor(point_cloud_range).cuda()
+        self.voxel_size = torch.Tensor(voxel_size).cuda()
+
+    def construct_multimodal_features(self, x, x_rgb, batch_dict, fuse_sum=False):
+        """
+            Construct the multimodal features with both lidar sparse features and image features.
+            Args:
+                x: [N, C] lidar sparse features
+                x_rgb: [b, c, h, w] image features
+                batch_dict: input and output information during forward
+                fuse_sum: bool, manner for fusion, True - sum, False - concat
+
+            Return:
+                image_with_voxelfeatures: [N, C] fused multimodal features
+        """
+        batch_index = x.indices[:, 0]
+        spatial_indices = x.indices[:, 1:] * self.voxel_stride
+        voxels_3d = spatial_indices * self.voxel_size + self.point_cloud_range[:3]
+        calibs = batch_dict['calib']
+        batch_size = batch_dict['batch_size']
+        h, w = batch_dict['images'].shape[2:]
+
+        if not x_rgb.shape == batch_dict['images'].shape:
+            x_rgb = nn.functional.interpolate(x_rgb, (h, w), mode='bilinear')
+
+        image_with_voxelfeatures = []
+        voxels_2d_int_list = []
+        filter_idx_list = []
+        for b in range(batch_size):
+            x_rgb_batch = x_rgb[b]
+
+            calib = calibs[b]
+            voxels_3d_batch = voxels_3d[batch_index==b]
+            voxel_features_sparse = x.features[batch_index==b]
+
+            # Reverse the point cloud transformations to the original coords.
+            if 'noise_scale' in batch_dict:
+                voxels_3d_batch[:, :3] /= batch_dict['noise_scale'][b]
+            if 'noise_rot' in batch_dict:
+                voxels_3d_batch = common_utils.rotate_points_along_z(voxels_3d_batch[:, self.inv_idx].unsqueeze(0), -batch_dict['noise_rot'][b].unsqueeze(0))[0, :, self.inv_idx]
+            if 'flip_x' in batch_dict:
+                voxels_3d_batch[:, 1] *= -1 if batch_dict['flip_x'][b] else 1
+            if 'flip_y' in batch_dict:
+                voxels_3d_batch[:, 2] *= -1 if batch_dict['flip_y'][b] else 1
+
+            voxels_2d, _ = calib.lidar_to_img(voxels_3d_batch[:, self.inv_idx].cpu().numpy())
+
+            voxels_2d_int = torch.Tensor(voxels_2d).to(x_rgb_batch.device).long()
+
+            filter_idx = (0<=voxels_2d_int[:, 1]) * (voxels_2d_int[:, 1] < h) * (0<=voxels_2d_int[:, 0]) * (voxels_2d_int[:, 0] < w)
+
+            filter_idx_list.append(filter_idx)
+            voxels_2d_int = voxels_2d_int[filter_idx]
+            voxels_2d_int_list.append(voxels_2d_int)
+
+            image_features_batch = torch.zeros((voxel_features_sparse.shape[0], x_rgb_batch.shape[0]), device=x_rgb_batch.device)
+            image_features_batch[filter_idx] = x_rgb_batch[:, voxels_2d_int[:, 1], voxels_2d_int[:, 0]].permute(1, 0)
+
+            if fuse_sum:
+                image_with_voxelfeature = image_features_batch + voxel_features_sparse
+            else:
+                image_with_voxelfeature = torch.cat([image_features_batch, voxel_features_sparse], dim=1)
+
+            image_with_voxelfeatures.append(image_with_voxelfeature)
+
+        image_with_voxelfeatures = torch.cat(image_with_voxelfeatures)
+        return image_with_voxelfeatures
+
+    def _gen_sparse_features(self, x, imps_3d, batch_dict, voxels_3d):
+        """
+            Generate the output sparse features from the focal sparse conv.
+            Args:
+                x: [N, C], lidar sparse features
+                imps_3d: [N, kernelsize**3], the predicted importance values
+                batch_dict: input and output information during forward
+                voxels_3d: [N, 3], the 3d positions of voxel centers
+        """
+        batch_size = x.batch_size
+        voxel_features_fore = []
+        voxel_indices_fore = []
+        voxel_features_back = []
+        voxel_indices_back = []
+
+        box_of_pts_cls_targets = []
+        mask_voxels = []
+        mask_kernel_list = []
+
+        for b in range(batch_size):
+            if self.training:
+                index = x.indices[:, 0]
+                batch_index = index==b
+                mask_voxel = imps_3d[batch_index, -1].sigmoid()
+                voxels_3d_batch = voxels_3d[batch_index].unsqueeze(0)
+                mask_voxels.append(mask_voxel)
+                gt_boxes = batch_dict['gt_boxes'][b, :, :-1].unsqueeze(0)
+                box_of_pts_batch = points_in_boxes_gpu(voxels_3d_batch[:, :, self.inv_idx], gt_boxes).squeeze(0)
+                box_of_pts_cls_targets.append(box_of_pts_batch>=0)
+
+            features_fore, indices_fore, features_back, indices_back, mask_kernel = split_voxels(x, b, imps_3d, voxels_3d, self.kernel_offsets, mask_multi=self.mask_multi, topk=self.topk, threshold=self.threshold)
+
+            mask_kernel_list.append(mask_kernel)
+            voxel_features_fore.append(features_fore)
+            voxel_indices_fore.append(indices_fore)
+            voxel_features_back.append(features_back)
+            voxel_indices_back.append(indices_back)
+
+        voxel_features_fore = torch.cat(voxel_features_fore, dim=0)
+        voxel_indices_fore = torch.cat(voxel_indices_fore, dim=0)
+        voxel_features_back = torch.cat(voxel_features_back, dim=0)
+        voxel_indices_back = torch.cat(voxel_indices_back, dim=0)
+        mask_kernel = torch.cat(mask_kernel_list, dim=0)
+
+        x_fore = spconv.SparseConvTensor(voxel_features_fore, voxel_indices_fore, x.spatial_shape, x.batch_size)
+        x_back = spconv.SparseConvTensor(voxel_features_back, voxel_indices_back, x.spatial_shape, x.batch_size)
+
+        loss_box_of_pts = 0
+        if self.training:
+            mask_voxels = torch.cat(mask_voxels)
+            box_of_pts_cls_targets = torch.cat(box_of_pts_cls_targets)
+            mask_voxels_two_classes = torch.cat([1-mask_voxels.unsqueeze(-1), mask_voxels.unsqueeze(-1)], dim=1)
+            loss_box_of_pts = self.focal_loss(mask_voxels_two_classes, box_of_pts_cls_targets.long())
+
+        return x_fore, x_back, loss_box_of_pts, mask_kernel
+
+    def combine_out(self, x_fore, x_back, remove_repeat=False):
+        """
+            Combine the foreground and background sparse features together.
+            Args:
+                x_fore: [N1, C], foreground sparse features
+                x_back: [N2, C], background sparse features
+                remove_repeat: bool, whether to remove the spatial replicate features.
+        """
+        x_fore_features = torch.cat([x_fore.features, x_back.features], dim=0)
+        x_fore_indices = torch.cat([x_fore.indices, x_back.indices], dim=0)
+
+        if remove_repeat:
+            index = x_fore_indices[:, 0]
+            features_out_list = []
+            indices_coords_out_list = []
+            for b in range(x_fore.batch_size):
+                batch_index = index==b
+                features_out, indices_coords_out, _ = check_repeat(x_fore_features[batch_index], x_fore_indices[batch_index], flip_first=False)
+                features_out_list.append(features_out)
+                indices_coords_out_list.append(indices_coords_out)
+            x_fore_features = torch.cat(features_out_list, dim=0)
+            x_fore_indices = torch.cat(indices_coords_out_list, dim=0)
+
+        x_fore = x_fore.replace_feature(x_fore_features)
+        x_fore.indices = x_fore_indices
+
+        return x_fore
+        
+    def forward(self, x, batch_dict, x_rgb=None):
+        spatial_indices = x.indices[:, 1:] * self.voxel_stride
+        voxels_3d = spatial_indices * self.voxel_size + self.point_cloud_range[:3]
+
+        if self.use_img:
+            features_multimodal = self.construct_multimodal_features(x, x_rgb, batch_dict)
+            x_predict = spconv.SparseConvTensor(features_multimodal, x.indices, x.spatial_shape, x.batch_size)
+        else:
+            x_predict = self.conv_enlarge(x) if self.conv_enlarge else x
+
+        imps_3d = self.conv_imp(x_predict).features
+
+        x_fore, x_back, loss_box_of_pts, mask_kernel = self._gen_sparse_features(x, imps_3d, batch_dict, voxels_3d)
+
+        if not self.skip_mask_kernel:
+            x_fore = x_fore.replace_feature(x_fore.features * mask_kernel.unsqueeze(-1))
+        out = self.combine_out(x_fore, x_back, remove_repeat=True)
+        out = self.conv(out)
+
+        if self.use_img:
+            out = out.replace_feature(self.construct_multimodal_features(out, x_rgb, batch_dict, True))
+
+        out = out.replace_feature(self.bn1(out.features))
+        out = out.replace_feature(self.relu(out.features))
+
+        return out, batch_dict, loss_box_of_pts
--- a/pcdet/models/backbones_3d/focal_sparse_conv/focal_sparse_utils.py
+++ b/pcdet/models/backbones_3d/focal_sparse_conv/focal_sparse_utils.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+
+
+class FocalLoss(nn.Module):
+
+    def __init__(self, gamma=2.0, eps=1e-7):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.eps = eps
+
+    def one_hot(self, index, classes):
+        size = index.size() + (classes,)
+        view = index.size() + (1,)
+
+        mask = torch.Tensor(*size).fill_(0).to(index.device)
+
+        index = index.view(*view)
+        ones = 1.
+
+        if isinstance(index, Variable):
+            ones = Variable(torch.Tensor(index.size()).fill_(1).to(index.device))
+            mask = Variable(mask, volatile=index.volatile)
+
+        return mask.scatter_(1, index, ones)
+
+    def forward(self, input, target):
+        y = self.one_hot(target, input.size(-1))
+        logit = F.softmax(input, dim=-1)
+        logit = logit.clamp(self.eps, 1. - self.eps)
+
+        loss = -1 * y * torch.log(logit) # cross entropy
+        loss = loss * (1 - logit) ** self.gamma # focal loss
+
+        return loss.mean()
+
+def sort_by_indices(features, indices, features_add=None):
+    """
+        To sort the sparse features with its indices in a convenient manner.
+        Args:
+            features: [N, C], sparse features
+            indices: [N, 4], indices of sparse features
+            features_add: [N, C], additional features to sort
+    """
+    idx = indices[:, 1:]
+    idx_sum = idx.select(1, 0) * idx[:, 1].max() * idx[:, 2].max() + idx.select(1, 1) * idx[:, 2].max() + idx.select(1, 2)
+    _, ind = idx_sum.sort()
+    features = features[ind]
+    indices = indices[ind]
+    if not features_add is None:
+        features_add = features_add[ind]
+    return features, indices, features_add
+
+def check_repeat(features, indices, features_add=None, sort_first=True, flip_first=True):
+    """
+        Check that whether there are replicate indices in the sparse features, 
+        remove the replicate features if any.
+    """
+    if sort_first:
+        features, indices, features_add = sort_by_indices(features, indices, features_add)
+
+    if flip_first:
+        features, indices = features.flip([0]), indices.flip([0])
+
+    if not features_add is None:
+        features_add=features_add.flip([0])
+
+    idx = indices[:, 1:].int()
+    idx_sum = torch.add(torch.add(idx.select(1, 0) * idx[:, 1].max() * idx[:, 2].max(), idx.select(1, 1) * idx[:, 2].max()), idx.select(1, 2))
+    _unique, inverse, counts = torch.unique_consecutive(idx_sum, return_inverse=True, return_counts=True, dim=0)
+    
+    if _unique.shape[0] < indices.shape[0]:
+        perm = torch.arange(inverse.size(0), dtype=inverse.dtype, device=inverse.device)
+        features_new = torch.zeros((_unique.shape[0], features.shape[-1]), device=features.device)
+        features_new.index_add_(0, inverse.long(), features)
+        features = features_new
+        perm_ = inverse.new_empty(_unique.size(0)).scatter_(0, inverse, perm)
+        indices = indices[perm_].int()
+
+        if not features_add is None:
+            features_add_new = torch.zeros((_unique.shape[0],), device=features_add.device)
+            features_add_new.index_add_(0, inverse.long(), features_add)
+            features_add = features_add_new / counts
+    return features, indices, features_add
+
+
+def split_voxels(x, b, imps_3d, voxels_3d, kernel_offsets, mask_multi=True, topk=True, threshold=0.5):
+    """
+        Generate and split the voxels into foreground and background sparse features, based on the predicted importance values.
+        Args:
+            x: [N, C], input sparse features
+            b: int, batch size id
+            imps_3d: [N, kernelsize**3], the prediced importance values
+            voxels_3d: [N, 3], the 3d positions of voxel centers 
+            kernel_offsets: [kernelsize**3, 3], the offset coords in an kernel
+            mask_multi: bool, whether to multiply the predicted mask to features
+            topk: bool, whether to use topk or threshold for selection
+            threshold: float, threshold value
+    """
+    index = x.indices[:, 0]
+    batch_index = index==b
+    indices_ori = x.indices[batch_index]
+    features_ori = x.features[batch_index]
+    mask_voxel = imps_3d[batch_index, -1].sigmoid()
+    mask_kernel = imps_3d[batch_index, :-1].sigmoid()
+
+    if mask_multi:
+        features_ori *= mask_voxel.unsqueeze(-1)
+
+    if topk:
+        _, indices = mask_voxel.sort(descending=True)
+        indices_fore = indices[:int(mask_voxel.shape[0]*threshold)]
+        indices_back = indices[int(mask_voxel.shape[0]*threshold):]
+    else:
+        indices_fore = mask_voxel > threshold
+        indices_back = mask_voxel <= threshold
+
+    features_fore = features_ori[indices_fore]
+    coords_fore = indices_ori[indices_fore]
+
+    mask_kernel_fore = mask_kernel[indices_fore]
+    mask_kernel_bool = mask_kernel_fore>=threshold
+    voxel_kerels_imp = kernel_offsets.unsqueeze(0).repeat(mask_kernel_bool.shape[0],1, 1)
+    mask_kernel_fore = mask_kernel[indices_fore][mask_kernel_bool]
+    indices_fore_kernels = coords_fore[:, 1:].unsqueeze(1).repeat(1, kernel_offsets.shape[0], 1)
+    indices_with_imp = indices_fore_kernels + voxel_kerels_imp
+    selected_indices = indices_with_imp[mask_kernel_bool]
+    spatial_indices = (selected_indices[:, 0] >0) * (selected_indices[:, 1] >0) * (selected_indices[:, 2] >0)  * \
+                        (selected_indices[:, 0] < x.spatial_shape[0]) * (selected_indices[:, 1] < x.spatial_shape[1]) * (selected_indices[:, 2] < x.spatial_shape[2])
+    selected_indices = selected_indices[spatial_indices]
+    mask_kernel_fore = mask_kernel_fore[spatial_indices]
+    selected_indices = torch.cat([torch.ones((selected_indices.shape[0], 1), device=features_fore.device)*b, selected_indices], dim=1)
+
+    selected_features = torch.zeros((selected_indices.shape[0], features_ori.shape[1]), device=features_fore.device)
+
+    features_fore_cat = torch.cat([features_fore, selected_features], dim=0)
+    coords_fore = torch.cat([coords_fore, selected_indices], dim=0)
+    mask_kernel_fore = torch.cat([torch.ones(features_fore.shape[0], device=features_fore.device), mask_kernel_fore], dim=0)
+
+    features_fore, coords_fore, mask_kernel_fore = check_repeat(features_fore_cat, coords_fore, features_add=mask_kernel_fore)
+
+    features_back = features_ori[indices_back]
+    coords_back = indices_ori[indices_back]
+
+    return features_fore, coords_fore, features_back, coords_back, mask_kernel_fore
--- a/pcdet/models/backbones_3d/spconv_backbone_focal.py
+++ b/pcdet/models/backbones_3d/spconv_backbone_focal.py
+from functools import partial
+
+import torch
+import spconv.pytorch as spconv
+import torch.nn as nn
+
+from .focal_sparse_conv.focal_sparse_conv import FocalSparseConv
+from .focal_sparse_conv.SemanticSeg.pyramid_ffn import PyramidFeat2D
+
+
+class objDict:
+    @staticmethod
+    def to_object(obj: object, **data):
+        obj.__dict__.update(data)
+
+class ConfigDict:
+    def __init__(self, name):
+        self.name = name
+    def __getitem__(self, item):
+        return getattr(self, item)
+
+
+class SparseSequentialBatchdict(spconv.SparseSequential):
+    def __init__(self, *args, **kwargs):
+        super(SparseSequentialBatchdict, self).__init__(*args, **kwargs)
+
+    def forward(self, input, batch_dict=None):
+        loss = 0
+        for k, module in self._modules.items():
+            if module is None:
+                continue
+            if isinstance(module, (FocalSparseConv,)):
+                input, batch_dict, _loss = module(input, batch_dict)
+                loss += _loss
+            else:
+                input = module(input)
+        return input, batch_dict, loss
+
+
+def post_act_block(in_channels, out_channels, kernel_size, indice_key=None, stride=1, padding=0,
+                   conv_type='subm', norm_fn=None):
+
+    if conv_type == 'subm':
+        conv = spconv.SubMConv3d(in_channels, out_channels, kernel_size, bias=False, indice_key=indice_key)
+    elif conv_type == 'spconv':
+        conv = spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding,
+                                   bias=False, indice_key=indice_key)
+    elif conv_type == 'inverseconv':
+        conv = spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, indice_key=indice_key, bias=False)
+    else:
+        raise NotImplementedError
+
+    m = spconv.SparseSequential(
+        conv,
+        norm_fn(out_channels),
+        nn.ReLU(True),
+    )
+
+    return m
+
+
+class SparseBasicBlock(spconv.SparseModule):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, norm_fn=None, downsample=None, indice_key=None):
+        super(SparseBasicBlock, self).__init__()
+
+        assert norm_fn is not None
+        bias = norm_fn is not None
+        self.conv1 = spconv.SubMConv3d(
+            inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=bias, indice_key=indice_key
+        )
+        self.bn1 = norm_fn(planes)
+        self.relu = nn.ReLU(True)
+        self.conv2 = spconv.SubMConv3d(
+            planes, planes, kernel_size=3, stride=stride, padding=1, bias=bias, indice_key=indice_key
+        )
+        self.bn2 = norm_fn(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = out.replace_feature(self.bn1(out.features))
+        out = out.replace_feature(self.relu(out.features))
+
+        out = self.conv2(out)
+        out = out.replace_feature(self.bn2(out.features))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = out.replace_feature(out.features + identity.features)
+        out = out.replace_feature(self.relu(out.features))
+
+        return out
+
+
+class VoxelBackBone8xFocal(nn.Module):
+    def __init__(self, model_cfg, input_channels, grid_size, **kwargs):
+        super().__init__()
+        self.model_cfg = model_cfg
+
+        norm_fn = partial(nn.BatchNorm1d, eps=1e-3, momentum=0.01)
+
+        self.sparse_shape = grid_size[::-1] + [1, 0, 0]
+
+        self.conv_input = spconv.SparseSequential(
+            spconv.SubMConv3d(input_channels, 16, 3, padding=1, bias=False, indice_key='subm1'),
+            norm_fn(16),
+            nn.ReLU(True),
+        )
+
+        block = post_act_block
+
+        use_img = model_cfg.get('USE_IMG', False)
+        topk = model_cfg.get('TOPK', True)
+        threshold = model_cfg.get('THRESHOLD', 0.5)
+        kernel_size = model_cfg.get('KERNEL_SIZE', 3)
+        mask_multi = model_cfg.get('MASK_MULTI', False)
+        skip_mask_kernel = model_cfg.get('SKIP_MASK_KERNEL', False)
+        skip_mask_kernel_image =  model_cfg.get('SKIP_MASK_KERNEL_IMG', False)
+        enlarge_voxel_channels = model_cfg.get('ENLARGE_VOXEL_CHANNELS', -1)
+        img_pretrain = model_cfg.get('IMG_PRETRAIN', "../checkpoints/deeplabv3_resnet50_coco-cd0a2569.pth")
+
+        if use_img:
+            model_cfg_seg=dict(
+                name='SemDeepLabV3',
+                backbone='ResNet50',
+                num_class=21, # pretrained on COCO
+                args={"feat_extract_layer": ["layer1"],
+                    "pretrained_path": img_pretrain},
+                channel_reduce={
+                    "in_channels": [256],
+                    "out_channels": [16],
+                    "kernel_size": [1],
+                    "stride": [1],
+                    "bias": [False]
+                }
+            )
+            cfg_dict = ConfigDict('SemDeepLabV3')
+            objDict.to_object(cfg_dict, **model_cfg_seg)
+            self.semseg = PyramidFeat2D(optimize=True, model_cfg=cfg_dict)
+
+            self.conv_focal_multimodal = FocalSparseConv(16, 16, image_channel=model_cfg_seg['channel_reduce']['out_channels'][0],
+                                        topk=topk, threshold=threshold, use_img=True, skip_mask_kernel=skip_mask_kernel_image,
+                                        voxel_stride=1, norm_fn=norm_fn, indice_key='spconv_focal_multimodal')
+
+        special_spconv_fn = partial(FocalSparseConv, mask_multi=mask_multi, enlarge_voxel_channels=enlarge_voxel_channels, 
+                                    topk=topk, threshold=threshold, kernel_size=kernel_size, padding=kernel_size//2, 
+                                    skip_mask_kernel=skip_mask_kernel)
+        self.use_img = use_img
+
+        self.conv1 = SparseSequentialBatchdict(
+            block(16, 16, 3, norm_fn=norm_fn, padding=1, indice_key='subm1'),
+            special_spconv_fn(16, 16, voxel_stride=1, norm_fn=norm_fn, indice_key='focal1'),
+        )
+
+        self.conv2 =SparseSequentialBatchdict(
+            # [1600, 1408, 41] <- [800, 704, 21]
+            block(16, 32, 3, norm_fn=norm_fn, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'),
+            block(32, 32, 3, norm_fn=norm_fn, padding=1, indice_key='subm2'),
+            block(32, 32, 3, norm_fn=norm_fn, padding=1, indice_key='subm2'),
+            special_spconv_fn(32, 32, voxel_stride=2, norm_fn=norm_fn, indice_key='focal2'),
+        )
+
+        self.conv3 = SparseSequentialBatchdict(
+            # [800, 704, 21] <- [400, 352, 11]
+            block(32, 64, 3, norm_fn=norm_fn, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'),
+            block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm3'),
+            block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm3'),
+            special_spconv_fn(64, 64, voxel_stride=4, norm_fn=norm_fn, indice_key='focal3'),
+        )
+
+        self.conv4 = SparseSequentialBatchdict(
+            # [400, 352, 11] <- [200, 176, 5]
+            block(64, 64, 3, norm_fn=norm_fn, stride=2, padding=(0, 1, 1), indice_key='spconv4', conv_type='spconv'),
+            block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm4'),
+            block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm4'),
+        )
+
+        last_pad = 0
+        last_pad = self.model_cfg.get('last_pad', last_pad)
+        self.conv_out = spconv.SparseSequential(
+            # [200, 150, 5] -> [200, 150, 2]
+            spconv.SparseConv3d(64, 128, (3, 1, 1), stride=(2, 1, 1), padding=last_pad,
+                                bias=False, indice_key='spconv_down2'),
+            norm_fn(128),
+            nn.ReLU(True),
+        )
+        self.num_point_features = 128
+        self.backbone_channels = {
+            'x_conv1': 16,
+            'x_conv2': 32,
+            'x_conv3': 64,
+            'x_conv4': 64
+        }
+        
+        self.forward_ret_dict = {}
+        
+    def get_loss(self, tb_dict=None):
+        loss = self.forward_ret_dict['loss_box_of_pts']
+        if tb_dict is None:
+            tb_dict = {}
+        tb_dict['loss_box_of_pts'] = loss.item()
+        return loss, tb_dict
+
+    def forward(self, batch_dict):
+        """
+        Args:
+            batch_dict:
+                batch_size: int
+                vfe_features: (num_voxels, C)
+                voxel_coords: (num_voxels, 4), [batch_idx, z_idx, y_idx, x_idx]
+        Returns:
+            batch_dict:
+                encoded_spconv_tensor: sparse tensor
+        """
+        voxel_features, voxel_coords = batch_dict['voxel_features'], batch_dict['voxel_coords']
+        batch_size = batch_dict['batch_size']
+        input_sp_tensor = spconv.SparseConvTensor(
+            features=voxel_features,
+            indices=voxel_coords.int(),
+            spatial_shape=self.sparse_shape,
+            batch_size=batch_size
+        )
+
+        loss_img = 0
+
+        x = self.conv_input(input_sp_tensor)
+        x_conv1, batch_dict, loss1 = self.conv1(x, batch_dict)
+
+        if self.use_img:
+            x_image = self.semseg(batch_dict['images'])['layer1_feat2d']
+            x_conv1, batch_dict, loss_img = self.conv_focal_multimodal(x_conv1, batch_dict, x_image)
+
+        x_conv2, batch_dict, loss2 = self.conv2(x_conv1, batch_dict)
+        x_conv3, batch_dict, loss3 = self.conv3(x_conv2, batch_dict)
+        x_conv4, batch_dict, loss4 = self.conv4(x_conv3, batch_dict)
+
+        self.forward_ret_dict['loss_box_of_pts'] = loss1 + loss2 + loss3 + loss4 + loss_img
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(x_conv4)
+
+        batch_dict.update({
+            'encoded_spconv_tensor': out,
+            'encoded_spconv_tensor_stride': 8
+        })
+        batch_dict.update({
+            'multi_scale_3d_features': {
+                'x_conv1': x_conv1,
+                'x_conv2': x_conv2,
+                'x_conv3': x_conv3,
+                'x_conv4': x_conv4,
+            }
+        })
+        batch_dict.update({
+            'multi_scale_3d_strides': {
+                'x_conv1': 1,
+                'x_conv2': 2,
+                'x_conv3': 4,
+                'x_conv4': 8,
+            }
+        })
+
+        return batch_dict
--- a/pcdet/models/backbones_3d/vfe/image_vfe_modules/ffn/ddn/ddn_template.py
+++ b/pcdet/models/backbones_3d/vfe/image_vfe_modules/ffn/ddn/ddn_template.py
@@ -151,7 +151,7 @@ class DDNTemplate(nn.Module):
        x = images
        if self.pretrained:
            # Create a mask for padded pixels
-            mask = torch.isnan(x)
+            mask = (x == 0)

            # Match ResNet pretrained preprocessing
            x = normalize(x, mean=self.norm_mean, std=self.norm_std)

--- a/pcdet/models/detectors/pv_rcnn.py
+++ b/pcdet/models/detectors/pv_rcnn.py
@@ -28,4 +28,9 @@ class PVRCNN(Detector3DTemplate):
        loss_rcnn, tb_dict = self.roi_head.get_loss(tb_dict)

        loss = loss_rpn + loss_point + loss_rcnn
+        
+        if hasattr(self.backbone_3d, 'get_loss'):
+            loss_backbone3d, tb_dict = self.backbone_3d.get_loss(tb_dict)
+            loss += loss_backbone3d
+        
        return loss, tb_dict, disp_dict
--- a/pcdet/models/detectors/voxel_rcnn.py
+++ b/pcdet/models/detectors/voxel_rcnn.py
@@ -29,4 +29,9 @@ class VoxelRCNN(Detector3DTemplate):
        loss_rcnn, tb_dict = self.roi_head.get_loss(tb_dict)

        loss = loss + loss_rpn + loss_rcnn
+        
+        if hasattr(self.backbone_3d, 'get_loss'):
+            loss_backbone3d, tb_dict = self.backbone_3d.get_loss(tb_dict)
+            loss += loss_backbone3d
+            
        return loss, tb_dict, disp_dict
--- a/pcdet/utils/box_utils.py
+++ b/pcdet/utils/box_utils.py
@@ -52,6 +52,43 @@ def boxes_to_corners_3d(boxes3d):

    return corners3d.numpy() if is_numpy else corners3d

+def corners_rect_to_camera(corners):
+    """
+        7 -------- 4
+       /|         /|
+      6 -------- 5 .
+      | |        | |
+      . 3 -------- 0
+      |/         |/
+      2 -------- 1
+    Args:
+        corners:  (8, 3) [x0, y0, z0, ...], (x, y, z) is the point coordinate in image rect
+
+    Returns:
+        boxes_rect:  (7,) [x, y, z, l, h, w, r] in rect camera coords
+    """
+    height_group = [(0, 4), (1, 5), (2, 6), (3, 7)]
+    width_group = [(0, 1), (2, 3), (4, 5), (6, 7)]
+    length_group = [(0, 3), (1, 2), (4, 7), (5, 6)]
+    vector_group = [(0, 3), (1, 2), (4, 7), (5, 6)]
+    height, width, length = 0., 0., 0.
+    vector = np.zeros(2, dtype=np.float32)
+    for index_h, index_w, index_l, index_v in zip(height_group, width_group, length_group, vector_group):
+        height += np.linalg.norm(corners[index_h[0], :] - corners[index_h[1], :])
+        width += np.linalg.norm(corners[index_w[0], :] - corners[index_w[1], :])
+        length += np.linalg.norm(corners[index_l[0], :] - corners[index_l[1], :])
+        vector[0] += (corners[index_v[0], :] - corners[index_v[1], :])[0]
+        vector[1] += (corners[index_v[0], :] - corners[index_v[1], :])[2]
+
+    height, width, length = height*1.0/4, width*1.0/4, length*1.0/4
+    rotation_y = -np.arctan2(vector[1], vector[0])
+
+    center_point = corners.mean(axis=0)
+    center_point[1] += height/2
+    camera_rect = np.concatenate([center_point, np.array([length, height, width, rotation_y])])
+
+    return camera_rect
+

 def mask_boxes_outside_range_numpy(boxes, limit_range, min_num_corners=1):
    """
@@ -296,3 +333,49 @@ def boxes3d_nearest_bev_iou(boxes_a, boxes_b):
    boxes_bev_b = boxes3d_lidar_to_aligned_bev_boxes(boxes_b)

    return boxes_iou_normal(boxes_bev_a, boxes_bev_b)
+
+
+def area(box) -> torch.Tensor:
+    """
+    Computes the area of all the boxes.
+
+    Returns:
+        torch.Tensor: a vector with areas of each box.
+    """
+    area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
+    return area
+
+
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def pairwise_iou(boxes1, boxes2) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M,
+    compute the IoU (intersection over union)
+    between __all__ N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax).
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+    area1 = area(boxes1)
+    area2 = area(boxes2)
+
+    width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max(
+        boxes1[:, None, :2], boxes2[:, :2]
+    )  # [N,M,2]
+
+    width_height.clamp_(min=0)  # [N,M,2]
+    inter = width_height.prod(dim=2)  # [N,M]
+    del width_height
+
+    # handle empty boxes
+    iou = torch.where(
+        inter > 0,
+        inter / (area1[:, None] + area2 - inter),
+        torch.zeros(1, dtype=inter.dtype, device=inter.device),
+    )
+    return iou
--- a/tools/cfgs/kitti_models/voxel_rcnn_car_focal_multimodal.yaml
+++ b/tools/cfgs/kitti_models/voxel_rcnn_car_focal_multimodal.yaml
+CLASS_NAMES: ['Car']
+
+DATA_CONFIG:
+    _BASE_CONFIG_: cfgs/dataset_configs/kitti_dataset.yaml
+    GET_ITEM_LIST: ["images", "points", "calib_matricies", "gt_boxes2d"]
+    DATA_AUGMENTOR:
+        DISABLE_AUG_LIST: ['placeholder']
+        AUG_CONFIG_LIST:
+            - NAME: gt_sampling
+
+              # AUG_WITH_IMAGE: True # use PC-Image Aug
+              IMG_AUG_TYPE: kitti
+
+              USE_ROAD_PLANE: True
+              DB_INFO_PATH:
+                  - kitti_dbinfos_train.pkl
+              PREPARE: {
+                 filter_by_min_points: ['Car:5'],
+                 filter_by_difficulty: [-1],
+              }
+
+              SAMPLE_GROUPS: ['Car:15']
+              NUM_POINT_FEATURES: 4
+              DATABASE_WITH_FAKELIDAR: False
+              REMOVE_EXTRA_WIDTH: [0.0, 0.0, 0.0]
+              LIMIT_WHOLE_SCENE: False
+
+            - NAME: random_world_flip
+              ALONG_AXIS_LIST: ['x']
+
+            - NAME: random_world_rotation
+              WORLD_ROT_ANGLE: [-0.78539816, 0.78539816]
+
+            - NAME: random_world_scaling
+              WORLD_SCALE_RANGE: [0.95, 1.05]
+     
+MODEL:
+    NAME: VoxelRCNN
+
+    VFE:
+        NAME: MeanVFE
+
+    BACKBONE_3D:
+        NAME: VoxelBackBone8xFocal
+        USE_IMG: True
+        IMG_PRETRAIN: "../checkpoints/deeplabv3_resnet50_coco-cd0a2569.pth"
+
+    MAP_TO_BEV:
+        NAME: HeightCompression
+        NUM_BEV_FEATURES: 256
+
+    BACKBONE_2D:
+        NAME: BaseBEVBackbone
+
+        LAYER_NUMS: [5, 5]
+        LAYER_STRIDES: [1, 2]
+        NUM_FILTERS: [64, 128]
+        UPSAMPLE_STRIDES: [1, 2]
+        NUM_UPSAMPLE_FILTERS: [128, 128]
+
+    DENSE_HEAD:
+        NAME: AnchorHeadSingle
+        CLASS_AGNOSTIC: False
+
+        USE_DIRECTION_CLASSIFIER: True
+        DIR_OFFSET: 0.78539
+        DIR_LIMIT_OFFSET: 0.0
+        NUM_DIR_BINS: 2
+
+        ANCHOR_GENERATOR_CONFIG: [
+            {
+                'class_name': 'Car',
+                'anchor_sizes': [[3.9, 1.6, 1.56]],
+                'anchor_rotations': [0, 1.57],
+                'anchor_bottom_heights': [-1.78],
+                'align_center': False,
+                'feature_map_stride': 8,
+                'matched_threshold': 0.6,
+                'unmatched_threshold': 0.45
+            },
+        ]
+
+        TARGET_ASSIGNER_CONFIG:
+            NAME: AxisAlignedTargetAssigner
+            POS_FRACTION: -1.0
+            SAMPLE_SIZE: 512
+            NORM_BY_NUM_EXAMPLES: False
+            MATCH_HEIGHT: False
+            BOX_CODER: ResidualCoder
+
+        LOSS_CONFIG:
+            LOSS_WEIGHTS: {
+                'cls_weight': 1.0,
+                'loc_weight': 2.0,
+                'dir_weight': 0.2,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+
+    ROI_HEAD:
+        NAME: VoxelRCNNHead
+        CLASS_AGNOSTIC: True
+
+        SHARED_FC: [256, 256]
+        CLS_FC: [256, 256]
+        REG_FC: [256, 256]
+        DP_RATIO: 0.3
+
+        NMS_CONFIG:
+            TRAIN:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                NMS_PRE_MAXSIZE: 9000
+                NMS_POST_MAXSIZE: 512
+                NMS_THRESH: 0.8
+            TEST:
+                NMS_TYPE: nms_gpu
+                MULTI_CLASSES_NMS: False
+                USE_FAST_NMS: False
+                SCORE_THRESH: 0.0
+                NMS_PRE_MAXSIZE: 2048
+                NMS_POST_MAXSIZE: 100
+                NMS_THRESH: 0.7
+
+        ROI_GRID_POOL:
+            FEATURES_SOURCE: ['x_conv2', 'x_conv3', 'x_conv4']
+            PRE_MLP: True
+            GRID_SIZE: 6
+            POOL_LAYERS:
+                x_conv2:
+                    MLPS: [[32, 32]]
+                    QUERY_RANGES: [[4, 4, 4]]
+                    POOL_RADIUS: [0.4]
+                    NSAMPLE: [16]
+                    POOL_METHOD: max_pool
+                x_conv3:
+                    MLPS: [[32, 32]]
+                    QUERY_RANGES: [[4, 4, 4]]
+                    POOL_RADIUS: [0.8]
+                    NSAMPLE: [16]
+                    POOL_METHOD: max_pool
+                x_conv4:
+                    MLPS: [[32, 32]]
+                    QUERY_RANGES: [[4, 4, 4]]
+                    POOL_RADIUS: [1.6]
+                    NSAMPLE: [16]
+                    POOL_METHOD: max_pool
+        TARGET_CONFIG:
+            BOX_CODER: ResidualCoder
+            ROI_PER_IMAGE: 128
+            FG_RATIO: 0.5
+
+            SAMPLE_ROI_BY_EACH_CLASS: True
+            CLS_SCORE_TYPE: roi_iou
+
+            CLS_FG_THRESH: 0.75
+            CLS_BG_THRESH: 0.25
+            CLS_BG_THRESH_LO: 0.1
+            HARD_BG_RATIO: 0.8
+
+            REG_FG_THRESH: 0.55
+
+        LOSS_CONFIG:
+            CLS_LOSS: BinaryCrossEntropy
+            REG_LOSS: smooth-l1
+            CORNER_LOSS_REGULARIZATION: True
+            GRID_3D_IOU_LOSS: False
+            LOSS_WEIGHTS: {
+                'rcnn_cls_weight': 1.0,
+                'rcnn_reg_weight': 1.0,
+                'rcnn_corner_weight': 1.0,
+                'rcnn_iou3d_weight': 1.0,
+                'code_weights': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+            }
+
+    POST_PROCESSING:
+        RECALL_THRESH_LIST: [0.3, 0.5, 0.7]
+        SCORE_THRESH: 0.3
+        OUTPUT_RAW_SCORE: False
+
+        EVAL_METRIC: kitti
+
+        NMS_CONFIG:
+            MULTI_CLASSES_NMS: False
+            NMS_TYPE: nms_gpu
+            NMS_THRESH: 0.1
+            NMS_PRE_MAXSIZE: 4096
+            NMS_POST_MAXSIZE: 500
+
+
+OPTIMIZATION:
+    BATCH_SIZE_PER_GPU: 2
+    NUM_EPOCHS: 80
+
+    OPTIMIZER: adam_onecycle
+    LR: 0.01
+    WEIGHT_DECAY: 0.01
+    MOMENTUM: 0.9
+
+    MOMS: [0.95, 0.85]
+    PCT_START: 0.4
+    DIV_FACTOR: 10
+    DECAY_STEP_LIST: [35, 45]
+    LR_DECAY: 0.1
+    LR_CLIP: 0.0000001
+
+    LR_WARMUP: False
+    WARMUP_EPOCH: 1
+
+    GRAD_NORM_CLIP: 10