init

4cd43886 · lishj6 · a9a1fe81 · 4cd43886 · 4cd43886 · 4cd43886
Commit 4cd43886 authored Sep 01, 2025 by lishj6 🏸
20 changed files
--- a/projects/mmdet3d_plugin/datasets/pipelines/formating.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/formating.py
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+
+from mmdet3d.core.bbox import BaseInstance3DBoxes
+from mmdet3d.core.points import BasePoints
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import to_tensor
+from mmdet3d.datasets.pipelines import DefaultFormatBundle3D
+
+@PIPELINES.register_module()
+class CustomDefaultFormatBundle3D(DefaultFormatBundle3D):
+    """Default formatting bundle.
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+        Args:
+            results (dict): Result dict contains the data to convert.
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        # Format 3D data
+        results = super(CustomDefaultFormatBundle3D, self).__call__(results)
+        results['gt_map_masks'] = DC(
+            to_tensor(results['gt_map_masks']), stack=True)
+
+        return results
\ No newline at end of file
--- a/projects/mmdet3d_plugin/datasets/pipelines/loading.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/loading.py
--- a/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
+++ b/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
+import numpy as np
+from numpy import random
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+from mmcv.parallel import DataContainer as DC
+
+@PIPELINES.register_module()
+class PadMultiViewImage(object):
+    """Pad the multi-view image.
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        if self.size is not None:
+            padded_img = [mmcv.impad(
+                img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
+        elif self.size_divisor is not None:
+            padded_img = [mmcv.impad_to_multiple(
+                img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
+        
+        results['ori_shape'] = [img.shape for img in results['img']]
+        results['img'] = padded_img
+        results['img_shape'] = [img.shape for img in padded_img]
+        results['pad_shape'] = [img.shape for img in padded_img]
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+    """Normalize the image.
+    Added key is "img_norm_cfg".
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+
+    def __call__(self, results):
+        """Call function to normalize images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+
+        results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']]
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortionMultiViewImage:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        imgs = results['img']
+        new_imgs = []
+        for img in imgs:
+            assert img.dtype == np.float32, \
+                'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+            # random brightness
+            if random.randint(2):
+                delta = random.uniform(-self.brightness_delta,
+                                    self.brightness_delta)
+                img += delta
+
+            # mode == 0 --> do random contrast first
+            # mode == 1 --> do random contrast last
+            mode = random.randint(2)
+            if mode == 1:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+
+            # convert color from BGR to HSV
+            img = mmcv.bgr2hsv(img)
+
+            # random saturation
+            if random.randint(2):
+                img[..., 1] *= random.uniform(self.saturation_lower,
+                                            self.saturation_upper)
+
+            # random hue
+            if random.randint(2):
+                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+                img[..., 0][img[..., 0] > 360] -= 360
+                img[..., 0][img[..., 0] < 0] += 360
+
+            # convert color from HSV to BGR
+            img = mmcv.hsv2bgr(img)
+
+            # random contrast
+            if mode == 0:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+
+            # randomly swap channels
+            if random.randint(2):
+                img = img[..., random.permutation(3)]
+            new_imgs.append(img)
+        results['img'] = new_imgs
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+
+@PIPELINES.register_module()
+class CustomCollect3D(object):
+    """Collect data from the loader relevant to the specific task.
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+        - 'img_shape': shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+        - 'scale_factor': a float indicating the preprocessing scale
+        - 'flip': a boolean indicating if image flip transform was used
+        - 'filename': path to the image file
+        - 'ori_shape': original shape of the image as a tuple (h, w, c)
+        - 'pad_shape': image shape after padding
+        - 'lidar2img': transform from lidar to image
+        - 'depth2img': transform from depth to image
+        - 'cam2img': transform from camera to image
+        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
+            flipped horizontally
+        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
+            flipped vertically
+        - 'box_mode_3d': 3D box mode
+        - 'box_type_3d': 3D box type
+        - 'img_norm_cfg': a dict of normalization information:
+            - mean: per channel mean subtraction
+            - std: per channel std divisor
+            - to_rgb: bool indicating if bgr was converted to rgb
+        - 'pcd_trans': point cloud transformations
+        - 'sample_idx': sample index
+        - 'pcd_scale_factor': point cloud scale factor
+        - 'pcd_rotation': rotation applied to point cloud
+        - 'pts_filename': path to point cloud file.
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img','lidar2cam',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
+                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                            'transformation_3d_flow', 'scene_token',
+                            'can_bus',
+                            )):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:`mmcv.DataContainer`.
+        Args:
+            results (dict): Result dict contains the data to collect.
+        Returns:
+            dict: The result dict contains the following keys
+                - keys in ``self.keys``
+                - ``img_metas``
+        """
+       
+        data = {}
+        img_metas = {}
+      
+        for key in self.meta_keys:
+            if key in results:
+                img_metas[key] = results[key]
+
+        data['img_metas'] = DC(img_metas, cpu_only=True)
+        for key in self.keys:
+            if key not in results:
+                data[key] = None 
+            else:
+                data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+
+@PIPELINES.register_module()
+class RandomScaleImageMultiViewImage(object):
+    """Random scale the image
+    Args:
+        scales
+    """
+
+    def __init__(self, scales=[]):
+        self.scales = scales
+        assert len(self.scales)==1
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        rand_ind = np.random.permutation(range(len(self.scales)))[0]
+        rand_scale = self.scales[rand_ind]
+
+        y_size = [int(img.shape[0] * rand_scale) for img in results['img']]
+        x_size = [int(img.shape[1] * rand_scale) for img in results['img']]
+        scale_factor = np.eye(4)
+        scale_factor[0, 0] *= rand_scale
+        scale_factor[1, 1] *= rand_scale
+        results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in
+                          enumerate(results['img'])]
+        lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
+        results['lidar2img'] = lidar2img
+        results['img_shape'] = [img.shape for img in results['img']]
+        results['ori_shape'] = [img.shape for img in results['img']]
+
+        return results
+
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.scales}, '
+        return repr_str
\ No newline at end of file
--- a/projects/mmdet3d_plugin/datasets/samplers/__init__.py
+++ b/projects/mmdet3d_plugin/datasets/samplers/__init__.py
+from .group_sampler import DistributedGroupSampler
+from .distributed_sampler import DistributedSampler
+from .sampler import SAMPLER, build_sampler
+
--- a/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
+++ b/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
+import math
+
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from .sampler import SAMPLER
+
+
+@SAMPLER.register_module()
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self,
+                 dataset=None,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 seed=0):
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        # for the compatibility from PyTorch 1.3+
+        self.seed = seed if seed is not None else 0
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            assert False
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices *
+                   math.ceil(self.total_size / len(indices)))[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        per_replicas = self.total_size//self.num_replicas
+        # indices = indices[self.rank:self.total_size:self.num_replicas]
+        indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
--- a/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
+++ b/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import Sampler
+from .sampler import SAMPLER
+import random
+from IPython import embed
+
+
+@SAMPLER.register_module()
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None,
+                 seed=0):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.seed = seed if seed is not None else 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                # add .numpy() to avoid bug when selecting indice in parrots.
+                # TODO: check whether torch.randperm() can be replaced by
+                # numpy.random.permutation().
+                indice = indice[list(
+                    torch.randperm(int(size), generator=g).numpy())].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
--- a/projects/mmdet3d_plugin/datasets/samplers/sampler.py
+++ b/projects/mmdet3d_plugin/datasets/samplers/sampler.py
+from mmcv.utils.registry import Registry, build_from_cfg
+
+SAMPLER = Registry('sampler')
+
+
+def build_sampler(cfg, default_args):
+    return build_from_cfg(cfg, SAMPLER, default_args)
--- a/projects/mmdet3d_plugin/dd3d/__init__.py
+++ b/projects/mmdet3d_plugin/dd3d/__init__.py
+from .modeling import *
\ No newline at end of file
--- a/projects/mmdet3d_plugin/dd3d/datasets/__init__.py
+++ b/projects/mmdet3d_plugin/dd3d/datasets/__init__.py
--- a/projects/mmdet3d_plugin/dd3d/datasets/nuscenes.py
+++ b/projects/mmdet3d_plugin/dd3d/datasets/nuscenes.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+#import functools
+from collections import OrderedDict
+
+import numpy as np
+import seaborn as sns
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+#from detectron2.data import MetadataCatalog
+from detectron2.structures.boxes import BoxMode
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.splits import create_splits_scenes
+
+#from tridet.data import collect_dataset_dicts
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import GenericBoxes3D
+from projects.mmdet3d_plugin.dd3d.structures.pose import Pose
+from projects.mmdet3d_plugin.dd3d.utils.geometry import project_points3d
+from projects.mmdet3d_plugin.dd3d.utils.visualization import float_to_uint8_color
+
+#  https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
+#     - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
+#     - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
+#     - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
+#         tracker if required
+DATASET_NAME_TO_VERSION = {
+    "nusc_train": "v1.0-trainval",
+    "nusc_val": "v1.0-trainval",
+    "nusc_val-subsample-8": "v1.0-trainval",
+    "nusc_trainval": "v1.0-trainval",
+    "nusc_test": "v1.0-test",
+    "nusc_mini_train": "v1.0-mini",
+    "nusc_mini_val": "v1.0-mini",
+}
+
+CAMERA_NAMES = ('CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT')
+
+ATTRIBUTE_IDS = {
+    'vehicle.moving': 0,
+    'vehicle.parked': 1,
+    'vehicle.stopped': 2,
+    'pedestrian.moving': 0,
+    'pedestrian.standing': 1,
+    'pedestrian.sitting_lying_down': 2,
+    'cycle.with_rider': 0,
+    'cycle.without_rider': 1,
+}
+
+CATEGORY_IDS = OrderedDict({
+    'barrier': 0,
+    'bicycle': 1,
+    'bus': 2,
+    'car': 3,
+    'construction_vehicle': 4,
+    'motorcycle': 5,
+    'pedestrian': 6,
+    'traffic_cone': 7,
+    'trailer': 8,
+    'truck': 9,
+})
+
+COLORS = [float_to_uint8_color(clr) for clr in sns.color_palette("bright", n_colors=10)]
+COLORMAP = OrderedDict({
+    'barrier': COLORS[8],  # yellow
+    'bicycle': COLORS[0],  # blue
+    'bus': COLORS[6],  # pink
+    'car': COLORS[2],  # green
+    'construction_vehicle': COLORS[7],  # gray
+    'motorcycle': COLORS[4],  # purple
+    'pedestrian': COLORS[1],  # orange
+    'traffic_cone': COLORS[3],  # red
+    'trailer': COLORS[9],  # skyblue
+    'truck': COLORS[5],  # brown
+})
+
+MAX_NUM_ATTRIBUTES = 3
+
+
+def _compute_iou(box1, box2):
+    """
+    Parameters
+    ----------
+    box1, box2:
+        (x1, y1, x2, y2)
+    """
+    xx1 = max(box1[0], box2[0])
+    yy1 = max(box1[1], box2[1])
+    xx2 = min(box1[2], box2[2])
+    yy2 = min(box1[3], box2[3])
+    if xx1 >= xx2 or yy1 >= yy2:
+        return 0.
+    inter = (xx2 - xx1) * (yy2 - yy1)
+    a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    return inter / (a1 + a2 - inter)
+
+
+class NuscenesDataset(Dataset):
+    def __init__(self, name, data_root, datum_names=CAMERA_NAMES, min_num_lidar_points=3, min_box_visibility=0.2, **unused):
+        self.data_root = data_root
+        assert name in DATASET_NAME_TO_VERSION
+        version = DATASET_NAME_TO_VERSION[name]
+        self.nusc = NuScenes(version=version, dataroot=data_root, verbose=True)
+
+        self.datum_names = datum_names
+        self.min_num_lidar_points = min_num_lidar_points
+        self.min_box_visibility = min_box_visibility
+
+        self.dataset_item_info = self._build_dataset_item_info(name)
+
+        # Index instance tokens to their IDs
+        self._instance_token_to_id = self._index_instance_tokens()
+
+        # Construct the mapping from datum_token (image id) to index
+        print("Generating the mapping from image id to idx...")
+        self.datumtoken2idx = {}
+        for idx, (datum_token, _, _, _, _) in enumerate(self.dataset_item_info):
+            self.datumtoken2idx[datum_token] = idx
+        print("Done.")
+
+    def _build_dataset_item_info(self, name):
+        scenes_in_split = self._get_split_scenes(name)
+
+        dataset_items = []
+        for _, scene_token in tqdm(scenes_in_split):
+            scene = self.nusc.get('scene', scene_token)
+            sample_token = scene['first_sample_token']
+            for sample_idx in range(scene['nbr_samples']):
+                if name.endswith('subsample-8') and sample_idx % 8 > 0:
+                    # Sample-level subsampling.
+                    continue
+
+                sample = self.nusc.get('sample', sample_token)
+                for datum_name, datum_token in sample['data'].items():
+                    if datum_name not in self.datum_names:
+                        continue
+                    dataset_items.append((datum_token, sample_token, scene['name'], sample_idx, datum_name))
+                sample_token = sample['next']
+        return dataset_items
+
+    def _get_split_scenes(self, name):
+        scenes_in_splits = create_splits_scenes()
+        if name == "nusc_trainval":
+            scenes = scenes_in_splits["train"] + scenes_in_splits["val"]
+        elif name == "nusc_val-subsample-8":
+            scenes = scenes_in_splits["val"]
+        else:
+            assert name.startswith('nusc_'), f"Invalid dataset name: {name}"
+            split = name[5:]
+            assert split in scenes_in_splits, f"Invalid dataset: {split}"
+            scenes = scenes_in_splits[split]
+
+        # Mapping from scene name to token.
+        name_to_token = {scene['name']: scene['token'] for scene in self.nusc.scene}
+        return [(name, name_to_token[name]) for name in scenes]
+
+    def __len__(self):
+        return len(self.dataset_item_info)
+
+    def _build_id(self, scene_name, sample_idx, datum_name):
+        sample_id = f"{scene_name}_{sample_idx:03d}"
+        image_id = f"{sample_id}_{datum_name}"
+        return image_id, sample_id
+
+    def _index_instance_tokens(self):
+        """Index instance tokens for uniquely identifying instances across samples"""
+        instance_token_to_id = {}
+        for record in self.nusc.sample_annotation:
+            instance_token = record['instance_token']
+            if instance_token not in instance_token_to_id:
+                next_instance_id = len(instance_token_to_id)
+                instance_token_to_id[instance_token] = next_instance_id
+        return instance_token_to_id
+
+    def get_instance_annotations(self, annotation_list, K, image_shape, pose_WS):
+        annotations = []
+        for _ann in annotation_list:
+            ann = self.nusc.get('sample_annotation', _ann.token)
+            if ann['num_lidar_pts'] + ann['num_radar_pts'] < self.min_num_lidar_points:
+                continue
+            annotation = OrderedDict()
+
+            # --------
+            # Category
+            # --------
+            category = category_to_detection_name(ann['category_name'])
+            if category is None:
+                continue
+            annotation['category_id'] = CATEGORY_IDS[category]
+
+            # ------
+            # 3D box
+            # ------
+            # NOTE: ann['rotation'], ann['translation'] is in global frame.
+            pose_SO = Pose(wxyz=_ann.orientation, tvec=_ann.center)  # pose in sensor frame
+            # DEBUG:
+            # pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
+            # pose_WO_2 = pose_WS * pose_SO
+            # assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
+            bbox3d = GenericBoxes3D(_ann.orientation, _ann.center, _ann.wlh)
+            annotation['bbox3d'] = bbox3d.vectorize().tolist()[0]
+
+            # --------------------------------------
+            # 2D box -- project 8 corners of 3D bbox
+            # --------------------------------------
+            corners = project_points3d(bbox3d.corners.cpu().numpy().squeeze(0), K)
+            l, t = corners[:, 0].min(), corners[:, 1].min()
+            r, b = corners[:, 0].max(), corners[:, 1].max()
+
+            x1 = max(0, l)
+            y1 = max(0, t)
+            x2 = min(image_shape[1], r)
+            y2 = min(image_shape[0], b)
+
+            iou = _compute_iou([l, t, r, b], [x1, y1, x2, y2])
+            if iou < self.min_box_visibility:
+                continue
+
+            annotation['bbox'] = [x1, y1, x2, y2]
+            annotation['bbox_mode'] = BoxMode.XYXY_ABS
+
+            # --------
+            # Track ID
+            # --------
+            annotation['track_id'] = self._instance_token_to_id[ann['instance_token']]
+
+            # ---------
+            # Attribute
+            # ---------
+            attr_tokens = ann['attribute_tokens']
+            assert len(attr_tokens) < 2  # NOTE: Allow only single attrubute.
+            attribute_id = MAX_NUM_ATTRIBUTES  # By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
+            if attr_tokens:
+                attribute = self.nusc.get('attribute', attr_tokens[0])['name']
+                attribute_id = ATTRIBUTE_IDS[attribute]
+            annotation['attribute_id'] = attribute_id
+
+            # -----
+            # Speed
+            # -----
+            vel_global = self.nusc.box_velocity(ann['token'])
+            speed = np.linalg.norm(vel_global)  # NOTE: This can be NaN.
+            # DEBUG:
+            # speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
+            annotation['speed'] = speed
+
+            annotations.append(annotation)
+
+        return annotations
+
+    def _get_ego_velocity(self, current, max_time_diff=1.5):
+        """Velocity of ego-vehicle in m/s.
+        """
+        has_prev = current['prev'] != ''
+        has_next = current['next'] != ''
+
+        # Cannot estimate velocity for a single annotation.
+        if not has_prev and not has_next:
+            return np.array([np.nan, np.nan, np.nan])
+
+        if has_prev:
+            first = self.nusc.get('sample_data', current['prev'])
+        else:
+            first = current
+
+        if has_next:
+            last = self.nusc.get('sample_data', current['next'])
+        else:
+            last = current
+
+        pos_first = self.nusc.get('ego_pose', first['ego_pose_token'])['translation']
+        pos_last = self.nusc.get('ego_pose', last['ego_pose_token'])['translation']
+        pos_diff = np.float32(pos_last) - np.float32(pos_first)
+
+        time_last = 1e-6 * last['timestamp']
+        time_first = 1e-6 * first['timestamp']
+        time_diff = time_last - time_first
+
+        if has_next and has_prev:
+            # If doing centered difference, allow for up to double the max_time_diff.
+            max_time_diff *= 2
+
+        if time_diff > max_time_diff:
+            # If time_diff is too big, don't return an estimate.
+            return np.array([np.nan, np.nan, np.nan])
+        else:
+            return pos_diff / time_diff
+
+    def __getitem__(self, idx):
+        datum_token, sample_token, scene_name, sample_idx, datum_name = self.dataset_item_info[idx]
+        datum = self.nusc.get('sample_data', datum_token)
+        assert datum['is_key_frame']
+
+        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+        image_id, sample_id = self._build_id(scene_name, sample_idx, datum_name)
+        height, width = datum['height'], datum['width']
+        d2_dict = OrderedDict(
+            file_name=filename,
+            height=height,
+            width=width,
+            image_id=image_id,
+            sample_id=sample_id,
+            sample_token=sample_token
+        )
+
+        # Intrinsics
+        d2_dict['intrinsics'] = list(K.flatten())
+
+        # Get pose of the sensor (S) from vehicle (V) frame
+        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
+
+        # Get ego-pose of the vehicle (V) from global/world (W) frame
+        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+        pose_WS = pose_WV * pose_VS
+
+        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+
+        return d2_dict
+
+    def getitem_by_datumtoken(self, datum_token):
+        # idx = self.datumtoken2idx[datum_token]
+        # ret = self.__getitem__(idx)
+
+        datum = self.nusc.get('sample_data', datum_token)
+        sample_token = datum['sample_token']
+        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+        height, width = datum['height'], datum['width']
+        d2_dict = OrderedDict(
+            file_name=filename,
+            height=height,
+            width=width,
+            image_id=0,
+            sample_id=0,
+            sample_token=sample_token
+        )
+        # Intrinsics
+        d2_dict['intrinsics'] = list(K.flatten())
+        # Get pose of the sensor (S) from vehicle (V) frame
+        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation'])) 
+        # Get ego-pose of the vehicle (V) from global/world (W) frame
+        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+        pose_WS = pose_WV * pose_VS
+
+        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+        return d2_dict
\ No newline at end of file
--- a/projects/mmdet3d_plugin/dd3d/datasets/transform_utils.py
+++ b/projects/mmdet3d_plugin/dd3d/datasets/transform_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from detectron2:
+#   https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py
+import numpy as np
+import torch
+
+from detectron2.data import transforms as T
+from detectron2.structures import Boxes, BoxMode, Instances
+
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+
+__all__ = ["transform_instance_annotations", "annotations_to_instances"]
+
+
+def transform_instance_annotations(
+    annotation,
+    transforms,
+    image_size,
+):
+    """Adapted from:
+        https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py#L254
+
+    The changes from original:
+        - The presence of 2D bounding box (i.e. "bbox" field) is assumed by default in d2; here it's optional.
+        - Add optional 3D bounding box support.
+        - If the instance mask annotation is in RLE, then it's decoded into polygons, not bitmask, to save memory.
+
+    ===============================================================================================================
+
+    Apply transforms to box, segmentation and keypoints annotations of a single instance.
+
+    It will use `transforms.apply_box` for the box, and
+    `transforms.apply_coords` for segmentation polygons & keypoints.
+    If you need anything more specially designed for each data structure,
+    you'll need to implement your own version of this function or the transforms.
+
+    Args:
+        annotation (dict): dict of instance annotations for a single instance.
+            It will be modified in-place.
+        transforms (TransformList or list[Transform]):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+
+    Returns:
+        dict:
+            the same input dict with fields "bbox", "segmentation", "keypoints"
+            transformed according to `transforms`.
+            The "bbox_mode" field will be set to XYXY_ABS.
+    """
+    if isinstance(transforms, (tuple, list)):
+        transforms = T.TransformList(transforms)
+    # (dennis.park) Here 2D bounding box is optional.
+    if "bbox" in annotation:
+        assert "bbox_mode" in annotation, "'bbox' is present, but 'bbox_mode' is not."
+        # bbox is 1d (per-instance bounding box)
+        bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+        bbox = transforms.apply_box(np.array([bbox]))[0]
+        # clip transformed bbox to image size
+        bbox = bbox.clip(min=0)
+        bbox = np.minimum(bbox, list(image_size + image_size)[::-1])
+        annotation["bbox"] = bbox
+        annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+    # Vertical flipping is not implemented (`flip_transform.py`). TODO: implement if needed.
+    if "bbox3d" in annotation:
+        bbox3d = np.array(annotation["bbox3d"])
+        annotation['bbox3d'] = transforms.apply_box3d(bbox3d)
+
+    return annotation
+
+
+def _create_empty_instances(image_size):
+    target = Instances(image_size)
+
+    target.gt_boxes = Boxes([])
+    target.gt_classes = torch.tensor([], dtype=torch.int64)
+    target.gt_boxes3d = Boxes3D.from_vectors([], torch.eye(3, dtype=torch.float32))
+
+    return target
+
+
+def annotations_to_instances(
+    annos,
+    image_size,
+    intrinsics=None,
+):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            It will contain fields "gt_boxes", "gt_classes",
+            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    if len(annos) == 0:
+        return _create_empty_instances(image_size)
+
+    boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+    target = Instances(image_size)
+    target.gt_boxes = Boxes(boxes)
+
+    classes = [obj["category_id"] for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    if len(annos) and "bbox3d" in annos[0]:
+        assert intrinsics is not None
+        target.gt_boxes3d = Boxes3D.from_vectors([anno['bbox3d'] for anno in annos], intrinsics)
+        if len(target.gt_boxes3d) != target.gt_boxes.tensor.shape[0]:
+            raise ValueError(
+                f"The sizes of `gt_boxes3d` and `gt_boxes` do not match: a={len(target.gt_boxes3d)}, b={target.gt_boxes.tensor.shape[0]}."
+            )
+
+    # NOTE: add nuscenes attributes here
+    # NOTE: instances will be filtered later
+    # NuScenes attributes
+    if len(annos) and "attribute_id" in annos[0]:    
+        attributes = [obj["attribute_id"] for obj in annos] 
+        target.gt_attributes = torch.tensor(attributes, dtype=torch.int64)
+
+    # Speed (magnitude of velocity)
+    if len(annos) and "speed" in annos[0]:
+        speeds = [obj["speed"] for obj in annos]
+        target.gt_speeds = torch.tensor(speeds, dtype=torch.float32)
+
+    assert len(boxes) == len(classes) == len(attributes) == len(speeds), \
+        'the numbers of annotations should be the same'
+    return target
--- a/projects/mmdet3d_plugin/dd3d/layers/iou_loss.py
+++ b/projects/mmdet3d_plugin/dd3d/layers/iou_loss.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet:
+#   https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/iou_loss.py
+import torch
+from torch import nn
+
+
+class IOULoss(nn.Module):
+    """
+    Intersetion Over Union (IoU) loss which supports three
+    different IoU computations:
+
+    * IoU
+    * Linear IoU
+    * gIoU
+    """
+    def __init__(self, loc_loss_type='iou'):
+        super(IOULoss, self).__init__()
+        self.loc_loss_type = loc_loss_type
+
+    def forward(self, pred, target, weight=None):
+        """
+        Args:
+            pred: Nx4 predicted bounding boxes
+            target: Nx4 target bounding boxes
+            weight: N loss weight for each instance
+        """
+        pred_left = pred[:, 0]
+        pred_top = pred[:, 1]
+        pred_right = pred[:, 2]
+        pred_bottom = pred[:, 3]
+
+        target_left = target[:, 0]
+        target_top = target[:, 1]
+        target_right = target[:, 2]
+        target_bottom = target[:, 3]
+
+        target_aera = (target_left + target_right) * \
+                      (target_top + target_bottom)
+        pred_aera = (pred_left + pred_right) * \
+                    (pred_top + pred_bottom)
+
+        w_intersect = torch.min(pred_left, target_left) + \
+                      torch.min(pred_right, target_right)
+        h_intersect = torch.min(pred_bottom, target_bottom) + \
+                      torch.min(pred_top, target_top)
+
+        g_w_intersect = torch.max(pred_left, target_left) + \
+                        torch.max(pred_right, target_right)
+        g_h_intersect = torch.max(pred_bottom, target_bottom) + \
+                        torch.max(pred_top, target_top)
+        ac_uion = g_w_intersect * g_h_intersect
+
+        area_intersect = w_intersect * h_intersect
+        area_union = target_aera + pred_aera - area_intersect
+
+        ious = (area_intersect + 1.0) / (area_union + 1.0)
+        gious = ious - (ac_uion - area_union) / ac_uion
+        if self.loc_loss_type == 'iou':
+            losses = -torch.log(ious)
+        elif self.loc_loss_type == 'linear_iou':
+            losses = 1 - ious
+        elif self.loc_loss_type == 'giou':
+            losses = 1 - gious
+        else:
+            raise NotImplementedError
+
+        if weight is not None:
+            return (losses * weight).sum()
+        else:
+            return losses.sum()
--- a/projects/mmdet3d_plugin/dd3d/layers/normalization.py
+++ b/projects/mmdet3d_plugin/dd3d/layers/normalization.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet
+#   https://github.com/aim-uofa/AdelaiDet/
+import logging
+
+import torch
+from torch import nn
+
+LOG = logging.getLogger(__name__)
+
+
+class Scale(nn.Module):
+    def __init__(self, init_value=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.FloatTensor([init_value]))
+
+    def forward(self, input):
+        return input * self.scale
+
+
+class Offset(nn.Module):
+    def __init__(self, init_value=0.):
+        super(Offset, self).__init__()
+        self.bias = nn.Parameter(torch.FloatTensor([init_value]))
+
+    def forward(self, input):
+        return input + self.bias
+
+
+class ModuleListDial(nn.ModuleList):
+    def __init__(self, modules=None):
+        super(ModuleListDial, self).__init__(modules)
+        self.cur_position = 0
+
+    def forward(self, x):
+        result = self[self.cur_position](x)
+        self.cur_position += 1
+        if self.cur_position >= len(self):
+            self.cur_position = 0
+        return result
--- a/projects/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
+++ b/projects/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from fvcore:
+#   https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/smooth_l1_loss.py
+
+import torch
+
+
+def smooth_l1_loss(input: torch.Tensor, target: torch.Tensor, beta: float, reduction: str = "none") -> torch.Tensor:
+    """
+    Smooth L1 loss defined in the Fast R-CNN paper as:
+
+                  | 0.5 * x ** 2 / beta   if abs(x) < beta
+    smoothl1(x) = |
+                  | abs(x) - 0.5 * beta   otherwise,
+
+    where x = input - target.
+
+    Smooth L1 loss is related to Huber loss, which is defined as:
+
+                | 0.5 * x ** 2                  if abs(x) < beta
+     huber(x) = |
+                | beta * (abs(x) - 0.5 * beta)  otherwise
+
+    Smooth L1 loss is equal to huber(x) / beta. This leads to the following
+    differences:
+
+     - As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss
+       converges to a constant 0 loss.
+     - As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss
+       converges to L2 loss.
+     - For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant
+       slope of 1. For Huber loss, the slope of the L1 segment is beta.
+
+    Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta
+    portion replaced with a quadratic function such that at abs(x) = beta, its
+    slope is 1. The quadratic segment smooths the L1 loss near x = 0.
+
+    Args:
+        input (Tensor): input tensor of any shape
+        target (Tensor): target value tensor with the same shape as input
+        beta (float): L1 to L2 change point.
+            For beta values < 1e-5, L1 loss is computed.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+
+    Returns:
+        The loss with the reduction option applied.
+
+    Note:
+        PyTorch's builtin "Smooth L1 loss" implementation does not actually
+        implement Smooth L1 loss, nor does it implement Huber loss. It implements
+        the special case of both in which they are equal (beta=1).
+        See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss.
+     """
+    # (dennis.park) Make it work with mixed precision training.
+    beta = torch.as_tensor(beta).to(input.dtype)
+    if beta < 1e-5:
+        # if beta == 0, then torch.where will result in nan gradients when
+        # the chain rule is applied due to pytorch implementation details
+        # (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of
+        # zeros, rather than "no gradient"). To avoid this issue, we define
+        # small values of beta to be exactly l1 loss.
+        loss = torch.abs(input - target)
+    else:
+        n = torch.abs(input - target)
+        cond = n < beta
+        a = 0.5 * n**2
+        b = n - 0.5 * beta
+        a, b = a.to(input.dtype), b.to(input.dtype)
+        loss = torch.where(cond, a, b)
+        # loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss
--- a/projects/mmdet3d_plugin/dd3d/modeling/__init__.py
+++ b/projects/mmdet3d_plugin/dd3d/modeling/__init__.py
+from .nuscenes_dd3d import NuscenesDD3D
\ No newline at end of file
--- a/projects/mmdet3d_plugin/dd3d/modeling/core.py
+++ b/projects/mmdet3d_plugin/dd3d/modeling/core.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+from torch import nn
+
+#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.postprocessing import detector_postprocess as resize_instances
+from detectron2.structures import Instances
+from detectron2.layers import ShapeSpec
+from mmcv.runner import force_fp32
+
+from .fcos2d import FCOS2DHead, FCOS2DInference, FCOS2DLoss
+from .fcos3d import FCOS3DHead, FCOS3DInference, FCOS3DLoss
+#from tridet.modeling.dd3d.postprocessing import nuscenes_sample_aggregate
+from .prepare_targets import DD3DTargetPreparer
+#from tridet.modeling.feature_extractor import build_feature_extractor
+from projects.mmdet3d_plugin.dd3d.structures.image_list import ImageList
+from projects.mmdet3d_plugin.dd3d.utils.tensor2d import compute_features_locations as compute_locations_per_level
+
+
+#@META_ARCH_REGISTRY.register()
+class DD3D(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 strides,
+                 fcos2d_cfg=dict(),
+                 fcos2d_loss_cfg=dict(),
+                 fcos3d_cfg=dict(),
+                 fcos3d_loss_cfg=dict(),
+                 target_assign_cfg=dict(),
+                 box3d_on=True,
+                 feature_locations_offset="none"):
+        super().__init__()
+        # NOTE: do not need backbone
+        # self.backbone = build_feature_extractor(cfg)
+        # backbone_output_shape = self.backbone.output_shape()
+        # self.in_features = cfg.DD3D.IN_FEATURES or list(backbone_output_shape.keys())
+        
+        self.backbone_output_shape = [ShapeSpec(channels=in_channels, stride=s) for s in strides]
+
+        self.feature_locations_offset = feature_locations_offset
+
+        self.fcos2d_head = FCOS2DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
+                                     **fcos2d_cfg)
+        self.fcos2d_loss = FCOS2DLoss(num_classes=num_classes, **fcos2d_loss_cfg)
+        # NOTE: inference later
+        # self.fcos2d_inference = FCOS2DInference(cfg)
+
+        if box3d_on:
+            self.fcos3d_head = FCOS3DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
+                                          **fcos3d_cfg)
+            self.fcos3d_loss = FCOS3DLoss(num_classes=num_classes, **fcos3d_loss_cfg)
+            # NOTE: inference later
+            # self.fcos3d_inference = FCOS3DInference(cfg)
+            self.only_box2d = False
+        else:
+            self.only_box2d = True
+
+        self.prepare_targets = DD3DTargetPreparer(num_classes=num_classes, 
+                                                  input_shape=self.backbone_output_shape,
+                                                  box3d_on=box3d_on,
+                                                  **target_assign_cfg)
+
+        # NOTE: inference later
+        # self.postprocess_in_inference = cfg.DD3D.INFERENCE.DO_POSTPROCESS
+
+        # self.do_nms = cfg.DD3D.INFERENCE.DO_NMS
+        # self.do_bev_nms = cfg.DD3D.INFERENCE.DO_BEV_NMS
+        # self.bev_nms_iou_thresh = cfg.DD3D.INFERENCE.BEV_NMS_IOU_THRESH
+
+        # nuScenes inference aggregates detections over all 6 cameras.
+        # self.nusc_sample_aggregate_in_inference = cfg.DD3D.INFERENCE.NUSC_SAMPLE_AGGREGATE
+        self.num_classes = num_classes
+
+        # NOTE: do not need normalize
+        # self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        # self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+    # NOTE:
+    # @property
+    # def device(self):
+    #     return self.pixel_mean.device
+
+    # def preprocess_image(self, x):
+    #     return (x - self.pixel_mean) / self.pixel_std
+
+    @force_fp32(apply_to=('features'))
+    def forward(self, features, batched_inputs):
+        # NOTE:
+        # images = [x["image"].to(self.device) for x in batched_inputs]
+        # images = [self.preprocess_image(x) for x in images]
+
+        # NOTE: directly use inv_intrinsics
+        # if 'intrinsics' in batched_inputs[0]:
+        #     intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
+        # else:
+        #     intrinsics = None
+        # images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
+        if 'inv_intrinsics' in batched_inputs[0]:
+            inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
+            inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
+        else:
+            inv_intrinsics = None
+
+        # NOTE:
+        # gt_dense_depth = None
+        # if 'depth' in batched_inputs[0]:
+        #     gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
+        #     gt_dense_depth = ImageList.from_tensors(
+        #         gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
+        #     )
+
+        # NOTE: directly input feature
+        # features = self.backbone(images.tensor)
+        # features = [features[f] for f in self.in_features]
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        locations = self.compute_locations(features)
+        logits, box2d_reg, centerness, _ = self.fcos2d_head(features)
+        if not self.only_box2d:
+            box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
+        # NOTE: directly use inv_intrinsics
+        # inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
+
+        if self.training:
+            assert gt_instances is not None
+            feature_shapes = [x.shape[-2:] for x in features]
+            training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
+            # NOTE: 
+            # if gt_dense_depth is not None:
+            #    training_targets.update({"dense_depth": gt_dense_depth})
+
+            losses = {}
+            fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
+            losses.update(fcos2d_loss)
+
+            if not self.only_box2d:
+                fcos3d_loss = self.fcos3d_loss(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
+                    fcos2d_info, training_targets
+                )
+                losses.update(fcos3d_loss)
+            return losses
+        else:
+            # TODO: do not support inference now
+            raise NotImplementedError
+            
+            pred_instances, fcos2d_info = self.fcos2d_inference(
+                logits, box2d_reg, centerness, locations, images.image_sizes
+            )
+            if not self.only_box2d:
+                # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances' in place.
+                self.fcos3d_inference(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
+                    fcos2d_info
+                )
+
+                # 3D score == 2D score x confidence.
+                score_key = "scores_3d"
+            else:
+                score_key = "scores"
+
+            # Transpose to "image-first", i.e. (B, L)
+            pred_instances = list(zip(*pred_instances))
+            pred_instances = [Instances.cat(instances) for instances in pred_instances]
+
+            # 2D NMS and pick top-K.
+            if self.do_nms:
+                pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
+
+            if not self.only_box2d and self.do_bev_nms:
+                # Bird-eye-view NMS.
+                dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
+                if 'pose' in batched_inputs[0]:
+                    poses = [x['pose'] for x in batched_inputs]
+                else:
+                    poses = [x['extrinsics'] for x in batched_inputs]
+                pred_instances = nuscenes_sample_aggregate(
+                    pred_instances,
+                    dummy_group_idxs,
+                    self.num_classes,
+                    poses,
+                    iou_threshold=self.bev_nms_iou_thresh,
+                    include_boxes3d_global=False
+                )
+
+            if self.postprocess_in_inference:
+                processed_results = []
+                for results_per_image, input_per_image, image_size in \
+                        zip(pred_instances, batched_inputs, images.image_sizes):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = resize_instances(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+            else:
+                processed_results = [{"instances": x} for x in pred_instances]
+
+            return processed_results
+
+    def compute_locations(self, features):
+        locations = []
+        in_strides = [x.stride for x in self.backbone_output_shape]
+        for level, feature in enumerate(features):
+            h, w = feature.size()[-2:]
+            locations_per_level = compute_locations_per_level(
+                h, w, in_strides[level], feature.dtype, feature.device, offset=self.feature_locations_offset
+            )
+            locations.append(locations_per_level)
+        return locations
+
+    def forward_train(self, features, batched_inputs):
+        self.train()
+        return self.forward(features, batched_inputs)
\ No newline at end of file
--- a/projects/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
+++ b/projects/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import logging
+
+import torch
+import torch.nn as nn
+
+from projects.mmdet3d_plugin.dd3d.layers.smooth_l1_loss import smooth_l1_loss
+
+LOG = logging.getLogger(__name__)
+
+
+class DisentangledBox3DLoss(nn.Module):
+    def __init__(self, smooth_l1_loss_beta, max_loss_per_group):
+        super().__init__()
+        self.smooth_l1_loss_beta = smooth_l1_loss_beta
+        self.max_loss_per_group = max_loss_per_group
+
+    def forward(self, box3d_pred, box3d_targets, locations, weights=None):
+
+        box3d_pred = box3d_pred.to(torch.float32)
+        box3d_targets = box3d_targets.to(torch.float32)
+
+        target_corners = box3d_targets.corners
+
+        disentangled_losses = {}
+        for component_key in ["quat", "proj_ctr", "depth", "size"]:
+            disentangled_boxes = box3d_targets.clone()
+            setattr(disentangled_boxes, component_key, getattr(box3d_pred, component_key))
+            pred_corners = disentangled_boxes.to(torch.float32).corners
+
+            loss = smooth_l1_loss(pred_corners, target_corners, beta=self.smooth_l1_loss_beta)
+
+            # Bound the loss
+            loss.clamp(max=self.max_loss_per_group)
+
+            if weights is not None:
+                # loss = torch.sum(loss.reshape(-1, 24) * weights.unsqueeze(-1))
+                loss = torch.sum(loss.reshape(-1, 24).mean(dim=1) * weights)
+            else:
+                loss = loss.reshape(-1, 24).mean()
+
+            disentangled_losses["loss_box3d_" + component_key] = loss
+
+        entangled_l1_dist = (target_corners - box3d_pred.corners).detach().abs().reshape(-1, 24).mean(dim=1)
+
+        return disentangled_losses, entangled_l1_dist
--- a/projects/mmdet3d_plugin/dd3d/modeling/fcos2d.py
+++ b/projects/mmdet3d_plugin/dd3d/modeling/fcos2d.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet:
+#   https://github.com/aim-uofa/AdelaiDet
+import torch
+from fvcore.nn import sigmoid_focal_loss
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, batched_nms, cat, get_norm
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.comm import get_world_size
+from mmcv.runner import force_fp32
+
+from projects.mmdet3d_plugin.dd3d.layers.iou_loss import IOULoss
+from projects.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Scale
+from projects.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
+
+INF = 100000000
+
+
+def compute_ctrness_targets(reg_targets):
+    if len(reg_targets) == 0:
+        return reg_targets.new_zeros(len(reg_targets))
+    left_right = reg_targets[:, [0, 2]]
+    top_bottom = reg_targets[:, [1, 3]]
+    ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
+                 (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+    return torch.sqrt(ctrness)
+
+
+class FCOS2DHead(nn.Module):
+    def __init__(self, 
+                 num_classes, 
+                 input_shape,
+                 num_cls_convs=4,
+                 num_box_convs=4,
+                 norm='BN',
+                 use_deformable=False,
+                 use_scale=True,
+                 box2d_scale_init_factor=1.0,
+                 version='v2'):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.in_strides = [shape.stride for shape in input_shape]
+        self.num_levels = len(input_shape)
+
+        self.use_scale = use_scale
+        self.box2d_scale_init_factor = box2d_scale_init_factor
+
+        self._version = version
+
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        if use_deformable:
+            raise ValueError("Not supported yet.")
+
+        head_configs = {'cls': num_cls_convs, 'box2d': num_box_convs}
+
+        for head_name, num_convs in head_configs.items():
+            tower = []
+            if self._version == "v1":
+                for _ in range(num_convs):
+                    conv_func = nn.Conv2d
+                    tower.append(conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True))
+                    if norm == "GN":
+                        raise NotImplementedError()
+                    elif norm == "NaiveGN":
+                        raise NotImplementedError()
+                    elif norm == "BN":
+                        tower.append(ModuleListDial([nn.BatchNorm2d(in_channels) for _ in range(self.num_levels)]))
+                    elif norm == "SyncBN":
+                        raise NotImplementedError()
+                    tower.append(nn.ReLU())
+            elif self._version == "v2":
+                for _ in range(num_convs):
+                    if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
+                        # NOTE: need to add norm here!
+                        # Each FPN level has its own batchnorm layer.
+                        # NOTE: do not use dd3d train.py!
+                        # "BN" is converted to "SyncBN" in distributed training (see train.py)
+                        norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
+                    else:
+                        norm_layer = get_norm(norm, in_channels)
+                    tower.append(
+                        Conv2d(
+                            in_channels,
+                            in_channels,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            bias=norm_layer is None,
+                            norm=norm_layer,
+                            activation=F.relu
+                        )
+                    )
+            else:
+                raise ValueError(f"Invalid FCOS2D version: {self._version}")
+            self.add_module(f'{head_name}_tower', nn.Sequential(*tower))
+
+        self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1)
+        self.box2d_reg = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1)
+        self.centerness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1)
+
+        if self.use_scale:
+            if self._version == "v1":
+                self.scales_reg = nn.ModuleList([
+                    Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
+                ])
+            else:
+                self.scales_box2d_reg = nn.ModuleList([
+                    Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
+                ])
+
+        self.init_weights()
+
+    def init_weights(self):
+
+        for tower in [self.cls_tower, self.box2d_tower]:
+            for l in tower.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
+                    if l.bias is not None:
+                        torch.nn.init.constant_(l.bias, 0)
+
+        predictors = [self.cls_logits, self.box2d_reg, self.centerness]
+
+        for modules in predictors:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        logits = []
+        box2d_reg = []
+        centerness = []
+
+        extra_output = {"cls_tower_out": []}
+
+        for l, feature in enumerate(x):
+            cls_tower_out = self.cls_tower(feature)
+            bbox_tower_out = self.box2d_tower(feature)
+
+            # 2D box
+            logits.append(self.cls_logits(cls_tower_out))
+            centerness.append(self.centerness(bbox_tower_out))
+            box_reg = self.box2d_reg(bbox_tower_out)
+            if self.use_scale:
+                # TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
+                if self._version == "v1":
+                    box_reg = self.scales_reg[l](box_reg)
+                else:
+                    box_reg = self.scales_box2d_reg[l](box_reg)
+            # Note that we use relu, as in the improved FCOS, instead of exp.
+            box2d_reg.append(F.relu(box_reg))
+
+            extra_output['cls_tower_out'].append(cls_tower_out)
+
+        return logits, box2d_reg, centerness, extra_output
+
+
+class FCOS2DLoss(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 focal_loss_alpha=0.25,
+                 focal_loss_gamma=2.0,
+                 loc_loss_type='giou',
+                 ):
+        super().__init__()
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+
+        self.box2d_reg_loss_fn = IOULoss(loc_loss_type)
+
+        self.num_classes = num_classes
+
+    @force_fp32(apply_to=('logits', 'box2d_reg', 'centerness'))
+    def forward(self, logits, box2d_reg, centerness, targets):
+        labels = targets['labels']
+        box2d_reg_targets = targets['box2d_reg_targets']
+        pos_inds = targets["pos_inds"]
+
+        if len(labels) != box2d_reg_targets.shape[0]:
+            raise ValueError(
+                f"The size of 'labels' and 'box2d_reg_targets' does not match: a={len(labels)}, b={box2d_reg_targets.shape[0]}"
+            )
+
+        # Flatten predictions
+        logits = cat([x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits])
+        box2d_reg_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4) for x in box2d_reg])
+        centerness_pred = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in centerness])
+
+        # -------------------
+        # Classification loss
+        # -------------------
+        num_pos_local = pos_inds.numel()
+        num_gpus = get_world_size()
+        total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
+        num_pos_avg = max(total_num_pos / num_gpus, 1.0)
+
+        # prepare one_hot
+        cls_target = torch.zeros_like(logits)
+        cls_target[pos_inds, labels[pos_inds]] = 1
+
+        loss_cls = sigmoid_focal_loss(
+            logits,
+            cls_target,
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        ) / num_pos_avg
+
+        # NOTE: The rest of losses only consider foreground pixels.
+        box2d_reg_pred = box2d_reg_pred[pos_inds]
+        box2d_reg_targets = box2d_reg_targets[pos_inds]
+
+        centerness_pred = centerness_pred[pos_inds]
+
+        # Compute centerness targets here using 2D regression targets of foreground pixels.
+        centerness_targets = compute_ctrness_targets(box2d_reg_targets)
+
+        # Denominator for all foreground losses.
+        ctrness_targets_sum = centerness_targets.sum()
+        loss_denom = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
+
+        # NOTE: change the return after reduce_sum
+        if pos_inds.numel() == 0:
+            losses = {
+                "loss_cls": loss_cls,
+                "loss_box2d_reg": box2d_reg_pred.sum() * 0.,
+                "loss_centerness": centerness_pred.sum() * 0.,
+            }
+            return losses, {}
+
+        # ----------------------
+        # 2D box regression loss
+        # ----------------------
+        loss_box2d_reg = self.box2d_reg_loss_fn(box2d_reg_pred, box2d_reg_targets, centerness_targets) / loss_denom
+
+        # ---------------
+        # Centerness loss
+        # ---------------
+        loss_centerness = F.binary_cross_entropy_with_logits(
+            centerness_pred, centerness_targets, reduction="sum"
+        ) / num_pos_avg
+
+        loss_dict = {"loss_cls": loss_cls, "loss_box2d_reg": loss_box2d_reg, "loss_centerness": loss_centerness}
+        extra_info = {"loss_denom": loss_denom, "centerness_targets": centerness_targets}
+
+        return loss_dict, extra_info
+
+
+class FCOS2DInference():
+    def __init__(self, cfg):
+        self.thresh_with_ctr = cfg.DD3D.FCOS2D.INFERENCE.THRESH_WITH_CTR
+        self.pre_nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_THRESH
+        self.pre_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_TOPK
+        self.post_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.POST_NMS_TOPK
+        self.nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.NMS_THRESH
+        self.num_classes = cfg.DD3D.NUM_CLASSES
+
+    def __call__(self, logits, box2d_reg, centerness, locations, image_sizes):
+
+        pred_instances = []  # List[List[Instances]], shape = (L, B)
+        extra_info = []
+        for lvl, (logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl) in \
+            enumerate(zip(logits, box2d_reg, centerness, locations)):
+
+            instances_per_lvl, extra_info_per_lvl = self.forward_for_single_feature_map(
+                logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl, image_sizes
+            )  # List of Instances; one for each image.
+
+            for instances_per_im in instances_per_lvl:
+                instances_per_im.fpn_levels = locations_lvl.new_ones(len(instances_per_im), dtype=torch.long) * lvl
+
+            pred_instances.append(instances_per_lvl)
+            extra_info.append(extra_info_per_lvl)
+
+        return pred_instances, extra_info
+
+    def forward_for_single_feature_map(self, logits, box2d_reg, centerness, locations, image_sizes):
+        N, C, _, __ = logits.shape
+
+        # put in the same format as locations
+        scores = logits.permute(0, 2, 3, 1).reshape(N, -1, C).sigmoid()
+        box2d_reg = box2d_reg.permute(0, 2, 3, 1).reshape(N, -1, 4)
+        centerness = centerness.permute(0, 2, 3, 1).reshape(N, -1).sigmoid()
+
+        # if self.thresh_with_ctr is True, we multiply the classification
+        # scores with centerness scores before applying the threshold.
+        if self.thresh_with_ctr:
+            scores = scores * centerness[:, :, None]
+
+        candidate_mask = scores > self.pre_nms_thresh
+
+        pre_nms_topk = candidate_mask.reshape(N, -1).sum(1)
+        pre_nms_topk = pre_nms_topk.clamp(max=self.pre_nms_topk)
+
+        if not self.thresh_with_ctr:
+            scores = scores * centerness[:, :, None]
+
+        results = []
+        all_fg_inds_per_im, all_topk_indices, all_class_inds_per_im = [], [], []
+        for i in range(N):
+            scores_per_im = scores[i]
+            candidate_mask_per_im = candidate_mask[i]
+            scores_per_im = scores_per_im[candidate_mask_per_im]
+
+            candidate_inds_per_im = candidate_mask_per_im.nonzero(as_tuple=False)
+            fg_inds_per_im = candidate_inds_per_im[:, 0]
+            class_inds_per_im = candidate_inds_per_im[:, 1]
+
+            # Cache info here.
+            all_fg_inds_per_im.append(fg_inds_per_im)
+            all_class_inds_per_im.append(class_inds_per_im)
+
+            box2d_reg_per_im = box2d_reg[i][fg_inds_per_im]
+            locations_per_im = locations[fg_inds_per_im]
+
+            pre_nms_topk_per_im = pre_nms_topk[i]
+
+            if candidate_mask_per_im.sum().item() > pre_nms_topk_per_im.item():
+                scores_per_im, topk_indices = \
+                    scores_per_im.topk(pre_nms_topk_per_im, sorted=False)
+
+                class_inds_per_im = class_inds_per_im[topk_indices]
+                box2d_reg_per_im = box2d_reg_per_im[topk_indices]
+                locations_per_im = locations_per_im[topk_indices]
+            else:
+                topk_indices = None
+
+            all_topk_indices.append(topk_indices)
+
+            detections = torch.stack([
+                locations_per_im[:, 0] - box2d_reg_per_im[:, 0],
+                locations_per_im[:, 1] - box2d_reg_per_im[:, 1],
+                locations_per_im[:, 0] + box2d_reg_per_im[:, 2],
+                locations_per_im[:, 1] + box2d_reg_per_im[:, 3],
+            ],
+                                     dim=1)
+
+            instances = Instances(image_sizes[i])
+            instances.pred_boxes = Boxes(detections)
+            instances.scores = torch.sqrt(scores_per_im)
+            instances.pred_classes = class_inds_per_im
+            instances.locations = locations_per_im
+
+            results.append(instances)
+
+        extra_info = {
+            "fg_inds_per_im": all_fg_inds_per_im,
+            "class_inds_per_im": all_class_inds_per_im,
+            "topk_indices": all_topk_indices
+        }
+        return results, extra_info
+
+    def nms_and_top_k(self, instances_per_im, score_key_for_nms="scores"):
+        results = []
+        for instances in instances_per_im:
+            if self.nms_thresh > 0:
+                # Multiclass NMS.
+                keep = batched_nms(
+                    instances.pred_boxes.tensor, instances.get(score_key_for_nms), instances.pred_classes,
+                    self.nms_thresh
+                )
+                instances = instances[keep]
+            num_detections = len(instances)
+
+            # Limit to max_per_image detections **over all classes**
+            if num_detections > self.post_nms_topk > 0:
+                scores = instances.scores
+                # image_thresh, _ = torch.kthvalue(scores.cpu(), num_detections - self.post_nms_topk + 1)
+                image_thresh, _ = torch.kthvalue(scores, num_detections - self.post_nms_topk + 1)
+                keep = scores >= image_thresh.item()
+                keep = torch.nonzero(keep).squeeze(1)
+                instances = instances[keep]
+            results.append(instances)
+        return results
--- a/projects/mmdet3d_plugin/dd3d/modeling/fcos3d.py
+++ b/projects/mmdet3d_plugin/dd3d/modeling/fcos3d.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import Conv2d, cat, get_norm
+from mmcv.runner import force_fp32
+
+from projects.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Offset, Scale
+from .disentangled_box3d_loss import DisentangledBox3DLoss
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+from projects.mmdet3d_plugin.dd3d.utils.geometry import allocentric_to_egocentric, unproject_points2d
+
+EPS = 1e-7
+
+
+def predictions_to_boxes3d(
+    quat,
+    proj_ctr,
+    depth,
+    size,
+    locations,
+    inv_intrinsics,
+    canon_box_sizes,
+    min_depth,
+    max_depth,
+    scale_depth_by_focal_lengths_factor,
+    scale_depth_by_focal_lengths=True,
+    quat_is_allocentric=True,
+    depth_is_distance=False
+):
+    # Normalize to make quat unit norm.
+    quat = quat / quat.norm(dim=1, keepdim=True).clamp(min=EPS)
+    # Make sure again it's numerically unit-norm.
+    quat = quat / quat.norm(dim=1, keepdim=True)
+
+    if scale_depth_by_focal_lengths:
+        pixel_size = torch.norm(torch.stack([inv_intrinsics[:, 0, 0], inv_intrinsics[:, 1, 1]], dim=-1), dim=-1)
+        depth = depth / (pixel_size * scale_depth_by_focal_lengths_factor)
+
+    if depth_is_distance:
+        depth = depth / unproject_points2d(locations, inv_intrinsics).norm(dim=1).clamp(min=EPS)
+
+    depth = depth.reshape(-1, 1).clamp(min_depth, max_depth)
+
+    proj_ctr = proj_ctr + locations
+
+    if quat_is_allocentric:
+        quat = allocentric_to_egocentric(quat, proj_ctr, inv_intrinsics)
+
+    size = (size.tanh() + 1.) * canon_box_sizes  # max size = 2 * canon_size
+
+    return Boxes3D(quat, proj_ctr, depth, size, inv_intrinsics)
+
+
+class FCOS3DHead(nn.Module):
+    def __init__(self, 
+                 num_classes,
+                 input_shape,
+                 num_convs=4,
+                 norm='BN',
+                 use_scale=True,
+                 depth_scale_init_factor=0.3,
+                 proj_ctr_scale_init_factor=1.0,
+                 use_per_level_predictors=False,
+                 class_agnostic=False,
+                 use_deformable=False,
+                 mean_depth_per_level=None,
+                 std_depth_per_level=None,
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_strides = [shape.stride for shape in input_shape]
+        self.num_levels = len(input_shape)
+
+        self.use_scale = use_scale
+        self.depth_scale_init_factor = depth_scale_init_factor
+        self.proj_ctr_scale_init_factor = proj_ctr_scale_init_factor
+        self.use_per_level_predictors = use_per_level_predictors
+
+        self.register_buffer("mean_depth_per_level", torch.Tensor(mean_depth_per_level))
+        self.register_buffer("std_depth_per_level", torch.Tensor(std_depth_per_level))
+
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        if use_deformable:
+            raise ValueError("Not supported yet.")
+
+        box3d_tower = []
+        for i in range(num_convs):
+            if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
+                # NOTE: need to add norm here!
+                # Each FPN level has its own batchnorm layer.
+                # NOTE: do not use dd3d train.py!
+                # "BN" is converted to "SyncBN" in distributed training (see train.py)
+                norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
+            else:
+                norm_layer = get_norm(norm, in_channels)
+            box3d_tower.append(
+                Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=norm_layer is None,
+                    norm=norm_layer,
+                    activation=F.relu
+                )
+            )
+        self.add_module('box3d_tower', nn.Sequential(*box3d_tower))
+
+        num_classes = self.num_classes if not class_agnostic else 1
+        num_levels = self.num_levels if use_per_level_predictors else 1
+
+        # 3D box branches.
+        self.box3d_quat = nn.ModuleList([
+            Conv2d(in_channels, 4 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_ctr = nn.ModuleList([
+            Conv2d(in_channels, 2 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_depth = nn.ModuleList([
+            Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=(not self.use_scale))
+            for _ in range(num_levels)
+        ])
+        self.box3d_size = nn.ModuleList([
+            Conv2d(in_channels, 3 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_conf = nn.ModuleList([
+            Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+
+        if self.use_scale:
+            self.scales_proj_ctr = nn.ModuleList([
+                Scale(init_value=stride * self.proj_ctr_scale_init_factor) for stride in self.in_strides
+            ])
+            # (pre-)compute (mean, std) of depth for each level, and determine the init value here.
+            self.scales_size = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
+            self.scales_conf = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
+
+            self.scales_depth = nn.ModuleList([
+                Scale(init_value=sigma * self.depth_scale_init_factor) for sigma in self.std_depth_per_level
+            ])
+            self.offsets_depth = nn.ModuleList([Offset(init_value=b) for b in self.mean_depth_per_level])
+
+        self._init_weights()
+
+    def _init_weights(self):
+
+        for l in self.box3d_tower.modules():
+            if isinstance(l, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
+                if l.bias is not None:
+                    torch.nn.init.constant_(l.bias, 0)
+
+        predictors = [self.box3d_quat, self.box3d_ctr, self.box3d_depth, self.box3d_size, self.box3d_conf]
+
+        for modules in predictors:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf = [], [], [], [], []
+        dense_depth = None
+        for l, features in enumerate(x):
+            box3d_tower_out = self.box3d_tower(features)
+
+            _l = l if self.use_per_level_predictors else 0
+
+            # 3D box
+            quat = self.box3d_quat[_l](box3d_tower_out)
+            proj_ctr = self.box3d_ctr[_l](box3d_tower_out)
+            depth = self.box3d_depth[_l](box3d_tower_out)
+            size3d = self.box3d_size[_l](box3d_tower_out)
+            conf3d = self.box3d_conf[_l](box3d_tower_out)
+
+            if self.use_scale:
+                # TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
+                proj_ctr = self.scales_proj_ctr[l](proj_ctr)
+                size3d = self.scales_size[l](size3d)
+                conf3d = self.scales_conf[l](conf3d)
+                depth = self.offsets_depth[l](self.scales_depth[l](depth))
+
+            box3d_quat.append(quat)
+            box3d_ctr.append(proj_ctr)
+            box3d_depth.append(depth)
+            box3d_size.append(size3d)
+            box3d_conf.append(conf3d)
+
+        return box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth
+
+
+class FCOS3DLoss(nn.Module):
+    def __init__(self, 
+                 num_classes,
+                 min_depth=0.1,
+                 max_depth=80.0,
+                 box3d_loss_weight=2.0,
+                 conf3d_loss_weight=1.0,
+                 conf_3d_temperature=1.0,
+                 smooth_l1_loss_beta=0.05, 
+                 max_loss_per_group=20,
+                 predict_allocentric_rot=True,
+                 scale_depth_by_focal_lengths=True,
+                 scale_depth_by_focal_lengths_factor=500.0,
+                 class_agnostic=False,
+                 predict_distance=False,
+                 canon_box_sizes=None):
+        super().__init__()
+        self.canon_box_sizes = canon_box_sizes
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.predict_allocentric_rot = predict_allocentric_rot
+        self.scale_depth_by_focal_lengths = scale_depth_by_focal_lengths
+        self.scale_depth_by_focal_lengths_factor = scale_depth_by_focal_lengths_factor
+        self.predict_distance = predict_distance
+
+        self.box3d_reg_loss_fn = DisentangledBox3DLoss(smooth_l1_loss_beta, max_loss_per_group)
+        self.box3d_loss_weight = box3d_loss_weight
+        self.conf3d_loss_weight = conf3d_loss_weight
+        self.conf_3d_temperature = conf_3d_temperature
+
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+
+    @force_fp32(apply_to=('box3d_quat', 'box3d_ctr', 'box3d_depth', 'box3d_size','box3d_conf', 'inv_intrinsics'))
+    def forward(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics, fcos2d_info,
+        targets
+    ):
+        labels = targets['labels']
+        box3d_targets = targets['box3d_targets']
+        pos_inds = targets["pos_inds"]
+
+        if pos_inds.numel() == 0:
+            losses = {
+                "loss_box3d_quat": torch.stack([x.sum() * 0. for x in box3d_quat]).sum(), 
+                "loss_box3d_proj_ctr": torch.stack([x.sum() * 0. for x in box3d_ctr]).sum(),
+                "loss_box3d_depth": torch.stack([x.sum() * 0. for x in box3d_depth]).sum(),
+                "loss_box3d_size": torch.stack([x.sum() * 0. for x in box3d_size]).sum(),
+                "loss_conf3d": torch.stack([x.sum() * 0. for x in box3d_conf]).sum()
+            }
+            return losses
+
+        if len(labels) != len(box3d_targets):
+            raise ValueError(
+                f"The size of 'labels' and 'box3d_targets' does not match: a={len(labels)}, b={len(box3d_targets)}"
+            )
+
+        num_classes = self.num_classes if not self.class_agnostic else 1
+
+        box3d_quat_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4, num_classes) for x in box3d_quat])
+        box3d_ctr_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 2, num_classes) for x in box3d_ctr])
+        box3d_depth_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_depth])
+        box3d_size_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 3, num_classes) for x in box3d_size])
+        box3d_conf_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_conf])
+
+        # ----------------------
+        # 3D box disentangled loss
+        # ----------------------
+        box3d_targets = box3d_targets[pos_inds]
+
+        box3d_quat_pred = box3d_quat_pred[pos_inds]
+        box3d_ctr_pred = box3d_ctr_pred[pos_inds]
+        box3d_depth_pred = box3d_depth_pred[pos_inds]
+        box3d_size_pred = box3d_size_pred[pos_inds]
+        box3d_conf_pred = box3d_conf_pred[pos_inds]
+
+        if self.class_agnostic:
+            box3d_quat_pred = box3d_quat_pred.squeeze(-1)
+            box3d_ctr_pred = box3d_ctr_pred.squeeze(-1)
+            box3d_depth_pred = box3d_depth_pred.squeeze(-1)
+            box3d_size_pred = box3d_size_pred.squeeze(-1)
+            box3d_conf_pred = box3d_conf_pred.squeeze(-1)
+        else:
+            I = labels[pos_inds][..., None, None]
+            box3d_quat_pred = torch.gather(box3d_quat_pred, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
+            box3d_ctr_pred = torch.gather(box3d_ctr_pred, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
+            box3d_depth_pred = torch.gather(box3d_depth_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
+            box3d_size_pred = torch.gather(box3d_size_pred, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
+            box3d_conf_pred = torch.gather(box3d_conf_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
+
+        canon_box_sizes = box3d_quat_pred.new_tensor(self.canon_box_sizes)[labels[pos_inds]]
+
+        locations = targets["locations"][pos_inds]
+        im_inds = targets["im_inds"][pos_inds]
+        inv_intrinsics = inv_intrinsics[im_inds]
+
+        box3d_pred = predictions_to_boxes3d(
+            box3d_quat_pred,
+            box3d_ctr_pred,
+            box3d_depth_pred,
+            box3d_size_pred,
+            locations,
+            inv_intrinsics,
+            canon_box_sizes,
+            self.min_depth,
+            self.max_depth,
+            scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
+            scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
+            quat_is_allocentric=self.predict_allocentric_rot,
+            depth_is_distance=self.predict_distance
+        )
+
+        centerness_targets = fcos2d_info["centerness_targets"]
+        loss_denom = fcos2d_info["loss_denom"]
+        losses_box3d, box3d_l1_error = self.box3d_reg_loss_fn(box3d_pred, box3d_targets, locations, centerness_targets)
+
+        losses_box3d = {k: self.box3d_loss_weight * v / loss_denom for k, v in losses_box3d.items()}
+
+        conf_3d_targets = torch.exp(-1. / self.conf_3d_temperature * box3d_l1_error)
+        loss_conf3d = F.binary_cross_entropy_with_logits(box3d_conf_pred, conf_3d_targets, reduction='none')
+        loss_conf3d = self.conf3d_loss_weight * (loss_conf3d * centerness_targets).sum() / loss_denom
+
+        losses = {"loss_conf3d": loss_conf3d, **losses_box3d}
+
+        return losses
+
+
+class FCOS3DInference():
+    def __init__(self, cfg):
+        self.canon_box_sizes = cfg.DD3D.FCOS3D.CANONICAL_BOX3D_SIZES
+        self.min_depth = cfg.DD3D.FCOS3D.MIN_DEPTH
+        self.max_depth = cfg.DD3D.FCOS3D.MAX_DEPTH
+        self.predict_allocentric_rot = cfg.DD3D.FCOS3D.PREDICT_ALLOCENTRIC_ROT
+        self.scale_depth_by_focal_lengths = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS
+        self.scale_depth_by_focal_lengths_factor = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR
+        self.predict_distance = cfg.DD3D.FCOS3D.PREDICT_DISTANCE
+
+        self.num_classes = cfg.DD3D.NUM_CLASSES
+        self.class_agnostic = cfg.DD3D.FCOS3D.CLASS_AGNOSTIC_BOX3D
+
+    def __call__(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
+    ):
+        # pred_instances: # List[List[Instances]], shape = (L, B)
+        for lvl, (box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl) in \
+            enumerate(zip(box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf)):
+
+            # In-place modification: update per-level pred_instances.
+            self.forward_for_single_feature_map(
+                box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl, inv_intrinsics,
+                pred_instances[lvl], fcos2d_info[lvl]
+            )  # List of Instances; one for each image.
+
+    def forward_for_single_feature_map(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
+    ):
+        N = box3d_quat.shape[0]
+
+        num_classes = self.num_classes if not self.class_agnostic else 1
+
+        box3d_quat = box3d_quat.permute(0, 2, 3, 1).reshape(N, -1, 4, num_classes)
+        box3d_ctr = box3d_ctr.permute(0, 2, 3, 1).reshape(N, -1, 2, num_classes)
+        box3d_depth = box3d_depth.permute(0, 2, 3, 1).reshape(N, -1, num_classes)
+        box3d_size = box3d_size.permute(0, 2, 3, 1).reshape(N, -1, 3, num_classes)
+        box3d_conf = box3d_conf.permute(0, 2, 3, 1).reshape(N, -1, num_classes).sigmoid()
+
+        for i in range(N):
+            fg_inds_per_im = fcos2d_info['fg_inds_per_im'][i]
+            class_inds_per_im = fcos2d_info['class_inds_per_im'][i]
+            topk_indices = fcos2d_info['topk_indices'][i]
+
+            box3d_quat_per_im = box3d_quat[i][fg_inds_per_im]
+            box3d_ctr_per_im = box3d_ctr[i][fg_inds_per_im]
+            box3d_depth_per_im = box3d_depth[i][fg_inds_per_im]
+            box3d_size_per_im = box3d_size[i][fg_inds_per_im]
+            box3d_conf_per_im = box3d_conf[i][fg_inds_per_im]
+
+            if self.class_agnostic:
+                box3d_quat_per_im = box3d_quat_per_im.squeeze(-1)
+                box3d_ctr_per_im = box3d_ctr_per_im.squeeze(-1)
+                box3d_depth_per_im = box3d_depth_per_im.squeeze(-1)
+                box3d_size_per_im = box3d_size_per_im.squeeze(-1)
+                box3d_conf_per_im = box3d_conf_per_im.squeeze(-1)
+            else:
+                I = class_inds_per_im[..., None, None]
+                box3d_quat_per_im = torch.gather(box3d_quat_per_im, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
+                box3d_ctr_per_im = torch.gather(box3d_ctr_per_im, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
+                box3d_depth_per_im = torch.gather(box3d_depth_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
+                box3d_size_per_im = torch.gather(box3d_size_per_im, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
+                box3d_conf_per_im = torch.gather(box3d_conf_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
+
+            if topk_indices is not None:
+                box3d_quat_per_im = box3d_quat_per_im[topk_indices]
+                box3d_ctr_per_im = box3d_ctr_per_im[topk_indices]
+                box3d_depth_per_im = box3d_depth_per_im[topk_indices]
+                box3d_size_per_im = box3d_size_per_im[topk_indices]
+                box3d_conf_per_im = box3d_conf_per_im[topk_indices]
+
+            # scores_per_im = pred_instances[i].scores.square()
+            # NOTE: Before refactoring, the squared score was used. Is raw 2D score better?
+            scores_per_im = pred_instances[i].scores
+            scores_3d_per_im = scores_per_im * box3d_conf_per_im
+
+            canon_box_sizes = box3d_quat.new_tensor(self.canon_box_sizes)[pred_instances[i].pred_classes]
+            inv_K = inv_intrinsics[i][None, ...].expand(len(box3d_quat_per_im), 3, 3)
+            locations = pred_instances[i].locations
+            pred_boxes3d = predictions_to_boxes3d(
+                box3d_quat_per_im,
+                box3d_ctr_per_im,
+                box3d_depth_per_im,
+                box3d_size_per_im,
+                locations,
+                inv_K,
+                canon_box_sizes,
+                self.min_depth,
+                self.max_depth,
+                scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
+                scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
+                quat_is_allocentric=self.predict_allocentric_rot,
+                depth_is_distance=self.predict_distance
+            )
+
+            # In-place modification: add fields to instances.
+            pred_instances[i].pred_boxes3d = pred_boxes3d
+            pred_instances[i].scores_3d = scores_3d_per_im
--- a/projects/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
+++ b/projects/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+import torch.nn.functional as F
+from fvcore.nn.smooth_l1_loss import smooth_l1_loss
+from torch import nn
+
+from detectron2.layers import Conv2d, cat
+#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.postprocessing import detector_postprocess as resize_instances
+from detectron2.structures import Instances
+from detectron2.utils import comm as d2_comm
+from mmdet.models.builder import HEADS
+from mmcv.runner import force_fp32
+
+from projects.mmdet3d_plugin.dd3d.datasets.nuscenes import MAX_NUM_ATTRIBUTES
+from .core import DD3D
+#from tridet.modeling.dd3d.postprocessing import get_group_idxs, nuscenes_sample_aggregate
+from .prepare_targets import DD3DTargetPreparer
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+from projects.mmdet3d_plugin.dd3d.structures.image_list import ImageList
+from projects.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
+
+INF = 100000000.
+
+
+class NuscenesDD3DTargetPreparer(DD3DTargetPreparer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        assert self.dd3d_enabled, f"{type(self).__name__} requires dd3d_enabled = True"
+
+    def __call__(self, locations, gt_instances, feature_shapes):
+        num_loc_list = [len(loc) for loc in locations]
+
+        # compute locations to size ranges
+        loc_to_size_range = []
+        for l, loc_per_level in enumerate(locations):
+            loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
+            loc_to_size_range.append(loc_to_size_range_per_level[None].expand(num_loc_list[l], -1))
+
+        loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
+        locations = torch.cat(locations, dim=0)
+
+        training_targets = self.compute_targets_for_locations(locations, gt_instances, loc_to_size_range, num_loc_list)
+
+        training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
+        training_targets["im_inds"] = [
+            locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
+        ]
+
+        box2d = training_targets.pop("box2d", None)
+
+        # transpose im first training_targets to level first ones
+        training_targets = {k: self._transpose(v, num_loc_list) for k, v in training_targets.items() if k != "box2d"}
+
+        training_targets["fpn_levels"] = [
+            loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(training_targets["locations"])
+        ]
+
+        # Flatten targets: (L x B x H x W, TARGET_SIZE)
+        labels = cat([x.reshape(-1) for x in training_targets["labels"]])
+        box2d_reg_targets = cat([x.reshape(-1, 4) for x in training_targets["box2d_reg"]])
+
+        target_inds = cat([x.reshape(-1) for x in training_targets["target_inds"]])
+        locations = cat([x.reshape(-1, 2) for x in training_targets["locations"]])
+        im_inds = cat([x.reshape(-1) for x in training_targets["im_inds"]])
+        fpn_levels = cat([x.reshape(-1) for x in training_targets["fpn_levels"]])
+
+        pos_inds = torch.nonzero(labels != self.num_classes).squeeze(1)
+
+        targets = {
+            "labels": labels,
+            "box2d_reg_targets": box2d_reg_targets,
+            "locations": locations,
+            "target_inds": target_inds,
+            "im_inds": im_inds,
+            "fpn_levels": fpn_levels,
+            "pos_inds": pos_inds
+        }
+
+        if self.dd3d_enabled:
+            box3d_targets = Boxes3D.cat(training_targets["box3d"])
+            targets.update({"box3d_targets": box3d_targets})
+
+            if box2d is not None:
+                # Original format is B x L x (H x W, 4)
+                # Need to be in L x (B, 4, H, W).
+                batched_box2d = []
+                for lvl, per_lvl_box2d in enumerate(zip(*box2d)):
+                    # B x (H x W, 4)
+                    h, w = feature_shapes[lvl]
+                    batched_box2d_lvl = torch.stack([x.T.reshape(4, h, w) for x in per_lvl_box2d], dim=0)
+                    batched_box2d.append(batched_box2d_lvl)
+                targets.update({"batched_box2d": batched_box2d})
+
+        # Nuscenes targets -- attribute / speed
+        attributes = cat([x.reshape(-1) for x in training_targets["attributes"]])
+        speeds = cat([x.reshape(-1) for x in training_targets["speeds"]])
+
+        targets.update({'attributes': attributes, 'speeds': speeds})
+
+        return targets
+
+    def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
+        labels = []
+        box2d_reg = []
+
+        if self.dd3d_enabled:
+            box3d = []
+
+        target_inds = []
+        xs, ys = locations[:, 0], locations[:, 1]
+
+        # NuScenes targets  -- attribute / speed
+        attributes, speeds = [], []
+
+        num_targets = 0
+        for im_i in range(len(targets)):
+            targets_per_im = targets[im_i]
+            bboxes = targets_per_im.gt_boxes.tensor
+            labels_per_im = targets_per_im.gt_classes
+
+            # no gt
+            if bboxes.numel() == 0:
+                labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
+                # reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+                box2d_reg.append(locations.new_zeros((locations.size(0), 4)))
+                target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
+
+                if self.dd3d_enabled:
+                    box3d.append(
+                        Boxes3D(
+                            locations.new_zeros(locations.size(0), 4),
+                            locations.new_zeros(locations.size(0), 2),
+                            locations.new_zeros(locations.size(0), 1),
+                            locations.new_zeros(locations.size(0), 3),
+                            locations.new_zeros(locations.size(0), 3, 3),
+                        ).to(torch.float32)
+                    )
+                # NOTE: attributes and speeds.
+                attributes.append(labels_per_im.new_zeros(locations.size(0)))
+                speeds.append(labels_per_im.new_zeros(locations.size(0)))  
+                continue
+
+            area = targets_per_im.gt_boxes.area()
+
+            l = xs[:, None] - bboxes[:, 0][None]
+            t = ys[:, None] - bboxes[:, 1][None]
+            r = bboxes[:, 2][None] - xs[:, None]
+            b = bboxes[:, 3][None] - ys[:, None]
+            # reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+            box2d_reg_per_im = torch.stack([l, t, r, b], dim=2)
+
+            if self.center_sample:
+                is_in_boxes = self.get_sample_region(bboxes, num_loc_list, xs, ys)
+            else:
+                is_in_boxes = box2d_reg_per_im.min(dim=2)[0] > 0
+
+            max_reg_targets_per_im = box2d_reg_per_im.max(dim=2)[0]
+            # limit the regression range for each location
+            is_cared_in_the_level = \
+                (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
+                (max_reg_targets_per_im <= size_ranges[:, [1]])
+
+            locations_to_gt_area = area[None].repeat(len(locations), 1)
+            locations_to_gt_area[is_in_boxes == 0] = INF
+            locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+            # if there are still more than one objects for a location,
+            # we choose the one with minimal area
+            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+
+            box2d_reg_per_im = box2d_reg_per_im[range(len(locations)), locations_to_gt_inds]
+            target_inds_per_im = locations_to_gt_inds + num_targets
+            num_targets += len(targets_per_im)
+
+            labels_per_im = labels_per_im[locations_to_gt_inds]
+            labels_per_im[locations_to_min_area == INF] = self.num_classes
+
+            labels.append(labels_per_im)
+            box2d_reg.append(box2d_reg_per_im)
+            target_inds.append(target_inds_per_im)
+
+            if self.dd3d_enabled:
+                # 3D box targets
+                box3d_per_im = targets_per_im.gt_boxes3d[locations_to_gt_inds]
+                box3d.append(box3d_per_im)
+
+            # NuScenes targets  -- attribute / speed
+            attributes_per_im = targets_per_im.gt_attributes[locations_to_gt_inds]
+            speeds_per_im = targets_per_im.gt_speeds[locations_to_gt_inds]
+            attributes.append(attributes_per_im)
+            speeds.append(speeds_per_im)
+
+        ret = {"labels": labels, "box2d_reg": box2d_reg, "target_inds": target_inds}
+        if self.dd3d_enabled:
+            ret.update({"box3d": box3d})
+
+        # NuScenes targets  -- attribute / speed
+        ret.update({"attributes": attributes, "speeds": speeds})
+
+        return ret
+
+
+class NuscenesLoss(nn.Module):
+    def __init__(self, attr_loss_weight=0.2, speed_loss_weight=0.2):
+        super().__init__()
+        self.attr_loss_weight = attr_loss_weight
+        self.speed_loss_weight = speed_loss_weight
+
+    @force_fp32(apply_to=('attr_logits', 'speeds'))
+    def forward(self, attr_logits, speeds, fcos2d_info, targets):
+        # Flatten predictions
+        attr_logits = cat([x.permute(0, 2, 3, 1).reshape(-1, MAX_NUM_ATTRIBUTES) for x in attr_logits])
+        speeds = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in speeds])
+
+        pos_inds = targets['pos_inds']
+
+        losses = {}
+
+        # 1. Attributes
+        attr_logits = attr_logits[pos_inds]
+        target_attr = targets['attributes'][pos_inds]
+        valid_attr_mask = target_attr != MAX_NUM_ATTRIBUTES  # No attrs associated with class, or just attr missing.
+
+        if pos_inds.numel() == 0:
+            attr_weights = attr_logits.new_tensor(0.0) #torch.tensor(0.0).cuda()
+        else:
+            attr_weights = fcos2d_info['centerness_targets'][valid_attr_mask]
+        # Denominator for all foreground losses -- re-computed for features with valid attributes.
+        # attr_loss_denom = max(reduce_sum(attr_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
+        # NOTE: compute attr_weights_sum, and then feed it to reduce_sum() works, but not above.
+        attr_weights_sum = attr_weights.sum()
+        attr_loss_denom = max(reduce_sum(attr_weights_sum).item() / d2_comm.get_world_size(), 1e-6)
+
+        if valid_attr_mask.sum() == 0:
+            losses.update({"loss_attr": attr_logits.sum() * 0.})
+        else:
+            attr_logits = attr_logits[valid_attr_mask]
+            target_attr = target_attr[valid_attr_mask]
+
+            xent = F.cross_entropy(attr_logits, target_attr)
+            loss_attr = (xent * attr_weights).sum() / attr_loss_denom
+
+            losses.update({"loss_attr": self.attr_loss_weight * loss_attr})
+
+        # 2. Speed
+        speeds = speeds[pos_inds]
+        target_speeds = targets['speeds'][pos_inds]
+        # NOTE: some GT speeds are NaN.
+        valid_gt_mask = torch.logical_not(torch.isnan(target_speeds))
+
+        if pos_inds.numel() == 0:
+            speed_weights = speeds.new_tensor(0.0) #torch.tensor(0.0).cuda()
+        else:
+            speed_weights = fcos2d_info['centerness_targets'][valid_gt_mask]
+        # Denominator for all foreground losses -- re-computed for features with valid speeds.
+        # speed_loss_denom = max(reduce_sum(speed_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
+        speed_weights_sum = speed_weights.sum()
+        speed_loss_denom = max(reduce_sum(speed_weights_sum).item() / d2_comm.get_world_size(), 1e-6)
+
+        # NOTE: move after reduce sum
+        if pos_inds.numel() == 0:
+            losses = {"loss_attr": attr_logits.sum() * 0., "loss_speed": speeds.sum() * 0.}
+            # NOTE: This is probably un-reachable, because the training filter images with empty annotations.
+            # NOTE: If not, attr_weights can be unavailable in the reduce_sum below().
+            return losses
+
+        if valid_gt_mask.sum() == 0:
+            losses.update({"loss_speed": speeds.sum() * 0.})
+            # return losses
+        else:
+            speeds = speeds[valid_gt_mask]
+            target_speeds = target_speeds[valid_gt_mask]
+
+            l1_error = smooth_l1_loss(speeds, target_speeds, beta=0.05)
+            loss_speed = (l1_error * speed_weights).sum() / speed_loss_denom
+            losses.update({"loss_speed": self.speed_loss_weight * loss_speed})
+
+        return losses
+
+
+class NuscenesInference():
+    def __init__(self, cfg):
+        pass
+
+    def __call__(self, attr_logits, speeds, pred_instances, fcos2d_info):
+        """Add 'pred_attribute', 'pred_speed' to Instances in 'pred_instances'."""
+        N = attr_logits[0].shape[0]
+        for lvl, (attr_logits_lvl, speed_lvl, info_lvl, instances_lvl) in \
+            enumerate(zip(attr_logits, speeds, fcos2d_info, pred_instances)):
+
+            attr_logits_lvl = attr_logits_lvl.permute(0, 2, 3, 1).reshape(N, -1, MAX_NUM_ATTRIBUTES)
+            speed_lvl = speed_lvl.permute(0, 2, 3, 1).reshape(N, -1)
+            for i in range(N):
+                fg_inds_per_im = info_lvl['fg_inds_per_im'][i]
+                topk_indices = info_lvl['topk_indices'][i]
+
+                attr_logits_per_im = attr_logits_lvl[i][fg_inds_per_im]
+                speed_per_im = speed_lvl[i][fg_inds_per_im]
+
+                if topk_indices is not None:
+                    attr_logits_per_im = attr_logits_per_im[topk_indices]
+                    speed_per_im = speed_per_im[topk_indices]
+
+                if len(attr_logits_per_im) == 0:
+                    instances_lvl[i].pred_attributes = instances_lvl[i].pred_classes.new_tensor([])
+                    instances_lvl[i].pred_speeds = instances_lvl[i].scores.new_tensor([])
+                else:
+                    instances_lvl[i].pred_attributes = attr_logits_per_im.argmax(dim=1)
+                    instances_lvl[i].pred_speeds = speed_per_im
+
+
+@HEADS.register_module()
+class NuscenesDD3D(DD3D):
+    def __init__(self, 
+                 num_classes,
+                 in_channels,
+                 strides,
+                 fcos2d_cfg=dict(),
+                 fcos2d_loss_cfg=dict(),
+                 fcos3d_cfg=dict(),
+                 fcos3d_loss_cfg=dict(),
+                 target_assign_cfg=dict(),
+                 nusc_loss_weight=dict(),
+                 box3d_on=True,
+                 feature_locations_offset="none"):
+        super().__init__(num_classes,
+                        in_channels,
+                        strides,
+                        fcos2d_cfg=fcos2d_cfg,
+                        fcos2d_loss_cfg=fcos2d_loss_cfg,
+                        fcos3d_cfg=fcos3d_cfg,
+                        fcos3d_loss_cfg=fcos3d_loss_cfg,
+                        target_assign_cfg=target_assign_cfg,
+                        box3d_on=box3d_on,
+                        feature_locations_offset=feature_locations_offset)
+
+        # backbone_output_shape = self.backbone_output_shape
+        # in_channels = backbone_output_shape[0].channels
+
+        # --------------------------------------------------------------------------
+        # NuScenes predictions -- attribute / speed, computed from cls_tower output.
+        # --------------------------------------------------------------------------
+        self.attr_logits = Conv2d(in_channels, MAX_NUM_ATTRIBUTES, kernel_size=3, stride=1, padding=1, bias=True)
+        self.speed = Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=True, activation=F.relu)
+
+        # init weights
+        for modules in [self.attr_logits, self.speed]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+        # Re-define target preparer
+        del self.prepare_targets
+        self.prepare_targets = NuscenesDD3DTargetPreparer(num_classes=num_classes, 
+                                                          input_shape=self.backbone_output_shape,
+                                                          box3d_on=box3d_on,
+                                                          **target_assign_cfg)
+
+        self.nuscenes_loss = NuscenesLoss(**nusc_loss_weight)
+        # NOTE: inference later
+        # self.nuscenes_inference = NuscenesInference(cfg)
+
+        # self.num_images_per_sample = cfg.MODEL.FCOS3D.NUSC_NUM_IMAGES_PER_SAMPLE
+        # NOTE: inference later
+        # self.num_images_per_sample = cfg.DD3D.NUSC.INFERENCE.NUM_IMAGES_PER_SAMPLE
+
+        # assert self.num_images_per_sample == 6
+        # assert cfg.DATALOADER.TEST.NUM_IMAGES_PER_GROUP == 6
+
+        # NOTE: NuScenes evaluator allows max. 500 detections per sample.
+        # self.max_num_dets_per_sample = cfg.DD3D.NUSC.INFERENCE.MAX_NUM_DETS_PER_SAMPLE
+
+    @force_fp32(apply_to=('features'))
+    def forward(self, features, batched_inputs):
+        # NOTE:
+        # images = [x["image"].to(self.device) for x in batched_inputs]
+        # images = [self.preprocess_image(x) for x in images]
+
+        # NOTE: directly use inv_intrinsics
+        # if 'intrinsics' in batched_inputs[0]:
+        #     intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
+        # else:
+        #     intrinsics = None
+        # images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
+        if 'inv_intrinsics' in batched_inputs[0]:
+            inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
+            inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
+        else:
+            inv_intrinsics = None
+
+        # NOTE:
+        # gt_dense_depth = None
+        # if 'depth' in batched_inputs[0]:
+        #     gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
+        #     gt_dense_depth = ImageList.from_tensors(
+        #         gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
+        #     )
+
+        # NOTE: directly input feature
+        # features = self.backbone(images.tensor)
+        # features = [features[f] for f in self.in_features]
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        locations = self.compute_locations(features)
+        logits, box2d_reg, centerness, fcos2d_extra_output = self.fcos2d_head(features)
+        if not self.only_box2d:
+            box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
+        # NOTE: directly use inv_intrinsics
+        # inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
+
+        # --------------------------------------------------------------------------
+        # NuScenes predictions -- attribute / speed, computed from cls_tower output.
+        # --------------------------------------------------------------------------
+        attr_logits, speeds = [], []
+        for x in fcos2d_extra_output['cls_tower_out']:
+            attr_logits.append(self.attr_logits(x))
+            speeds.append(self.speed(x))
+
+        if self.training:
+            assert gt_instances is not None
+            feature_shapes = [x.shape[-2:] for x in features]
+            training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
+            # NOTE: 
+            # if gt_dense_depth is not None:
+            #    training_targets.update({"dense_depth": gt_dense_depth})
+
+            losses = {}
+            fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
+            losses.update(fcos2d_loss)
+
+            if not self.only_box2d:
+                fcos3d_loss = self.fcos3d_loss(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
+                    fcos2d_info, training_targets
+                )
+                losses.update(fcos3d_loss)
+
+            # Nuscenes loss -- attribute / speed
+            nuscenes_loss = self.nuscenes_loss(attr_logits, speeds, fcos2d_info, training_targets)
+            losses.update(nuscenes_loss)
+            return losses
+        else:
+            # TODO: do not support inference now
+            raise NotImplementedError
+            pred_instances, fcos2d_info = self.fcos2d_inference(
+                logits, box2d_reg, centerness, locations, images.image_sizes
+            )
+            if not self.only_box2d:
+                # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances'.
+                self.fcos3d_inference(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
+                    fcos2d_info
+                )
+                score_key = "scores_3d"
+            else:
+                score_key = "scores"
+
+            # This adds 'pred_attributes', 'pred_speed' to Instances in 'pred_instances'.
+            self.nuscenes_inference(attr_logits, speeds, pred_instances, fcos2d_info)
+
+            # Transpose to "image-first", i.e. (B, L)
+            pred_instances = list(zip(*pred_instances))
+            pred_instances = [Instances.cat(instances) for instances in pred_instances]
+
+            # 2D NMS and pick top-K.
+            if self.do_nms:
+                pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
+
+            if not self.only_box2d and self.do_bev_nms:
+                # Bird-eye-view NMS.
+                dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
+                if 'pose' in batched_inputs[0]:
+                    poses = [x['pose'] for x in batched_inputs]
+                else:
+                    poses = [x['extrinsics'] for x in batched_inputs]
+                pred_instances = nuscenes_sample_aggregate(
+                    pred_instances,
+                    dummy_group_idxs,
+                    self.num_classes,
+                    poses,
+                    iou_threshold=self.bev_nms_iou_thresh,
+                    include_boxes3d_global=False
+                )
+
+            if self.postprocess_in_inference:
+                processed_results = []
+                for results_per_image, input_per_image, image_size in \
+                        zip(pred_instances, batched_inputs, images.image_sizes):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = resize_instances(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+
+                # ----------------------------------------------------------
+                # NuScenes specific: cross-image (i.e. sample-level) BEV NMS.
+                # ----------------------------------------------------------
+                sample_tokens = [x['sample_token'] for x in batched_inputs]
+                group_idxs = get_group_idxs(sample_tokens, self.num_images_per_sample)
+
+                instances = [x['instances'] for x in processed_results]
+                global_poses = [x['pose'] for x in batched_inputs]
+
+                filtered_instances = nuscenes_sample_aggregate(
+                    instances,
+                    group_idxs,
+                    self.num_classes,
+                    global_poses,
+                    self.bev_nms_iou_thresh,
+                    max_num_dets_per_sample=self.max_num_dets_per_sample
+                )
+                processed_results = [{"instances": x} for x in filtered_instances]
+            else:
+                processed_results = [{"instances": x} for x in pred_instances]
+
+            return processed_results