将子模块转换为普通目录

007f2e68 · 雍大凯 · 19472568 · 007f2e68 · 007f2e68 · 007f2e68
Commit 007f2e68 authored Apr 08, 2026 by 雍大凯
20 changed files
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/datasets/samplers/__init__.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/datasets/samplers/__init__.py
+from .group_sampler import DistributedGroupSampler
+from .distributed_sampler import DistributedSampler
+from .sampler import SAMPLER, build_sampler
+
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
+import math
+
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from .sampler import SAMPLER
+
+
+@SAMPLER.register_module()
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self,
+                 dataset=None,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 seed=0):
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        # for the compatibility from PyTorch 1.3+
+        self.seed = seed if seed is not None else 0
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            assert False
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices *
+                   math.ceil(self.total_size / len(indices)))[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        per_replicas = self.total_size//self.num_replicas
+        # indices = indices[self.rank:self.total_size:self.num_replicas]
+        indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import Sampler
+from .sampler import SAMPLER
+import random
+from IPython import embed
+
+
+@SAMPLER.register_module()
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None,
+                 seed=0):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.seed = seed if seed is not None else 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                # add .numpy() to avoid bug when selecting indice in parrots.
+                # TODO: check whether torch.randperm() can be replaced by
+                # numpy.random.permutation().
+                indice = indice[list(
+                    torch.randperm(int(size), generator=g).numpy())].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/datasets/samplers/sampler.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/datasets/samplers/sampler.py
+from mmcv.utils.registry import Registry, build_from_cfg
+
+SAMPLER = Registry('sampler')
+
+
+def build_sampler(cfg, default_args):
+    return build_from_cfg(cfg, SAMPLER, default_args)
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/__init__.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/__init__.py
+from .modeling import *
\ No newline at end of file
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/datasets/__init__.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/datasets/__init__.py
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/datasets/nuscenes.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/datasets/nuscenes.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+#import functools
+from collections import OrderedDict
+
+import numpy as np
+import seaborn as sns
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+#from detectron2.data import MetadataCatalog
+from detectron2.structures.boxes import BoxMode
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.splits import create_splits_scenes
+
+#from tridet.data import collect_dataset_dicts
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import GenericBoxes3D
+from projects.mmdet3d_plugin.dd3d.structures.pose import Pose
+from projects.mmdet3d_plugin.dd3d.utils.geometry import project_points3d
+from projects.mmdet3d_plugin.dd3d.utils.visualization import float_to_uint8_color
+
+#  https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
+#     - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
+#     - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
+#     - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
+#         tracker if required
+DATASET_NAME_TO_VERSION = {
+    "nusc_train": "v1.0-trainval",
+    "nusc_val": "v1.0-trainval",
+    "nusc_val-subsample-8": "v1.0-trainval",
+    "nusc_trainval": "v1.0-trainval",
+    "nusc_test": "v1.0-test",
+    "nusc_mini_train": "v1.0-mini",
+    "nusc_mini_val": "v1.0-mini",
+}
+
+CAMERA_NAMES = ('CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT')
+
+ATTRIBUTE_IDS = {
+    'vehicle.moving': 0,
+    'vehicle.parked': 1,
+    'vehicle.stopped': 2,
+    'pedestrian.moving': 0,
+    'pedestrian.standing': 1,
+    'pedestrian.sitting_lying_down': 2,
+    'cycle.with_rider': 0,
+    'cycle.without_rider': 1,
+}
+
+CATEGORY_IDS = OrderedDict({
+    'barrier': 0,
+    'bicycle': 1,
+    'bus': 2,
+    'car': 3,
+    'construction_vehicle': 4,
+    'motorcycle': 5,
+    'pedestrian': 6,
+    'traffic_cone': 7,
+    'trailer': 8,
+    'truck': 9,
+})
+
+COLORS = [float_to_uint8_color(clr) for clr in sns.color_palette("bright", n_colors=10)]
+COLORMAP = OrderedDict({
+    'barrier': COLORS[8],  # yellow
+    'bicycle': COLORS[0],  # blue
+    'bus': COLORS[6],  # pink
+    'car': COLORS[2],  # green
+    'construction_vehicle': COLORS[7],  # gray
+    'motorcycle': COLORS[4],  # purple
+    'pedestrian': COLORS[1],  # orange
+    'traffic_cone': COLORS[3],  # red
+    'trailer': COLORS[9],  # skyblue
+    'truck': COLORS[5],  # brown
+})
+
+MAX_NUM_ATTRIBUTES = 3
+
+
+def _compute_iou(box1, box2):
+    """
+    Parameters
+    ----------
+    box1, box2:
+        (x1, y1, x2, y2)
+    """
+    xx1 = max(box1[0], box2[0])
+    yy1 = max(box1[1], box2[1])
+    xx2 = min(box1[2], box2[2])
+    yy2 = min(box1[3], box2[3])
+    if xx1 >= xx2 or yy1 >= yy2:
+        return 0.
+    inter = (xx2 - xx1) * (yy2 - yy1)
+    a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    return inter / (a1 + a2 - inter)
+
+
+class NuscenesDataset(Dataset):
+    def __init__(self, name, data_root, datum_names=CAMERA_NAMES, min_num_lidar_points=3, min_box_visibility=0.2, **unused):
+        self.data_root = data_root
+        assert name in DATASET_NAME_TO_VERSION
+        version = DATASET_NAME_TO_VERSION[name]
+        self.nusc = NuScenes(version=version, dataroot=data_root, verbose=True)
+
+        self.datum_names = datum_names
+        self.min_num_lidar_points = min_num_lidar_points
+        self.min_box_visibility = min_box_visibility
+
+        self.dataset_item_info = self._build_dataset_item_info(name)
+
+        # Index instance tokens to their IDs
+        self._instance_token_to_id = self._index_instance_tokens()
+
+        # Construct the mapping from datum_token (image id) to index
+        print("Generating the mapping from image id to idx...")
+        self.datumtoken2idx = {}
+        for idx, (datum_token, _, _, _, _) in enumerate(self.dataset_item_info):
+            self.datumtoken2idx[datum_token] = idx
+        print("Done.")
+
+    def _build_dataset_item_info(self, name):
+        scenes_in_split = self._get_split_scenes(name)
+
+        dataset_items = []
+        for _, scene_token in tqdm(scenes_in_split):
+            scene = self.nusc.get('scene', scene_token)
+            sample_token = scene['first_sample_token']
+            for sample_idx in range(scene['nbr_samples']):
+                if name.endswith('subsample-8') and sample_idx % 8 > 0:
+                    # Sample-level subsampling.
+                    continue
+
+                sample = self.nusc.get('sample', sample_token)
+                for datum_name, datum_token in sample['data'].items():
+                    if datum_name not in self.datum_names:
+                        continue
+                    dataset_items.append((datum_token, sample_token, scene['name'], sample_idx, datum_name))
+                sample_token = sample['next']
+        return dataset_items
+
+    def _get_split_scenes(self, name):
+        scenes_in_splits = create_splits_scenes()
+        if name == "nusc_trainval":
+            scenes = scenes_in_splits["train"] + scenes_in_splits["val"]
+        elif name == "nusc_val-subsample-8":
+            scenes = scenes_in_splits["val"]
+        else:
+            assert name.startswith('nusc_'), f"Invalid dataset name: {name}"
+            split = name[5:]
+            assert split in scenes_in_splits, f"Invalid dataset: {split}"
+            scenes = scenes_in_splits[split]
+
+        # Mapping from scene name to token.
+        name_to_token = {scene['name']: scene['token'] for scene in self.nusc.scene}
+        return [(name, name_to_token[name]) for name in scenes]
+
+    def __len__(self):
+        return len(self.dataset_item_info)
+
+    def _build_id(self, scene_name, sample_idx, datum_name):
+        sample_id = f"{scene_name}_{sample_idx:03d}"
+        image_id = f"{sample_id}_{datum_name}"
+        return image_id, sample_id
+
+    def _index_instance_tokens(self):
+        """Index instance tokens for uniquely identifying instances across samples"""
+        instance_token_to_id = {}
+        for record in self.nusc.sample_annotation:
+            instance_token = record['instance_token']
+            if instance_token not in instance_token_to_id:
+                next_instance_id = len(instance_token_to_id)
+                instance_token_to_id[instance_token] = next_instance_id
+        return instance_token_to_id
+
+    def get_instance_annotations(self, annotation_list, K, image_shape, pose_WS):
+        annotations = []
+        for _ann in annotation_list:
+            ann = self.nusc.get('sample_annotation', _ann.token)
+            if ann['num_lidar_pts'] + ann['num_radar_pts'] < self.min_num_lidar_points:
+                continue
+            annotation = OrderedDict()
+
+            # --------
+            # Category
+            # --------
+            category = category_to_detection_name(ann['category_name'])
+            if category is None:
+                continue
+            annotation['category_id'] = CATEGORY_IDS[category]
+
+            # ------
+            # 3D box
+            # ------
+            # NOTE: ann['rotation'], ann['translation'] is in global frame.
+            pose_SO = Pose(wxyz=_ann.orientation, tvec=_ann.center)  # pose in sensor frame
+            # DEBUG:
+            # pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
+            # pose_WO_2 = pose_WS * pose_SO
+            # assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
+            bbox3d = GenericBoxes3D(_ann.orientation, _ann.center, _ann.wlh)
+            annotation['bbox3d'] = bbox3d.vectorize().tolist()[0]
+
+            # --------------------------------------
+            # 2D box -- project 8 corners of 3D bbox
+            # --------------------------------------
+            corners = project_points3d(bbox3d.corners.cpu().numpy().squeeze(0), K)
+            l, t = corners[:, 0].min(), corners[:, 1].min()
+            r, b = corners[:, 0].max(), corners[:, 1].max()
+
+            x1 = max(0, l)
+            y1 = max(0, t)
+            x2 = min(image_shape[1], r)
+            y2 = min(image_shape[0], b)
+
+            iou = _compute_iou([l, t, r, b], [x1, y1, x2, y2])
+            if iou < self.min_box_visibility:
+                continue
+
+            annotation['bbox'] = [x1, y1, x2, y2]
+            annotation['bbox_mode'] = BoxMode.XYXY_ABS
+
+            # --------
+            # Track ID
+            # --------
+            annotation['track_id'] = self._instance_token_to_id[ann['instance_token']]
+
+            # ---------
+            # Attribute
+            # ---------
+            attr_tokens = ann['attribute_tokens']
+            assert len(attr_tokens) < 2  # NOTE: Allow only single attrubute.
+            attribute_id = MAX_NUM_ATTRIBUTES  # By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
+            if attr_tokens:
+                attribute = self.nusc.get('attribute', attr_tokens[0])['name']
+                attribute_id = ATTRIBUTE_IDS[attribute]
+            annotation['attribute_id'] = attribute_id
+
+            # -----
+            # Speed
+            # -----
+            vel_global = self.nusc.box_velocity(ann['token'])
+            speed = np.linalg.norm(vel_global)  # NOTE: This can be NaN.
+            # DEBUG:
+            # speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
+            annotation['speed'] = speed
+
+            annotations.append(annotation)
+
+        return annotations
+
+    def _get_ego_velocity(self, current, max_time_diff=1.5):
+        """Velocity of ego-vehicle in m/s.
+        """
+        has_prev = current['prev'] != ''
+        has_next = current['next'] != ''
+
+        # Cannot estimate velocity for a single annotation.
+        if not has_prev and not has_next:
+            return np.array([np.nan, np.nan, np.nan])
+
+        if has_prev:
+            first = self.nusc.get('sample_data', current['prev'])
+        else:
+            first = current
+
+        if has_next:
+            last = self.nusc.get('sample_data', current['next'])
+        else:
+            last = current
+
+        pos_first = self.nusc.get('ego_pose', first['ego_pose_token'])['translation']
+        pos_last = self.nusc.get('ego_pose', last['ego_pose_token'])['translation']
+        pos_diff = np.float32(pos_last) - np.float32(pos_first)
+
+        time_last = 1e-6 * last['timestamp']
+        time_first = 1e-6 * first['timestamp']
+        time_diff = time_last - time_first
+
+        if has_next and has_prev:
+            # If doing centered difference, allow for up to double the max_time_diff.
+            max_time_diff *= 2
+
+        if time_diff > max_time_diff:
+            # If time_diff is too big, don't return an estimate.
+            return np.array([np.nan, np.nan, np.nan])
+        else:
+            return pos_diff / time_diff
+
+    def __getitem__(self, idx):
+        datum_token, sample_token, scene_name, sample_idx, datum_name = self.dataset_item_info[idx]
+        datum = self.nusc.get('sample_data', datum_token)
+        assert datum['is_key_frame']
+
+        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+        image_id, sample_id = self._build_id(scene_name, sample_idx, datum_name)
+        height, width = datum['height'], datum['width']
+        d2_dict = OrderedDict(
+            file_name=filename,
+            height=height,
+            width=width,
+            image_id=image_id,
+            sample_id=sample_id,
+            sample_token=sample_token
+        )
+
+        # Intrinsics
+        d2_dict['intrinsics'] = list(K.flatten())
+
+        # Get pose of the sensor (S) from vehicle (V) frame
+        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
+
+        # Get ego-pose of the vehicle (V) from global/world (W) frame
+        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+        pose_WS = pose_WV * pose_VS
+
+        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+
+        return d2_dict
+
+    def getitem_by_datumtoken(self, datum_token):
+        # idx = self.datumtoken2idx[datum_token]
+        # ret = self.__getitem__(idx)
+
+        datum = self.nusc.get('sample_data', datum_token)
+        sample_token = datum['sample_token']
+        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+        height, width = datum['height'], datum['width']
+        d2_dict = OrderedDict(
+            file_name=filename,
+            height=height,
+            width=width,
+            image_id=0,
+            sample_id=0,
+            sample_token=sample_token
+        )
+        # Intrinsics
+        d2_dict['intrinsics'] = list(K.flatten())
+        # Get pose of the sensor (S) from vehicle (V) frame
+        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation'])) 
+        # Get ego-pose of the vehicle (V) from global/world (W) frame
+        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+        pose_WS = pose_WV * pose_VS
+
+        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+        return d2_dict
\ No newline at end of file
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/datasets/transform_utils.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/datasets/transform_utils.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from detectron2:
+#   https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py
+import numpy as np
+import torch
+
+from detectron2.data import transforms as T
+from detectron2.structures import Boxes, BoxMode, Instances
+
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+
+__all__ = ["transform_instance_annotations", "annotations_to_instances"]
+
+
+def transform_instance_annotations(
+    annotation,
+    transforms,
+    image_size,
+):
+    """Adapted from:
+        https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py#L254
+
+    The changes from original:
+        - The presence of 2D bounding box (i.e. "bbox" field) is assumed by default in d2; here it's optional.
+        - Add optional 3D bounding box support.
+        - If the instance mask annotation is in RLE, then it's decoded into polygons, not bitmask, to save memory.
+
+    ===============================================================================================================
+
+    Apply transforms to box, segmentation and keypoints annotations of a single instance.
+
+    It will use `transforms.apply_box` for the box, and
+    `transforms.apply_coords` for segmentation polygons & keypoints.
+    If you need anything more specially designed for each data structure,
+    you'll need to implement your own version of this function or the transforms.
+
+    Args:
+        annotation (dict): dict of instance annotations for a single instance.
+            It will be modified in-place.
+        transforms (TransformList or list[Transform]):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+
+    Returns:
+        dict:
+            the same input dict with fields "bbox", "segmentation", "keypoints"
+            transformed according to `transforms`.
+            The "bbox_mode" field will be set to XYXY_ABS.
+    """
+    if isinstance(transforms, (tuple, list)):
+        transforms = T.TransformList(transforms)
+    # (dennis.park) Here 2D bounding box is optional.
+    if "bbox" in annotation:
+        assert "bbox_mode" in annotation, "'bbox' is present, but 'bbox_mode' is not."
+        # bbox is 1d (per-instance bounding box)
+        bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+        bbox = transforms.apply_box(np.array([bbox]))[0]
+        # clip transformed bbox to image size
+        bbox = bbox.clip(min=0)
+        bbox = np.minimum(bbox, list(image_size + image_size)[::-1])
+        annotation["bbox"] = bbox
+        annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+    # Vertical flipping is not implemented (`flip_transform.py`). TODO: implement if needed.
+    if "bbox3d" in annotation:
+        bbox3d = np.array(annotation["bbox3d"])
+        annotation['bbox3d'] = transforms.apply_box3d(bbox3d)
+
+    return annotation
+
+
+def _create_empty_instances(image_size):
+    target = Instances(image_size)
+
+    target.gt_boxes = Boxes([])
+    target.gt_classes = torch.tensor([], dtype=torch.int64)
+    target.gt_boxes3d = Boxes3D.from_vectors([], torch.eye(3, dtype=torch.float32))
+
+    return target
+
+
+def annotations_to_instances(
+    annos,
+    image_size,
+    intrinsics=None,
+):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            It will contain fields "gt_boxes", "gt_classes",
+            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    if len(annos) == 0:
+        return _create_empty_instances(image_size)
+
+    boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+    target = Instances(image_size)
+    target.gt_boxes = Boxes(boxes)
+
+    classes = [obj["category_id"] for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    if len(annos) and "bbox3d" in annos[0]:
+        assert intrinsics is not None
+        target.gt_boxes3d = Boxes3D.from_vectors([anno['bbox3d'] for anno in annos], intrinsics)
+        if len(target.gt_boxes3d) != target.gt_boxes.tensor.shape[0]:
+            raise ValueError(
+                f"The sizes of `gt_boxes3d` and `gt_boxes` do not match: a={len(target.gt_boxes3d)}, b={target.gt_boxes.tensor.shape[0]}."
+            )
+
+    # NOTE: add nuscenes attributes here
+    # NOTE: instances will be filtered later
+    # NuScenes attributes
+    if len(annos) and "attribute_id" in annos[0]:    
+        attributes = [obj["attribute_id"] for obj in annos] 
+        target.gt_attributes = torch.tensor(attributes, dtype=torch.int64)
+
+    # Speed (magnitude of velocity)
+    if len(annos) and "speed" in annos[0]:
+        speeds = [obj["speed"] for obj in annos]
+        target.gt_speeds = torch.tensor(speeds, dtype=torch.float32)
+
+    assert len(boxes) == len(classes) == len(attributes) == len(speeds), \
+        'the numbers of annotations should be the same'
+    return target
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/layers/iou_loss.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/layers/iou_loss.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet:
+#   https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/iou_loss.py
+import torch
+from torch import nn
+
+
+class IOULoss(nn.Module):
+    """
+    Intersetion Over Union (IoU) loss which supports three
+    different IoU computations:
+
+    * IoU
+    * Linear IoU
+    * gIoU
+    """
+    def __init__(self, loc_loss_type='iou'):
+        super(IOULoss, self).__init__()
+        self.loc_loss_type = loc_loss_type
+
+    def forward(self, pred, target, weight=None):
+        """
+        Args:
+            pred: Nx4 predicted bounding boxes
+            target: Nx4 target bounding boxes
+            weight: N loss weight for each instance
+        """
+        pred_left = pred[:, 0]
+        pred_top = pred[:, 1]
+        pred_right = pred[:, 2]
+        pred_bottom = pred[:, 3]
+
+        target_left = target[:, 0]
+        target_top = target[:, 1]
+        target_right = target[:, 2]
+        target_bottom = target[:, 3]
+
+        target_aera = (target_left + target_right) * \
+                      (target_top + target_bottom)
+        pred_aera = (pred_left + pred_right) * \
+                    (pred_top + pred_bottom)
+
+        w_intersect = torch.min(pred_left, target_left) + \
+                      torch.min(pred_right, target_right)
+        h_intersect = torch.min(pred_bottom, target_bottom) + \
+                      torch.min(pred_top, target_top)
+
+        g_w_intersect = torch.max(pred_left, target_left) + \
+                        torch.max(pred_right, target_right)
+        g_h_intersect = torch.max(pred_bottom, target_bottom) + \
+                        torch.max(pred_top, target_top)
+        ac_uion = g_w_intersect * g_h_intersect
+
+        area_intersect = w_intersect * h_intersect
+        area_union = target_aera + pred_aera - area_intersect
+
+        ious = (area_intersect + 1.0) / (area_union + 1.0)
+        gious = ious - (ac_uion - area_union) / ac_uion
+        if self.loc_loss_type == 'iou':
+            losses = -torch.log(ious)
+        elif self.loc_loss_type == 'linear_iou':
+            losses = 1 - ious
+        elif self.loc_loss_type == 'giou':
+            losses = 1 - gious
+        else:
+            raise NotImplementedError
+
+        if weight is not None:
+            return (losses * weight).sum()
+        else:
+            return losses.sum()
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/layers/normalization.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/layers/normalization.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet
+#   https://github.com/aim-uofa/AdelaiDet/
+import logging
+
+import torch
+from torch import nn
+
+LOG = logging.getLogger(__name__)
+
+
+class Scale(nn.Module):
+    def __init__(self, init_value=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.FloatTensor([init_value]))
+
+    def forward(self, input):
+        return input * self.scale
+
+
+class Offset(nn.Module):
+    def __init__(self, init_value=0.):
+        super(Offset, self).__init__()
+        self.bias = nn.Parameter(torch.FloatTensor([init_value]))
+
+    def forward(self, input):
+        return input + self.bias
+
+
+class ModuleListDial(nn.ModuleList):
+    def __init__(self, modules=None):
+        super(ModuleListDial, self).__init__(modules)
+        self.cur_position = 0
+
+    def forward(self, x):
+        result = self[self.cur_position](x)
+        self.cur_position += 1
+        if self.cur_position >= len(self):
+            self.cur_position = 0
+        return result
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from fvcore:
+#   https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/smooth_l1_loss.py
+
+import torch
+
+
+def smooth_l1_loss(input: torch.Tensor, target: torch.Tensor, beta: float, reduction: str = "none") -> torch.Tensor:
+    """
+    Smooth L1 loss defined in the Fast R-CNN paper as:
+
+                  | 0.5 * x ** 2 / beta   if abs(x) < beta
+    smoothl1(x) = |
+                  | abs(x) - 0.5 * beta   otherwise,
+
+    where x = input - target.
+
+    Smooth L1 loss is related to Huber loss, which is defined as:
+
+                | 0.5 * x ** 2                  if abs(x) < beta
+     huber(x) = |
+                | beta * (abs(x) - 0.5 * beta)  otherwise
+
+    Smooth L1 loss is equal to huber(x) / beta. This leads to the following
+    differences:
+
+     - As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss
+       converges to a constant 0 loss.
+     - As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss
+       converges to L2 loss.
+     - For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant
+       slope of 1. For Huber loss, the slope of the L1 segment is beta.
+
+    Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta
+    portion replaced with a quadratic function such that at abs(x) = beta, its
+    slope is 1. The quadratic segment smooths the L1 loss near x = 0.
+
+    Args:
+        input (Tensor): input tensor of any shape
+        target (Tensor): target value tensor with the same shape as input
+        beta (float): L1 to L2 change point.
+            For beta values < 1e-5, L1 loss is computed.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+
+    Returns:
+        The loss with the reduction option applied.
+
+    Note:
+        PyTorch's builtin "Smooth L1 loss" implementation does not actually
+        implement Smooth L1 loss, nor does it implement Huber loss. It implements
+        the special case of both in which they are equal (beta=1).
+        See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss.
+     """
+    # (dennis.park) Make it work with mixed precision training.
+    beta = torch.as_tensor(beta).to(input.dtype)
+    if beta < 1e-5:
+        # if beta == 0, then torch.where will result in nan gradients when
+        # the chain rule is applied due to pytorch implementation details
+        # (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of
+        # zeros, rather than "no gradient"). To avoid this issue, we define
+        # small values of beta to be exactly l1 loss.
+        loss = torch.abs(input - target)
+    else:
+        n = torch.abs(input - target)
+        cond = n < beta
+        a = 0.5 * n**2
+        b = n - 0.5 * beta
+        a, b = a.to(input.dtype), b.to(input.dtype)
+        loss = torch.where(cond, a, b)
+        # loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/__init__.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/__init__.py
+from .nuscenes_dd3d import NuscenesDD3D
\ No newline at end of file
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/core.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/core.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+from torch import nn
+
+#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.postprocessing import detector_postprocess as resize_instances
+from detectron2.structures import Instances
+from detectron2.layers import ShapeSpec
+from mmcv.runner import force_fp32
+
+from .fcos2d import FCOS2DHead, FCOS2DInference, FCOS2DLoss
+from .fcos3d import FCOS3DHead, FCOS3DInference, FCOS3DLoss
+#from tridet.modeling.dd3d.postprocessing import nuscenes_sample_aggregate
+from .prepare_targets import DD3DTargetPreparer
+#from tridet.modeling.feature_extractor import build_feature_extractor
+from projects.mmdet3d_plugin.dd3d.structures.image_list import ImageList
+from projects.mmdet3d_plugin.dd3d.utils.tensor2d import compute_features_locations as compute_locations_per_level
+
+
+#@META_ARCH_REGISTRY.register()
+class DD3D(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 strides,
+                 fcos2d_cfg=dict(),
+                 fcos2d_loss_cfg=dict(),
+                 fcos3d_cfg=dict(),
+                 fcos3d_loss_cfg=dict(),
+                 target_assign_cfg=dict(),
+                 box3d_on=True,
+                 feature_locations_offset="none"):
+        super().__init__()
+        # NOTE: do not need backbone
+        # self.backbone = build_feature_extractor(cfg)
+        # backbone_output_shape = self.backbone.output_shape()
+        # self.in_features = cfg.DD3D.IN_FEATURES or list(backbone_output_shape.keys())
+        
+        self.backbone_output_shape = [ShapeSpec(channels=in_channels, stride=s) for s in strides]
+
+        self.feature_locations_offset = feature_locations_offset
+
+        self.fcos2d_head = FCOS2DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
+                                     **fcos2d_cfg)
+        self.fcos2d_loss = FCOS2DLoss(num_classes=num_classes, **fcos2d_loss_cfg)
+        # NOTE: inference later
+        # self.fcos2d_inference = FCOS2DInference(cfg)
+
+        if box3d_on:
+            self.fcos3d_head = FCOS3DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
+                                          **fcos3d_cfg)
+            self.fcos3d_loss = FCOS3DLoss(num_classes=num_classes, **fcos3d_loss_cfg)
+            # NOTE: inference later
+            # self.fcos3d_inference = FCOS3DInference(cfg)
+            self.only_box2d = False
+        else:
+            self.only_box2d = True
+
+        self.prepare_targets = DD3DTargetPreparer(num_classes=num_classes, 
+                                                  input_shape=self.backbone_output_shape,
+                                                  box3d_on=box3d_on,
+                                                  **target_assign_cfg)
+
+        # NOTE: inference later
+        # self.postprocess_in_inference = cfg.DD3D.INFERENCE.DO_POSTPROCESS
+
+        # self.do_nms = cfg.DD3D.INFERENCE.DO_NMS
+        # self.do_bev_nms = cfg.DD3D.INFERENCE.DO_BEV_NMS
+        # self.bev_nms_iou_thresh = cfg.DD3D.INFERENCE.BEV_NMS_IOU_THRESH
+
+        # nuScenes inference aggregates detections over all 6 cameras.
+        # self.nusc_sample_aggregate_in_inference = cfg.DD3D.INFERENCE.NUSC_SAMPLE_AGGREGATE
+        self.num_classes = num_classes
+
+        # NOTE: do not need normalize
+        # self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        # self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+    # NOTE:
+    # @property
+    # def device(self):
+    #     return self.pixel_mean.device
+
+    # def preprocess_image(self, x):
+    #     return (x - self.pixel_mean) / self.pixel_std
+
+    @force_fp32(apply_to=('features'))
+    def forward(self, features, batched_inputs):
+        # NOTE:
+        # images = [x["image"].to(self.device) for x in batched_inputs]
+        # images = [self.preprocess_image(x) for x in images]
+
+        # NOTE: directly use inv_intrinsics
+        # if 'intrinsics' in batched_inputs[0]:
+        #     intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
+        # else:
+        #     intrinsics = None
+        # images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
+        if 'inv_intrinsics' in batched_inputs[0]:
+            inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
+            inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
+        else:
+            inv_intrinsics = None
+
+        # NOTE:
+        # gt_dense_depth = None
+        # if 'depth' in batched_inputs[0]:
+        #     gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
+        #     gt_dense_depth = ImageList.from_tensors(
+        #         gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
+        #     )
+
+        # NOTE: directly input feature
+        # features = self.backbone(images.tensor)
+        # features = [features[f] for f in self.in_features]
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        locations = self.compute_locations(features)
+        logits, box2d_reg, centerness, _ = self.fcos2d_head(features)
+        if not self.only_box2d:
+            box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
+        # NOTE: directly use inv_intrinsics
+        # inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
+
+        if self.training:
+            assert gt_instances is not None
+            feature_shapes = [x.shape[-2:] for x in features]
+            training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
+            # NOTE: 
+            # if gt_dense_depth is not None:
+            #    training_targets.update({"dense_depth": gt_dense_depth})
+
+            losses = {}
+            fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
+            losses.update(fcos2d_loss)
+
+            if not self.only_box2d:
+                fcos3d_loss = self.fcos3d_loss(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
+                    fcos2d_info, training_targets
+                )
+                losses.update(fcos3d_loss)
+            return losses
+        else:
+            # TODO: do not support inference now
+            raise NotImplementedError
+            
+            pred_instances, fcos2d_info = self.fcos2d_inference(
+                logits, box2d_reg, centerness, locations, images.image_sizes
+            )
+            if not self.only_box2d:
+                # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances' in place.
+                self.fcos3d_inference(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
+                    fcos2d_info
+                )
+
+                # 3D score == 2D score x confidence.
+                score_key = "scores_3d"
+            else:
+                score_key = "scores"
+
+            # Transpose to "image-first", i.e. (B, L)
+            pred_instances = list(zip(*pred_instances))
+            pred_instances = [Instances.cat(instances) for instances in pred_instances]
+
+            # 2D NMS and pick top-K.
+            if self.do_nms:
+                pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
+
+            if not self.only_box2d and self.do_bev_nms:
+                # Bird-eye-view NMS.
+                dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
+                if 'pose' in batched_inputs[0]:
+                    poses = [x['pose'] for x in batched_inputs]
+                else:
+                    poses = [x['extrinsics'] for x in batched_inputs]
+                pred_instances = nuscenes_sample_aggregate(
+                    pred_instances,
+                    dummy_group_idxs,
+                    self.num_classes,
+                    poses,
+                    iou_threshold=self.bev_nms_iou_thresh,
+                    include_boxes3d_global=False
+                )
+
+            if self.postprocess_in_inference:
+                processed_results = []
+                for results_per_image, input_per_image, image_size in \
+                        zip(pred_instances, batched_inputs, images.image_sizes):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = resize_instances(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+            else:
+                processed_results = [{"instances": x} for x in pred_instances]
+
+            return processed_results
+
+    def compute_locations(self, features):
+        locations = []
+        in_strides = [x.stride for x in self.backbone_output_shape]
+        for level, feature in enumerate(features):
+            h, w = feature.size()[-2:]
+            locations_per_level = compute_locations_per_level(
+                h, w, in_strides[level], feature.dtype, feature.device, offset=self.feature_locations_offset
+            )
+            locations.append(locations_per_level)
+        return locations
+
+    def forward_train(self, features, batched_inputs):
+        self.train()
+        return self.forward(features, batched_inputs)
\ No newline at end of file
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import logging
+
+import torch
+import torch.nn as nn
+
+from projects.mmdet3d_plugin.dd3d.layers.smooth_l1_loss import smooth_l1_loss
+
+LOG = logging.getLogger(__name__)
+
+
+class DisentangledBox3DLoss(nn.Module):
+    def __init__(self, smooth_l1_loss_beta, max_loss_per_group):
+        super().__init__()
+        self.smooth_l1_loss_beta = smooth_l1_loss_beta
+        self.max_loss_per_group = max_loss_per_group
+
+    def forward(self, box3d_pred, box3d_targets, locations, weights=None):
+
+        box3d_pred = box3d_pred.to(torch.float32)
+        box3d_targets = box3d_targets.to(torch.float32)
+
+        target_corners = box3d_targets.corners
+
+        disentangled_losses = {}
+        for component_key in ["quat", "proj_ctr", "depth", "size"]:
+            disentangled_boxes = box3d_targets.clone()
+            setattr(disentangled_boxes, component_key, getattr(box3d_pred, component_key))
+            pred_corners = disentangled_boxes.to(torch.float32).corners
+
+            loss = smooth_l1_loss(pred_corners, target_corners, beta=self.smooth_l1_loss_beta)
+
+            # Bound the loss
+            loss.clamp(max=self.max_loss_per_group)
+
+            if weights is not None:
+                # loss = torch.sum(loss.reshape(-1, 24) * weights.unsqueeze(-1))
+                loss = torch.sum(loss.reshape(-1, 24).mean(dim=1) * weights)
+            else:
+                loss = loss.reshape(-1, 24).mean()
+
+            disentangled_losses["loss_box3d_" + component_key] = loss
+
+        entangled_l1_dist = (target_corners - box3d_pred.corners).detach().abs().reshape(-1, 24).mean(dim=1)
+
+        return disentangled_losses, entangled_l1_dist
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/fcos2d.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/fcos2d.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet:
+#   https://github.com/aim-uofa/AdelaiDet
+import torch
+from fvcore.nn import sigmoid_focal_loss
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.layers import Conv2d, batched_nms, cat, get_norm
+from detectron2.structures import Boxes, Instances
+from detectron2.utils.comm import get_world_size
+from mmcv.runner import force_fp32
+
+from projects.mmdet3d_plugin.dd3d.layers.iou_loss import IOULoss
+from projects.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Scale
+from projects.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
+
+INF = 100000000
+
+
+def compute_ctrness_targets(reg_targets):
+    if len(reg_targets) == 0:
+        return reg_targets.new_zeros(len(reg_targets))
+    left_right = reg_targets[:, [0, 2]]
+    top_bottom = reg_targets[:, [1, 3]]
+    ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
+                 (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+    return torch.sqrt(ctrness)
+
+
+class FCOS2DHead(nn.Module):
+    def __init__(self, 
+                 num_classes, 
+                 input_shape,
+                 num_cls_convs=4,
+                 num_box_convs=4,
+                 norm='BN',
+                 use_deformable=False,
+                 use_scale=True,
+                 box2d_scale_init_factor=1.0,
+                 version='v2'):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.in_strides = [shape.stride for shape in input_shape]
+        self.num_levels = len(input_shape)
+
+        self.use_scale = use_scale
+        self.box2d_scale_init_factor = box2d_scale_init_factor
+
+        self._version = version
+
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        if use_deformable:
+            raise ValueError("Not supported yet.")
+
+        head_configs = {'cls': num_cls_convs, 'box2d': num_box_convs}
+
+        for head_name, num_convs in head_configs.items():
+            tower = []
+            if self._version == "v1":
+                for _ in range(num_convs):
+                    conv_func = nn.Conv2d
+                    tower.append(conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True))
+                    if norm == "GN":
+                        raise NotImplementedError()
+                    elif norm == "NaiveGN":
+                        raise NotImplementedError()
+                    elif norm == "BN":
+                        tower.append(ModuleListDial([nn.BatchNorm2d(in_channels) for _ in range(self.num_levels)]))
+                    elif norm == "SyncBN":
+                        raise NotImplementedError()
+                    tower.append(nn.ReLU())
+            elif self._version == "v2":
+                for _ in range(num_convs):
+                    if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
+                        # NOTE: need to add norm here!
+                        # Each FPN level has its own batchnorm layer.
+                        # NOTE: do not use dd3d train.py!
+                        # "BN" is converted to "SyncBN" in distributed training (see train.py)
+                        norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
+                    else:
+                        norm_layer = get_norm(norm, in_channels)
+                    tower.append(
+                        Conv2d(
+                            in_channels,
+                            in_channels,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            bias=norm_layer is None,
+                            norm=norm_layer,
+                            activation=F.relu
+                        )
+                    )
+            else:
+                raise ValueError(f"Invalid FCOS2D version: {self._version}")
+            self.add_module(f'{head_name}_tower', nn.Sequential(*tower))
+
+        self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1)
+        self.box2d_reg = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1)
+        self.centerness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1)
+
+        if self.use_scale:
+            if self._version == "v1":
+                self.scales_reg = nn.ModuleList([
+                    Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
+                ])
+            else:
+                self.scales_box2d_reg = nn.ModuleList([
+                    Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
+                ])
+
+        self.init_weights()
+
+    def init_weights(self):
+
+        for tower in [self.cls_tower, self.box2d_tower]:
+            for l in tower.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
+                    if l.bias is not None:
+                        torch.nn.init.constant_(l.bias, 0)
+
+        predictors = [self.cls_logits, self.box2d_reg, self.centerness]
+
+        for modules in predictors:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        logits = []
+        box2d_reg = []
+        centerness = []
+
+        extra_output = {"cls_tower_out": []}
+
+        for l, feature in enumerate(x):
+            cls_tower_out = self.cls_tower(feature)
+            bbox_tower_out = self.box2d_tower(feature)
+
+            # 2D box
+            logits.append(self.cls_logits(cls_tower_out))
+            centerness.append(self.centerness(bbox_tower_out))
+            box_reg = self.box2d_reg(bbox_tower_out)
+            if self.use_scale:
+                # TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
+                if self._version == "v1":
+                    box_reg = self.scales_reg[l](box_reg)
+                else:
+                    box_reg = self.scales_box2d_reg[l](box_reg)
+            # Note that we use relu, as in the improved FCOS, instead of exp.
+            box2d_reg.append(F.relu(box_reg))
+
+            extra_output['cls_tower_out'].append(cls_tower_out)
+
+        return logits, box2d_reg, centerness, extra_output
+
+
+class FCOS2DLoss(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 focal_loss_alpha=0.25,
+                 focal_loss_gamma=2.0,
+                 loc_loss_type='giou',
+                 ):
+        super().__init__()
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+
+        self.box2d_reg_loss_fn = IOULoss(loc_loss_type)
+
+        self.num_classes = num_classes
+
+    @force_fp32(apply_to=('logits', 'box2d_reg', 'centerness'))
+    def forward(self, logits, box2d_reg, centerness, targets):
+        labels = targets['labels']
+        box2d_reg_targets = targets['box2d_reg_targets']
+        pos_inds = targets["pos_inds"]
+
+        if len(labels) != box2d_reg_targets.shape[0]:
+            raise ValueError(
+                f"The size of 'labels' and 'box2d_reg_targets' does not match: a={len(labels)}, b={box2d_reg_targets.shape[0]}"
+            )
+
+        # Flatten predictions
+        logits = cat([x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits])
+        box2d_reg_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4) for x in box2d_reg])
+        centerness_pred = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in centerness])
+
+        # -------------------
+        # Classification loss
+        # -------------------
+        num_pos_local = pos_inds.numel()
+        num_gpus = get_world_size()
+        total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
+        num_pos_avg = max(total_num_pos / num_gpus, 1.0)
+
+        # prepare one_hot
+        cls_target = torch.zeros_like(logits)
+        cls_target[pos_inds, labels[pos_inds]] = 1
+
+        loss_cls = sigmoid_focal_loss(
+            logits,
+            cls_target,
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        ) / num_pos_avg
+
+        # NOTE: The rest of losses only consider foreground pixels.
+        box2d_reg_pred = box2d_reg_pred[pos_inds]
+        box2d_reg_targets = box2d_reg_targets[pos_inds]
+
+        centerness_pred = centerness_pred[pos_inds]
+
+        # Compute centerness targets here using 2D regression targets of foreground pixels.
+        centerness_targets = compute_ctrness_targets(box2d_reg_targets)
+
+        # Denominator for all foreground losses.
+        ctrness_targets_sum = centerness_targets.sum()
+        loss_denom = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
+
+        # NOTE: change the return after reduce_sum
+        if pos_inds.numel() == 0:
+            losses = {
+                "loss_cls": loss_cls,
+                "loss_box2d_reg": box2d_reg_pred.sum() * 0.,
+                "loss_centerness": centerness_pred.sum() * 0.,
+            }
+            return losses, {}
+
+        # ----------------------
+        # 2D box regression loss
+        # ----------------------
+        loss_box2d_reg = self.box2d_reg_loss_fn(box2d_reg_pred, box2d_reg_targets, centerness_targets) / loss_denom
+
+        # ---------------
+        # Centerness loss
+        # ---------------
+        loss_centerness = F.binary_cross_entropy_with_logits(
+            centerness_pred, centerness_targets, reduction="sum"
+        ) / num_pos_avg
+
+        loss_dict = {"loss_cls": loss_cls, "loss_box2d_reg": loss_box2d_reg, "loss_centerness": loss_centerness}
+        extra_info = {"loss_denom": loss_denom, "centerness_targets": centerness_targets}
+
+        return loss_dict, extra_info
+
+
+class FCOS2DInference():
+    def __init__(self, cfg):
+        self.thresh_with_ctr = cfg.DD3D.FCOS2D.INFERENCE.THRESH_WITH_CTR
+        self.pre_nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_THRESH
+        self.pre_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_TOPK
+        self.post_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.POST_NMS_TOPK
+        self.nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.NMS_THRESH
+        self.num_classes = cfg.DD3D.NUM_CLASSES
+
+    def __call__(self, logits, box2d_reg, centerness, locations, image_sizes):
+
+        pred_instances = []  # List[List[Instances]], shape = (L, B)
+        extra_info = []
+        for lvl, (logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl) in \
+            enumerate(zip(logits, box2d_reg, centerness, locations)):
+
+            instances_per_lvl, extra_info_per_lvl = self.forward_for_single_feature_map(
+                logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl, image_sizes
+            )  # List of Instances; one for each image.
+
+            for instances_per_im in instances_per_lvl:
+                instances_per_im.fpn_levels = locations_lvl.new_ones(len(instances_per_im), dtype=torch.long) * lvl
+
+            pred_instances.append(instances_per_lvl)
+            extra_info.append(extra_info_per_lvl)
+
+        return pred_instances, extra_info
+
+    def forward_for_single_feature_map(self, logits, box2d_reg, centerness, locations, image_sizes):
+        N, C, _, __ = logits.shape
+
+        # put in the same format as locations
+        scores = logits.permute(0, 2, 3, 1).reshape(N, -1, C).sigmoid()
+        box2d_reg = box2d_reg.permute(0, 2, 3, 1).reshape(N, -1, 4)
+        centerness = centerness.permute(0, 2, 3, 1).reshape(N, -1).sigmoid()
+
+        # if self.thresh_with_ctr is True, we multiply the classification
+        # scores with centerness scores before applying the threshold.
+        if self.thresh_with_ctr:
+            scores = scores * centerness[:, :, None]
+
+        candidate_mask = scores > self.pre_nms_thresh
+
+        pre_nms_topk = candidate_mask.reshape(N, -1).sum(1)
+        pre_nms_topk = pre_nms_topk.clamp(max=self.pre_nms_topk)
+
+        if not self.thresh_with_ctr:
+            scores = scores * centerness[:, :, None]
+
+        results = []
+        all_fg_inds_per_im, all_topk_indices, all_class_inds_per_im = [], [], []
+        for i in range(N):
+            scores_per_im = scores[i]
+            candidate_mask_per_im = candidate_mask[i]
+            scores_per_im = scores_per_im[candidate_mask_per_im]
+
+            candidate_inds_per_im = candidate_mask_per_im.nonzero(as_tuple=False)
+            fg_inds_per_im = candidate_inds_per_im[:, 0]
+            class_inds_per_im = candidate_inds_per_im[:, 1]
+
+            # Cache info here.
+            all_fg_inds_per_im.append(fg_inds_per_im)
+            all_class_inds_per_im.append(class_inds_per_im)
+
+            box2d_reg_per_im = box2d_reg[i][fg_inds_per_im]
+            locations_per_im = locations[fg_inds_per_im]
+
+            pre_nms_topk_per_im = pre_nms_topk[i]
+
+            if candidate_mask_per_im.sum().item() > pre_nms_topk_per_im.item():
+                scores_per_im, topk_indices = \
+                    scores_per_im.topk(pre_nms_topk_per_im, sorted=False)
+
+                class_inds_per_im = class_inds_per_im[topk_indices]
+                box2d_reg_per_im = box2d_reg_per_im[topk_indices]
+                locations_per_im = locations_per_im[topk_indices]
+            else:
+                topk_indices = None
+
+            all_topk_indices.append(topk_indices)
+
+            detections = torch.stack([
+                locations_per_im[:, 0] - box2d_reg_per_im[:, 0],
+                locations_per_im[:, 1] - box2d_reg_per_im[:, 1],
+                locations_per_im[:, 0] + box2d_reg_per_im[:, 2],
+                locations_per_im[:, 1] + box2d_reg_per_im[:, 3],
+            ],
+                                     dim=1)
+
+            instances = Instances(image_sizes[i])
+            instances.pred_boxes = Boxes(detections)
+            instances.scores = torch.sqrt(scores_per_im)
+            instances.pred_classes = class_inds_per_im
+            instances.locations = locations_per_im
+
+            results.append(instances)
+
+        extra_info = {
+            "fg_inds_per_im": all_fg_inds_per_im,
+            "class_inds_per_im": all_class_inds_per_im,
+            "topk_indices": all_topk_indices
+        }
+        return results, extra_info
+
+    def nms_and_top_k(self, instances_per_im, score_key_for_nms="scores"):
+        results = []
+        for instances in instances_per_im:
+            if self.nms_thresh > 0:
+                # Multiclass NMS.
+                keep = batched_nms(
+                    instances.pred_boxes.tensor, instances.get(score_key_for_nms), instances.pred_classes,
+                    self.nms_thresh
+                )
+                instances = instances[keep]
+            num_detections = len(instances)
+
+            # Limit to max_per_image detections **over all classes**
+            if num_detections > self.post_nms_topk > 0:
+                scores = instances.scores
+                # image_thresh, _ = torch.kthvalue(scores.cpu(), num_detections - self.post_nms_topk + 1)
+                image_thresh, _ = torch.kthvalue(scores, num_detections - self.post_nms_topk + 1)
+                keep = scores >= image_thresh.item()
+                keep = torch.nonzero(keep).squeeze(1)
+                instances = instances[keep]
+            results.append(instances)
+        return results
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/fcos3d.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/fcos3d.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import Conv2d, cat, get_norm
+from mmcv.runner import force_fp32
+
+from projects.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Offset, Scale
+from .disentangled_box3d_loss import DisentangledBox3DLoss
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+from projects.mmdet3d_plugin.dd3d.utils.geometry import allocentric_to_egocentric, unproject_points2d
+
+EPS = 1e-7
+
+
+def predictions_to_boxes3d(
+    quat,
+    proj_ctr,
+    depth,
+    size,
+    locations,
+    inv_intrinsics,
+    canon_box_sizes,
+    min_depth,
+    max_depth,
+    scale_depth_by_focal_lengths_factor,
+    scale_depth_by_focal_lengths=True,
+    quat_is_allocentric=True,
+    depth_is_distance=False
+):
+    # Normalize to make quat unit norm.
+    quat = quat / quat.norm(dim=1, keepdim=True).clamp(min=EPS)
+    # Make sure again it's numerically unit-norm.
+    quat = quat / quat.norm(dim=1, keepdim=True)
+
+    if scale_depth_by_focal_lengths:
+        pixel_size = torch.norm(torch.stack([inv_intrinsics[:, 0, 0], inv_intrinsics[:, 1, 1]], dim=-1), dim=-1)
+        depth = depth / (pixel_size * scale_depth_by_focal_lengths_factor)
+
+    if depth_is_distance:
+        depth = depth / unproject_points2d(locations, inv_intrinsics).norm(dim=1).clamp(min=EPS)
+
+    depth = depth.reshape(-1, 1).clamp(min_depth, max_depth)
+
+    proj_ctr = proj_ctr + locations
+
+    if quat_is_allocentric:
+        quat = allocentric_to_egocentric(quat, proj_ctr, inv_intrinsics)
+
+    size = (size.tanh() + 1.) * canon_box_sizes  # max size = 2 * canon_size
+
+    return Boxes3D(quat, proj_ctr, depth, size, inv_intrinsics)
+
+
+class FCOS3DHead(nn.Module):
+    def __init__(self, 
+                 num_classes,
+                 input_shape,
+                 num_convs=4,
+                 norm='BN',
+                 use_scale=True,
+                 depth_scale_init_factor=0.3,
+                 proj_ctr_scale_init_factor=1.0,
+                 use_per_level_predictors=False,
+                 class_agnostic=False,
+                 use_deformable=False,
+                 mean_depth_per_level=None,
+                 std_depth_per_level=None,
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_strides = [shape.stride for shape in input_shape]
+        self.num_levels = len(input_shape)
+
+        self.use_scale = use_scale
+        self.depth_scale_init_factor = depth_scale_init_factor
+        self.proj_ctr_scale_init_factor = proj_ctr_scale_init_factor
+        self.use_per_level_predictors = use_per_level_predictors
+
+        self.register_buffer("mean_depth_per_level", torch.Tensor(mean_depth_per_level))
+        self.register_buffer("std_depth_per_level", torch.Tensor(std_depth_per_level))
+
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        if use_deformable:
+            raise ValueError("Not supported yet.")
+
+        box3d_tower = []
+        for i in range(num_convs):
+            if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
+                # NOTE: need to add norm here!
+                # Each FPN level has its own batchnorm layer.
+                # NOTE: do not use dd3d train.py!
+                # "BN" is converted to "SyncBN" in distributed training (see train.py)
+                norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
+            else:
+                norm_layer = get_norm(norm, in_channels)
+            box3d_tower.append(
+                Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=norm_layer is None,
+                    norm=norm_layer,
+                    activation=F.relu
+                )
+            )
+        self.add_module('box3d_tower', nn.Sequential(*box3d_tower))
+
+        num_classes = self.num_classes if not class_agnostic else 1
+        num_levels = self.num_levels if use_per_level_predictors else 1
+
+        # 3D box branches.
+        self.box3d_quat = nn.ModuleList([
+            Conv2d(in_channels, 4 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_ctr = nn.ModuleList([
+            Conv2d(in_channels, 2 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_depth = nn.ModuleList([
+            Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=(not self.use_scale))
+            for _ in range(num_levels)
+        ])
+        self.box3d_size = nn.ModuleList([
+            Conv2d(in_channels, 3 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_conf = nn.ModuleList([
+            Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+
+        if self.use_scale:
+            self.scales_proj_ctr = nn.ModuleList([
+                Scale(init_value=stride * self.proj_ctr_scale_init_factor) for stride in self.in_strides
+            ])
+            # (pre-)compute (mean, std) of depth for each level, and determine the init value here.
+            self.scales_size = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
+            self.scales_conf = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
+
+            self.scales_depth = nn.ModuleList([
+                Scale(init_value=sigma * self.depth_scale_init_factor) for sigma in self.std_depth_per_level
+            ])
+            self.offsets_depth = nn.ModuleList([Offset(init_value=b) for b in self.mean_depth_per_level])
+
+        self._init_weights()
+
+    def _init_weights(self):
+
+        for l in self.box3d_tower.modules():
+            if isinstance(l, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
+                if l.bias is not None:
+                    torch.nn.init.constant_(l.bias, 0)
+
+        predictors = [self.box3d_quat, self.box3d_ctr, self.box3d_depth, self.box3d_size, self.box3d_conf]
+
+        for modules in predictors:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf = [], [], [], [], []
+        dense_depth = None
+        for l, features in enumerate(x):
+            box3d_tower_out = self.box3d_tower(features)
+
+            _l = l if self.use_per_level_predictors else 0
+
+            # 3D box
+            quat = self.box3d_quat[_l](box3d_tower_out)
+            proj_ctr = self.box3d_ctr[_l](box3d_tower_out)
+            depth = self.box3d_depth[_l](box3d_tower_out)
+            size3d = self.box3d_size[_l](box3d_tower_out)
+            conf3d = self.box3d_conf[_l](box3d_tower_out)
+
+            if self.use_scale:
+                # TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
+                proj_ctr = self.scales_proj_ctr[l](proj_ctr)
+                size3d = self.scales_size[l](size3d)
+                conf3d = self.scales_conf[l](conf3d)
+                depth = self.offsets_depth[l](self.scales_depth[l](depth))
+
+            box3d_quat.append(quat)
+            box3d_ctr.append(proj_ctr)
+            box3d_depth.append(depth)
+            box3d_size.append(size3d)
+            box3d_conf.append(conf3d)
+
+        return box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth
+
+
+class FCOS3DLoss(nn.Module):
+    def __init__(self, 
+                 num_classes,
+                 min_depth=0.1,
+                 max_depth=80.0,
+                 box3d_loss_weight=2.0,
+                 conf3d_loss_weight=1.0,
+                 conf_3d_temperature=1.0,
+                 smooth_l1_loss_beta=0.05, 
+                 max_loss_per_group=20,
+                 predict_allocentric_rot=True,
+                 scale_depth_by_focal_lengths=True,
+                 scale_depth_by_focal_lengths_factor=500.0,
+                 class_agnostic=False,
+                 predict_distance=False,
+                 canon_box_sizes=None):
+        super().__init__()
+        self.canon_box_sizes = canon_box_sizes
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.predict_allocentric_rot = predict_allocentric_rot
+        self.scale_depth_by_focal_lengths = scale_depth_by_focal_lengths
+        self.scale_depth_by_focal_lengths_factor = scale_depth_by_focal_lengths_factor
+        self.predict_distance = predict_distance
+
+        self.box3d_reg_loss_fn = DisentangledBox3DLoss(smooth_l1_loss_beta, max_loss_per_group)
+        self.box3d_loss_weight = box3d_loss_weight
+        self.conf3d_loss_weight = conf3d_loss_weight
+        self.conf_3d_temperature = conf_3d_temperature
+
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+
+    @force_fp32(apply_to=('box3d_quat', 'box3d_ctr', 'box3d_depth', 'box3d_size','box3d_conf', 'inv_intrinsics'))
+    def forward(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics, fcos2d_info,
+        targets
+    ):
+        labels = targets['labels']
+        box3d_targets = targets['box3d_targets']
+        pos_inds = targets["pos_inds"]
+
+        if pos_inds.numel() == 0:
+            losses = {
+                "loss_box3d_quat": torch.stack([x.sum() * 0. for x in box3d_quat]).sum(), 
+                "loss_box3d_proj_ctr": torch.stack([x.sum() * 0. for x in box3d_ctr]).sum(),
+                "loss_box3d_depth": torch.stack([x.sum() * 0. for x in box3d_depth]).sum(),
+                "loss_box3d_size": torch.stack([x.sum() * 0. for x in box3d_size]).sum(),
+                "loss_conf3d": torch.stack([x.sum() * 0. for x in box3d_conf]).sum()
+            }
+            return losses
+
+        if len(labels) != len(box3d_targets):
+            raise ValueError(
+                f"The size of 'labels' and 'box3d_targets' does not match: a={len(labels)}, b={len(box3d_targets)}"
+            )
+
+        num_classes = self.num_classes if not self.class_agnostic else 1
+
+        box3d_quat_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4, num_classes) for x in box3d_quat])
+        box3d_ctr_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 2, num_classes) for x in box3d_ctr])
+        box3d_depth_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_depth])
+        box3d_size_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 3, num_classes) for x in box3d_size])
+        box3d_conf_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_conf])
+
+        # ----------------------
+        # 3D box disentangled loss
+        # ----------------------
+        box3d_targets = box3d_targets[pos_inds]
+
+        box3d_quat_pred = box3d_quat_pred[pos_inds]
+        box3d_ctr_pred = box3d_ctr_pred[pos_inds]
+        box3d_depth_pred = box3d_depth_pred[pos_inds]
+        box3d_size_pred = box3d_size_pred[pos_inds]
+        box3d_conf_pred = box3d_conf_pred[pos_inds]
+
+        if self.class_agnostic:
+            box3d_quat_pred = box3d_quat_pred.squeeze(-1)
+            box3d_ctr_pred = box3d_ctr_pred.squeeze(-1)
+            box3d_depth_pred = box3d_depth_pred.squeeze(-1)
+            box3d_size_pred = box3d_size_pred.squeeze(-1)
+            box3d_conf_pred = box3d_conf_pred.squeeze(-1)
+        else:
+            I = labels[pos_inds][..., None, None]
+            box3d_quat_pred = torch.gather(box3d_quat_pred, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
+            box3d_ctr_pred = torch.gather(box3d_ctr_pred, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
+            box3d_depth_pred = torch.gather(box3d_depth_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
+            box3d_size_pred = torch.gather(box3d_size_pred, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
+            box3d_conf_pred = torch.gather(box3d_conf_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
+
+        canon_box_sizes = box3d_quat_pred.new_tensor(self.canon_box_sizes)[labels[pos_inds]]
+
+        locations = targets["locations"][pos_inds]
+        im_inds = targets["im_inds"][pos_inds]
+        inv_intrinsics = inv_intrinsics[im_inds]
+
+        box3d_pred = predictions_to_boxes3d(
+            box3d_quat_pred,
+            box3d_ctr_pred,
+            box3d_depth_pred,
+            box3d_size_pred,
+            locations,
+            inv_intrinsics,
+            canon_box_sizes,
+            self.min_depth,
+            self.max_depth,
+            scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
+            scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
+            quat_is_allocentric=self.predict_allocentric_rot,
+            depth_is_distance=self.predict_distance
+        )
+
+        centerness_targets = fcos2d_info["centerness_targets"]
+        loss_denom = fcos2d_info["loss_denom"]
+        losses_box3d, box3d_l1_error = self.box3d_reg_loss_fn(box3d_pred, box3d_targets, locations, centerness_targets)
+
+        losses_box3d = {k: self.box3d_loss_weight * v / loss_denom for k, v in losses_box3d.items()}
+
+        conf_3d_targets = torch.exp(-1. / self.conf_3d_temperature * box3d_l1_error)
+        loss_conf3d = F.binary_cross_entropy_with_logits(box3d_conf_pred, conf_3d_targets, reduction='none')
+        loss_conf3d = self.conf3d_loss_weight * (loss_conf3d * centerness_targets).sum() / loss_denom
+
+        losses = {"loss_conf3d": loss_conf3d, **losses_box3d}
+
+        return losses
+
+
+class FCOS3DInference():
+    def __init__(self, cfg):
+        self.canon_box_sizes = cfg.DD3D.FCOS3D.CANONICAL_BOX3D_SIZES
+        self.min_depth = cfg.DD3D.FCOS3D.MIN_DEPTH
+        self.max_depth = cfg.DD3D.FCOS3D.MAX_DEPTH
+        self.predict_allocentric_rot = cfg.DD3D.FCOS3D.PREDICT_ALLOCENTRIC_ROT
+        self.scale_depth_by_focal_lengths = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS
+        self.scale_depth_by_focal_lengths_factor = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR
+        self.predict_distance = cfg.DD3D.FCOS3D.PREDICT_DISTANCE
+
+        self.num_classes = cfg.DD3D.NUM_CLASSES
+        self.class_agnostic = cfg.DD3D.FCOS3D.CLASS_AGNOSTIC_BOX3D
+
+    def __call__(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
+    ):
+        # pred_instances: # List[List[Instances]], shape = (L, B)
+        for lvl, (box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl) in \
+            enumerate(zip(box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf)):
+
+            # In-place modification: update per-level pred_instances.
+            self.forward_for_single_feature_map(
+                box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl, inv_intrinsics,
+                pred_instances[lvl], fcos2d_info[lvl]
+            )  # List of Instances; one for each image.
+
+    def forward_for_single_feature_map(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
+    ):
+        N = box3d_quat.shape[0]
+
+        num_classes = self.num_classes if not self.class_agnostic else 1
+
+        box3d_quat = box3d_quat.permute(0, 2, 3, 1).reshape(N, -1, 4, num_classes)
+        box3d_ctr = box3d_ctr.permute(0, 2, 3, 1).reshape(N, -1, 2, num_classes)
+        box3d_depth = box3d_depth.permute(0, 2, 3, 1).reshape(N, -1, num_classes)
+        box3d_size = box3d_size.permute(0, 2, 3, 1).reshape(N, -1, 3, num_classes)
+        box3d_conf = box3d_conf.permute(0, 2, 3, 1).reshape(N, -1, num_classes).sigmoid()
+
+        for i in range(N):
+            fg_inds_per_im = fcos2d_info['fg_inds_per_im'][i]
+            class_inds_per_im = fcos2d_info['class_inds_per_im'][i]
+            topk_indices = fcos2d_info['topk_indices'][i]
+
+            box3d_quat_per_im = box3d_quat[i][fg_inds_per_im]
+            box3d_ctr_per_im = box3d_ctr[i][fg_inds_per_im]
+            box3d_depth_per_im = box3d_depth[i][fg_inds_per_im]
+            box3d_size_per_im = box3d_size[i][fg_inds_per_im]
+            box3d_conf_per_im = box3d_conf[i][fg_inds_per_im]
+
+            if self.class_agnostic:
+                box3d_quat_per_im = box3d_quat_per_im.squeeze(-1)
+                box3d_ctr_per_im = box3d_ctr_per_im.squeeze(-1)
+                box3d_depth_per_im = box3d_depth_per_im.squeeze(-1)
+                box3d_size_per_im = box3d_size_per_im.squeeze(-1)
+                box3d_conf_per_im = box3d_conf_per_im.squeeze(-1)
+            else:
+                I = class_inds_per_im[..., None, None]
+                box3d_quat_per_im = torch.gather(box3d_quat_per_im, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
+                box3d_ctr_per_im = torch.gather(box3d_ctr_per_im, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
+                box3d_depth_per_im = torch.gather(box3d_depth_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
+                box3d_size_per_im = torch.gather(box3d_size_per_im, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
+                box3d_conf_per_im = torch.gather(box3d_conf_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
+
+            if topk_indices is not None:
+                box3d_quat_per_im = box3d_quat_per_im[topk_indices]
+                box3d_ctr_per_im = box3d_ctr_per_im[topk_indices]
+                box3d_depth_per_im = box3d_depth_per_im[topk_indices]
+                box3d_size_per_im = box3d_size_per_im[topk_indices]
+                box3d_conf_per_im = box3d_conf_per_im[topk_indices]
+
+            # scores_per_im = pred_instances[i].scores.square()
+            # NOTE: Before refactoring, the squared score was used. Is raw 2D score better?
+            scores_per_im = pred_instances[i].scores
+            scores_3d_per_im = scores_per_im * box3d_conf_per_im
+
+            canon_box_sizes = box3d_quat.new_tensor(self.canon_box_sizes)[pred_instances[i].pred_classes]
+            inv_K = inv_intrinsics[i][None, ...].expand(len(box3d_quat_per_im), 3, 3)
+            locations = pred_instances[i].locations
+            pred_boxes3d = predictions_to_boxes3d(
+                box3d_quat_per_im,
+                box3d_ctr_per_im,
+                box3d_depth_per_im,
+                box3d_size_per_im,
+                locations,
+                inv_K,
+                canon_box_sizes,
+                self.min_depth,
+                self.max_depth,
+                scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
+                scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
+                quat_is_allocentric=self.predict_allocentric_rot,
+                depth_is_distance=self.predict_distance
+            )
+
+            # In-place modification: add fields to instances.
+            pred_instances[i].pred_boxes3d = pred_boxes3d
+            pred_instances[i].scores_3d = scores_3d_per_im
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+import torch.nn.functional as F
+from fvcore.nn.smooth_l1_loss import smooth_l1_loss
+from torch import nn
+
+from detectron2.layers import Conv2d, cat
+#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.modeling.postprocessing import detector_postprocess as resize_instances
+from detectron2.structures import Instances
+from detectron2.utils import comm as d2_comm
+from mmdet.models.builder import HEADS
+from mmcv.runner import force_fp32
+
+from projects.mmdet3d_plugin.dd3d.datasets.nuscenes import MAX_NUM_ATTRIBUTES
+from .core import DD3D
+#from tridet.modeling.dd3d.postprocessing import get_group_idxs, nuscenes_sample_aggregate
+from .prepare_targets import DD3DTargetPreparer
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+from projects.mmdet3d_plugin.dd3d.structures.image_list import ImageList
+from projects.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
+
+INF = 100000000.
+
+
+class NuscenesDD3DTargetPreparer(DD3DTargetPreparer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        assert self.dd3d_enabled, f"{type(self).__name__} requires dd3d_enabled = True"
+
+    def __call__(self, locations, gt_instances, feature_shapes):
+        num_loc_list = [len(loc) for loc in locations]
+
+        # compute locations to size ranges
+        loc_to_size_range = []
+        for l, loc_per_level in enumerate(locations):
+            loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
+            loc_to_size_range.append(loc_to_size_range_per_level[None].expand(num_loc_list[l], -1))
+
+        loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
+        locations = torch.cat(locations, dim=0)
+
+        training_targets = self.compute_targets_for_locations(locations, gt_instances, loc_to_size_range, num_loc_list)
+
+        training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
+        training_targets["im_inds"] = [
+            locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
+        ]
+
+        box2d = training_targets.pop("box2d", None)
+
+        # transpose im first training_targets to level first ones
+        training_targets = {k: self._transpose(v, num_loc_list) for k, v in training_targets.items() if k != "box2d"}
+
+        training_targets["fpn_levels"] = [
+            loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(training_targets["locations"])
+        ]
+
+        # Flatten targets: (L x B x H x W, TARGET_SIZE)
+        labels = cat([x.reshape(-1) for x in training_targets["labels"]])
+        box2d_reg_targets = cat([x.reshape(-1, 4) for x in training_targets["box2d_reg"]])
+
+        target_inds = cat([x.reshape(-1) for x in training_targets["target_inds"]])
+        locations = cat([x.reshape(-1, 2) for x in training_targets["locations"]])
+        im_inds = cat([x.reshape(-1) for x in training_targets["im_inds"]])
+        fpn_levels = cat([x.reshape(-1) for x in training_targets["fpn_levels"]])
+
+        pos_inds = torch.nonzero(labels != self.num_classes).squeeze(1)
+
+        targets = {
+            "labels": labels,
+            "box2d_reg_targets": box2d_reg_targets,
+            "locations": locations,
+            "target_inds": target_inds,
+            "im_inds": im_inds,
+            "fpn_levels": fpn_levels,
+            "pos_inds": pos_inds
+        }
+
+        if self.dd3d_enabled:
+            box3d_targets = Boxes3D.cat(training_targets["box3d"])
+            targets.update({"box3d_targets": box3d_targets})
+
+            if box2d is not None:
+                # Original format is B x L x (H x W, 4)
+                # Need to be in L x (B, 4, H, W).
+                batched_box2d = []
+                for lvl, per_lvl_box2d in enumerate(zip(*box2d)):
+                    # B x (H x W, 4)
+                    h, w = feature_shapes[lvl]
+                    batched_box2d_lvl = torch.stack([x.T.reshape(4, h, w) for x in per_lvl_box2d], dim=0)
+                    batched_box2d.append(batched_box2d_lvl)
+                targets.update({"batched_box2d": batched_box2d})
+
+        # Nuscenes targets -- attribute / speed
+        attributes = cat([x.reshape(-1) for x in training_targets["attributes"]])
+        speeds = cat([x.reshape(-1) for x in training_targets["speeds"]])
+
+        targets.update({'attributes': attributes, 'speeds': speeds})
+
+        return targets
+
+    def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
+        labels = []
+        box2d_reg = []
+
+        if self.dd3d_enabled:
+            box3d = []
+
+        target_inds = []
+        xs, ys = locations[:, 0], locations[:, 1]
+
+        # NuScenes targets  -- attribute / speed
+        attributes, speeds = [], []
+
+        num_targets = 0
+        for im_i in range(len(targets)):
+            targets_per_im = targets[im_i]
+            bboxes = targets_per_im.gt_boxes.tensor
+            labels_per_im = targets_per_im.gt_classes
+
+            # no gt
+            if bboxes.numel() == 0:
+                labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
+                # reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+                box2d_reg.append(locations.new_zeros((locations.size(0), 4)))
+                target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
+
+                if self.dd3d_enabled:
+                    box3d.append(
+                        Boxes3D(
+                            locations.new_zeros(locations.size(0), 4),
+                            locations.new_zeros(locations.size(0), 2),
+                            locations.new_zeros(locations.size(0), 1),
+                            locations.new_zeros(locations.size(0), 3),
+                            locations.new_zeros(locations.size(0), 3, 3),
+                        ).to(torch.float32)
+                    )
+                # NOTE: attributes and speeds.
+                attributes.append(labels_per_im.new_zeros(locations.size(0)))
+                speeds.append(labels_per_im.new_zeros(locations.size(0)))  
+                continue
+
+            area = targets_per_im.gt_boxes.area()
+
+            l = xs[:, None] - bboxes[:, 0][None]
+            t = ys[:, None] - bboxes[:, 1][None]
+            r = bboxes[:, 2][None] - xs[:, None]
+            b = bboxes[:, 3][None] - ys[:, None]
+            # reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+            box2d_reg_per_im = torch.stack([l, t, r, b], dim=2)
+
+            if self.center_sample:
+                is_in_boxes = self.get_sample_region(bboxes, num_loc_list, xs, ys)
+            else:
+                is_in_boxes = box2d_reg_per_im.min(dim=2)[0] > 0
+
+            max_reg_targets_per_im = box2d_reg_per_im.max(dim=2)[0]
+            # limit the regression range for each location
+            is_cared_in_the_level = \
+                (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
+                (max_reg_targets_per_im <= size_ranges[:, [1]])
+
+            locations_to_gt_area = area[None].repeat(len(locations), 1)
+            locations_to_gt_area[is_in_boxes == 0] = INF
+            locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+            # if there are still more than one objects for a location,
+            # we choose the one with minimal area
+            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+
+            box2d_reg_per_im = box2d_reg_per_im[range(len(locations)), locations_to_gt_inds]
+            target_inds_per_im = locations_to_gt_inds + num_targets
+            num_targets += len(targets_per_im)
+
+            labels_per_im = labels_per_im[locations_to_gt_inds]
+            labels_per_im[locations_to_min_area == INF] = self.num_classes
+
+            labels.append(labels_per_im)
+            box2d_reg.append(box2d_reg_per_im)
+            target_inds.append(target_inds_per_im)
+
+            if self.dd3d_enabled:
+                # 3D box targets
+                box3d_per_im = targets_per_im.gt_boxes3d[locations_to_gt_inds]
+                box3d.append(box3d_per_im)
+
+            # NuScenes targets  -- attribute / speed
+            attributes_per_im = targets_per_im.gt_attributes[locations_to_gt_inds]
+            speeds_per_im = targets_per_im.gt_speeds[locations_to_gt_inds]
+            attributes.append(attributes_per_im)
+            speeds.append(speeds_per_im)
+
+        ret = {"labels": labels, "box2d_reg": box2d_reg, "target_inds": target_inds}
+        if self.dd3d_enabled:
+            ret.update({"box3d": box3d})
+
+        # NuScenes targets  -- attribute / speed
+        ret.update({"attributes": attributes, "speeds": speeds})
+
+        return ret
+
+
+class NuscenesLoss(nn.Module):
+    def __init__(self, attr_loss_weight=0.2, speed_loss_weight=0.2):
+        super().__init__()
+        self.attr_loss_weight = attr_loss_weight
+        self.speed_loss_weight = speed_loss_weight
+
+    @force_fp32(apply_to=('attr_logits', 'speeds'))
+    def forward(self, attr_logits, speeds, fcos2d_info, targets):
+        # Flatten predictions
+        attr_logits = cat([x.permute(0, 2, 3, 1).reshape(-1, MAX_NUM_ATTRIBUTES) for x in attr_logits])
+        speeds = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in speeds])
+
+        pos_inds = targets['pos_inds']
+
+        losses = {}
+
+        # 1. Attributes
+        attr_logits = attr_logits[pos_inds]
+        target_attr = targets['attributes'][pos_inds]
+        valid_attr_mask = target_attr != MAX_NUM_ATTRIBUTES  # No attrs associated with class, or just attr missing.
+
+        if pos_inds.numel() == 0:
+            attr_weights = attr_logits.new_tensor(0.0) #torch.tensor(0.0).cuda()
+        else:
+            attr_weights = fcos2d_info['centerness_targets'][valid_attr_mask]
+        # Denominator for all foreground losses -- re-computed for features with valid attributes.
+        # attr_loss_denom = max(reduce_sum(attr_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
+        # NOTE: compute attr_weights_sum, and then feed it to reduce_sum() works, but not above.
+        attr_weights_sum = attr_weights.sum()
+        attr_loss_denom = max(reduce_sum(attr_weights_sum).item() / d2_comm.get_world_size(), 1e-6)
+
+        if valid_attr_mask.sum() == 0:
+            losses.update({"loss_attr": attr_logits.sum() * 0.})
+        else:
+            attr_logits = attr_logits[valid_attr_mask]
+            target_attr = target_attr[valid_attr_mask]
+
+            xent = F.cross_entropy(attr_logits, target_attr)
+            loss_attr = (xent * attr_weights).sum() / attr_loss_denom
+
+            losses.update({"loss_attr": self.attr_loss_weight * loss_attr})
+
+        # 2. Speed
+        speeds = speeds[pos_inds]
+        target_speeds = targets['speeds'][pos_inds]
+        # NOTE: some GT speeds are NaN.
+        valid_gt_mask = torch.logical_not(torch.isnan(target_speeds))
+
+        if pos_inds.numel() == 0:
+            speed_weights = speeds.new_tensor(0.0) #torch.tensor(0.0).cuda()
+        else:
+            speed_weights = fcos2d_info['centerness_targets'][valid_gt_mask]
+        # Denominator for all foreground losses -- re-computed for features with valid speeds.
+        # speed_loss_denom = max(reduce_sum(speed_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
+        speed_weights_sum = speed_weights.sum()
+        speed_loss_denom = max(reduce_sum(speed_weights_sum).item() / d2_comm.get_world_size(), 1e-6)
+
+        # NOTE: move after reduce sum
+        if pos_inds.numel() == 0:
+            losses = {"loss_attr": attr_logits.sum() * 0., "loss_speed": speeds.sum() * 0.}
+            # NOTE: This is probably un-reachable, because the training filter images with empty annotations.
+            # NOTE: If not, attr_weights can be unavailable in the reduce_sum below().
+            return losses
+
+        if valid_gt_mask.sum() == 0:
+            losses.update({"loss_speed": speeds.sum() * 0.})
+            # return losses
+        else:
+            speeds = speeds[valid_gt_mask]
+            target_speeds = target_speeds[valid_gt_mask]
+
+            l1_error = smooth_l1_loss(speeds, target_speeds, beta=0.05)
+            loss_speed = (l1_error * speed_weights).sum() / speed_loss_denom
+            losses.update({"loss_speed": self.speed_loss_weight * loss_speed})
+
+        return losses
+
+
+class NuscenesInference():
+    def __init__(self, cfg):
+        pass
+
+    def __call__(self, attr_logits, speeds, pred_instances, fcos2d_info):
+        """Add 'pred_attribute', 'pred_speed' to Instances in 'pred_instances'."""
+        N = attr_logits[0].shape[0]
+        for lvl, (attr_logits_lvl, speed_lvl, info_lvl, instances_lvl) in \
+            enumerate(zip(attr_logits, speeds, fcos2d_info, pred_instances)):
+
+            attr_logits_lvl = attr_logits_lvl.permute(0, 2, 3, 1).reshape(N, -1, MAX_NUM_ATTRIBUTES)
+            speed_lvl = speed_lvl.permute(0, 2, 3, 1).reshape(N, -1)
+            for i in range(N):
+                fg_inds_per_im = info_lvl['fg_inds_per_im'][i]
+                topk_indices = info_lvl['topk_indices'][i]
+
+                attr_logits_per_im = attr_logits_lvl[i][fg_inds_per_im]
+                speed_per_im = speed_lvl[i][fg_inds_per_im]
+
+                if topk_indices is not None:
+                    attr_logits_per_im = attr_logits_per_im[topk_indices]
+                    speed_per_im = speed_per_im[topk_indices]
+
+                if len(attr_logits_per_im) == 0:
+                    instances_lvl[i].pred_attributes = instances_lvl[i].pred_classes.new_tensor([])
+                    instances_lvl[i].pred_speeds = instances_lvl[i].scores.new_tensor([])
+                else:
+                    instances_lvl[i].pred_attributes = attr_logits_per_im.argmax(dim=1)
+                    instances_lvl[i].pred_speeds = speed_per_im
+
+
+@HEADS.register_module()
+class NuscenesDD3D(DD3D):
+    def __init__(self, 
+                 num_classes,
+                 in_channels,
+                 strides,
+                 fcos2d_cfg=dict(),
+                 fcos2d_loss_cfg=dict(),
+                 fcos3d_cfg=dict(),
+                 fcos3d_loss_cfg=dict(),
+                 target_assign_cfg=dict(),
+                 nusc_loss_weight=dict(),
+                 box3d_on=True,
+                 feature_locations_offset="none"):
+        super().__init__(num_classes,
+                        in_channels,
+                        strides,
+                        fcos2d_cfg=fcos2d_cfg,
+                        fcos2d_loss_cfg=fcos2d_loss_cfg,
+                        fcos3d_cfg=fcos3d_cfg,
+                        fcos3d_loss_cfg=fcos3d_loss_cfg,
+                        target_assign_cfg=target_assign_cfg,
+                        box3d_on=box3d_on,
+                        feature_locations_offset=feature_locations_offset)
+
+        # backbone_output_shape = self.backbone_output_shape
+        # in_channels = backbone_output_shape[0].channels
+
+        # --------------------------------------------------------------------------
+        # NuScenes predictions -- attribute / speed, computed from cls_tower output.
+        # --------------------------------------------------------------------------
+        self.attr_logits = Conv2d(in_channels, MAX_NUM_ATTRIBUTES, kernel_size=3, stride=1, padding=1, bias=True)
+        self.speed = Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=True, activation=F.relu)
+
+        # init weights
+        for modules in [self.attr_logits, self.speed]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+        # Re-define target preparer
+        del self.prepare_targets
+        self.prepare_targets = NuscenesDD3DTargetPreparer(num_classes=num_classes, 
+                                                          input_shape=self.backbone_output_shape,
+                                                          box3d_on=box3d_on,
+                                                          **target_assign_cfg)
+
+        self.nuscenes_loss = NuscenesLoss(**nusc_loss_weight)
+        # NOTE: inference later
+        # self.nuscenes_inference = NuscenesInference(cfg)
+
+        # self.num_images_per_sample = cfg.MODEL.FCOS3D.NUSC_NUM_IMAGES_PER_SAMPLE
+        # NOTE: inference later
+        # self.num_images_per_sample = cfg.DD3D.NUSC.INFERENCE.NUM_IMAGES_PER_SAMPLE
+
+        # assert self.num_images_per_sample == 6
+        # assert cfg.DATALOADER.TEST.NUM_IMAGES_PER_GROUP == 6
+
+        # NOTE: NuScenes evaluator allows max. 500 detections per sample.
+        # self.max_num_dets_per_sample = cfg.DD3D.NUSC.INFERENCE.MAX_NUM_DETS_PER_SAMPLE
+
+    @force_fp32(apply_to=('features'))
+    def forward(self, features, batched_inputs):
+        # NOTE:
+        # images = [x["image"].to(self.device) for x in batched_inputs]
+        # images = [self.preprocess_image(x) for x in images]
+
+        # NOTE: directly use inv_intrinsics
+        # if 'intrinsics' in batched_inputs[0]:
+        #     intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
+        # else:
+        #     intrinsics = None
+        # images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
+        if 'inv_intrinsics' in batched_inputs[0]:
+            inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
+            inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
+        else:
+            inv_intrinsics = None
+
+        # NOTE:
+        # gt_dense_depth = None
+        # if 'depth' in batched_inputs[0]:
+        #     gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
+        #     gt_dense_depth = ImageList.from_tensors(
+        #         gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
+        #     )
+
+        # NOTE: directly input feature
+        # features = self.backbone(images.tensor)
+        # features = [features[f] for f in self.in_features]
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        locations = self.compute_locations(features)
+        logits, box2d_reg, centerness, fcos2d_extra_output = self.fcos2d_head(features)
+        if not self.only_box2d:
+            box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
+        # NOTE: directly use inv_intrinsics
+        # inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
+
+        # --------------------------------------------------------------------------
+        # NuScenes predictions -- attribute / speed, computed from cls_tower output.
+        # --------------------------------------------------------------------------
+        attr_logits, speeds = [], []
+        for x in fcos2d_extra_output['cls_tower_out']:
+            attr_logits.append(self.attr_logits(x))
+            speeds.append(self.speed(x))
+
+        if self.training:
+            assert gt_instances is not None
+            feature_shapes = [x.shape[-2:] for x in features]
+            training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
+            # NOTE: 
+            # if gt_dense_depth is not None:
+            #    training_targets.update({"dense_depth": gt_dense_depth})
+
+            losses = {}
+            fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
+            losses.update(fcos2d_loss)
+
+            if not self.only_box2d:
+                fcos3d_loss = self.fcos3d_loss(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
+                    fcos2d_info, training_targets
+                )
+                losses.update(fcos3d_loss)
+
+            # Nuscenes loss -- attribute / speed
+            nuscenes_loss = self.nuscenes_loss(attr_logits, speeds, fcos2d_info, training_targets)
+            losses.update(nuscenes_loss)
+            return losses
+        else:
+            # TODO: do not support inference now
+            raise NotImplementedError
+            pred_instances, fcos2d_info = self.fcos2d_inference(
+                logits, box2d_reg, centerness, locations, images.image_sizes
+            )
+            if not self.only_box2d:
+                # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances'.
+                self.fcos3d_inference(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
+                    fcos2d_info
+                )
+                score_key = "scores_3d"
+            else:
+                score_key = "scores"
+
+            # This adds 'pred_attributes', 'pred_speed' to Instances in 'pred_instances'.
+            self.nuscenes_inference(attr_logits, speeds, pred_instances, fcos2d_info)
+
+            # Transpose to "image-first", i.e. (B, L)
+            pred_instances = list(zip(*pred_instances))
+            pred_instances = [Instances.cat(instances) for instances in pred_instances]
+
+            # 2D NMS and pick top-K.
+            if self.do_nms:
+                pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
+
+            if not self.only_box2d and self.do_bev_nms:
+                # Bird-eye-view NMS.
+                dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
+                if 'pose' in batched_inputs[0]:
+                    poses = [x['pose'] for x in batched_inputs]
+                else:
+                    poses = [x['extrinsics'] for x in batched_inputs]
+                pred_instances = nuscenes_sample_aggregate(
+                    pred_instances,
+                    dummy_group_idxs,
+                    self.num_classes,
+                    poses,
+                    iou_threshold=self.bev_nms_iou_thresh,
+                    include_boxes3d_global=False
+                )
+
+            if self.postprocess_in_inference:
+                processed_results = []
+                for results_per_image, input_per_image, image_size in \
+                        zip(pred_instances, batched_inputs, images.image_sizes):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = resize_instances(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+
+                # ----------------------------------------------------------
+                # NuScenes specific: cross-image (i.e. sample-level) BEV NMS.
+                # ----------------------------------------------------------
+                sample_tokens = [x['sample_token'] for x in batched_inputs]
+                group_idxs = get_group_idxs(sample_tokens, self.num_images_per_sample)
+
+                instances = [x['instances'] for x in processed_results]
+                global_poses = [x['pose'] for x in batched_inputs]
+
+                filtered_instances = nuscenes_sample_aggregate(
+                    instances,
+                    group_idxs,
+                    self.num_classes,
+                    global_poses,
+                    self.bev_nms_iou_thresh,
+                    max_num_dets_per_sample=self.max_num_dets_per_sample
+                )
+                processed_results = [{"instances": x} for x in filtered_instances]
+            else:
+                processed_results = [{"instances": x} for x in pred_instances]
+
+            return processed_results
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/prepare_targets.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/modeling/prepare_targets.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+
+from detectron2.layers import cat
+
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+
+INF = 100000000.
+
+
+class DD3DTargetPreparer():
+    def __init__(self, 
+                 num_classes, 
+                 input_shape,
+                 box3d_on=True,
+                 center_sample=True,
+                 pos_radius=1.5,
+                 sizes_of_interest=None):
+        self.num_classes = num_classes
+        self.center_sample = center_sample
+        self.strides = [shape.stride for shape in input_shape]
+        self.radius = pos_radius
+        self.dd3d_enabled = box3d_on
+
+        # generate sizes of interest
+        # NOTE:
+        # soi = []
+        # prev_size = -1
+        # for s in sizes_of_interest:
+        #     soi.append([prev_size, s])
+        #     prev_size = s
+        # soi.append([prev_size, INF])
+        self.sizes_of_interest = sizes_of_interest
+
+    def __call__(self, locations, gt_instances, feature_shapes):
+        num_loc_list = [len(loc) for loc in locations]
+
+        # compute locations to size ranges
+        loc_to_size_range = []
+        for l, loc_per_level in enumerate(locations):
+            loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
+            loc_to_size_range.append(loc_to_size_range_per_level[None].expand(num_loc_list[l], -1))
+
+        loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
+        locations = torch.cat(locations, dim=0)
+
+        training_targets = self.compute_targets_for_locations(locations, gt_instances, loc_to_size_range, num_loc_list)
+
+        training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
+        training_targets["im_inds"] = [
+            locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
+        ]
+
+        box2d = training_targets.pop("box2d", None)
+
+        # transpose im first training_targets to level first ones
+        training_targets = {k: self._transpose(v, num_loc_list) for k, v in training_targets.items() if k != "box2d"}
+
+        training_targets["fpn_levels"] = [
+            loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(training_targets["locations"])
+        ]
+
+        # Flatten targets: (L x B x H x W, TARGET_SIZE)
+        labels = cat([x.reshape(-1) for x in training_targets["labels"]])
+        box2d_reg_targets = cat([x.reshape(-1, 4) for x in training_targets["box2d_reg"]])
+
+        target_inds = cat([x.reshape(-1) for x in training_targets["target_inds"]])
+        locations = cat([x.reshape(-1, 2) for x in training_targets["locations"]])
+        im_inds = cat([x.reshape(-1) for x in training_targets["im_inds"]])
+        fpn_levels = cat([x.reshape(-1) for x in training_targets["fpn_levels"]])
+
+        pos_inds = torch.nonzero(labels != self.num_classes).squeeze(1)
+
+        targets = {
+            "labels": labels,
+            "box2d_reg_targets": box2d_reg_targets,
+            "locations": locations,
+            "target_inds": target_inds,
+            "im_inds": im_inds,
+            "fpn_levels": fpn_levels,
+            "pos_inds": pos_inds
+        }
+
+        if self.dd3d_enabled:
+            box3d_targets = Boxes3D.cat(training_targets["box3d"])
+            targets.update({"box3d_targets": box3d_targets})
+
+            if box2d is not None:
+                # Original format is B x L x (H x W, 4)
+                # Need to be in L x (B, 4, H, W).
+                batched_box2d = []
+                for lvl, per_lvl_box2d in enumerate(zip(*box2d)):
+                    # B x (H x W, 4)
+                    h, w = feature_shapes[lvl]
+                    batched_box2d_lvl = torch.stack([x.T.reshape(4, h, w) for x in per_lvl_box2d], dim=0)
+                    batched_box2d.append(batched_box2d_lvl)
+                targets.update({"batched_box2d": batched_box2d})
+
+        return targets
+
+    def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
+        labels = []
+        box2d_reg = []
+
+        if self.dd3d_enabled:
+            box3d = []
+
+        target_inds = []
+        xs, ys = locations[:, 0], locations[:, 1]
+
+        num_targets = 0
+        for im_i in range(len(targets)):
+            targets_per_im = targets[im_i]
+            bboxes = targets_per_im.gt_boxes.tensor
+            labels_per_im = targets_per_im.gt_classes
+
+            # no gt
+            if bboxes.numel() == 0:
+                labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
+                # reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+                box2d_reg.append(locations.new_zeros((locations.size(0), 4)))
+                target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
+
+                if self.dd3d_enabled:
+                    box3d.append(
+                        Boxes3D(
+                            locations.new_zeros(locations.size(0), 4),
+                            locations.new_zeros(locations.size(0), 2),
+                            locations.new_zeros(locations.size(0), 1),
+                            locations.new_zeros(locations.size(0), 3),
+                            locations.new_zeros(locations.size(0), 3, 3),
+                        ).to(torch.float32)
+                    )
+                continue
+
+            area = targets_per_im.gt_boxes.area()
+
+            l = xs[:, None] - bboxes[:, 0][None]
+            t = ys[:, None] - bboxes[:, 1][None]
+            r = bboxes[:, 2][None] - xs[:, None]
+            b = bboxes[:, 3][None] - ys[:, None]
+            # reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+            box2d_reg_per_im = torch.stack([l, t, r, b], dim=2)
+
+            if self.center_sample:
+                is_in_boxes = self.get_sample_region(bboxes, num_loc_list, xs, ys)
+            else:
+                is_in_boxes = box2d_reg_per_im.min(dim=2)[0] > 0
+
+            max_reg_targets_per_im = box2d_reg_per_im.max(dim=2)[0]
+            # limit the regression range for each location
+            is_cared_in_the_level = \
+                (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
+                (max_reg_targets_per_im <= size_ranges[:, [1]])
+
+            locations_to_gt_area = area[None].repeat(len(locations), 1)
+            locations_to_gt_area[is_in_boxes == 0] = INF
+            locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+            # if there are still more than one objects for a location,
+            # we choose the one with minimal area
+            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+
+            box2d_reg_per_im = box2d_reg_per_im[range(len(locations)), locations_to_gt_inds]
+            target_inds_per_im = locations_to_gt_inds + num_targets
+            num_targets += len(targets_per_im)
+
+            labels_per_im = labels_per_im[locations_to_gt_inds]
+            labels_per_im[locations_to_min_area == INF] = self.num_classes
+
+            labels.append(labels_per_im)
+            box2d_reg.append(box2d_reg_per_im)
+            target_inds.append(target_inds_per_im)
+
+            if self.dd3d_enabled:
+                # 3D box targets
+                box3d_per_im = targets_per_im.gt_boxes3d[locations_to_gt_inds]
+                box3d.append(box3d_per_im)
+
+        ret = {"labels": labels, "box2d_reg": box2d_reg, "target_inds": target_inds}
+        if self.dd3d_enabled:
+            ret.update({"box3d": box3d})
+
+        return ret
+
+    def get_sample_region(self, boxes, num_loc_list, loc_xs, loc_ys):
+        center_x = boxes[..., [0, 2]].sum(dim=-1) * 0.5
+        center_y = boxes[..., [1, 3]].sum(dim=-1) * 0.5
+
+        num_gts = boxes.shape[0]
+        K = len(loc_xs)
+        boxes = boxes[None].expand(K, num_gts, 4)
+        center_x = center_x[None].expand(K, num_gts)
+        center_y = center_y[None].expand(K, num_gts)
+        center_gt = boxes.new_zeros(boxes.shape)
+        # no gt
+        if center_x.numel() == 0 or center_x[..., 0].sum() == 0:
+            return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8)
+        beg = 0
+        for level, num_loc in enumerate(num_loc_list):
+            end = beg + num_loc
+            stride = self.strides[level] * self.radius
+            xmin = center_x[beg:end] - stride
+            ymin = center_y[beg:end] - stride
+            xmax = center_x[beg:end] + stride
+            ymax = center_y[beg:end] + stride
+            # limit sample region in gt
+            center_gt[beg:end, :, 0] = torch.where(xmin > boxes[beg:end, :, 0], xmin, boxes[beg:end, :, 0])
+            center_gt[beg:end, :, 1] = torch.where(ymin > boxes[beg:end, :, 1], ymin, boxes[beg:end, :, 1])
+            center_gt[beg:end, :, 2] = torch.where(xmax > boxes[beg:end, :, 2], boxes[beg:end, :, 2], xmax)
+            center_gt[beg:end, :, 3] = torch.where(ymax > boxes[beg:end, :, 3], boxes[beg:end, :, 3], ymax)
+            beg = end
+        left = loc_xs[:, None] - center_gt[..., 0]
+        right = center_gt[..., 2] - loc_xs[:, None]
+        top = loc_ys[:, None] - center_gt[..., 1]
+        bottom = center_gt[..., 3] - loc_ys[:, None]
+        center_bbox = torch.stack((left, top, right, bottom), -1)
+        inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        return inside_gt_bbox_mask
+
+    def _transpose(self, training_targets, num_loc_list):
+        '''
+        This function is used to transpose image first training targets to level first ones
+        :return: level first training targets
+        '''
+        if isinstance(training_targets[0], Boxes3D):
+            for im_i in range(len(training_targets)):
+                # training_targets[im_i] = torch.split(training_targets[im_i], num_loc_list, dim=0)
+                training_targets[im_i] = training_targets[im_i].split(num_loc_list, dim=0)
+
+            targets_level_first = []
+            for targets_per_level in zip(*training_targets):
+                targets_level_first.append(Boxes3D.cat(targets_per_level, dim=0))
+            return targets_level_first
+
+        for im_i in range(len(training_targets)):
+            training_targets[im_i] = torch.split(training_targets[im_i], num_loc_list, dim=0)
+
+        targets_level_first = []
+        for targets_per_level in zip(*training_targets):
+            targets_level_first.append(torch.cat(targets_per_level, dim=0))
+        return targets_level_first
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/structures/__init__.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/structures/__init__.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+from .image_list import ImageList
--- a/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/structures/boxes3d.py
+++ b/docker-hub/BEVFormer/BEVFormer/projects/mmdet3d_plugin/dd3d/structures/boxes3d.py
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import numpy as np
+import torch
+from pyquaternion import Quaternion
+from torch.cuda import amp
+
+from projects.mmdet3d_plugin.dd3d.utils.geometry import unproject_points2d
+import projects.mmdet3d_plugin.dd3d.structures.transform3d as t3d
+# yapf: disable
+BOX3D_CORNER_MAPPING = [
+    [1, 1, 1, 1, -1, -1, -1, -1],
+    [1, -1, -1, 1, 1, -1, -1, 1],
+    [1, 1, -1, -1, 1, 1, -1, -1]
+]
+# yapf: enable
+
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+def _to_tensor(x, dim):
+    if isinstance(x, torch.Tensor):
+        x = x.to(torch.float32)
+    elif isinstance(x, np.ndarray) or isinstance(x, list) or isinstance(x, tuple):
+        x = torch.tensor(x, dtype=torch.float32)
+    elif isinstance(x, Quaternion):
+        x = torch.tensor(x.elements, dtype=torch.float32)
+    else:
+        raise ValueError(f"Unsupported type: {type(x).__name__}")
+
+    if x.ndim == 1:
+        x = x.reshape(-1, dim)
+    elif x.ndim > 2:
+        raise ValueError(f"Invalid shape of input: {x.shape.__str__()}")
+    return x
+
+
+class GenericBoxes3D():
+    def __init__(self, quat, tvec, size):
+        self.quat = _to_tensor(quat, dim=4)
+        self._tvec = _to_tensor(tvec, dim=3)
+        self.size = _to_tensor(size, dim=3)
+
+    @property
+    def tvec(self):
+        return self._tvec
+
+    @property
+    @amp.autocast(enabled=False)
+    def corners(self):
+        allow_tf32 = torch.backends.cuda.matmul.allow_tf32
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+
+        translation = t3d.Translate(self.tvec, device=self.device)
+
+        R = quaternion_to_matrix(self.quat)
+        rotation = t3d.Rotate(R=R.transpose(1, 2), device=self.device)  # Need to transpose to make it work.
+
+        tfm = rotation.compose(translation)
+
+        _corners = 0.5 * self.quat.new_tensor(BOX3D_CORNER_MAPPING).T
+        # corners_in_obj_frame = self.size.unsqueeze(1) * _corners.unsqueeze(0)
+        lwh = self.size[:, [1, 0, 2]]  # wlh -> lwh
+        corners_in_obj_frame = lwh.unsqueeze(1) * _corners.unsqueeze(0)
+
+        corners3d = tfm.transform_points(corners_in_obj_frame)
+        torch.backends.cuda.matmul.allow_tf32 = allow_tf32
+        torch.backends.cudnn.allow_tf32 = allow_tf32
+        return corners3d
+
+    @classmethod
+    def from_vectors(cls, vecs, device="cpu"):
+        """
+        Parameters
+        ----------
+        vecs: Iterable[np.ndarray]
+            Iterable of 10D pose representation.
+
+        intrinsics: np.ndarray
+            (3, 3) intrinsics matrix.
+        """
+        quats, tvecs, sizes = [], [], []
+        for vec in vecs:
+            quat = vec[:4]
+            tvec = vec[4:7]
+            size = vec[7:]
+
+            quats.append(quat)
+            tvecs.append(tvec)
+            sizes.append(size)
+
+        quats = torch.as_tensor(quats, dtype=torch.float32, device=device)
+        tvecs = torch.as_tensor(tvecs, dtype=torch.float32, device=device)
+        sizes = torch.as_tensor(sizes, device=device)
+
+        return cls(quats, tvecs, sizes)
+
+    @classmethod
+    def cat(cls, boxes_list, dim=0):
+
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0), torch.empty(0), torch.empty(0))
+        assert all([isinstance(box, GenericBoxes3D) for box in boxes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        quat = torch.cat([b.quat for b in boxes_list], dim=dim)
+        tvec = torch.cat([b.tvec for b in boxes_list], dim=dim)
+        size = torch.cat([b.size for b in boxes_list], dim=dim)
+
+        cat_boxes = cls(quat, tvec, size)
+        return cat_boxes
+
+    def split(self, split_sizes, dim=0):
+        assert sum(split_sizes) == len(self)
+        quat_list = torch.split(self.quat, split_sizes, dim=dim)
+        tvec_list = torch.split(self.tvec, split_sizes, dim=dim)
+        size_list = torch.split(self.size, split_sizes, dim=dim)
+
+        return [GenericBoxes3D(*x) for x in zip(quat_list, tvec_list, size_list)]
+
+    def __getitem__(self, item):
+        """
+        """
+        if isinstance(item, int):
+            return GenericBoxes3D(self.quat[item].view(1, -1), self.tvec[item].view(1, -1), self.size[item].view(1, -1))
+
+        quat = self.quat[item]
+        tvec = self.tvec[item]
+        size = self.size[item]
+
+        assert quat.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert tvec.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert size.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+
+        return GenericBoxes3D(quat, tvec, size)
+
+    def __len__(self):
+        assert len(self.quat) == len(self.tvec) == len(self.size)
+        return self.quat.shape[0]
+
+    def clone(self):
+        """
+        """
+        return GenericBoxes3D(self.quat.clone(), self.tvec.clone(), self.size.clone())
+
+    def vectorize(self):
+        xyz = self.tvec
+        return torch.cat([self.quat, xyz, self.size], dim=1)
+
+    @property
+    def device(self):
+        return self.quat.device
+
+    def to(self, *args, **kwargs):
+        quat = self.quat.to(*args, **kwargs)
+        tvec = self.tvec.to(*args, **kwargs)
+        size = self.size.to(*args, **kwargs)
+        return GenericBoxes3D(quat, tvec, size)
+
+
+class Boxes3D(GenericBoxes3D):
+    """Vision-based 3D box container.
+
+    The tvec is computed from projected center, depth, and intrinsics.
+    """
+    def __init__(self, quat, proj_ctr, depth, size, inv_intrinsics):
+        self.quat = quat
+        self.proj_ctr = proj_ctr
+        self.depth = depth
+        self.size = size
+        self.inv_intrinsics = inv_intrinsics
+
+    @property
+    def tvec(self):
+        ray = unproject_points2d(self.proj_ctr, self.inv_intrinsics)
+        xyz = ray * self.depth
+        return xyz
+
+    @classmethod
+    def from_vectors(cls, vecs, intrinsics, device="cpu"):
+        """
+        Parameters
+        ----------
+        vecs: Iterable[np.ndarray]
+            Iterable of 10D pose representation.
+
+        intrinsics: np.ndarray
+            (3, 3) intrinsics matrix.
+        """
+        if len(vecs) == 0:
+            quats = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 4)
+            proj_ctrs = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 2)
+            depths = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 1)
+            sizes = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 3)
+            inv_intrinsics = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 3, 3)
+            return cls(quats, proj_ctrs, depths, sizes, inv_intrinsics)
+
+        quats, proj_ctrs, depths, sizes = [], [], [], []
+        for vec in vecs:
+            quat = vec[:4]
+
+            proj_ctr = intrinsics.dot(vec[4:7])
+            proj_ctr = proj_ctr[:2] / proj_ctr[-1]
+
+            depth = vec[6:7]
+
+            size = vec[7:]
+
+            quats.append(quat)
+            proj_ctrs.append(proj_ctr)
+            depths.append(depth)
+            sizes.append(size)
+
+        quats = torch.as_tensor(np.array(quats), dtype=torch.float32, device=device)
+        proj_ctrs = torch.as_tensor(np.array(proj_ctrs), dtype=torch.float32, device=device)
+        depths = torch.as_tensor(np.array(depths), dtype=torch.float32, device=device)
+        sizes = torch.as_tensor(np.array(sizes), dtype=torch.float32, device=device)
+
+        inv_intrinsics = np.linalg.inv(intrinsics)
+        inv_intrinsics = torch.as_tensor(inv_intrinsics[None, ...], device=device).expand(len(vecs), 3, 3)
+
+        return cls(quats, proj_ctrs, depths, sizes, inv_intrinsics)
+
+    @classmethod
+    def cat(cls, boxes_list, dim=0):
+
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0), torch.empty(0), torch.empty(0), torch.empty(0), torch.empty(0))
+        assert all([isinstance(box, Boxes3D) for box in boxes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        quat = torch.cat([b.quat for b in boxes_list], dim=dim)
+        proj_ctr = torch.cat([b.proj_ctr for b in boxes_list], dim=dim)
+        depth = torch.cat([b.depth for b in boxes_list], dim=dim)
+        size = torch.cat([b.size for b in boxes_list], dim=dim)
+        inv_intrinsics = torch.cat([b.inv_intrinsics for b in boxes_list], dim=dim)
+
+        cat_boxes = cls(quat, proj_ctr, depth, size, inv_intrinsics)
+        return cat_boxes
+
+    def split(self, split_sizes, dim=0):
+        assert sum(split_sizes) == len(self)
+        quat_list = torch.split(self.quat, split_sizes, dim=dim)
+        proj_ctr_list = torch.split(self.proj_ctr, split_sizes, dim=dim)
+        depth_list = torch.split(self.depth, split_sizes, dim=dim)
+        size_list = torch.split(self.size, split_sizes, dim=dim)
+        inv_K_list = torch.split(self.inv_intrinsics, split_sizes, dim=dim)
+
+        return [Boxes3D(*x) for x in zip(quat_list, proj_ctr_list, depth_list, size_list, inv_K_list)]
+
+    def __getitem__(self, item):
+        """
+        """
+        if isinstance(item, int):
+            return Boxes3D(
+                self.quat[item].view(1, -1), self.proj_ctr[item].view(1, -1), self.depth[item].view(1, -1),
+                self.size[item].view(1, -1), self.inv_intrinsics[item].view(1, 3, 3)
+            )
+
+        quat = self.quat[item]
+        ctr = self.proj_ctr[item]
+        depth = self.depth[item]
+        size = self.size[item]
+        inv_K = self.inv_intrinsics[item]
+
+        assert quat.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert ctr.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert depth.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert size.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert inv_K.dim() == 3, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert inv_K.shape[1:] == (3, 3), "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+
+        return Boxes3D(quat, ctr, depth, size, inv_K)
+
+    def __len__(self):
+        assert len(self.quat) == len(self.proj_ctr) == len(self.depth) == len(self.size) == len(self.inv_intrinsics)
+        return self.quat.shape[0]
+
+    def clone(self):
+        """
+        """
+        return Boxes3D(
+            self.quat.clone(), self.proj_ctr.clone(), self.depth.clone(), self.size.clone(), self.inv_intrinsics.clone()
+        )
+
+    def to(self, *args, **kwargs):
+        quat = self.quat.to(*args, **kwargs)
+        proj_ctr = self.proj_ctr.to(*args, **kwargs)
+        depth = self.depth.to(*args, **kwargs)
+        size = self.size.to(*args, **kwargs)
+        inv_K = self.inv_intrinsics.to(*args, **kwargs)
+        return Boxes3D(quat, proj_ctr, depth, size, inv_K)