# Copyright (c) OpenMMLab. All rights reserved. import mmcv import numpy as np from mmcv.transforms import BaseTransform, Compose from PIL import Image from mmdet3d.registry import TRANSFORMS def get_dtu_raydir(pixelcoords, intrinsic, rot, dir_norm=None): # rot is c2w # pixelcoords: H x W x 2 x = (pixelcoords[..., 0] + 0.5 - intrinsic[0, 2]) / intrinsic[0, 0] y = (pixelcoords[..., 1] + 0.5 - intrinsic[1, 2]) / intrinsic[1, 1] z = np.ones_like(x) dirs = np.stack([x, y, z], axis=-1) # dirs = np.sum(dirs[...,None,:] * rot[:,:], axis=-1) # h*w*1*3 x 3*3 dirs = dirs @ rot[:, :].T # if dir_norm: dirs = dirs / (np.linalg.norm(dirs, axis=-1, keepdims=True) + 1e-5) return dirs @TRANSFORMS.register_module() class MultiViewPipeline(BaseTransform): """MultiViewPipeline used in nerfdet. Required Keys: - depth_info - img_prefix - img_info - lidar2img - c2w - cammrotc2w - lightpos - ray_info Modified Keys: - lidar2img Added Keys: - img - denorm_images - depth - c2w - camrotc2w - lightpos - pixels - raydirs - gt_images - gt_depths - nerf_sizes - depth_range Args: transforms (list[dict]): The transform pipeline used to process the imgs. n_images (int): The number of sampled views. mean (array): The mean values used in normalization. std (array): The variance values used in normalization. margin (int): The margin value. Defaults to 10. depth_range (array): The range of the depth. Defaults to [0.5, 5.5]. loading (str): The mode of loading. Defaults to 'random'. nerf_target_views (int): The number of novel views. sample_freq (int): The frequency of sampling. """ def __init__(self, transforms: dict, n_images: int, mean: tuple = [123.675, 116.28, 103.53], std: tuple = [58.395, 57.12, 57.375], margin: int = 10, depth_range: tuple = [0.5, 5.5], loading: str = 'random', nerf_target_views: int = 0, sample_freq: int = 3): self.transforms = Compose(transforms) self.depth_transforms = Compose(transforms[1]) self.n_images = n_images self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.margin = margin self.depth_range = depth_range self.loading = loading self.sample_freq = sample_freq self.nerf_target_views = nerf_target_views def transform(self, results: dict) -> dict: """Nerfdet transform function. Args: results (dict): Result dict from loading pipeline Returns: dict: The result dict containing the processed results. Updated key and value are described below. - img (list): The loaded origin image. - denorm_images (list): The denormalized image. - depth (list): The origin depth image. - c2w (list): The c2w matrixes. - camrotc2w (list): The rotation matrixes. - lightpos (list): The transform parameters of the camera. - pixels (list): Some pixel information. - raydirs (list): The ray-directions. - gt_images (list): The groundtruth images. - gt_depths (list): The groundtruth depth images. - nerf_sizes (array): The size of the groundtruth images. - depth_range (array): The range of the depth. Here we give a detailed explanation of some keys mentioned above. Let P_c be the coordinate of camera, P_w be the coordinate of world. There is such a conversion relationship: P_c = R @ P_w + T. The 'camrotc2w' mentioned above corresponds to the R matrix here. The 'lightpos' corresponds to the T matrix here. And if you put R and T together, you can get the camera extrinsics matrix. It corresponds to the 'c2w' mentioned above. """ imgs = [] depths = [] extrinsics = [] c2ws = [] camrotc2ws = [] lightposes = [] pixels = [] raydirs = [] gt_images = [] gt_depths = [] denorm_imgs_list = [] nerf_sizes = [] if self.loading == 'random': ids = np.arange(len(results['img_info'])) replace = True if self.n_images > len(ids) else False ids = np.random.choice(ids, self.n_images, replace=replace) if self.nerf_target_views != 0: target_id = np.random.choice( ids, self.nerf_target_views, replace=False) ids = np.setdiff1d(ids, target_id) ids = ids.tolist() target_id = target_id.tolist() else: ids = np.arange(len(results['img_info'])) begin_id = 0 ids = np.arange(begin_id, begin_id + self.n_images * self.sample_freq, self.sample_freq) if self.nerf_target_views != 0: target_id = ids ratio = 0 size = (240, 320) for i in ids: _results = dict() _results['img_path'] = results['img_info'][i]['filename'] _results = self.transforms(_results) imgs.append(_results['img']) # normalize for key in _results.get('img_fields', ['img']): _results[key] = mmcv.imnormalize(_results[key], self.mean, self.std, True) _results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=True) # pad for key in _results.get('img_fields', ['img']): padded_img = mmcv.impad(_results[key], shape=size, pad_val=0) _results[key] = padded_img _results['pad_shape'] = padded_img.shape _results['pad_fixed_size'] = size ori_shape = _results['ori_shape'] aft_shape = _results['img_shape'] ratio = ori_shape[0] / aft_shape[0] # prepare the depth information if 'depth_info' in results.keys(): if '.npy' in results['depth_info'][i]['filename']: _results['depth'] = np.load( results['depth_info'][i]['filename']) else: _results['depth'] = np.asarray((Image.open( results['depth_info'][i]['filename']))) / 1000 _results['depth'] = mmcv.imresize( _results['depth'], (aft_shape[1], aft_shape[0])) depths.append(_results['depth']) denorm_img = mmcv.imdenormalize( _results['img'], self.mean, self.std, to_bgr=True).astype( np.uint8) / 255.0 denorm_imgs_list.append(denorm_img) height, width = padded_img.shape[:2] extrinsics.append(results['lidar2img']['extrinsic'][i]) # prepare the nerf information if 'ray_info' in results.keys(): intrinsics_nerf = results['lidar2img']['intrinsic'].copy() intrinsics_nerf[:2] = intrinsics_nerf[:2] / ratio assert self.nerf_target_views > 0 for i in target_id: c2ws.append(results['c2w'][i]) camrotc2ws.append(results['camrotc2w'][i]) lightposes.append(results['lightpos'][i]) px, py = np.meshgrid( np.arange(self.margin, width - self.margin).astype(np.float32), np.arange(self.margin, height - self.margin).astype(np.float32)) pixelcoords = np.stack((px, py), axis=-1).astype(np.float32) # H x W x 2 pixels.append(pixelcoords) raydir = get_dtu_raydir(pixelcoords, intrinsics_nerf, results['camrotc2w'][i]) raydirs.append(np.reshape(raydir.astype(np.float32), (-1, 3))) # read target images temp_results = dict() temp_results['img_path'] = results['img_info'][i]['filename'] temp_results_ = self.transforms(temp_results) # normalize for key in temp_results.get('img_fields', ['img']): temp_results[key] = mmcv.imnormalize( temp_results[key], self.mean, self.std, True) temp_results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=True) # pad for key in temp_results.get('img_fields', ['img']): padded_img = mmcv.impad( temp_results[key], shape=size, pad_val=0) temp_results[key] = padded_img temp_results['pad_shape'] = padded_img.shape temp_results['pad_fixed_size'] = size # denormalize target_images. denorm_imgs = mmcv.imdenormalize( temp_results_['img'], self.mean, self.std, to_bgr=True).astype(np.uint8) gt_rgb_shape = denorm_imgs.shape gt_image = denorm_imgs[py.astype(np.int32), px.astype(np.int32), :] nerf_sizes.append(np.array(gt_image.shape)) gt_image = np.reshape(gt_image, (-1, 3)) gt_images.append(gt_image / 255.0) if 'depth_info' in results.keys(): if '.npy' in results['depth_info'][i]['filename']: _results['depth'] = np.load( results['depth_info'][i]['filename']) else: depth_image = Image.open( results['depth_info'][i]['filename']) _results['depth'] = np.asarray(depth_image) / 1000 _results['depth'] = mmcv.imresize( _results['depth'], (gt_rgb_shape[1], gt_rgb_shape[0])) _results['depth'] = _results['depth'] gt_depth = _results['depth'][py.astype(np.int32), px.astype(np.int32)] gt_depths.append(gt_depth) for key in _results.keys(): if key not in ['img', 'img_info']: results[key] = _results[key] results['img'] = imgs if 'ray_info' in results.keys(): results['c2w'] = c2ws results['camrotc2w'] = camrotc2ws results['lightpos'] = lightposes results['pixels'] = pixels results['raydirs'] = raydirs results['gt_images'] = gt_images results['gt_depths'] = gt_depths results['nerf_sizes'] = nerf_sizes results['denorm_images'] = denorm_imgs_list results['depth_range'] = np.array([self.depth_range]) if len(depths) != 0: results['depth'] = depths results['lidar2img']['extrinsic'] = extrinsics return results @TRANSFORMS.register_module() class RandomShiftOrigin(BaseTransform): def __init__(self, std): self.std = std def transform(self, results): shift = np.random.normal(.0, self.std, 3) results['lidar2img']['origin'] += shift return results