cameras.py 51.7 KB
Newer Older
Patrick Labatut's avatar
Patrick Labatut committed
1
2
3
4
5
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
facebook-github-bot's avatar
facebook-github-bot committed
6
7

import math
Georgia Gkioxari's avatar
Georgia Gkioxari committed
8
import warnings
Georgia Gkioxari's avatar
Georgia Gkioxari committed
9
from typing import Optional, Sequence, Tuple
10
11

import numpy as np
facebook-github-bot's avatar
facebook-github-bot committed
12
13
import torch
import torch.nn.functional as F
14
from pytorch3d.common.types import Device
facebook-github-bot's avatar
facebook-github-bot committed
15
16
17
18
from pytorch3d.transforms import Rotate, Transform3d, Translate

from .utils import TensorProperties, convert_to_tensors_and_broadcast

19

facebook-github-bot's avatar
facebook-github-bot committed
20
# Default values for rotation and translation matrices.
David Novotny's avatar
David Novotny committed
21
22
_R = torch.eye(3)[None]  # (1, 3, 3)
_T = torch.zeros(1, 3)  # (1, 3)
facebook-github-bot's avatar
facebook-github-bot committed
23
24


25
26
27
28
class CamerasBase(TensorProperties):
    """
    `CamerasBase` implements a base class for all cameras.

Georgia Gkioxari's avatar
Georgia Gkioxari committed
29
30
31
32
33
34
35
36
37
    For cameras, there are four different coordinate systems (or spaces)
    - World coordinate system: This is the system the object lives - the world.
    - Camera view coordinate system: This is the system that has its origin on the image plane
        and the and the Z-axis perpendicular to the image plane.
        In PyTorch3D, we assume that +X points left, and +Y points up and
        +Z points out from the image plane.
        The transformation from world -> view happens after applying a rotation (R)
        and translation (T)
    - NDC coordinate system: This is the normalized coordinate system that confines
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
38
        in a volume the rendered part of the object or scene. Also known as view volume.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
39
40
41
42
43
44
45
        Given the PyTorch3D convention, (+1, +1, znear) is the top left near corner,
        and (-1, -1, zfar) is the bottom right far corner of the volume.
        The transformation from view -> NDC happens after applying the camera
        projection matrix (P).
    - Screen coordinate system: This is another representation of the view volume with
        the XY coordinates defined in pixel space instead of a normalized space.

Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
46
47
    A better illustration of the coordinate systems can be found in
    pytorch3d/docs/notes/cameras.md.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
48

49
50
    It defines methods that are common to all camera models:
        - `get_camera_center` that returns the optical center of the camera in
51
            world coordinates
52
        - `get_world_to_view_transform` which returns a 3D transform from
53
            world coordinates to the camera view coordinates (R, T)
54
        - `get_full_projection_transform` which composes the projection
55
            transform (P) with the world-to-view transform (R, T)
Georgia Gkioxari's avatar
Georgia Gkioxari committed
56
        - `transform_points` which takes a set of input points in world coordinates and
57
            projects to NDC coordinates ranging from [-1, -1, znear] to [+1, +1, zfar].
Georgia Gkioxari's avatar
Georgia Gkioxari committed
58
        - `transform_points_screen` which takes a set of input points in world coordinates and
59
60
            projects them to the screen coordinates ranging from
            [0, 0, znear] to [W-1, H-1, zfar]
61
62

    For each new camera, one should implement the `get_projection_transform`
Georgia Gkioxari's avatar
Georgia Gkioxari committed
63
    routine that returns the mapping from camera view coordinates to NDC coordinates.
64
65

    Another useful function that is specific to each camera model is
Georgia Gkioxari's avatar
Georgia Gkioxari committed
66
67
    `unproject_points` which sends points from NDC coordinates back to
    camera view or world coordinates depending on the `world_coordinates`
68
69
70
71
72
73
74
75
76
77
78
79
    boolean argument of the function.
    """

    def get_projection_transform(self):
        """
        Calculate the projective transformation matrix.

        Args:
            **kwargs: parameters for the projection can be passed in as keyword
                arguments to override the default values set in `__init__`.

        Return:
80
            a `Transform3d` object which represents a batch of projection
81
82
83
84
85
86
            matrices of shape (N, 3, 3)
        """
        raise NotImplementedError()

    def unproject_points(self):
        """
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
87
        Transform input points from NDC coordinates
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
        to the world / camera coordinates.

        Each of the input points `xy_depth` of shape (..., 3) is
        a concatenation of the x, y location and its depth.

        For instance, for an input 2D tensor of shape `(num_points, 3)`
        `xy_depth` takes the following form:
            `xy_depth[i] = [x[i], y[i], depth[i]]`,
        for a each point at an index `i`.

        The following example demonstrates the relationship between
        `transform_points` and `unproject_points`:

        .. code-block:: python

            cameras = # camera object derived from CamerasBase
            xyz = # 3D points of shape (batch_size, num_points, 3)
Georgia Gkioxari's avatar
Georgia Gkioxari committed
105
            # transform xyz to the camera view coordinates
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
            xyz_cam = cameras.get_world_to_view_transform().transform_points(xyz)
            # extract the depth of each point as the 3rd coord of xyz_cam
            depth = xyz_cam[:, :, 2:]
            # project the points xyz to the camera
            xy = cameras.transform_points(xyz)[:, :, :2]
            # append depth to xy
            xy_depth = torch.cat((xy, depth), dim=2)
            # unproject to the world coordinates
            xyz_unproj_world = cameras.unproject_points(xy_depth, world_coordinates=True)
            print(torch.allclose(xyz, xyz_unproj_world)) # True
            # unproject to the camera coordinates
            xyz_unproj = cameras.unproject_points(xy_depth, world_coordinates=False)
            print(torch.allclose(xyz_cam, xyz_unproj)) # True

        Args:
            xy_depth: torch tensor of shape (..., 3).
            world_coordinates: If `True`, unprojects the points back to world
                coordinates using the camera extrinsics `R` and `T`.
                `False` ignores `R` and `T` and unprojects to
Georgia Gkioxari's avatar
Georgia Gkioxari committed
125
                the camera view coordinates.
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

        Returns
            new_points: unprojected points with the same shape as `xy_depth`.
        """
        raise NotImplementedError()

    def get_camera_center(self, **kwargs) -> torch.Tensor:
        """
        Return the 3D location of the camera optical center
        in the world coordinates.

        Args:
            **kwargs: parameters for the camera extrinsics can be passed in
                as keyword arguments to override the default values
                set in __init__.

        Setting T here will update the values set in init as this
        value may be needed later on in the rendering pipeline e.g. for
        lighting calculations.

        Returns:
            C: a batch of 3D locations of shape (N, 3) denoting
            the locations of the center of each camera in the batch.
        """
        w2v_trans = self.get_world_to_view_transform(**kwargs)
        P = w2v_trans.inverse().get_matrix()
        # the camera center is the translation component (the first 3 elements
        # of the last row) of the inverted world-to-view
        # transform (4x4 RT matrix)
        C = P[:, 3, :3]
        return C

    def get_world_to_view_transform(self, **kwargs) -> Transform3d:
        """
        Return the world-to-view transform.

        Args:
            **kwargs: parameters for the camera extrinsics can be passed in
                as keyword arguments to override the default values
                set in __init__.

        Setting R and T here will update the values set in init as these
        values may be needed later on in the rendering pipeline e.g. for
        lighting calculations.

        Returns:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
172
            A Transform3d object which represents a batch of transforms
173
174
            of shape (N, 3, 3)
        """
Patrick Labatut's avatar
Patrick Labatut committed
175
176
177
178
179
        R: torch.Tensor = kwargs.get("R", self.R)
        T: torch.Tensor = kwargs.get("T", self.T)
        self.R = R  # pyre-ignore[16]
        self.T = T  # pyre-ignore[16]
        world_to_view_transform = get_world_to_view_transform(R=R, T=T)
180
181
182
183
        return world_to_view_transform

    def get_full_projection_transform(self, **kwargs) -> Transform3d:
        """
Georgia Gkioxari's avatar
Georgia Gkioxari committed
184
185
        Return the full world-to-NDC transform composing the
        world-to-view and view-to-NDC transforms.
186
187
188
189
190
191
192
193
194
195
196

        Args:
            **kwargs: parameters for the projection transforms can be passed in
                as keyword arguments to override the default values
                set in __init__.

        Setting R and T here will update the values set in init as these
        values may be needed later on in the rendering pipeline e.g. for
        lighting calculations.

        Returns:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
197
            a Transform3d object which represents a batch of transforms
198
199
            of shape (N, 3, 3)
        """
Patrick Labatut's avatar
Patrick Labatut committed
200
201
        self.R: torch.Tensor = kwargs.get("R", self.R)  # pyre-ignore[16]
        self.T: torch.Tensor = kwargs.get("T", self.T)  # pyre-ignore[16]
202
        world_to_view_transform = self.get_world_to_view_transform(R=self.R, T=self.T)
Georgia Gkioxari's avatar
Georgia Gkioxari committed
203
204
        view_to_ndc_transform = self.get_projection_transform(**kwargs)
        return world_to_view_transform.compose(view_to_ndc_transform)
205
206
207
208
209

    def transform_points(
        self, points, eps: Optional[float] = None, **kwargs
    ) -> torch.Tensor:
        """
Georgia Gkioxari's avatar
Georgia Gkioxari committed
210
        Transform input points from world to NDC space.
211
212
213
214
215

        Args:
            points: torch tensor of shape (..., 3).
            eps: If eps!=None, the argument is used to clamp the
                divisor in the homogeneous normalization of the points
Georgia Gkioxari's avatar
Georgia Gkioxari committed
216
                transformed to the ndc space. Please see
217
218
219
220
                `transforms.Transform3D.transform_points` for details.

                For `CamerasBase.transform_points`, setting `eps > 0`
                stabilizes gradients since it leads to avoiding division
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
221
                by excessively low numbers for points close to the
222
223
224
225
226
                camera plane.

        Returns
            new_points: transformed points with the same shape as the input.
        """
Georgia Gkioxari's avatar
Georgia Gkioxari committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
        world_to_ndc_transform = self.get_full_projection_transform(**kwargs)
        return world_to_ndc_transform.transform_points(points, eps=eps)

    def transform_points_screen(
        self, points, image_size, eps: Optional[float] = None, **kwargs
    ) -> torch.Tensor:
        """
        Transform input points from world to screen space.

        Args:
            points: torch tensor of shape (N, V, 3).
            image_size: torch tensor of shape (N, 2)
            eps: If eps!=None, the argument is used to clamp the
                divisor in the homogeneous normalization of the points
                transformed to the ndc space. Please see
                `transforms.Transform3D.transform_points` for details.

                For `CamerasBase.transform_points`, setting `eps > 0`
                stabilizes gradients since it leads to avoiding division
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
246
                by excessively low numbers for points close to the
Georgia Gkioxari's avatar
Georgia Gkioxari committed
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
                camera plane.

        Returns
            new_points: transformed points with the same shape as the input.
        """

        ndc_points = self.transform_points(points, eps=eps, **kwargs)

        if not torch.is_tensor(image_size):
            image_size = torch.tensor(
                image_size, dtype=torch.int64, device=points.device
            )
        if (image_size < 1).any():
            raise ValueError("Provided image size is invalid.")

        image_width, image_height = image_size.unbind(1)
        image_width = image_width.view(-1, 1)  # (N, 1)
        image_height = image_height.view(-1, 1)  # (N, 1)

        ndc_z = ndc_points[..., 2]
        screen_x = (image_width - 1.0) / 2.0 * (1.0 - ndc_points[..., 0])
        screen_y = (image_height - 1.0) / 2.0 * (1.0 - ndc_points[..., 1])

        return torch.stack((screen_x, screen_y, ndc_z), dim=2)
271
272
273
274
275
276
277
278
279

    def clone(self):
        """
        Returns a copy of `self`.
        """
        cam_type = type(self)
        other = cam_type(device=self.device)
        return super().clone(other)

280
281
282
283
284
285
    def is_perspective(self):
        raise NotImplementedError()

    def get_znear(self):
        return self.znear if hasattr(self, "znear") else None

286

Georgia Gkioxari's avatar
Georgia Gkioxari committed
287
288
289
290
############################################################
#             Field of View Camera Classes                 #
############################################################

291

Georgia Gkioxari's avatar
Georgia Gkioxari committed
292
293
294
295
296
297
def OpenGLPerspectiveCameras(
    znear=1.0,
    zfar=100.0,
    aspect_ratio=1.0,
    fov=60.0,
    degrees: bool = True,
Patrick Labatut's avatar
Patrick Labatut committed
298
299
    R: torch.Tensor = _R,
    T: torch.Tensor = _T,
300
    device: Device = "cpu",
Patrick Labatut's avatar
Patrick Labatut committed
301
) -> "FoVPerspectiveCameras":
Georgia Gkioxari's avatar
Georgia Gkioxari committed
302
303
304
305
    """
    OpenGLPerspectiveCameras has been DEPRECATED. Use FoVPerspectiveCameras instead.
    Preserving OpenGLPerspectiveCameras for backward compatibility.
    """
306

Georgia Gkioxari's avatar
Georgia Gkioxari committed
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
    warnings.warn(
        """OpenGLPerspectiveCameras is deprecated,
        Use FoVPerspectiveCameras instead.
        OpenGLPerspectiveCameras will be removed in future releases.""",
        PendingDeprecationWarning,
    )

    return FoVPerspectiveCameras(
        znear=znear,
        zfar=zfar,
        aspect_ratio=aspect_ratio,
        fov=fov,
        degrees=degrees,
        R=R,
        T=T,
        device=device,
    )


class FoVPerspectiveCameras(CamerasBase):
facebook-github-bot's avatar
facebook-github-bot committed
327
328
    """
    A class which stores a batch of parameters to generate a batch of
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
329
    projection matrices by specifying the field of view.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
330
    The definition of the parameters follow the OpenGL perspective camera.
facebook-github-bot's avatar
facebook-github-bot committed
331
332
333

    The extrinsics of the camera (R and T matrices) can also be set in the
    initializer or passed in to `get_full_projection_transform` to get
Georgia Gkioxari's avatar
Georgia Gkioxari committed
334
    the full transformation from world -> ndc.
facebook-github-bot's avatar
facebook-github-bot committed
335

Georgia Gkioxari's avatar
Georgia Gkioxari committed
336
    The `transform_points` method calculates the full world -> ndc transform
facebook-github-bot's avatar
facebook-github-bot committed
337
338
339
    and then applies it to the input points.

    The transforms can also be returned separately as Transform3d objects.
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355

    * Setting the Aspect Ratio for Non Square Images *

    If the desired output image size is non square (i.e. a tuple of (H, W) where H != W)
    the aspect ratio needs special consideration: There are two aspect ratios
    to be aware of:
        - the aspect ratio of each pixel
        - the aspect ratio of the output image
    The `aspect_ratio` setting in the FoVPerspectiveCameras sets the
    pixel aspect ratio. When using this camera with the differentiable rasterizer
    be aware that in the rasterizer we assume square pixels, but allow
    variable image aspect ratio (i.e rectangle images).

    In most cases you will want to set the camera `aspect_ratio=1.0`
    (i.e. square pixels) and only vary the output image dimensions in pixels
    for rasterization.
facebook-github-bot's avatar
facebook-github-bot committed
356
357
358
359
360
361
362
363
364
    """

    def __init__(
        self,
        znear=1.0,
        zfar=100.0,
        aspect_ratio=1.0,
        fov=60.0,
        degrees: bool = True,
Patrick Labatut's avatar
Patrick Labatut committed
365
366
367
        R: torch.Tensor = _R,
        T: torch.Tensor = _T,
        K: Optional[torch.Tensor] = None,
368
        device: Device = "cpu",
Patrick Labatut's avatar
Patrick Labatut committed
369
    ) -> None:
facebook-github-bot's avatar
facebook-github-bot committed
370
371
372
373
374
        """

        Args:
            znear: near clipping plane of the view frustrum.
            zfar: far clipping plane of the view frustrum.
375
376
            aspect_ratio: aspect ratio of the image pixels.
                1.0 indicates square pixels.
facebook-github-bot's avatar
facebook-github-bot committed
377
378
379
380
            fov: field of view angle of the camera.
            degrees: bool, set to True if fov is specified in degrees.
            R: Rotation matrix of shape (N, 3, 3)
            T: Translation matrix of shape (N, 3)
381
382
            K: (optional) A calibration matrix of shape (N, 4, 4)
                If provided, don't need znear, zfar, fov, aspect_ratio, degrees
383
            device: Device (as str or torch.device)
facebook-github-bot's avatar
facebook-github-bot committed
384
385
386
387
388
389
390
391
392
393
394
        """
        # The initializer formats all inputs to torch tensors and broadcasts
        # all the inputs to have the same batch dimension where necessary.
        super().__init__(
            device=device,
            znear=znear,
            zfar=zfar,
            aspect_ratio=aspect_ratio,
            fov=fov,
            R=R,
            T=T,
395
            K=K,
facebook-github-bot's avatar
facebook-github-bot committed
396
397
398
399
400
        )

        # No need to convert to tensor or broadcast.
        self.degrees = degrees

401
    def compute_projection_matrix(
Patrick Labatut's avatar
Patrick Labatut committed
402
        self, znear, zfar, fov, aspect_ratio, degrees: bool
403
404
405
406
407
408
409
410
    ) -> torch.Tensor:
        """
        Compute the calibration matrix K of shape (N, 4, 4)

        Args:
            znear: near clipping plane of the view frustrum.
            zfar: far clipping plane of the view frustrum.
            fov: field of view angle of the camera.
411
412
            aspect_ratio: aspect ratio of the image pixels.
                1.0 indicates square pixels.
413
414
415
            degrees: bool, set to True if fov is specified in degrees.

        Returns:
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
416
            torch.FloatTensor of the calibration matrix with shape (N, 4, 4)
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
        """
        K = torch.zeros((self._N, 4, 4), device=self.device, dtype=torch.float32)
        ones = torch.ones((self._N), dtype=torch.float32, device=self.device)
        if degrees:
            fov = (np.pi / 180) * fov

        if not torch.is_tensor(fov):
            fov = torch.tensor(fov, device=self.device)
        tanHalfFov = torch.tan((fov / 2))
        max_y = tanHalfFov * znear
        min_y = -max_y
        max_x = max_y * aspect_ratio
        min_x = -max_x

        # NOTE: In OpenGL the projection matrix changes the handedness of the
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
432
        # coordinate frame. i.e the NDC space positive z direction is the
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
        # camera space negative z direction. This is because the sign of the z
        # in the projection matrix is set to -1.0.
        # In pytorch3d we maintain a right handed coordinate system throughout
        # so the so the z sign is 1.0.
        z_sign = 1.0

        K[:, 0, 0] = 2.0 * znear / (max_x - min_x)
        K[:, 1, 1] = 2.0 * znear / (max_y - min_y)
        K[:, 0, 2] = (max_x + min_x) / (max_x - min_x)
        K[:, 1, 2] = (max_y + min_y) / (max_y - min_y)
        K[:, 3, 2] = z_sign * ones

        # NOTE: This maps the z coordinate from [0, 1] where z = 0 if the point
        # is at the near clipping plane and z = 1 when the point is at the far
        # clipping plane.
        K[:, 2, 2] = z_sign * zfar / (zfar - znear)
        K[:, 2, 3] = -(zfar * znear) / (zfar - znear)

        return K

facebook-github-bot's avatar
facebook-github-bot committed
453
454
    def get_projection_transform(self, **kwargs) -> Transform3d:
        """
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
455
        Calculate the perspective projection matrix with a symmetric
facebook-github-bot's avatar
facebook-github-bot committed
456
        viewing frustrum. Use column major order.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
457
458
459
        The viewing frustrum will be projected into ndc, s.t.
        (max_x, max_y) -> (+1, +1)
        (min_x, min_y) -> (-1, -1)
facebook-github-bot's avatar
facebook-github-bot committed
460
461
462
463
464
465

        Args:
            **kwargs: parameters for the projection can be passed in as keyword
                arguments to override the default values set in `__init__`.

        Return:
466
            a Transform3d object which represents a batch of projection
Georgia Gkioxari's avatar
Georgia Gkioxari committed
467
            matrices of shape (N, 4, 4)
facebook-github-bot's avatar
facebook-github-bot committed
468
469
470

        .. code-block:: python

Georgia Gkioxari's avatar
Georgia Gkioxari committed
471
472
            h1 = (max_y + min_y)/(max_y - min_y)
            w1 = (max_x + min_x)/(max_x - min_x)
facebook-github-bot's avatar
facebook-github-bot committed
473
474
475
476
            tanhalffov = tan((fov/2))
            s1 = 1/tanhalffov
            s2 = 1/(tanhalffov * (aspect_ratio))

477
478
479
480
481
            # To map z to the range [0, 1] use:
            f1 =  far / (far - near)
            f2 = -(far * near) / (far - near)

            # Projection matrix
482
            K = [
facebook-github-bot's avatar
facebook-github-bot committed
483
484
485
                    [s1,   0,   w1,   0],
                    [0,   s2,   h1,   0],
                    [0,    0,   f1,  f2],
486
                    [0,    0,    1,   0],
facebook-github-bot's avatar
facebook-github-bot committed
487
488
            ]
        """
489
        K = kwargs.get("K", self.K)
490
491
492
493
494
495
        if K is not None:
            if K.shape != (self._N, 4, 4):
                msg = "Expected K to have shape of (%r, 4, 4)"
                raise ValueError(msg % (self._N))
        else:
            K = self.compute_projection_matrix(
496
497
498
499
                kwargs.get("znear", self.znear),
                kwargs.get("zfar", self.zfar),
                kwargs.get("fov", self.fov),
                kwargs.get("aspect_ratio", self.aspect_ratio),
500
501
                kwargs.get("degrees", self.degrees),
            )
facebook-github-bot's avatar
facebook-github-bot committed
502

David Novotny's avatar
David Novotny committed
503
        # Transpose the projection matrix as PyTorch3D transforms use row vectors.
facebook-github-bot's avatar
facebook-github-bot committed
504
        transform = Transform3d(device=self.device)
505
        transform._matrix = K.transpose(1, 2).contiguous()
facebook-github-bot's avatar
facebook-github-bot committed
506
507
        return transform

508
509
510
511
512
513
514
515
    def unproject_points(
        self,
        xy_depth: torch.Tensor,
        world_coordinates: bool = True,
        scaled_depth_input: bool = False,
        **kwargs
    ) -> torch.Tensor:
        """>!
Georgia Gkioxari's avatar
Georgia Gkioxari committed
516
        FoV cameras further allow for passing depth in world units
517
518
        (`scaled_depth_input=False`) or in the [0, 1]-normalized units
        (`scaled_depth_input=True`)
facebook-github-bot's avatar
facebook-github-bot committed
519
520

        Args:
521
522
523
524
525
            scaled_depth_input: If `True`, assumes the input depth is in
                the [0, 1]-normalized units. If `False` the input depth is in
                the world units.
        """

Georgia Gkioxari's avatar
Georgia Gkioxari committed
526
        # obtain the relevant transformation to ndc
527
        if world_coordinates:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
528
            to_ndc_transform = self.get_full_projection_transform()
529
        else:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
530
            to_ndc_transform = self.get_projection_transform()
531
532
533
534
535
536

        if scaled_depth_input:
            # the input is scaled depth, so we don't have to do anything
            xy_sdepth = xy_depth
        else:
            # parse out important values from the projection matrix
537
538
            K_matrix = self.get_projection_transform(**kwargs.copy()).get_matrix()
            # parse out f1, f2 from K_matrix
539
            unsqueeze_shape = [1] * xy_depth.dim()
540
541
542
            unsqueeze_shape[0] = K_matrix.shape[0]
            f1 = K_matrix[:, 2, 2].reshape(unsqueeze_shape)
            f2 = K_matrix[:, 3, 2].reshape(unsqueeze_shape)
543
544
545
546
547
548
            # get the scaled depth
            sdepth = (f1 * xy_depth[..., 2:3] + f2) / xy_depth[..., 2:3]
            # concatenate xy + scaled depth
            xy_sdepth = torch.cat((xy_depth[..., 0:2], sdepth), dim=-1)

        # unproject with inverse of the projection
Georgia Gkioxari's avatar
Georgia Gkioxari committed
549
        unprojection_transform = to_ndc_transform.inverse()
550
551
        return unprojection_transform.transform_points(xy_sdepth)

552
553
554
    def is_perspective(self):
        return True

555

Georgia Gkioxari's avatar
Georgia Gkioxari committed
556
557
558
559
560
561
562
563
def OpenGLOrthographicCameras(
    znear=1.0,
    zfar=100.0,
    top=1.0,
    bottom=-1.0,
    left=-1.0,
    right=1.0,
    scale_xyz=((1.0, 1.0, 1.0),),  # (1, 3)
Patrick Labatut's avatar
Patrick Labatut committed
564
565
566
567
    R: torch.Tensor = _R,
    T: torch.Tensor = _T,
    device: Device = "cpu",
) -> "FoVOrthographicCameras":
Georgia Gkioxari's avatar
Georgia Gkioxari committed
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
    """
    OpenGLOrthographicCameras has been DEPRECATED. Use FoVOrthographicCameras instead.
    Preserving OpenGLOrthographicCameras for backward compatibility.
    """

    warnings.warn(
        """OpenGLOrthographicCameras is deprecated,
        Use FoVOrthographicCameras instead.
        OpenGLOrthographicCameras will be removed in future releases.""",
        PendingDeprecationWarning,
    )

    return FoVOrthographicCameras(
        znear=znear,
        zfar=zfar,
        max_y=top,
        min_y=bottom,
        max_x=right,
        min_x=left,
        scale_xyz=scale_xyz,
        R=R,
        T=T,
        device=device,
    )


class FoVOrthographicCameras(CamerasBase):
facebook-github-bot's avatar
facebook-github-bot committed
595
596
    """
    A class which stores a batch of parameters to generate a batch of
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
597
    projection matrices by specifying the field of view.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
598
    The definition of the parameters follow the OpenGL orthographic camera.
facebook-github-bot's avatar
facebook-github-bot committed
599
600
601
602
603
604
    """

    def __init__(
        self,
        znear=1.0,
        zfar=100.0,
Georgia Gkioxari's avatar
Georgia Gkioxari committed
605
606
607
608
        max_y=1.0,
        min_y=-1.0,
        max_x=1.0,
        min_x=-1.0,
facebook-github-bot's avatar
facebook-github-bot committed
609
        scale_xyz=((1.0, 1.0, 1.0),),  # (1, 3)
Patrick Labatut's avatar
Patrick Labatut committed
610
611
612
613
        R: torch.Tensor = _R,
        T: torch.Tensor = _T,
        K: Optional[torch.Tensor] = None,
        device: Device = "cpu",
facebook-github-bot's avatar
facebook-github-bot committed
614
615
616
617
618
619
    ):
        """

        Args:
            znear: near clipping plane of the view frustrum.
            zfar: far clipping plane of the view frustrum.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
620
621
622
            max_y: maximum y coordinate of the frustrum.
            min_y: minimum y coordinate of the frustrum.
            max_x: maximum x coordinate of the frustrum.
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
623
            min_x: minimum x coordinate of the frustrum
facebook-github-bot's avatar
facebook-github-bot committed
624
625
626
            scale_xyz: scale factors for each axis of shape (N, 3).
            R: Rotation matrix of shape (N, 3, 3).
            T: Translation of shape (N, 3).
627
628
            K: (optional) A calibration matrix of shape (N, 4, 4)
                If provided, don't need znear, zfar, max_y, min_y, max_x, min_x, scale_xyz
facebook-github-bot's avatar
facebook-github-bot committed
629
630
            device: torch.device or string.

Georgia Gkioxari's avatar
Georgia Gkioxari committed
631
        Only need to set min_x, max_x, min_y, max_y for viewing frustrums
facebook-github-bot's avatar
facebook-github-bot committed
632
633
634
635
636
637
638
639
        which are non symmetric about the origin.
        """
        # The initializer formats all inputs to torch tensors and broadcasts
        # all the inputs to have the same batch dimension where necessary.
        super().__init__(
            device=device,
            znear=znear,
            zfar=zfar,
Georgia Gkioxari's avatar
Georgia Gkioxari committed
640
641
642
643
            max_y=max_y,
            min_y=min_y,
            max_x=max_x,
            min_x=min_x,
facebook-github-bot's avatar
facebook-github-bot committed
644
645
646
            scale_xyz=scale_xyz,
            R=R,
            T=T,
647
            K=K,
facebook-github-bot's avatar
facebook-github-bot committed
648
649
        )

650
651
652
653
654
655
656
657
658
659
    def compute_projection_matrix(
        self, znear, zfar, max_x, min_x, max_y, min_y, scale_xyz
    ) -> torch.Tensor:
        """
        Compute the calibration matrix K of shape (N, 4, 4)

        Args:
            znear: near clipping plane of the view frustrum.
            zfar: far clipping plane of the view frustrum.
            max_x: maximum x coordinate of the frustrum.
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
660
            min_x: minimum x coordinate of the frustrum
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
            max_y: maximum y coordinate of the frustrum.
            min_y: minimum y coordinate of the frustrum.
            scale_xyz: scale factors for each axis of shape (N, 3).
        """
        K = torch.zeros((self._N, 4, 4), dtype=torch.float32, device=self.device)
        ones = torch.ones((self._N), dtype=torch.float32, device=self.device)
        # NOTE: OpenGL flips handedness of coordinate system between camera
        # space and NDC space so z sign is -ve. In PyTorch3D we maintain a
        # right handed coordinate system throughout.
        z_sign = +1.0

        K[:, 0, 0] = (2.0 / (max_x - min_x)) * scale_xyz[:, 0]
        K[:, 1, 1] = (2.0 / (max_y - min_y)) * scale_xyz[:, 1]
        K[:, 0, 3] = -(max_x + min_x) / (max_x - min_x)
        K[:, 1, 3] = -(max_y + min_y) / (max_y - min_y)
        K[:, 3, 3] = ones

        # NOTE: This maps the z coordinate to the range [0, 1] and replaces the
        # the OpenGL z normalization to [-1, 1]
        K[:, 2, 2] = z_sign * (1.0 / (zfar - znear)) * scale_xyz[:, 2]
        K[:, 2, 3] = -znear / (zfar - znear)

        return K

facebook-github-bot's avatar
facebook-github-bot committed
685
686
    def get_projection_transform(self, **kwargs) -> Transform3d:
        """
Georgia Gkioxari's avatar
Georgia Gkioxari committed
687
        Calculate the orthographic projection matrix.
facebook-github-bot's avatar
facebook-github-bot committed
688
689
690
691
692
693
        Use column major order.

        Args:
            **kwargs: parameters for the projection can be passed in to
                      override the default values set in __init__.
        Return:
694
            a Transform3d object which represents a batch of projection
Georgia Gkioxari's avatar
Georgia Gkioxari committed
695
               matrices of shape (N, 4, 4)
facebook-github-bot's avatar
facebook-github-bot committed
696
697
698

        .. code-block:: python

Georgia Gkioxari's avatar
Georgia Gkioxari committed
699
700
701
702
703
            scale_x = 2 / (max_x - min_x)
            scale_y = 2 / (max_y - min_y)
            scale_z = 2 / (far-near)
            mid_x = (max_x + min_x) / (max_x - min_x)
            mix_y = (max_y + min_y) / (max_y - min_y)
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
704
            mid_z = (far + near) / (far - near)
facebook-github-bot's avatar
facebook-github-bot committed
705

706
            K = [
facebook-github-bot's avatar
facebook-github-bot committed
707
708
709
710
711
712
                    [scale_x,        0,         0,  -mid_x],
                    [0,        scale_y,         0,  -mix_y],
                    [0,              0,  -scale_z,  -mid_z],
                    [0,              0,         0,       1],
            ]
        """
713
        K = kwargs.get("K", self.K)
714
715
716
717
718
719
        if K is not None:
            if K.shape != (self._N, 4, 4):
                msg = "Expected K to have shape of (%r, 4, 4)"
                raise ValueError(msg % (self._N))
        else:
            K = self.compute_projection_matrix(
720
721
722
723
724
725
726
                kwargs.get("znear", self.znear),
                kwargs.get("zfar", self.zfar),
                kwargs.get("max_x", self.max_x),
                kwargs.get("min_x", self.min_x),
                kwargs.get("max_y", self.max_y),
                kwargs.get("min_y", self.min_y),
                kwargs.get("scale_xyz", self.scale_xyz),
727
            )
facebook-github-bot's avatar
facebook-github-bot committed
728
729

        transform = Transform3d(device=self.device)
730
        transform._matrix = K.transpose(1, 2).contiguous()
facebook-github-bot's avatar
facebook-github-bot committed
731
732
        return transform

733
734
735
736
737
738
739
740
    def unproject_points(
        self,
        xy_depth: torch.Tensor,
        world_coordinates: bool = True,
        scaled_depth_input: bool = False,
        **kwargs
    ) -> torch.Tensor:
        """>!
Georgia Gkioxari's avatar
Georgia Gkioxari committed
741
        FoV cameras further allow for passing depth in world units
742
743
        (`scaled_depth_input=False`) or in the [0, 1]-normalized units
        (`scaled_depth_input=True`)
facebook-github-bot's avatar
facebook-github-bot committed
744
745

        Args:
746
747
748
749
750
751
            scaled_depth_input: If `True`, assumes the input depth is in
                the [0, 1]-normalized units. If `False` the input depth is in
                the world units.
        """

        if world_coordinates:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
752
            to_ndc_transform = self.get_full_projection_transform(**kwargs.copy())
753
        else:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
754
            to_ndc_transform = self.get_projection_transform(**kwargs.copy())
755
756
757
758
759
760

        if scaled_depth_input:
            # the input depth is already scaled
            xy_sdepth = xy_depth
        else:
            # we have to obtain the scaled depth first
761
762
763
764
765
            K = self.get_projection_transform(**kwargs).get_matrix()
            unsqueeze_shape = [1] * K.dim()
            unsqueeze_shape[0] = K.shape[0]
            mid_z = K[:, 3, 2].reshape(unsqueeze_shape)
            scale_z = K[:, 2, 2].reshape(unsqueeze_shape)
766
767
768
769
            scaled_depth = scale_z * xy_depth[..., 2:3] + mid_z
            # cat xy and scaled depth
            xy_sdepth = torch.cat((xy_depth[..., :2], scaled_depth), dim=-1)
        # finally invert the transform
Georgia Gkioxari's avatar
Georgia Gkioxari committed
770
        unprojection_transform = to_ndc_transform.inverse()
771
772
        return unprojection_transform.transform_points(xy_sdepth)

773
774
775
    def is_perspective(self):
        return False

776

Georgia Gkioxari's avatar
Georgia Gkioxari committed
777
778
779
780
781
782
783
784
785
786
787
788
############################################################
#             MultiView Camera Classes                     #
############################################################
"""
Note that the MultiView Cameras accept  parameters in both
screen and NDC space.
If the user specifies `image_size` at construction time then
we assume the parameters are in screen space.
"""


def SfMPerspectiveCameras(
Patrick Labatut's avatar
Patrick Labatut committed
789
790
791
792
793
794
    focal_length=1.0,
    principal_point=((0.0, 0.0),),
    R: torch.Tensor = _R,
    T: torch.Tensor = _T,
    device: Device = "cpu",
) -> "PerspectiveCameras":
Georgia Gkioxari's avatar
Georgia Gkioxari committed
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
    """
    SfMPerspectiveCameras has been DEPRECATED. Use PerspectiveCameras instead.
    Preserving SfMPerspectiveCameras for backward compatibility.
    """

    warnings.warn(
        """SfMPerspectiveCameras is deprecated,
        Use PerspectiveCameras instead.
        SfMPerspectiveCameras will be removed in future releases.""",
        PendingDeprecationWarning,
    )

    return PerspectiveCameras(
        focal_length=focal_length,
        principal_point=principal_point,
        R=R,
        T=T,
        device=device,
    )


class PerspectiveCameras(CamerasBase):
facebook-github-bot's avatar
facebook-github-bot committed
817
818
819
820
    """
    A class which stores a batch of parameters to generate a batch of
    transformation matrices using the multi-view geometry convention for
    perspective camera.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
821
822
823
824
825

    Parameters for this camera can be specified in NDC or in screen space.
    If you wish to provide parameters in screen space, you NEED to provide
    the image_size = (imwidth, imheight).
    If you wish to provide parameters in NDC space, you should NOT provide
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
826
    image_size. Providing valid image_size will trigger a screen space to
Georgia Gkioxari's avatar
Georgia Gkioxari committed
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
    NDC space transformation in the camera.

    For example, here is how to define cameras on the two spaces.

    .. code-block:: python
        # camera defined in screen space
        cameras = PerspectiveCameras(
            focal_length=((22.0, 15.0),),  # (fx_screen, fy_screen)
            principal_point=((192.0, 128.0),),  # (px_screen, py_screen)
            image_size=((256, 256),),  # (imwidth, imheight)
        )

        # the equivalent camera defined in NDC space
        cameras = PerspectiveCameras(
            focal_length=((0.17875, 0.11718),),  # fx = fx_screen / half_imwidth,
                                                # fy = fy_screen / half_imheight
            principal_point=((-0.5, 0),),  # px = - (px_screen - half_imwidth) / half_imwidth,
                                           # py = - (py_screen - half_imheight) / half_imheight
        )
facebook-github-bot's avatar
facebook-github-bot committed
846
847
848
    """

    def __init__(
Georgia Gkioxari's avatar
Georgia Gkioxari committed
849
850
851
        self,
        focal_length=1.0,
        principal_point=((0.0, 0.0),),
Patrick Labatut's avatar
Patrick Labatut committed
852
853
854
855
        R: torch.Tensor = _R,
        T: torch.Tensor = _T,
        K: Optional[torch.Tensor] = None,
        device: Device = "cpu",
Georgia Gkioxari's avatar
Georgia Gkioxari committed
856
        image_size=((-1, -1),),
Patrick Labatut's avatar
Patrick Labatut committed
857
    ) -> None:
facebook-github-bot's avatar
facebook-github-bot committed
858
859
860
861
862
863
864
865
866
867
868
        """

        Args:
            focal_length: Focal length of the camera in world units.
                A tensor of shape (N, 1) or (N, 2) for
                square and non-square pixels respectively.
            principal_point: xy coordinates of the center of
                the principal point of the camera in pixels.
                A tensor of shape (N, 2).
            R: Rotation matrix of shape (N, 3, 3)
            T: Translation matrix of shape (N, 3)
869
870
871
            K: (optional) A calibration matrix of shape (N, 4, 4)
                If provided, don't need focal_length, principal_point, image_size

facebook-github-bot's avatar
facebook-github-bot committed
872
            device: torch.device or string
Georgia Gkioxari's avatar
Georgia Gkioxari committed
873
874
875
876
877
            image_size: If image_size = (imwidth, imheight) with imwidth, imheight > 0
                is provided, the camera parameters are assumed to be in screen
                space. They will be converted to NDC space.
                If image_size is not provided, the parameters are assumed to
                be in NDC space.
facebook-github-bot's avatar
facebook-github-bot committed
878
879
880
881
882
883
884
885
886
        """
        # The initializer formats all inputs to torch tensors and broadcasts
        # all the inputs to have the same batch dimension where necessary.
        super().__init__(
            device=device,
            focal_length=focal_length,
            principal_point=principal_point,
            R=R,
            T=T,
887
            K=K,
Georgia Gkioxari's avatar
Georgia Gkioxari committed
888
            image_size=image_size,
facebook-github-bot's avatar
facebook-github-bot committed
889
890
891
892
893
894
895
896
897
898
899
900
        )

    def get_projection_transform(self, **kwargs) -> Transform3d:
        """
        Calculate the projection matrix using the
        multi-view geometry convention.

        Args:
            **kwargs: parameters for the projection can be passed in as keyword
                arguments to override the default values set in __init__.

        Returns:
901
            A `Transform3d` object with a batch of `N` projection transforms.
facebook-github-bot's avatar
facebook-github-bot committed
902
903
904

        .. code-block:: python

905
906
907
908
            fx = focal_length[:, 0]
            fy = focal_length[:, 1]
            px = principal_point[:, 0]
            py = principal_point[:, 1]
facebook-github-bot's avatar
facebook-github-bot committed
909

910
            K = [
911
912
                    [fx,   0,   px,   0],
                    [0,   fy,   py,   0],
facebook-github-bot's avatar
facebook-github-bot committed
913
914
915
916
                    [0,    0,    0,   1],
                    [0,    0,    1,   0],
            ]
        """
917
        K = kwargs.get("K", self.K)
918
919
920
921
922
923
924
925
926
927
928
929
        if K is not None:
            if K.shape != (self._N, 4, 4):
                msg = "Expected K to have shape of (%r, 4, 4)"
                raise ValueError(msg % (self._N))
        else:
            image_size = kwargs.get("image_size", self.image_size)
            # if imwidth > 0, parameters are in screen space
            image_size = image_size if image_size[0][0] > 0 else None

            K = _get_sfm_calibration_matrix(
                self._N,
                self.device,
930
931
                kwargs.get("focal_length", self.focal_length),
                kwargs.get("principal_point", self.principal_point),
932
933
934
                orthographic=False,
                image_size=image_size,
            )
facebook-github-bot's avatar
facebook-github-bot committed
935
936

        transform = Transform3d(device=self.device)
937
        transform._matrix = K.transpose(1, 2).contiguous()
facebook-github-bot's avatar
facebook-github-bot committed
938
939
        return transform

940
941
942
943
    def unproject_points(
        self, xy_depth: torch.Tensor, world_coordinates: bool = True, **kwargs
    ) -> torch.Tensor:
        if world_coordinates:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
944
            to_ndc_transform = self.get_full_projection_transform(**kwargs)
945
        else:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
946
            to_ndc_transform = self.get_projection_transform(**kwargs)
947

Georgia Gkioxari's avatar
Georgia Gkioxari committed
948
        unprojection_transform = to_ndc_transform.inverse()
949
950
951
952
        xy_inv_depth = torch.cat(
            (xy_depth[..., :2], 1.0 / xy_depth[..., 2:3]), dim=-1  # type: ignore
        )
        return unprojection_transform.transform_points(xy_inv_depth)
facebook-github-bot's avatar
facebook-github-bot committed
953

954
955
956
    def is_perspective(self):
        return True

facebook-github-bot's avatar
facebook-github-bot committed
957

Georgia Gkioxari's avatar
Georgia Gkioxari committed
958
def SfMOrthographicCameras(
Patrick Labatut's avatar
Patrick Labatut committed
959
960
961
962
963
964
    focal_length=1.0,
    principal_point=((0.0, 0.0),),
    R: torch.Tensor = _R,
    T: torch.Tensor = _T,
    device: Device = "cpu",
) -> "OrthographicCameras":
Georgia Gkioxari's avatar
Georgia Gkioxari committed
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
    """
    SfMOrthographicCameras has been DEPRECATED. Use OrthographicCameras instead.
    Preserving SfMOrthographicCameras for backward compatibility.
    """

    warnings.warn(
        """SfMOrthographicCameras is deprecated,
        Use OrthographicCameras instead.
        SfMOrthographicCameras will be removed in future releases.""",
        PendingDeprecationWarning,
    )

    return OrthographicCameras(
        focal_length=focal_length,
        principal_point=principal_point,
        R=R,
        T=T,
        device=device,
    )


class OrthographicCameras(CamerasBase):
facebook-github-bot's avatar
facebook-github-bot committed
987
988
989
990
    """
    A class which stores a batch of parameters to generate a batch of
    transformation matrices using the multi-view geometry convention for
    orthographic camera.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
991
992
993
994
995

    Parameters for this camera can be specified in NDC or in screen space.
    If you wish to provide parameters in screen space, you NEED to provide
    the image_size = (imwidth, imheight).
    If you wish to provide parameters in NDC space, you should NOT provide
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
996
    image_size. Providing valid image_size will trigger a screen space to
Georgia Gkioxari's avatar
Georgia Gkioxari committed
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
    NDC space transformation in the camera.

    For example, here is how to define cameras on the two spaces.

    .. code-block:: python
        # camera defined in screen space
        cameras = OrthographicCameras(
            focal_length=((22.0, 15.0),),  # (fx, fy)
            principal_point=((192.0, 128.0),),  # (px, py)
            image_size=((256, 256),),  # (imwidth, imheight)
        )

        # the equivalent camera defined in NDC space
        cameras = OrthographicCameras(
            focal_length=((0.17875, 0.11718),),  # := (fx / half_imwidth, fy / half_imheight)
            principal_point=((-0.5, 0),),  # := (- (px - half_imwidth) / half_imwidth,
                                                 - (py - half_imheight) / half_imheight)
        )
facebook-github-bot's avatar
facebook-github-bot committed
1015
1016
1017
    """

    def __init__(
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1018
1019
1020
        self,
        focal_length=1.0,
        principal_point=((0.0, 0.0),),
Patrick Labatut's avatar
Patrick Labatut committed
1021
1022
1023
1024
        R: torch.Tensor = _R,
        T: torch.Tensor = _T,
        K: Optional[torch.Tensor] = None,
        device: Device = "cpu",
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1025
        image_size=((-1, -1),),
Patrick Labatut's avatar
Patrick Labatut committed
1026
    ) -> None:
facebook-github-bot's avatar
facebook-github-bot committed
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
        """

        Args:
            focal_length: Focal length of the camera in world units.
                A tensor of shape (N, 1) or (N, 2) for
                square and non-square pixels respectively.
            principal_point: xy coordinates of the center of
                the principal point of the camera in pixels.
                A tensor of shape (N, 2).
            R: Rotation matrix of shape (N, 3, 3)
            T: Translation matrix of shape (N, 3)
1038
1039
            K: (optional) A calibration matrix of shape (N, 4, 4)
                If provided, don't need focal_length, principal_point, image_size
facebook-github-bot's avatar
facebook-github-bot committed
1040
            device: torch.device or string
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1041
1042
1043
1044
1045
            image_size: If image_size = (imwidth, imheight) with imwidth, imheight > 0
                is provided, the camera parameters are assumed to be in screen
                space. They will be converted to NDC space.
                If image_size is not provided, the parameters are assumed to
                be in NDC space.
facebook-github-bot's avatar
facebook-github-bot committed
1046
1047
1048
1049
1050
1051
1052
1053
1054
        """
        # The initializer formats all inputs to torch tensors and broadcasts
        # all the inputs to have the same batch dimension where necessary.
        super().__init__(
            device=device,
            focal_length=focal_length,
            principal_point=principal_point,
            R=R,
            T=T,
1055
            K=K,
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1056
            image_size=image_size,
facebook-github-bot's avatar
facebook-github-bot committed
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
        )

    def get_projection_transform(self, **kwargs) -> Transform3d:
        """
        Calculate the projection matrix using
        the multi-view geometry convention.

        Args:
            **kwargs: parameters for the projection can be passed in as keyword
                arguments to override the default values set in __init__.

1068
        Returns:
1069
            A `Transform3d` object with a batch of `N` projection transforms.
facebook-github-bot's avatar
facebook-github-bot committed
1070
1071
1072
1073
1074
1075
1076
1077

        .. code-block:: python

            fx = focal_length[:,0]
            fy = focal_length[:,1]
            px = principal_point[:,0]
            py = principal_point[:,1]

1078
            K = [
facebook-github-bot's avatar
facebook-github-bot committed
1079
1080
1081
1082
1083
1084
                    [fx,   0,    0,  px],
                    [0,   fy,    0,  py],
                    [0,    0,    1,   0],
                    [0,    0,    0,   1],
            ]
        """
1085
        K = kwargs.get("K", self.K)
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
        if K is not None:
            if K.shape != (self._N, 4, 4):
                msg = "Expected K to have shape of (%r, 4, 4)"
                raise ValueError(msg % (self._N))
        else:
            image_size = kwargs.get("image_size", self.image_size)
            # if imwidth > 0, parameters are in screen space
            image_size = image_size if image_size[0][0] > 0 else None

            K = _get_sfm_calibration_matrix(
                self._N,
                self.device,
1098
1099
                kwargs.get("focal_length", self.focal_length),
                kwargs.get("principal_point", self.principal_point),
1100
1101
1102
                orthographic=True,
                image_size=image_size,
            )
facebook-github-bot's avatar
facebook-github-bot committed
1103
1104

        transform = Transform3d(device=self.device)
1105
        transform._matrix = K.transpose(1, 2).contiguous()
facebook-github-bot's avatar
facebook-github-bot committed
1106
1107
        return transform

1108
1109
1110
1111
    def unproject_points(
        self, xy_depth: torch.Tensor, world_coordinates: bool = True, **kwargs
    ) -> torch.Tensor:
        if world_coordinates:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1112
            to_ndc_transform = self.get_full_projection_transform(**kwargs)
1113
        else:
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1114
            to_ndc_transform = self.get_projection_transform(**kwargs)
facebook-github-bot's avatar
facebook-github-bot committed
1115

Georgia Gkioxari's avatar
Georgia Gkioxari committed
1116
        unprojection_transform = to_ndc_transform.inverse()
1117
        return unprojection_transform.transform_points(xy_depth)
facebook-github-bot's avatar
facebook-github-bot committed
1118

1119
1120
1121
    def is_perspective(self):
        return False

facebook-github-bot's avatar
facebook-github-bot committed
1122

Georgia Gkioxari's avatar
Georgia Gkioxari committed
1123
1124
1125
1126
1127
################################################
#       Helper functions for cameras           #
################################################


facebook-github-bot's avatar
facebook-github-bot committed
1128
def _get_sfm_calibration_matrix(
Patrick Labatut's avatar
Patrick Labatut committed
1129
1130
    N: int,
    device: Device,
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1131
1132
1133
1134
    focal_length,
    principal_point,
    orthographic: bool = False,
    image_size=None,
facebook-github-bot's avatar
facebook-github-bot committed
1135
1136
) -> torch.Tensor:
    """
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
1137
    Returns a calibration matrix of a perspective/orthographic camera.
facebook-github-bot's avatar
facebook-github-bot committed
1138
1139
1140
1141
1142
1143

    Args:
        N: Number of cameras.
        focal_length: Focal length of the camera in world units.
        principal_point: xy coordinates of the center of
            the principal point of the camera in pixels.
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1144
1145
1146
1147
        orthographic: Boolean specifying if the camera is orthographic or not
        image_size: (Optional) Specifying the image_size = (imwidth, imheight).
            If not None, the camera parameters are assumed to be in screen space
            and are transformed to NDC space.
facebook-github-bot's avatar
facebook-github-bot committed
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166

        The calibration matrix `K` is set up as follows:

        .. code-block:: python

            fx = focal_length[:,0]
            fy = focal_length[:,1]
            px = principal_point[:,0]
            py = principal_point[:,1]

            for orthographic==True:
                K = [
                        [fx,   0,    0,  px],
                        [0,   fy,    0,  py],
                        [0,    0,    1,   0],
                        [0,    0,    0,   1],
                ]
            else:
                K = [
1167
1168
                        [fx,   0,   px,   0],
                        [0,   fy,   py,   0],
facebook-github-bot's avatar
facebook-github-bot committed
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
                        [0,    0,    0,   1],
                        [0,    0,    1,   0],
                ]

    Returns:
        A calibration matrix `K` of the SfM-conventioned camera
        of shape (N, 4, 4).
    """

    if not torch.is_tensor(focal_length):
        focal_length = torch.tensor(focal_length, device=device)

Georgia Gkioxari's avatar
Georgia Gkioxari committed
1181
    if focal_length.ndim in (0, 1) or focal_length.shape[1] == 1:
facebook-github-bot's avatar
facebook-github-bot committed
1182
1183
1184
1185
1186
1187
1188
1189
1190
        fx = fy = focal_length
    else:
        fx, fy = focal_length.unbind(1)

    if not torch.is_tensor(principal_point):
        principal_point = torch.tensor(principal_point, device=device)

    px, py = principal_point.unbind(1)

Georgia Gkioxari's avatar
Georgia Gkioxari committed
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
    if image_size is not None:
        if not torch.is_tensor(image_size):
            image_size = torch.tensor(image_size, device=device)
        imwidth, imheight = image_size.unbind(1)
        # make sure imwidth, imheight are valid (>0)
        if (imwidth < 1).any() or (imheight < 1).any():
            raise ValueError(
                "Camera parameters provided in screen space. Image width or height invalid."
            )
        half_imwidth = imwidth / 2.0
        half_imheight = imheight / 2.0
        fx = fx / half_imwidth
        fy = fy / half_imheight
        px = -(px - half_imwidth) / half_imwidth
        py = -(py - half_imheight) / half_imheight

facebook-github-bot's avatar
facebook-github-bot committed
1207
1208
1209
1210
    K = fx.new_zeros(N, 4, 4)
    K[:, 0, 0] = fx
    K[:, 1, 1] = fy
    if orthographic:
1211
1212
        K[:, 0, 3] = px
        K[:, 1, 3] = py
facebook-github-bot's avatar
facebook-github-bot committed
1213
1214
1215
        K[:, 2, 2] = 1.0
        K[:, 3, 3] = 1.0
    else:
1216
1217
        K[:, 0, 2] = px
        K[:, 1, 2] = py
facebook-github-bot's avatar
facebook-github-bot committed
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
        K[:, 3, 2] = 1.0
        K[:, 2, 3] = 1.0

    return K


################################################
# Helper functions for world to view transforms
################################################


Patrick Labatut's avatar
Patrick Labatut committed
1229
1230
1231
def get_world_to_view_transform(
    R: torch.Tensor = _R, T: torch.Tensor = _T
) -> Transform3d:
facebook-github-bot's avatar
facebook-github-bot committed
1232
1233
1234
1235
1236
    """
    This function returns a Transform3d representing the transformation
    matrix to go from world space to view space by applying a rotation and
    a translation.

1237
    PyTorch3D uses the same convention as Hartley & Zisserman.
facebook-github-bot's avatar
facebook-github-bot committed
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
    I.e., for camera extrinsic parameters R (rotation) and T (translation),
    we map a 3D point `X_world` in world coordinates to
    a point `X_cam` in camera coordinates with:
    `X_cam = X_world R + T`

    Args:
        R: (N, 3, 3) matrix representing the rotation.
        T: (N, 3) matrix representing the translation.

    Returns:
        a Transform3d object which represents the composed RT transformation.

    """
    # TODO: also support the case where RT is specified as one matrix
    # of shape (N, 4, 4).

    if T.shape[0] != R.shape[0]:
        msg = "Expected R, T to have the same batch dimension; got %r, %r"
        raise ValueError(msg % (R.shape[0], T.shape[0]))
    if T.dim() != 2 or T.shape[1:] != (3,):
        msg = "Expected T to have shape (N, 3); got %r"
        raise ValueError(msg % repr(T.shape))
    if R.dim() != 3 or R.shape[1:] != (3, 3):
        msg = "Expected R to have shape (N, 3, 3); got %r"
1262
        raise ValueError(msg % repr(R.shape))
facebook-github-bot's avatar
facebook-github-bot committed
1263
1264

    # Create a Transform3d object
Patrick Labatut's avatar
Patrick Labatut committed
1265
1266
1267
    T_ = Translate(T, device=T.device)
    R_ = Rotate(R, device=R.device)
    return R_.compose(T_)
facebook-github-bot's avatar
facebook-github-bot committed
1268
1269
1270


def camera_position_from_spherical_angles(
Patrick Labatut's avatar
Patrick Labatut committed
1271
1272
1273
1274
1275
    distance: float,
    elevation: float,
    azimuth: float,
    degrees: bool = True,
    device: Device = "cpu",
facebook-github-bot's avatar
facebook-github-bot committed
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
) -> torch.Tensor:
    """
    Calculate the location of the camera based on the distance away from
    the target point, the elevation and azimuth angles.

    Args:
        distance: distance of the camera from the object.
        elevation, azimuth: angles.
            The inputs distance, elevation and azimuth can be one of the following
                - Python scalar
                - Torch scalar
                - Torch tensor of shape (N) or (1)
        degrees: bool, whether the angles are specified in degrees or radians.
        device: str or torch.device, device for new tensors to be placed on.

    The vectors are broadcast against each other so they all have shape (N, 1).

    Returns:
        camera_position: (N, 3) xyz location of the camera.
    """
    broadcasted_args = convert_to_tensors_and_broadcast(
        distance, elevation, azimuth, device=device
    )
    dist, elev, azim = broadcasted_args
    if degrees:
        elev = math.pi / 180.0 * elev
        azim = math.pi / 180.0 * azim
    x = dist * torch.cos(elev) * torch.sin(azim)
    y = dist * torch.sin(elev)
1305
    z = dist * torch.cos(elev) * torch.cos(azim)
facebook-github-bot's avatar
facebook-github-bot committed
1306
1307
1308
1309
1310
1311
1312
    camera_position = torch.stack([x, y, z], dim=1)
    if camera_position.dim() == 0:
        camera_position = camera_position.view(1, -1)  # add batch dim.
    return camera_position.view(-1, 3)


def look_at_rotation(
Patrick Labatut's avatar
Patrick Labatut committed
1313
    camera_position, at=((0, 0, 0),), up=((0, 1, 0),), device: Device = "cpu"
facebook-github-bot's avatar
facebook-github-bot committed
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
) -> torch.Tensor:
    """
    This function takes a vector 'camera_position' which specifies the location
    of the camera in world coordinates and two vectors `at` and `up` which
    indicate the position of the object and the up directions of the world
    coordinate system respectively. The object is assumed to be centered at
    the origin.

    The output is a rotation matrix representing the transformation
    from world coordinates -> view coordinates.

    Args:
        camera_position: position of the camera in world coordinates
        at: position of the object in world coordinates
        up: vector specifying the up direction in the world coordinate frame.

    The inputs camera_position, at and up can each be a
        - 3 element tuple/list
        - torch tensor of shape (1, 3)
        - torch tensor of shape (N, 3)

    The vectors are broadcast against each other so they all have shape (N, 3).

    Returns:
        R: (N, 3, 3) batched rotation matrices
    """
    # Format input and broadcast
    broadcasted_args = convert_to_tensors_and_broadcast(
        camera_position, at, up, device=device
    )
    camera_position, at, up = broadcasted_args
    for t, n in zip([camera_position, at, up], ["camera_position", "at", "up"]):
        if t.shape[-1] != 3:
            msg = "Expected arg %s to have shape (N, 3); got %r"
            raise ValueError(msg % (n, t.shape))
    z_axis = F.normalize(at - camera_position, eps=1e-5)
1350
1351
    x_axis = F.normalize(torch.cross(up, z_axis, dim=1), eps=1e-5)
    y_axis = F.normalize(torch.cross(z_axis, x_axis, dim=1), eps=1e-5)
Amitav Baruah's avatar
Amitav Baruah committed
1352
1353
1354
1355
1356
1357
    is_close = torch.isclose(x_axis, torch.tensor(0.0), atol=5e-3).all(
        dim=1, keepdim=True
    )
    if is_close.any():
        replacement = F.normalize(torch.cross(y_axis, z_axis, dim=1), eps=1e-5)
        x_axis = torch.where(is_close, replacement, x_axis)
1358
    R = torch.cat((x_axis[:, None, :], y_axis[:, None, :], z_axis[:, None, :]), dim=1)
facebook-github-bot's avatar
facebook-github-bot committed
1359
1360
1361
1362
    return R.transpose(1, 2)


def look_at_view_transform(
1363
1364
1365
    dist=1.0,
    elev=0.0,
    azim=0.0,
facebook-github-bot's avatar
facebook-github-bot committed
1366
    degrees: bool = True,
1367
    eye: Optional[Sequence] = None,
facebook-github-bot's avatar
facebook-github-bot committed
1368
1369
    at=((0, 0, 0),),  # (1, 3)
    up=((0, 1, 0),),  # (1, 3)
Patrick Labatut's avatar
Patrick Labatut committed
1370
    device: Device = "cpu",
facebook-github-bot's avatar
facebook-github-bot committed
1371
1372
1373
1374
1375
1376
1377
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    This function returns a rotation and translation matrix
    to apply the 'Look At' transformation from world -> view coordinates [0].

    Args:
        dist: distance of the camera from the object
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
1378
        elev: angle in degrees or radians. This is the angle between the
1379
            vector from the object to the camera, and the horizontal plane y = 0 (xz-plane).
facebook-github-bot's avatar
facebook-github-bot committed
1380
        azim: angle in degrees or radians. The vector from the object to
1381
            the camera is projected onto a horizontal plane y = 0.
facebook-github-bot's avatar
facebook-github-bot committed
1382
            azim is the angle between the projected vector and a
1383
            reference vector at (0, 0, 1) on the reference plane (the horizontal plane).
1384
        dist, elev and azim can be of shape (1), (N).
facebook-github-bot's avatar
facebook-github-bot committed
1385
        degrees: boolean flag to indicate if the elevation and azimuth
1386
1387
            angles are specified in degrees or radians.
        eye: the position of the camera(s) in world coordinates. If eye is not
Jeremy Reizenstein's avatar
Jeremy Reizenstein committed
1388
            None, it will override the camera position derived from dist, elev, azim.
facebook-github-bot's avatar
facebook-github-bot committed
1389
1390
        up: the direction of the x axis in the world coordinate system.
        at: the position of the object(s) in world coordinates.
1391
        eye, up and at can be of shape (1, 3) or (N, 3).
facebook-github-bot's avatar
facebook-github-bot committed
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401

    Returns:
        2-element tuple containing

        - **R**: the rotation to apply to the points to align with the camera.
        - **T**: the translation to apply to the points to align with the camera.

    References:
    [0] https://www.scratchapixel.com
    """
1402
1403

    if eye is not None:
1404
        broadcasted_args = convert_to_tensors_and_broadcast(eye, at, up, device=device)
1405
1406
1407
1408
        eye, at, up = broadcasted_args
        C = eye
    else:
        broadcasted_args = convert_to_tensors_and_broadcast(
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1409
1410
            dist, elev, azim, at, up, device=device
        )
1411
        dist, elev, azim, at, up = broadcasted_args
1412
1413
1414
1415
1416
        C = (
            camera_position_from_spherical_angles(
                dist, elev, azim, degrees=degrees, device=device
            )
            + at
Georgia Gkioxari's avatar
Georgia Gkioxari committed
1417
        )
1418

facebook-github-bot's avatar
facebook-github-bot committed
1419
1420
1421
    R = look_at_rotation(C, at, up, device=device)
    T = -torch.bmm(R.transpose(1, 2), C[:, :, None])[:, :, 0]
    return R, T