Initial commit

fbshipit-source-id: ad58e416e3ceeca85fae0583308968d04e78fe0d

Initial commit
fbshipit-source-id: ad58e416e3ceeca85fae0583308968d04e78fe0d
dbf06b50 · facebook-github-bot · dbf06b50 · dbf06b50 · dbf06b50 · dbf06b50
Commit dbf06b50 authored Jan 23, 2020 by facebook-github-bot
20 changed files
--- a/packaging/vs2017/activate.bat
+++ b/packaging/vs2017/activate.bat
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+:: Set env vars that tell distutils to use the compiler that we put on path
+SET DISTUTILS_USE_SDK=1
+SET MSSdk=1
+
+SET "VS_VERSION=15.0"
+SET "VS_MAJOR=15"
+SET "VS_YEAR=2017"
+
+set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
+set "MSYS2_ENV_CONV_EXCL=CL"
+
+:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
+:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
+set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VSINSTALLDIR=%%i\"
+        goto :vswhere
+    )
+)
+
+:vswhere
+
+:: Shorten PATH to avoid the `input line too long` error.
+SET MyPath=%PATH%
+
+setlocal EnableDelayedExpansion
+
+SET TempPath="%MyPath:;=";"%"
+SET var=
+FOR %%a IN (%TempPath%) DO (
+    IF EXIST %%~sa (
+        SET "var=!var!;%%~sa"
+    )
+)
+
+set "TempPath=!var:~1!"
+endlocal & set "PATH=%TempPath%"
+
+:: Shorten current directory too
+FOR %%A IN (.) DO CD "%%~sA"
+
+:: other things added by install_activate.bat at package build time
--- a/packaging/vs2017/conda_build_config.yaml
+++ b/packaging/vs2017/conda_build_config.yaml
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2017                     # [win]
+cxx_compiler:
+  - vs2017                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
--- a/packaging/vs2017/install_activate.bat
+++ b/packaging/vs2017/install_activate.bat
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set YEAR=2017
+set VER=15
+
+mkdir "%PREFIX%\etc\conda\activate.d"
+COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+
+IF "%cross_compiler_target_platform%" == "win-64" (
+  set "target_platform=amd64"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  IF "%VSDEVCMD_ARGS%" == "" (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) ELSE (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  )
+  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) else (
+  set "target_platform=x86"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo popd
+  )
--- a/packaging/vs2017/install_runtime.bat
+++ b/packaging/vs2017/install_runtime.bat
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set VC_PATH=x86
+if "%ARCH%"=="64" (
+   set VC_PATH=x64
+)
+
+set MSC_VER=2017
+
+rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
+rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
+rem     set SP=%%A
+rem     )
+
+rem if not "%SP%" == "%PKG_VERSION%" (
+rem    echo "Version detected from registry: %SP%"
+rem    echo    "does not match version of package being built (%PKG_VERSION%)"
+rem    echo "Do you have current updates for VS 2015 installed?"
+rem    exit 1
+rem )
+
+
+REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
+if %ERRORLEVEL% GEQ 8 exit 1
+
+REM ========== This one comes from visual studio 2017
+set "VC_VER=141"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto :eof
+    )
+)
+
+@setlocal
+call "%VS15VARSALL%" x64
+
+set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
+
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+@endlocal
--- a/packaging/vs2017/meta.yaml
+++ b/packaging/vs2017/meta.yaml
+{% set vcver="14.1" %}
+{% set vcfeature="14" %}
+{% set vsyear="2017" %}
+{% set fullver="15.4.27004.2010" %}
+
+package:
+  name: vs{{ vsyear }}
+  version: {{ fullver }}
+
+build:
+  skip: True  [not win]
+  script_env:
+    - VSDEVCMD_ARGS # [win]
+
+outputs:
+  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
+    script: install_activate.bat
+    track_features:
+      # VS 2017 is binary-compatible with VS 2015/vc14.  Tools are "v141".
+      strong:
+        - vc{{ vcfeature }}
+    run_exports:
+      - vc {{ vcver }}
+    about:
+      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
+      license: BSD 3-clause
+  - name: vs{{ vsyear }}_runtime
+    script: install_runtime.bat
+  - name: vc
+    version: {{ vcver }}
+    track_features:
+      - vc{{ vcfeature }}
+    requirements:
+      run:
+        - {{ pin_subpackage('vs' ~ vsyear ~ '_runtime') }}
+    about:
+      home: https://github.com/conda/conda/wiki/VC-features
+      license: Modified BSD License (3-clause)
+      license_family: BSD
+      summary: A meta-package to track VC features.
+      description: |
+          This metapackage is used to activate vc features without
+          depending on Python.
+      doc_url: https://github.com/conda/conda/wiki/VC-features
+      dev_url: https://github.com/conda/conda/wiki/VC-features
--- a/packaging/vs2019/activate.bat
+++ b/packaging/vs2019/activate.bat
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+:: Set env vars that tell distutils to use the compiler that we put on path
+SET DISTUTILS_USE_SDK=1
+SET MSSdk=1
+
+SET "VS_VERSION=16.0"
+SET "VS_MAJOR=16"
+SET "VS_YEAR=2019"
+
+set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
+set "MSYS2_ENV_CONV_EXCL=CL"
+
+:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
+:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
+set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VSINSTALLDIR=%%i\"
+        goto :vswhere
+    )
+)
+
+:vswhere
+
+:: Shorten PATH to avoid the `input line too long` error.
+SET MyPath=%PATH%
+
+setlocal EnableDelayedExpansion
+
+SET TempPath="%MyPath:;=";"%"
+SET var=
+FOR %%a IN (%TempPath%) DO (
+    IF EXIST %%~sa (
+        SET "var=!var!;%%~sa"
+    )
+)
+
+set "TempPath=!var:~1!"
+endlocal & set "PATH=%TempPath%"
+
+:: Shorten current directory too
+FOR %%A IN (.) DO CD "%%~sA"
+
+:: other things added by install_activate.bat at package build time
--- a/packaging/vs2019/conda_build_config.yaml
+++ b/packaging/vs2019/conda_build_config.yaml
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2019                     # [win]
+cxx_compiler:
+  - vs2019                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
--- a/packaging/vs2019/install_activate.bat
+++ b/packaging/vs2019/install_activate.bat
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set YEAR=2019
+set VER=16
+
+mkdir "%PREFIX%\etc\conda\activate.d"
+COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+
+IF "%cross_compiler_target_platform%" == "win-64" (
+  set "target_platform=amd64"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  IF "%VSDEVCMD_ARGS%" == "" (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) ELSE (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  )
+  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) else (
+  set "target_platform=x86"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo popd
+  )
--- a/packaging/vs2019/install_runtime.bat
+++ b/packaging/vs2019/install_runtime.bat
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+set VC_PATH=x86
+if "%ARCH%"=="64" (
+   set VC_PATH=x64
+)
+
+set MSC_VER=2019
+
+rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
+rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
+rem     set SP=%%A
+rem     )
+
+rem if not "%SP%" == "%PKG_VERSION%" (
+rem    echo "Version detected from registry: %SP%"
+rem    echo    "does not match version of package being built (%PKG_VERSION%)"
+rem    echo "Do you have current updates for VS 2015 installed?"
+rem    exit 1
+rem )
+
+
+REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
+if %ERRORLEVEL% GEQ 8 exit 1
+
+REM ========== This one comes from visual studio 2019
+set "VC_VER=142"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto :eof
+    )
+)
+
+@setlocal
+call "%VS15VARSALL%" x64
+
+set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
+
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+@endlocal
--- a/packaging/vs2019/meta.yaml
+++ b/packaging/vs2019/meta.yaml
+{% set vcver="14.2" %}
+{% set vcfeature="14" %}
+{% set vsyear="2019" %}
+{% set fullver="15.4.27004.2010" %}
+
+package:
+  name: vs{{ vsyear }}
+  version: {{ fullver }}
+
+build:
+  skip: True  [not win]
+  script_env:
+    - VSDEVCMD_ARGS # [win]
+
+outputs:
+  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
+    script: install_activate.bat
+    track_features:
+      # VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14.  Tools are "v142".
+      strong:
+        - vc{{ vcfeature }}
+    run_exports:
+      - vc {{ vcver }}
+    about:
+      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
+      license: BSD 3-clause
+  - name: vs{{ vsyear }}_runtime
+    script: install_runtime.bat
+  - name: vc
+    version: {{ vcver }}
+    track_features:
+      - vc{{ vcfeature }}
+    requirements:
+      run:
+        - {{ pin_subpackage('vs' ~ vsyear ~ '_runtime') }}
+    about:
+      home: https://github.com/conda/conda/wiki/VC-features
+      license: Modified BSD License (3-clause)
+      license_family: BSD
+      summary: A meta-package to track VC features.
+      description: |
+          This metapackage is used to activate vc features without
+          depending on Python.
+      doc_url: https://github.com/conda/conda/wiki/VC-features
+      dev_url: https://github.com/conda/conda/wiki/VC-features
--- a/pytorch3d/__init__.py
+++ b/pytorch3d/__init__.py
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+__version__ = "0.1"
--- a/pytorch3d/csrc/ext.cpp
+++ b/pytorch3d/csrc/ext.cpp
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <torch/extension.h>
+#include "face_areas_normals/face_areas_normals.h"
+#include "gather_scatter/gather_scatter.h"
+#include "nearest_neighbor_points/nearest_neighbor_points.h"
+#include "packed_to_padded_tensor/packed_to_padded_tensor.h"
+#include "rasterize_meshes/rasterize_meshes.h"
+#include "rasterize_points/rasterize_points.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("face_areas_normals", &face_areas_normals);
+  m.def("packed_to_padded_tensor", &packed_to_padded_tensor);
+  m.def("nn_points_idx", &nn_points_idx);
+  m.def("gather_scatter", &gather_scatter);
+  m.def("rasterize_points", &RasterizePoints);
+  m.def("rasterize_points_backward", &RasterizePointsBackward);
+  m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
+  m.def("rasterize_meshes", &RasterizeMeshes);
+
+  // These are only visible for testing; users should not call them directly
+  m.def("_rasterize_points_coarse", &RasterizePointsCoarse);
+  m.def("_rasterize_points_naive", &RasterizePointsNaive);
+  m.def("_rasterize_meshes_naive", &RasterizeMeshesNaive);
+  m.def("_rasterize_meshes_coarse", &RasterizeMeshesCoarse);
+  m.def("_rasterize_meshes_fine", &RasterizeMeshesFine);
+}
--- a/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
+++ b/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <ATen/ATen.h>
+#include <tuple>
+
+template <typename scalar_t>
+__global__ void face_areas_kernel(
+    const scalar_t* __restrict__ verts,
+    const long* __restrict__ faces,
+    scalar_t* __restrict__ face_areas,
+    scalar_t* __restrict__ face_normals,
+    const size_t V,
+    const size_t F) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = gridDim.x * blockDim.x;
+
+  // Faces split evenly over the number of threads in the grid.
+  // Each thread computes the area & normal of its respective faces and adds it
+  // to the global face_areas tensor.
+  for (size_t f = tid; f < F; f += stride) {
+    const long i0 = faces[3 * f + 0];
+    const long i1 = faces[3 * f + 1];
+    const long i2 = faces[3 * f + 2];
+
+    const scalar_t v0_x = verts[3 * i0 + 0];
+    const scalar_t v0_y = verts[3 * i0 + 1];
+    const scalar_t v0_z = verts[3 * i0 + 2];
+
+    const scalar_t v1_x = verts[3 * i1 + 0];
+    const scalar_t v1_y = verts[3 * i1 + 1];
+    const scalar_t v1_z = verts[3 * i1 + 2];
+
+    const scalar_t v2_x = verts[3 * i2 + 0];
+    const scalar_t v2_y = verts[3 * i2 + 1];
+    const scalar_t v2_z = verts[3 * i2 + 2];
+
+    const scalar_t ax = v1_x - v0_x;
+    const scalar_t ay = v1_y - v0_y;
+    const scalar_t az = v1_z - v0_z;
+
+    const scalar_t bx = v2_x - v0_x;
+    const scalar_t by = v2_y - v0_y;
+    const scalar_t bz = v2_z - v0_z;
+
+    const scalar_t cx = ay * bz - az * by;
+    const scalar_t cy = az * bx - ax * bz;
+    const scalar_t cz = ax * by - ay * bx;
+
+    scalar_t norm = sqrt(cx * cx + cy * cy + cz * cz);
+    face_areas[f] = norm / 2.0;
+    norm = (norm < 1e-6) ? 1e-6 : norm; // max(norm, 1e-6)
+    face_normals[3 * f + 0] = cx / norm;
+    face_normals[3 * f + 1] = cy / norm;
+    face_normals[3 * f + 2] = cz / norm;
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> face_areas_cuda(
+    at::Tensor verts,
+    at::Tensor faces) {
+  const auto V = verts.size(0);
+  const auto F = faces.size(0);
+
+  at::Tensor areas = at::empty({F}, verts.options());
+  at::Tensor normals = at::empty({F, 3}, verts.options());
+
+  const int blocks = 64;
+  const int threads = 512;
+  AT_DISPATCH_FLOATING_TYPES(verts.type(), "face_areas_kernel", ([&] {
+                               face_areas_kernel<scalar_t><<<blocks, threads>>>(
+                                   verts.data_ptr<scalar_t>(),
+                                   faces.data_ptr<long>(),
+                                   areas.data_ptr<scalar_t>(),
+                                   normals.data_ptr<scalar_t>(),
+                                   V,
+                                   F);
+                             }));
+
+  return std::make_tuple(areas, normals);
+}
--- a/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
+++ b/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+
+// Compute areas of mesh faces using packed representation.
+//
+// Inputs:
+//    verts: FloatTensor of shape (V, 3) giving vertex positions.
+//    faces: LongTensor of shape (F, 3) giving faces.
+//
+// Returns:
+//    areas: FloatTensor of shape (F,) where areas[f] is the area of faces[f].
+//    normals: FloatTensor of shape (F, 3) where normals[f] is the normal of
+//    faces[f]
+//
+
+// Cuda implementation.
+std::tuple<at::Tensor, at::Tensor> face_areas_cuda(
+    at::Tensor verts,
+    at::Tensor faces);
+
+// Implementation which is exposed.
+std::tuple<at::Tensor, at::Tensor> face_areas_normals(
+    at::Tensor verts,
+    at::Tensor faces) {
+  if (verts.type().is_cuda() && faces.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return face_areas_cuda(verts, faces);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU.");
+}
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.cu
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <ATen/ATen.h>
+
+// TODO(T47953967) to make this cuda kernel support all datatypes.
+__global__ void gather_scatter_kernel(
+    const float* __restrict__ input,
+    const long* __restrict__ edges,
+    float* __restrict__ output,
+    bool directed,
+    bool backward,
+    const size_t V,
+    const size_t D,
+    const size_t E) {
+  const int tid = threadIdx.x;
+
+  // Reverse the vertex order if backward.
+  const int v0_idx = backward ? 1 : 0;
+  const int v1_idx = backward ? 0 : 1;
+
+  // Edges are split evenly across the blocks.
+  for (int e = blockIdx.x; e < E; e += gridDim.x) {
+    // Get indices of vertices which form the edge.
+    const long v0 = edges[2 * e + v0_idx];
+    const long v1 = edges[2 * e + v1_idx];
+
+    // Split vertex features evenly across threads.
+    // This implementation will be quite wasteful when D<128 since there will be
+    // a lot of threads doing nothing.
+    for (int d = tid; d < D; d += blockDim.x) {
+      const float val = input[v1 * D + d];
+      float* address = output + v0 * D + d;
+      atomicAdd(address, val);
+      if (!directed) {
+        const float val = input[v0 * D + d];
+        float* address = output + v1 * D + d;
+        atomicAdd(address, val);
+      }
+    }
+    __syncthreads();
+  }
+}
+
+at::Tensor gather_scatter_cuda(
+    const at::Tensor input,
+    const at::Tensor edges,
+    bool directed,
+    bool backward) {
+  const auto num_vertices = input.size(0);
+  const auto input_feature_dim = input.size(1);
+  const auto num_edges = edges.size(0);
+
+  auto output = at::zeros({num_vertices, input_feature_dim}, input.options());
+  const size_t threads = 128;
+  const size_t max_blocks = 1920;
+  const size_t blocks = num_edges < max_blocks ? num_edges : max_blocks;
+
+  gather_scatter_kernel<<<blocks, threads>>>(
+      input.data_ptr<float>(),
+      edges.data_ptr<long>(),
+      output.data_ptr<float>(),
+      directed,
+      backward,
+      num_vertices,
+      input_feature_dim,
+      num_edges);
+
+  return output;
+}
--- a/pytorch3d/csrc/gather_scatter/gather_scatter.h
+++ b/pytorch3d/csrc/gather_scatter/gather_scatter.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+#include <torch/extension.h>
+
+// Fused gather scatter operation for aggregating features of neighbor nodes
+// in a graph. This gather scatter operation is specific to graphs as edge
+// indices are used as input.
+//
+// Args:
+//   input: float32 Tensor of shape (V, D) where V is the number of vertices
+//          and D is the feature dimension.
+//   edges: int64 Tensor of shape (E, 2) giving the indices of the vertices that
+//          make up the edge. E is the number of edges.
+//  directed: Bool indicating if edges in the graph are directed. For a
+//            directed graph v0 -> v1 the updated feature for v0 depends on v1.
+//  backward: Bool indicating if the operation is the backward pass.
+//
+// Returns:
+//   output: float32 Tensor of same shape as input.
+
+// Cuda implementation.
+at::Tensor gather_scatter_cuda(
+    const at::Tensor input,
+    const at::Tensor edges,
+    bool directed,
+    bool backward);
+
+// Exposed implementation.
+at::Tensor gather_scatter(
+    const at::Tensor input,
+    const at::Tensor edges,
+    bool directed,
+    bool backward) {
+  if (input.type().is_cuda() && edges.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return gather_scatter_cuda(input, edges, directed, backward);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
--- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu
+++ b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <ATen/ATen.h>
+#include <float.h>
+
+template <typename scalar_t>
+__device__ void warp_reduce(
+    volatile scalar_t* min_dists,
+    volatile long* min_idxs,
+    const size_t tid) {
+  // s = 32
+  if (min_dists[tid] > min_dists[tid + 32]) {
+    min_idxs[tid] = min_idxs[tid + 32];
+    min_dists[tid] = min_dists[tid + 32];
+  }
+  // s = 16
+  if (min_dists[tid] > min_dists[tid + 16]) {
+    min_idxs[tid] = min_idxs[tid + 16];
+    min_dists[tid] = min_dists[tid + 16];
+  }
+  // s = 8
+  if (min_dists[tid] > min_dists[tid + 8]) {
+    min_idxs[tid] = min_idxs[tid + 8];
+    min_dists[tid] = min_dists[tid + 8];
+  }
+  // s = 4
+  if (min_dists[tid] > min_dists[tid + 4]) {
+    min_idxs[tid] = min_idxs[tid + 4];
+    min_dists[tid] = min_dists[tid + 4];
+  }
+  // s = 2
+  if (min_dists[tid] > min_dists[tid + 2]) {
+    min_idxs[tid] = min_idxs[tid + 2];
+    min_dists[tid] = min_dists[tid + 2];
+  }
+  // s = 1
+  if (min_dists[tid] > min_dists[tid + 1]) {
+    min_idxs[tid] = min_idxs[tid + 1];
+    min_dists[tid] = min_dists[tid + 1];
+  }
+}
+
+//  CUDA kernel to compute nearest neighbors between two batches of pointclouds
+//  where each point is of dimension D.
+//
+//  Args:
+//    points1: First set of points, of shape (N, P1, D).
+//    points2: Second set of points, of shape (N, P2, D).
+//    idx: Output memory buffer of shape (N, P1).
+//    N: Batch size.
+//    P1: Number of points in points1.
+//    P2: Number of points in points2.
+//    D_2: Size of the shared buffer; this is D rounded up so that memory access
+//         is aligned.
+//
+template <typename scalar_t>
+__global__ void nearest_neighbor_kernel(
+    const scalar_t* __restrict__ points1,
+    const scalar_t* __restrict__ points2,
+    long* __restrict__ idx,
+    const size_t N,
+    const size_t P1,
+    const size_t P2,
+    const size_t D,
+    const size_t D_2) {
+  // Each block will compute one element of the output idx[n, i]. Within the
+  // block we will use threads to compute the distances between points1[n, i]
+  // and points2[n, j] for all 0 <= j < P2, then use a block reduction to
+  // take an argmin of the distances.
+
+  // Shared buffers for the threads in the block. CUDA only allows declaration
+  // of a single shared buffer, so it needs to be manually sliced and cast to
+  // build several logical shared buffers of different types.
+  extern __shared__ char shared_buf[];
+  scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD]
+  scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS]
+  long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
+
+  const size_t n = blockIdx.y; // index of batch element.
+  const size_t i = blockIdx.x; // index of point within batch element.
+  const size_t tid = threadIdx.x;
+
+  // Thread 0 copies points1[n, i, :] into x.
+  if (tid == 0) {
+    for (size_t d = 0; d < D; d++) {
+      x[d] = points1[n * (P1 * D) + i * D + d];
+    }
+  }
+  __syncthreads();
+
+  // Compute the distances between points1[n, i] and points2[n, j] for
+  // all 0 <= j < P2. Here each thread will reduce over P2 / blockDim.x
+  // in serial, and store its result to shared memory
+  scalar_t min_dist = FLT_MAX;
+  size_t min_idx = 0;
+  for (size_t j = tid; j < P2; j += blockDim.x) {
+    scalar_t dist = 0;
+    for (size_t d = 0; d < D; d++) {
+      scalar_t x_d = x[d];
+      scalar_t y_d = points2[n * (P2 * D) + j * D + d];
+      scalar_t diff = x_d - y_d;
+      dist += diff * diff;
+    }
+    min_dist = (j == tid) ? dist : min_dist;
+    min_idx = (dist <= min_dist) ? j : min_idx;
+    min_dist = (dist <= min_dist) ? dist : min_dist;
+  }
+  min_dists[tid] = min_dist;
+  min_idxs[tid] = min_idx;
+  __syncthreads();
+
+  // Perform reduction in shared memory.
+  for (int s = blockDim.x / 2; s > 32; s >>= 1) {
+    if (tid < s) {
+      if (min_dists[tid] > min_dists[tid + s]) {
+        min_dists[tid] = min_dists[tid + s];
+        min_idxs[tid] = min_idxs[tid + s];
+      }
+    }
+    __syncthreads();
+  }
+
+  // Unroll the last 6 iterations of the loop since they will happen
+  // synchronized within a single warp.
+  if (tid < 32)
+    warp_reduce<scalar_t>(min_dists, min_idxs, tid);
+
+  // Finally thread 0 writes the result to the output buffer.
+  if (tid == 0) {
+    idx[n * P1 + i] = min_idxs[0];
+  }
+}
+
+//  CUDA kernel to compute nearest neighbors between two sets of 3-dimensional
+//  pointclouds. This is a specialization of the nearest_neighbor_kernel
+//  to the case D=3.
+//
+//  Args:
+//    points1: First set of pointclouds, of shape (N, P1, 3).
+//    points2: Second set of pointclouds, of shape (N, P2, 3).
+//    idx: Output memory buffer of shape (N, P1).
+//    N: Batch size.
+//    P1: Number of points in points1.
+//    P2: Number of points in points2.
+//
+template <typename scalar_t>
+__global__ void nearest_neighbor_kernel_D3(
+    const scalar_t* __restrict__ points1,
+    const scalar_t* __restrict__ points2,
+    long* __restrict__ idx,
+    const size_t N,
+    const size_t P1,
+    const size_t P2) {
+  // Single shared memory buffer which is split and cast to different types.
+  extern __shared__ char shared_buf[];
+  scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS]
+  long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
+
+  const size_t D = 3;
+  const size_t n = blockIdx.y; // index of batch element.
+  const size_t i = blockIdx.x; // index of point within batch element.
+  const size_t tid = threadIdx.x;
+
+  // Retrieve the coordinates of points1[n, i] from global memory; these
+  // will be stored in registers for fast access.
+  const scalar_t x = points1[n * (P1 * D) + i * D + 0];
+  const scalar_t y = points1[n * (P1 * D) + i * D + 1];
+  const scalar_t z = points1[n * (P1 * D) + i * D + 2];
+
+  // Compute distances between points1[n, i] and all points2[n, j]
+  // for 0 <= j < P2
+  scalar_t min_dist = FLT_MAX;
+  size_t min_idx = 0;
+
+  // Distance computation for points in p2 spread across threads in the block.
+  for (size_t j = tid; j < P2; j += blockDim.x) {
+    scalar_t dx = x - points2[n * (P2 * D) + j * D + 0];
+    scalar_t dy = y - points2[n * (P2 * D) + j * D + 1];
+    scalar_t dz = z - points2[n * (P2 * D) + j * D + 2];
+    scalar_t dist = dx * dx + dy * dy + dz * dz;
+    min_dist = (j == tid) ? dist : min_dist;
+    min_idx = (dist <= min_dist) ? j : min_idx;
+    min_dist = (dist <= min_dist) ? dist : min_dist;
+  }
+  min_dists[tid] = min_dist;
+  min_idxs[tid] = min_idx;
+
+  // Synchronize local threads writing to the shared memory buffer.
+  __syncthreads();
+
+  // Perform reduction in shared memory.
+  for (int s = blockDim.x / 2; s > 32; s >>= 1) {
+    if (tid < s) {
+      if (min_dists[tid] > min_dists[tid + s]) {
+        min_dists[tid] = min_dists[tid + s];
+        min_idxs[tid] = min_idxs[tid + s];
+      }
+    }
+
+    // Synchronize local threads so that min_dists is correct.
+    __syncthreads();
+  }
+
+  // Unroll the last 6 iterations of the loop since they will happen
+  // synchronized within a single warp.
+  if (tid < 32)
+    warp_reduce<scalar_t>(min_dists, min_idxs, tid);
+
+  // Finally thread 0 writes the result to the output buffer.
+  if (tid == 0) {
+    idx[n * P1 + i] = min_idxs[0];
+  }
+}
+
+at::Tensor nn_points_idx_cuda(at::Tensor p1, at::Tensor p2) {
+  const auto N = p1.size(0);
+  const auto P1 = p1.size(1);
+  const auto P2 = p2.size(1);
+  const auto D = p1.size(2);
+
+  AT_ASSERTM(p2.size(2) == D, "Point sets must have same last dimension.");
+  auto idx = at::empty({N, P1}, p1.options().dtype(at::kLong));
+
+  // On P100 with pointclouds of size (16, 5000, 3), 128 threads per block
+  // gives best results.
+  const int threads = 128;
+  const dim3 blocks(P1, N);
+
+  if (D == 3) {
+    // Use the specialized kernel for D=3.
+    AT_DISPATCH_FLOATING_TYPES(p1.type(), "nearest_neighbor_v3_cuda", ([&] {
+                                 size_t shared_size = threads * sizeof(size_t) +
+                                     threads * sizeof(long);
+                                 nearest_neighbor_kernel_D3<scalar_t>
+                                     <<<blocks, threads, shared_size>>>(
+                                         p1.data_ptr<scalar_t>(),
+                                         p2.data_ptr<scalar_t>(),
+                                         idx.data_ptr<long>(),
+                                         N,
+                                         P1,
+                                         P2);
+                               }));
+  } else {
+    // Use the general kernel for all other D.
+    AT_DISPATCH_FLOATING_TYPES(
+        p1.type(), "nearest_neighbor_v3_cuda", ([&] {
+          // To avoid misaligned memory access, the size of shared buffers
+          // need to be rounded to the next even size.
+          size_t D_2 = D + (D % 2);
+          size_t shared_size = (D_2 + threads) * sizeof(size_t);
+          shared_size += threads * sizeof(long);
+          nearest_neighbor_kernel<scalar_t><<<blocks, threads, shared_size>>>(
+              p1.data_ptr<scalar_t>(),
+              p2.data_ptr<scalar_t>(),
+              idx.data_ptr<long>(),
+              N,
+              P1,
+              P2,
+              D,
+              D_2);
+        }));
+  }
+
+  return idx;
+}
--- a/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h
+++ b/pytorch3d/csrc/nearest_neighbor_points/nearest_neighbor_points.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+#include <torch/extension.h>
+#include "pytorch3d_cutils.h"
+
+// Compute indices of nearest neighbors in pointcloud p2 to points
+// in pointcloud p1.
+//
+// Args:
+//    p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each
+//        containing P1 points of dimension D.
+//    p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each
+//        containing P2 points of dimension D.
+//
+// Returns:
+//    p1_neighbor_idx: LongTensor of shape (N, P1), where
+//                     p1_neighbor_idx[n, i] = j means that the nearest neighbor
+//                     to p1[n, i] in the cloud p2[n] is p2[n, j].
+//
+
+// Cuda implementation.
+at::Tensor nn_points_idx_cuda(at::Tensor p1, at::Tensor p2);
+
+// Implementation which is exposed.
+at::Tensor nn_points_idx(at::Tensor p1, at::Tensor p2) {
+  if (p1.type().is_cuda() && p2.type().is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CONTIGUOUS_CUDA(p1);
+    CHECK_CONTIGUOUS_CUDA(p2);
+    return nn_points_idx_cuda(p1, p2);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU.");
+};
--- a/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.cu
+++ b/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.cu
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#include <ATen/ATen.h>
+
+template <typename scalar_t>
+__global__ void packed_to_padded_tensor_kernel(
+    const scalar_t* __restrict__ inputs,
+    const long* __restrict__ first_idxs,
+    scalar_t* __restrict__ inputs_padded,
+    const size_t batch_size,
+    const size_t max_size,
+    const size_t num_inputs) {
+  // Batch elements split evenly across blocks (num blocks = batch_size) and
+  // values for each element split across threads in the block. Each thread adds
+  // the values of its respective input elements to the global inputs_padded
+  // tensor.
+  const size_t tid = threadIdx.x;
+  const size_t batch_idx = blockIdx.x;
+
+  const long start = first_idxs[batch_idx];
+  const long end =
+      batch_idx + 1 < batch_size ? first_idxs[batch_idx + 1] : num_inputs;
+  const int num_faces = end - start;
+  for (size_t f = tid; f < num_faces; f += blockDim.x) {
+    inputs_padded[batch_idx * max_size + f] = inputs[start + f];
+  }
+}
+
+at::Tensor packed_to_padded_tensor_cuda(
+    at::Tensor inputs,
+    at::Tensor first_idxs,
+    const long max_size) {
+  const auto num_inputs = inputs.size(0);
+  const auto batch_size = first_idxs.size(0);
+  at::Tensor inputs_padded =
+      at::zeros({batch_size, max_size}, inputs.options());
+
+  const int threads = 512;
+  const int blocks = batch_size;
+  AT_DISPATCH_FLOATING_TYPES(
+      inputs.type(), "packed_to_padded_tensor_kernel", ([&] {
+        packed_to_padded_tensor_kernel<scalar_t><<<blocks, threads>>>(
+            inputs.data_ptr<scalar_t>(),
+            first_idxs.data_ptr<long>(),
+            inputs_padded.data_ptr<scalar_t>(),
+            batch_size,
+            max_size,
+            num_inputs);
+      }));
+
+  return inputs_padded;
+}
--- a/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
+++ b/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
+// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+#pragma once
+#include <torch/extension.h>
+
+// Converts a packed tensor into a padded tensor, restoring the batch dimension.
+// Refer to pytorch3d/structures/meshes.py for details on packed/padded tensors.
+//
+// Inputs:
+//    inputs: FloatTensor of shape (F,), representing the packed batch tensor.
+//           e.g. areas for faces in a batch of meshes.
+//    first_idxs: LongTensor of shape (N,) where N is the number of
+//                       elements in the batch and `packed_first_idxs[i] = f`
+//                       means that the inputs for batch element i begin at
+//                       `inputs[f]`.
+//   max_size: Max length of an element in the batch.
+// Returns:
+//   inputs_padded: FloatTensor of shape (N, max_size) where max_size is max
+//                 of `sizes`. The values for batch element i which start at
+//                 `inputs[packed_first_idxs[i]]` will be copied to
+//                 `inputs_padded[i, :]``, with zeros padding out the extra
+//                  inputs.
+//
+
+// Cuda implementation.
+at::Tensor packed_to_padded_tensor_cuda(
+    at::Tensor inputs,
+    at::Tensor first_idxs,
+    const long max_size);
+
+// Implementation which is exposed.
+at::Tensor packed_to_padded_tensor(
+    at::Tensor inputs,
+    at::Tensor first_idxs,
+    const long max_size) {
+  if (inputs.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return packed_to_padded_tensor_cuda(inputs, first_idxs, max_size);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU.");
+}