Commit dbf06b50 authored by facebook-github-bot's avatar facebook-github-bot
Browse files

Initial commit

fbshipit-source-id: ad58e416e3ceeca85fae0583308968d04e78fe0d
parents
:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
:: Set env vars that tell distutils to use the compiler that we put on path
SET DISTUTILS_USE_SDK=1
SET MSSdk=1
SET "VS_VERSION=15.0"
SET "VS_MAJOR=15"
SET "VS_YEAR=2017"
set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
set "MSYS2_ENV_CONV_EXCL=CL"
:: For Python 3.5+, ensure that we link with the dynamic runtime. See
:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
set "VSINSTALLDIR=%%i\"
goto :vswhere
)
)
:vswhere
:: Shorten PATH to avoid the `input line too long` error.
SET MyPath=%PATH%
setlocal EnableDelayedExpansion
SET TempPath="%MyPath:;=";"%"
SET var=
FOR %%a IN (%TempPath%) DO (
IF EXIST %%~sa (
SET "var=!var!;%%~sa"
)
)
set "TempPath=!var:~1!"
endlocal & set "PATH=%TempPath%"
:: Shorten current directory too
FOR %%A IN (.) DO CD "%%~sA"
:: other things added by install_activate.bat at package build time
blas_impl:
- mkl # [x86_64]
c_compiler:
- vs2017 # [win]
cxx_compiler:
- vs2017 # [win]
python:
- 3.5
- 3.6
# This differs from target_platform in that it determines what subdir the compiler
# will target, not what subdir the compiler package will be itself.
# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
# code on win-64 miniconda.
cross_compiler_target_platform:
- win-64 # [win]
target_platform:
- win-64 # [win]
vc:
- 14
zip_keys:
- # [win]
- vc # [win]
- c_compiler # [win]
- cxx_compiler # [win]
:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
set YEAR=2017
set VER=15
mkdir "%PREFIX%\etc\conda\activate.d"
COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
IF "%cross_compiler_target_platform%" == "win-64" (
set "target_platform=amd64"
echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
IF "%VSDEVCMD_ARGS%" == "" (
echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
) ELSE (
echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
)
echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
) else (
set "target_platform=x86"
echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo popd
)
:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
set VC_PATH=x86
if "%ARCH%"=="64" (
set VC_PATH=x64
)
set MSC_VER=2017
rem :: This should always be present for VC installed with VS. Not sure about VC installed with Visual C++ Build Tools 2015
rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
rem set SP=%%A
rem )
rem if not "%SP%" == "%PKG_VERSION%" (
rem echo "Version detected from registry: %SP%"
rem echo "does not match version of package being built (%PKG_VERSION%)"
rem echo "Do you have current updates for VS 2015 installed?"
rem exit 1
rem )
REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%LIBRARY_BIN%" *.dll /E
robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%PREFIX%" *.dll /E
if %ERRORLEVEL% GEQ 8 exit 1
REM ========== This one comes from visual studio 2017
set "VC_VER=141"
for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
goto :eof
)
)
@setlocal
call "%VS15VARSALL%" x64
set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
if %ERRORLEVEL% LSS 8 exit 0
robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
if %ERRORLEVEL% LSS 8 exit 0
robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
if %ERRORLEVEL% LSS 8 exit 0
robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
if %ERRORLEVEL% LSS 8 exit 0
@endlocal
{% set vcver="14.1" %}
{% set vcfeature="14" %}
{% set vsyear="2017" %}
{% set fullver="15.4.27004.2010" %}
package:
name: vs{{ vsyear }}
version: {{ fullver }}
build:
skip: True [not win]
script_env:
- VSDEVCMD_ARGS # [win]
outputs:
- name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
script: install_activate.bat
track_features:
# VS 2017 is binary-compatible with VS 2015/vc14. Tools are "v141".
strong:
- vc{{ vcfeature }}
run_exports:
- vc {{ vcver }}
about:
summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
license: BSD 3-clause
- name: vs{{ vsyear }}_runtime
script: install_runtime.bat
- name: vc
version: {{ vcver }}
track_features:
- vc{{ vcfeature }}
requirements:
run:
- {{ pin_subpackage('vs' ~ vsyear ~ '_runtime') }}
about:
home: https://github.com/conda/conda/wiki/VC-features
license: Modified BSD License (3-clause)
license_family: BSD
summary: A meta-package to track VC features.
description: |
This metapackage is used to activate vc features without
depending on Python.
doc_url: https://github.com/conda/conda/wiki/VC-features
dev_url: https://github.com/conda/conda/wiki/VC-features
:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
:: Set env vars that tell distutils to use the compiler that we put on path
SET DISTUTILS_USE_SDK=1
SET MSSdk=1
SET "VS_VERSION=16.0"
SET "VS_MAJOR=16"
SET "VS_YEAR=2019"
set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
set "MSYS2_ENV_CONV_EXCL=CL"
:: For Python 3.5+, ensure that we link with the dynamic runtime. See
:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
set "VSINSTALLDIR=%%i\"
goto :vswhere
)
)
:vswhere
:: Shorten PATH to avoid the `input line too long` error.
SET MyPath=%PATH%
setlocal EnableDelayedExpansion
SET TempPath="%MyPath:;=";"%"
SET var=
FOR %%a IN (%TempPath%) DO (
IF EXIST %%~sa (
SET "var=!var!;%%~sa"
)
)
set "TempPath=!var:~1!"
endlocal & set "PATH=%TempPath%"
:: Shorten current directory too
FOR %%A IN (.) DO CD "%%~sA"
:: other things added by install_activate.bat at package build time
blas_impl:
- mkl # [x86_64]
c_compiler:
- vs2019 # [win]
cxx_compiler:
- vs2019 # [win]
python:
- 3.5
- 3.6
# This differs from target_platform in that it determines what subdir the compiler
# will target, not what subdir the compiler package will be itself.
# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
# code on win-64 miniconda.
cross_compiler_target_platform:
- win-64 # [win]
target_platform:
- win-64 # [win]
vc:
- 14
zip_keys:
- # [win]
- vc # [win]
- c_compiler # [win]
- cxx_compiler # [win]
:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
set YEAR=2019
set VER=16
mkdir "%PREFIX%\etc\conda\activate.d"
COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
IF "%cross_compiler_target_platform%" == "win-64" (
set "target_platform=amd64"
echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
IF "%VSDEVCMD_ARGS%" == "" (
echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
) ELSE (
echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
)
echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
) else (
set "target_platform=x86"
echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
echo popd
)
:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
set VC_PATH=x86
if "%ARCH%"=="64" (
set VC_PATH=x64
)
set MSC_VER=2019
rem :: This should always be present for VC installed with VS. Not sure about VC installed with Visual C++ Build Tools 2015
rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
rem set SP=%%A
rem )
rem if not "%SP%" == "%PKG_VERSION%" (
rem echo "Version detected from registry: %SP%"
rem echo "does not match version of package being built (%PKG_VERSION%)"
rem echo "Do you have current updates for VS 2015 installed?"
rem exit 1
rem )
REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%LIBRARY_BIN%" *.dll /E
robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%PREFIX%" *.dll /E
if %ERRORLEVEL% GEQ 8 exit 1
REM ========== This one comes from visual studio 2019
set "VC_VER=142"
for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
goto :eof
)
)
@setlocal
call "%VS15VARSALL%" x64
set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
if %ERRORLEVEL% LSS 8 exit 0
robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
if %ERRORLEVEL% LSS 8 exit 0
robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
if %ERRORLEVEL% LSS 8 exit 0
robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
if %ERRORLEVEL% LSS 8 exit 0
@endlocal
{% set vcver="14.2" %}
{% set vcfeature="14" %}
{% set vsyear="2019" %}
{% set fullver="15.4.27004.2010" %}
package:
name: vs{{ vsyear }}
version: {{ fullver }}
build:
skip: True [not win]
script_env:
- VSDEVCMD_ARGS # [win]
outputs:
- name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
script: install_activate.bat
track_features:
# VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14. Tools are "v142".
strong:
- vc{{ vcfeature }}
run_exports:
- vc {{ vcver }}
about:
summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
license: BSD 3-clause
- name: vs{{ vsyear }}_runtime
script: install_runtime.bat
- name: vc
version: {{ vcver }}
track_features:
- vc{{ vcfeature }}
requirements:
run:
- {{ pin_subpackage('vs' ~ vsyear ~ '_runtime') }}
about:
home: https://github.com/conda/conda/wiki/VC-features
license: Modified BSD License (3-clause)
license_family: BSD
summary: A meta-package to track VC features.
description: |
This metapackage is used to activate vc features without
depending on Python.
doc_url: https://github.com/conda/conda/wiki/VC-features
dev_url: https://github.com/conda/conda/wiki/VC-features
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
__version__ = "0.1"
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <torch/extension.h>
#include "face_areas_normals/face_areas_normals.h"
#include "gather_scatter/gather_scatter.h"
#include "nearest_neighbor_points/nearest_neighbor_points.h"
#include "packed_to_padded_tensor/packed_to_padded_tensor.h"
#include "rasterize_meshes/rasterize_meshes.h"
#include "rasterize_points/rasterize_points.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("face_areas_normals", &face_areas_normals);
m.def("packed_to_padded_tensor", &packed_to_padded_tensor);
m.def("nn_points_idx", &nn_points_idx);
m.def("gather_scatter", &gather_scatter);
m.def("rasterize_points", &RasterizePoints);
m.def("rasterize_points_backward", &RasterizePointsBackward);
m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
m.def("rasterize_meshes", &RasterizeMeshes);
// These are only visible for testing; users should not call them directly
m.def("_rasterize_points_coarse", &RasterizePointsCoarse);
m.def("_rasterize_points_naive", &RasterizePointsNaive);
m.def("_rasterize_meshes_naive", &RasterizeMeshesNaive);
m.def("_rasterize_meshes_coarse", &RasterizeMeshesCoarse);
m.def("_rasterize_meshes_fine", &RasterizeMeshesFine);
}
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h>
#include <tuple>
template <typename scalar_t>
__global__ void face_areas_kernel(
const scalar_t* __restrict__ verts,
const long* __restrict__ faces,
scalar_t* __restrict__ face_areas,
scalar_t* __restrict__ face_normals,
const size_t V,
const size_t F) {
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = gridDim.x * blockDim.x;
// Faces split evenly over the number of threads in the grid.
// Each thread computes the area & normal of its respective faces and adds it
// to the global face_areas tensor.
for (size_t f = tid; f < F; f += stride) {
const long i0 = faces[3 * f + 0];
const long i1 = faces[3 * f + 1];
const long i2 = faces[3 * f + 2];
const scalar_t v0_x = verts[3 * i0 + 0];
const scalar_t v0_y = verts[3 * i0 + 1];
const scalar_t v0_z = verts[3 * i0 + 2];
const scalar_t v1_x = verts[3 * i1 + 0];
const scalar_t v1_y = verts[3 * i1 + 1];
const scalar_t v1_z = verts[3 * i1 + 2];
const scalar_t v2_x = verts[3 * i2 + 0];
const scalar_t v2_y = verts[3 * i2 + 1];
const scalar_t v2_z = verts[3 * i2 + 2];
const scalar_t ax = v1_x - v0_x;
const scalar_t ay = v1_y - v0_y;
const scalar_t az = v1_z - v0_z;
const scalar_t bx = v2_x - v0_x;
const scalar_t by = v2_y - v0_y;
const scalar_t bz = v2_z - v0_z;
const scalar_t cx = ay * bz - az * by;
const scalar_t cy = az * bx - ax * bz;
const scalar_t cz = ax * by - ay * bx;
scalar_t norm = sqrt(cx * cx + cy * cy + cz * cz);
face_areas[f] = norm / 2.0;
norm = (norm < 1e-6) ? 1e-6 : norm; // max(norm, 1e-6)
face_normals[3 * f + 0] = cx / norm;
face_normals[3 * f + 1] = cy / norm;
face_normals[3 * f + 2] = cz / norm;
}
}
std::tuple<at::Tensor, at::Tensor> face_areas_cuda(
at::Tensor verts,
at::Tensor faces) {
const auto V = verts.size(0);
const auto F = faces.size(0);
at::Tensor areas = at::empty({F}, verts.options());
at::Tensor normals = at::empty({F, 3}, verts.options());
const int blocks = 64;
const int threads = 512;
AT_DISPATCH_FLOATING_TYPES(verts.type(), "face_areas_kernel", ([&] {
face_areas_kernel<scalar_t><<<blocks, threads>>>(
verts.data_ptr<scalar_t>(),
faces.data_ptr<long>(),
areas.data_ptr<scalar_t>(),
normals.data_ptr<scalar_t>(),
V,
F);
}));
return std::make_tuple(areas, normals);
}
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#pragma once
#include <torch/extension.h>
#include <tuple>
// Compute areas of mesh faces using packed representation.
//
// Inputs:
// verts: FloatTensor of shape (V, 3) giving vertex positions.
// faces: LongTensor of shape (F, 3) giving faces.
//
// Returns:
// areas: FloatTensor of shape (F,) where areas[f] is the area of faces[f].
// normals: FloatTensor of shape (F, 3) where normals[f] is the normal of
// faces[f]
//
// Cuda implementation.
std::tuple<at::Tensor, at::Tensor> face_areas_cuda(
at::Tensor verts,
at::Tensor faces);
// Implementation which is exposed.
std::tuple<at::Tensor, at::Tensor> face_areas_normals(
at::Tensor verts,
at::Tensor faces) {
if (verts.type().is_cuda() && faces.type().is_cuda()) {
#ifdef WITH_CUDA
return face_areas_cuda(verts, faces);
#else
AT_ERROR("Not compiled with GPU support.");
#endif
}
AT_ERROR("Not implemented on the CPU.");
}
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h>
// TODO(T47953967) to make this cuda kernel support all datatypes.
__global__ void gather_scatter_kernel(
const float* __restrict__ input,
const long* __restrict__ edges,
float* __restrict__ output,
bool directed,
bool backward,
const size_t V,
const size_t D,
const size_t E) {
const int tid = threadIdx.x;
// Reverse the vertex order if backward.
const int v0_idx = backward ? 1 : 0;
const int v1_idx = backward ? 0 : 1;
// Edges are split evenly across the blocks.
for (int e = blockIdx.x; e < E; e += gridDim.x) {
// Get indices of vertices which form the edge.
const long v0 = edges[2 * e + v0_idx];
const long v1 = edges[2 * e + v1_idx];
// Split vertex features evenly across threads.
// This implementation will be quite wasteful when D<128 since there will be
// a lot of threads doing nothing.
for (int d = tid; d < D; d += blockDim.x) {
const float val = input[v1 * D + d];
float* address = output + v0 * D + d;
atomicAdd(address, val);
if (!directed) {
const float val = input[v0 * D + d];
float* address = output + v1 * D + d;
atomicAdd(address, val);
}
}
__syncthreads();
}
}
at::Tensor gather_scatter_cuda(
const at::Tensor input,
const at::Tensor edges,
bool directed,
bool backward) {
const auto num_vertices = input.size(0);
const auto input_feature_dim = input.size(1);
const auto num_edges = edges.size(0);
auto output = at::zeros({num_vertices, input_feature_dim}, input.options());
const size_t threads = 128;
const size_t max_blocks = 1920;
const size_t blocks = num_edges < max_blocks ? num_edges : max_blocks;
gather_scatter_kernel<<<blocks, threads>>>(
input.data_ptr<float>(),
edges.data_ptr<long>(),
output.data_ptr<float>(),
directed,
backward,
num_vertices,
input_feature_dim,
num_edges);
return output;
}
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#pragma once
#include <torch/extension.h>
// Fused gather scatter operation for aggregating features of neighbor nodes
// in a graph. This gather scatter operation is specific to graphs as edge
// indices are used as input.
//
// Args:
// input: float32 Tensor of shape (V, D) where V is the number of vertices
// and D is the feature dimension.
// edges: int64 Tensor of shape (E, 2) giving the indices of the vertices that
// make up the edge. E is the number of edges.
// directed: Bool indicating if edges in the graph are directed. For a
// directed graph v0 -> v1 the updated feature for v0 depends on v1.
// backward: Bool indicating if the operation is the backward pass.
//
// Returns:
// output: float32 Tensor of same shape as input.
// Cuda implementation.
at::Tensor gather_scatter_cuda(
const at::Tensor input,
const at::Tensor edges,
bool directed,
bool backward);
// Exposed implementation.
at::Tensor gather_scatter(
const at::Tensor input,
const at::Tensor edges,
bool directed,
bool backward) {
if (input.type().is_cuda() && edges.type().is_cuda()) {
#ifdef WITH_CUDA
return gather_scatter_cuda(input, edges, directed, backward);
#else
AT_ERROR("Not compiled with GPU support.");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h>
#include <float.h>
template <typename scalar_t>
__device__ void warp_reduce(
volatile scalar_t* min_dists,
volatile long* min_idxs,
const size_t tid) {
// s = 32
if (min_dists[tid] > min_dists[tid + 32]) {
min_idxs[tid] = min_idxs[tid + 32];
min_dists[tid] = min_dists[tid + 32];
}
// s = 16
if (min_dists[tid] > min_dists[tid + 16]) {
min_idxs[tid] = min_idxs[tid + 16];
min_dists[tid] = min_dists[tid + 16];
}
// s = 8
if (min_dists[tid] > min_dists[tid + 8]) {
min_idxs[tid] = min_idxs[tid + 8];
min_dists[tid] = min_dists[tid + 8];
}
// s = 4
if (min_dists[tid] > min_dists[tid + 4]) {
min_idxs[tid] = min_idxs[tid + 4];
min_dists[tid] = min_dists[tid + 4];
}
// s = 2
if (min_dists[tid] > min_dists[tid + 2]) {
min_idxs[tid] = min_idxs[tid + 2];
min_dists[tid] = min_dists[tid + 2];
}
// s = 1
if (min_dists[tid] > min_dists[tid + 1]) {
min_idxs[tid] = min_idxs[tid + 1];
min_dists[tid] = min_dists[tid + 1];
}
}
// CUDA kernel to compute nearest neighbors between two batches of pointclouds
// where each point is of dimension D.
//
// Args:
// points1: First set of points, of shape (N, P1, D).
// points2: Second set of points, of shape (N, P2, D).
// idx: Output memory buffer of shape (N, P1).
// N: Batch size.
// P1: Number of points in points1.
// P2: Number of points in points2.
// D_2: Size of the shared buffer; this is D rounded up so that memory access
// is aligned.
//
template <typename scalar_t>
__global__ void nearest_neighbor_kernel(
const scalar_t* __restrict__ points1,
const scalar_t* __restrict__ points2,
long* __restrict__ idx,
const size_t N,
const size_t P1,
const size_t P2,
const size_t D,
const size_t D_2) {
// Each block will compute one element of the output idx[n, i]. Within the
// block we will use threads to compute the distances between points1[n, i]
// and points2[n, j] for all 0 <= j < P2, then use a block reduction to
// take an argmin of the distances.
// Shared buffers for the threads in the block. CUDA only allows declaration
// of a single shared buffer, so it needs to be manually sliced and cast to
// build several logical shared buffers of different types.
extern __shared__ char shared_buf[];
scalar_t* x = (scalar_t*)shared_buf; // scalar_t[DD]
scalar_t* min_dists = &x[D_2]; // scalar_t[NUM_THREADS]
long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
const size_t n = blockIdx.y; // index of batch element.
const size_t i = blockIdx.x; // index of point within batch element.
const size_t tid = threadIdx.x;
// Thread 0 copies points1[n, i, :] into x.
if (tid == 0) {
for (size_t d = 0; d < D; d++) {
x[d] = points1[n * (P1 * D) + i * D + d];
}
}
__syncthreads();
// Compute the distances between points1[n, i] and points2[n, j] for
// all 0 <= j < P2. Here each thread will reduce over P2 / blockDim.x
// in serial, and store its result to shared memory
scalar_t min_dist = FLT_MAX;
size_t min_idx = 0;
for (size_t j = tid; j < P2; j += blockDim.x) {
scalar_t dist = 0;
for (size_t d = 0; d < D; d++) {
scalar_t x_d = x[d];
scalar_t y_d = points2[n * (P2 * D) + j * D + d];
scalar_t diff = x_d - y_d;
dist += diff * diff;
}
min_dist = (j == tid) ? dist : min_dist;
min_idx = (dist <= min_dist) ? j : min_idx;
min_dist = (dist <= min_dist) ? dist : min_dist;
}
min_dists[tid] = min_dist;
min_idxs[tid] = min_idx;
__syncthreads();
// Perform reduction in shared memory.
for (int s = blockDim.x / 2; s > 32; s >>= 1) {
if (tid < s) {
if (min_dists[tid] > min_dists[tid + s]) {
min_dists[tid] = min_dists[tid + s];
min_idxs[tid] = min_idxs[tid + s];
}
}
__syncthreads();
}
// Unroll the last 6 iterations of the loop since they will happen
// synchronized within a single warp.
if (tid < 32)
warp_reduce<scalar_t>(min_dists, min_idxs, tid);
// Finally thread 0 writes the result to the output buffer.
if (tid == 0) {
idx[n * P1 + i] = min_idxs[0];
}
}
// CUDA kernel to compute nearest neighbors between two sets of 3-dimensional
// pointclouds. This is a specialization of the nearest_neighbor_kernel
// to the case D=3.
//
// Args:
// points1: First set of pointclouds, of shape (N, P1, 3).
// points2: Second set of pointclouds, of shape (N, P2, 3).
// idx: Output memory buffer of shape (N, P1).
// N: Batch size.
// P1: Number of points in points1.
// P2: Number of points in points2.
//
template <typename scalar_t>
__global__ void nearest_neighbor_kernel_D3(
const scalar_t* __restrict__ points1,
const scalar_t* __restrict__ points2,
long* __restrict__ idx,
const size_t N,
const size_t P1,
const size_t P2) {
// Single shared memory buffer which is split and cast to different types.
extern __shared__ char shared_buf[];
scalar_t* min_dists = (scalar_t*)shared_buf; // scalar_t[NUM_THREADS]
long* min_idxs = (long*)&min_dists[blockDim.x]; // long[NUM_THREADS]
const size_t D = 3;
const size_t n = blockIdx.y; // index of batch element.
const size_t i = blockIdx.x; // index of point within batch element.
const size_t tid = threadIdx.x;
// Retrieve the coordinates of points1[n, i] from global memory; these
// will be stored in registers for fast access.
const scalar_t x = points1[n * (P1 * D) + i * D + 0];
const scalar_t y = points1[n * (P1 * D) + i * D + 1];
const scalar_t z = points1[n * (P1 * D) + i * D + 2];
// Compute distances between points1[n, i] and all points2[n, j]
// for 0 <= j < P2
scalar_t min_dist = FLT_MAX;
size_t min_idx = 0;
// Distance computation for points in p2 spread across threads in the block.
for (size_t j = tid; j < P2; j += blockDim.x) {
scalar_t dx = x - points2[n * (P2 * D) + j * D + 0];
scalar_t dy = y - points2[n * (P2 * D) + j * D + 1];
scalar_t dz = z - points2[n * (P2 * D) + j * D + 2];
scalar_t dist = dx * dx + dy * dy + dz * dz;
min_dist = (j == tid) ? dist : min_dist;
min_idx = (dist <= min_dist) ? j : min_idx;
min_dist = (dist <= min_dist) ? dist : min_dist;
}
min_dists[tid] = min_dist;
min_idxs[tid] = min_idx;
// Synchronize local threads writing to the shared memory buffer.
__syncthreads();
// Perform reduction in shared memory.
for (int s = blockDim.x / 2; s > 32; s >>= 1) {
if (tid < s) {
if (min_dists[tid] > min_dists[tid + s]) {
min_dists[tid] = min_dists[tid + s];
min_idxs[tid] = min_idxs[tid + s];
}
}
// Synchronize local threads so that min_dists is correct.
__syncthreads();
}
// Unroll the last 6 iterations of the loop since they will happen
// synchronized within a single warp.
if (tid < 32)
warp_reduce<scalar_t>(min_dists, min_idxs, tid);
// Finally thread 0 writes the result to the output buffer.
if (tid == 0) {
idx[n * P1 + i] = min_idxs[0];
}
}
at::Tensor nn_points_idx_cuda(at::Tensor p1, at::Tensor p2) {
const auto N = p1.size(0);
const auto P1 = p1.size(1);
const auto P2 = p2.size(1);
const auto D = p1.size(2);
AT_ASSERTM(p2.size(2) == D, "Point sets must have same last dimension.");
auto idx = at::empty({N, P1}, p1.options().dtype(at::kLong));
// On P100 with pointclouds of size (16, 5000, 3), 128 threads per block
// gives best results.
const int threads = 128;
const dim3 blocks(P1, N);
if (D == 3) {
// Use the specialized kernel for D=3.
AT_DISPATCH_FLOATING_TYPES(p1.type(), "nearest_neighbor_v3_cuda", ([&] {
size_t shared_size = threads * sizeof(size_t) +
threads * sizeof(long);
nearest_neighbor_kernel_D3<scalar_t>
<<<blocks, threads, shared_size>>>(
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
idx.data_ptr<long>(),
N,
P1,
P2);
}));
} else {
// Use the general kernel for all other D.
AT_DISPATCH_FLOATING_TYPES(
p1.type(), "nearest_neighbor_v3_cuda", ([&] {
// To avoid misaligned memory access, the size of shared buffers
// need to be rounded to the next even size.
size_t D_2 = D + (D % 2);
size_t shared_size = (D_2 + threads) * sizeof(size_t);
shared_size += threads * sizeof(long);
nearest_neighbor_kernel<scalar_t><<<blocks, threads, shared_size>>>(
p1.data_ptr<scalar_t>(),
p2.data_ptr<scalar_t>(),
idx.data_ptr<long>(),
N,
P1,
P2,
D,
D_2);
}));
}
return idx;
}
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#pragma once
#include <torch/extension.h>
#include "pytorch3d_cutils.h"
// Compute indices of nearest neighbors in pointcloud p2 to points
// in pointcloud p1.
//
// Args:
// p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each
// containing P1 points of dimension D.
// p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each
// containing P2 points of dimension D.
//
// Returns:
// p1_neighbor_idx: LongTensor of shape (N, P1), where
// p1_neighbor_idx[n, i] = j means that the nearest neighbor
// to p1[n, i] in the cloud p2[n] is p2[n, j].
//
// Cuda implementation.
at::Tensor nn_points_idx_cuda(at::Tensor p1, at::Tensor p2);
// Implementation which is exposed.
at::Tensor nn_points_idx(at::Tensor p1, at::Tensor p2) {
if (p1.type().is_cuda() && p2.type().is_cuda()) {
#ifdef WITH_CUDA
CHECK_CONTIGUOUS_CUDA(p1);
CHECK_CONTIGUOUS_CUDA(p2);
return nn_points_idx_cuda(p1, p2);
#else
AT_ERROR("Not compiled with GPU support.");
#endif
}
AT_ERROR("Not implemented on the CPU.");
};
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#include <ATen/ATen.h>
template <typename scalar_t>
__global__ void packed_to_padded_tensor_kernel(
const scalar_t* __restrict__ inputs,
const long* __restrict__ first_idxs,
scalar_t* __restrict__ inputs_padded,
const size_t batch_size,
const size_t max_size,
const size_t num_inputs) {
// Batch elements split evenly across blocks (num blocks = batch_size) and
// values for each element split across threads in the block. Each thread adds
// the values of its respective input elements to the global inputs_padded
// tensor.
const size_t tid = threadIdx.x;
const size_t batch_idx = blockIdx.x;
const long start = first_idxs[batch_idx];
const long end =
batch_idx + 1 < batch_size ? first_idxs[batch_idx + 1] : num_inputs;
const int num_faces = end - start;
for (size_t f = tid; f < num_faces; f += blockDim.x) {
inputs_padded[batch_idx * max_size + f] = inputs[start + f];
}
}
at::Tensor packed_to_padded_tensor_cuda(
at::Tensor inputs,
at::Tensor first_idxs,
const long max_size) {
const auto num_inputs = inputs.size(0);
const auto batch_size = first_idxs.size(0);
at::Tensor inputs_padded =
at::zeros({batch_size, max_size}, inputs.options());
const int threads = 512;
const int blocks = batch_size;
AT_DISPATCH_FLOATING_TYPES(
inputs.type(), "packed_to_padded_tensor_kernel", ([&] {
packed_to_padded_tensor_kernel<scalar_t><<<blocks, threads>>>(
inputs.data_ptr<scalar_t>(),
first_idxs.data_ptr<long>(),
inputs_padded.data_ptr<scalar_t>(),
batch_size,
max_size,
num_inputs);
}));
return inputs_padded;
}
// Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#pragma once
#include <torch/extension.h>
// Converts a packed tensor into a padded tensor, restoring the batch dimension.
// Refer to pytorch3d/structures/meshes.py for details on packed/padded tensors.
//
// Inputs:
// inputs: FloatTensor of shape (F,), representing the packed batch tensor.
// e.g. areas for faces in a batch of meshes.
// first_idxs: LongTensor of shape (N,) where N is the number of
// elements in the batch and `packed_first_idxs[i] = f`
// means that the inputs for batch element i begin at
// `inputs[f]`.
// max_size: Max length of an element in the batch.
// Returns:
// inputs_padded: FloatTensor of shape (N, max_size) where max_size is max
// of `sizes`. The values for batch element i which start at
// `inputs[packed_first_idxs[i]]` will be copied to
// `inputs_padded[i, :]``, with zeros padding out the extra
// inputs.
//
// Cuda implementation.
at::Tensor packed_to_padded_tensor_cuda(
at::Tensor inputs,
at::Tensor first_idxs,
const long max_size);
// Implementation which is exposed.
at::Tensor packed_to_padded_tensor(
at::Tensor inputs,
at::Tensor first_idxs,
const long max_size) {
if (inputs.type().is_cuda()) {
#ifdef WITH_CUDA
return packed_to_padded_tensor_cuda(inputs, first_idxs, max_size);
#else
AT_ERROR("Not compiled with GPU support.");
#endif
}
AT_ERROR("Not implemented on the CPU.");
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment