Unverified Commit 80b99adb authored by Matthias Fey's avatar Matthias Fey Committed by GitHub
Browse files

Merge pull request #57 from rusty1s/wheel

[WIP] Python wheels
parents 0194ebb6 bc476876
#include <Python.h>
#include <torch/script.h>
#ifdef WITH_CUDA
#include "cuda/knn_cuda.h"
#endif
#ifdef _WIN32
PyMODINIT_FUNC PyInit__knn(void) { return NULL; }
#endif
torch::Tensor knn(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
torch::Tensor ptr_y, int64_t k, bool cosine) {
if (x.device().is_cuda()) {
#ifdef WITH_CUDA
return knn_cuda(x, y, ptr_x, ptr_y, k, cosine);
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
AT_ERROR("No CPU version supported");
}
}
static auto registry =
torch::RegisterOperators().op("torch_cluster::knn", &knn);
#include <Python.h>
#include <torch/script.h>
#ifdef WITH_CUDA
#include "cuda/nearest_cuda.h"
#endif
#ifdef _WIN32
PyMODINIT_FUNC PyInit__nearest(void) { return NULL; }
#endif
torch::Tensor nearest(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
torch::Tensor ptr_y) {
if (x.device().is_cuda()) {
#ifdef WITH_CUDA
return nearest_cuda(x, y, ptr_x, ptr_y);
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
AT_ERROR("No CPU version supported");
}
}
static auto registry =
torch::RegisterOperators().op("torch_cluster::nearest", &nearest);
#include <Python.h>
#include <torch/script.h>
#ifdef WITH_CUDA
#include "cuda/radius_cuda.h"
#endif
#ifdef _WIN32
PyMODINIT_FUNC PyInit__radius(void) { return NULL; }
#endif
torch::Tensor radius(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
torch::Tensor ptr_y, double r, int64_t max_num_neighbors) {
if (x.device().is_cuda()) {
#ifdef WITH_CUDA
return radius_cuda(x, y, ptr_x, ptr_y, r, max_num_neighbors);
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
AT_ERROR("No CPU version supported");
}
}
static auto registry =
torch::RegisterOperators().op("torch_cluster::radius", &radius);
#include <Python.h>
#include <torch/script.h>
#include "cpu/rw_cpu.h"
#ifdef WITH_CUDA
#include "cuda/rw_cuda.h"
#endif
#ifdef _WIN32
PyMODINIT_FUNC PyInit__rw(void) { return NULL; }
#endif
torch::Tensor random_walk(torch::Tensor rowptr, torch::Tensor col,
torch::Tensor start, int64_t walk_length, double p,
double q) {
if (rowptr.device().is_cuda()) {
#ifdef WITH_CUDA
return random_walk_cuda(rowptr, col, start, walk_length, p, q);
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return random_walk_cpu(rowptr, col, start, walk_length, p, q);
}
}
static auto registry =
torch::RegisterOperators().op("torch_cluster::random_walk", &random_walk);
#include <Python.h>
#include <torch/script.h>
#include "cpu/sampler_cpu.h"
#ifdef _WIN32
PyMODINIT_FUNC PyInit__sampler(void) { return NULL; }
#endif
torch::Tensor neighbor_sampler(torch::Tensor start, torch::Tensor rowptr,
int64_t count, double factor) {
if (rowptr.device().is_cuda()) {
#ifdef WITH_CUDA
AT_ERROR("No CUDA version supported");
#else
AT_ERROR("Not compiled with CUDA support");
#endif
} else {
return neighbor_sampler_cpu(start, rowptr, count, factor);
}
}
static auto registry = torch::RegisterOperators().op(
"torch_cluster::neighbor_sampler", &neighbor_sampler);
#include <Python.h>
#include <torch/script.h>
#ifdef WITH_CUDA
#include <cuda.h>
#endif
#ifdef _WIN32
PyMODINIT_FUNC PyInit__version(void) { return NULL; }
#endif
int64_t cuda_version() {
#ifdef WITH_CUDA
return CUDA_VERSION;
#else
return -1;
#endif
}
static auto registry =
torch::RegisterOperators().op("torch_cluster::cuda_version", &cuda_version);
#pragma once
#include <ATen/ATen.h>
#include "compat.cuh"
#define THREADS 1024
#define BLOCKS(N) (N + THREADS - 1) / THREADS
#define BLUE_PROB 0.53406
__device__ int64_t done;
__global__ void init_done_kernel() { done = 1; }
__global__ void colorize_kernel(int64_t *cluster, float *__restrict__ bernoulli,
size_t numel) {
const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
for (int64_t u = index; u < numel; u += stride) {
if (cluster[u] < 0) {
cluster[u] = (int64_t)bernoulli[u] - 2;
done = 0;
}
}
}
int64_t colorize(at::Tensor cluster) {
init_done_kernel<<<1, 1>>>();
auto numel = cluster.size(0);
auto props = at::full(numel, BLUE_PROB, cluster.options().dtype(at::kFloat));
auto bernoulli = props.bernoulli();
colorize_kernel<<<BLOCKS(numel), THREADS>>>(
cluster.DATA_PTR<int64_t>(), bernoulli.DATA_PTR<float>(), numel);
int64_t out;
cudaMemcpyFromSymbol(&out, done, sizeof(out), 0, cudaMemcpyDeviceToHost);
return out;
}
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
#include <torch/extension.h>
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
at::Tensor fps_cuda(at::Tensor x, at::Tensor batch, float ratio, bool random);
at::Tensor fps(at::Tensor x, at::Tensor batch, float ratio, bool random) {
CHECK_CUDA(x);
IS_CONTIGUOUS(x);
CHECK_CUDA(batch);
return fps_cuda(x, batch, ratio, random);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("fps", &fps, "Farthest Point Sampling (CUDA)");
}
#include <ATen/ATen.h>
#include "compat.cuh"
#include "utils.cuh"
#define THREADS 1024
template <typename scalar_t, int64_t Dim> struct Dist;
template <typename scalar_t> struct Dist<scalar_t, 1> {
static __device__ void
compute(ptrdiff_t idx, ptrdiff_t start_idx, ptrdiff_t end_idx, ptrdiff_t old,
scalar_t *__restrict__ best, ptrdiff_t *__restrict__ best_idx,
const scalar_t *__restrict__ x, scalar_t *__restrict__ dist,
scalar_t *__restrict__ tmp_dist, size_t dim) {
for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
scalar_t d = x[old] - x[n];
dist[n] = min(dist[n], d * d);
if (dist[n] > *best) {
*best = dist[n];
*best_idx = n;
}
}
}
};
template <typename scalar_t> struct Dist<scalar_t, 2> {
static __device__ void
compute(ptrdiff_t idx, ptrdiff_t start_idx, ptrdiff_t end_idx, ptrdiff_t old,
scalar_t *__restrict__ best, ptrdiff_t *__restrict__ best_idx,
const scalar_t *__restrict__ x, scalar_t *__restrict__ dist,
scalar_t *__restrict__ tmp_dist, size_t dim) {
for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
scalar_t a = x[2 * old + 0] - x[2 * n + 0];
scalar_t b = x[2 * old + 1] - x[2 * n + 1];
scalar_t d = a * a + b * b;
dist[n] = min(dist[n], d);
if (dist[n] > *best) {
*best = dist[n];
*best_idx = n;
}
}
}
};
template <typename scalar_t> struct Dist<scalar_t, 3> {
static __device__ void
compute(ptrdiff_t idx, ptrdiff_t start_idx, ptrdiff_t end_idx, ptrdiff_t old,
scalar_t *__restrict__ best, ptrdiff_t *__restrict__ best_idx,
const scalar_t *__restrict__ x, scalar_t *__restrict__ dist,
scalar_t *__restrict__ tmp_dist, size_t dim) {
for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
scalar_t a = x[3 * old + 0] - x[3 * n + 0];
scalar_t b = x[3 * old + 1] - x[3 * n + 1];
scalar_t c = x[3 * old + 2] - x[3 * n + 2];
scalar_t d = a * a + b * b + c * c;
dist[n] = min(dist[n], d);
if (dist[n] > *best) {
*best = dist[n];
*best_idx = n;
}
}
}
};
template <typename scalar_t> struct Dist<scalar_t, -1> {
static __device__ void
compute(ptrdiff_t idx, ptrdiff_t start_idx, ptrdiff_t end_idx, ptrdiff_t old,
scalar_t *__restrict__ best, ptrdiff_t *__restrict__ best_idx,
const scalar_t *__restrict__ x, scalar_t *__restrict__ dist,
scalar_t *__restrict__ tmp_dist, size_t dim) {
for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
tmp_dist[n] = 0;
}
__syncthreads();
for (ptrdiff_t i = start_idx * dim + idx; i < end_idx * dim; i += THREADS) {
scalar_t d = x[(old * dim) + (i % dim)] - x[i];
atomicAdd(&tmp_dist[i / dim], d * d);
}
__syncthreads();
for (ptrdiff_t n = start_idx + idx; n < end_idx; n += THREADS) {
dist[n] = min(dist[n], tmp_dist[n]);
if (dist[n] > *best) {
*best = dist[n];
*best_idx = n;
}
}
}
};
template <typename scalar_t, int64_t Dim>
__global__ void
fps_kernel(const scalar_t *__restrict__ x, const int64_t *__restrict__ cum_deg,
const int64_t *__restrict__ cum_k, const int64_t *__restrict__ start,
scalar_t *__restrict__ dist, scalar_t *__restrict__ tmp_dist,
int64_t *__restrict__ out, size_t dim) {
const ptrdiff_t batch_idx = blockIdx.x;
const ptrdiff_t idx = threadIdx.x;
const ptrdiff_t start_idx = cum_deg[batch_idx];
const ptrdiff_t end_idx = cum_deg[batch_idx + 1];
__shared__ scalar_t best_dist[THREADS];
__shared__ int64_t best_dist_idx[THREADS];
if (idx == 0) {
out[cum_k[batch_idx]] = start_idx + start[batch_idx];
}
for (ptrdiff_t m = cum_k[batch_idx] + 1; m < cum_k[batch_idx + 1]; m++) {
scalar_t best = -1;
ptrdiff_t best_idx = 0;
__syncthreads();
Dist<scalar_t, Dim>::compute(idx, start_idx, end_idx, out[m - 1], &best,
&best_idx, x, dist, tmp_dist, dim);
best_dist[idx] = best;
best_dist_idx[idx] = best_idx;
for (int64_t u = 0; (1 << u) < THREADS; u++) {
__syncthreads();
if (idx < (THREADS >> (u + 1))) {
int64_t idx_1 = (idx * 2) << u;
int64_t idx_2 = (idx * 2 + 1) << u;
if (best_dist[idx_1] < best_dist[idx_2]) {
best_dist[idx_1] = best_dist[idx_2];
best_dist_idx[idx_1] = best_dist_idx[idx_2];
}
}
}
__syncthreads();
if (idx == 0) {
out[m] = best_dist_idx[0];
}
}
}
#define FPS_KERNEL(DIM, ...) \
[&] { \
switch (DIM) { \
case 1: \
fps_kernel<scalar_t, 1><<<batch_size, THREADS>>>(__VA_ARGS__, DIM); \
break; \
case 2: \
fps_kernel<scalar_t, 2><<<batch_size, THREADS>>>(__VA_ARGS__, DIM); \
break; \
case 3: \
fps_kernel<scalar_t, 3><<<batch_size, THREADS>>>(__VA_ARGS__, DIM); \
break; \
default: \
fps_kernel<scalar_t, -1><<<batch_size, THREADS>>>(__VA_ARGS__, DIM); \
} \
}()
at::Tensor fps_cuda(at::Tensor x, at::Tensor batch, float ratio, bool random) {
cudaSetDevice(x.get_device());
auto batch_sizes = (int64_t *)malloc(sizeof(int64_t));
cudaMemcpy(batch_sizes, batch[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
cudaMemcpyDeviceToHost);
auto batch_size = batch_sizes[0] + 1;
auto deg = degree(batch, batch_size);
auto cum_deg = at::cat({at::zeros(1, deg.options()), deg.cumsum(0)}, 0);
auto k = (deg.toType(at::kFloat) * ratio).ceil().toType(at::kLong);
auto cum_k = at::cat({at::zeros(1, k.options()), k.cumsum(0)}, 0);
at::Tensor start;
if (random) {
start = at::rand(batch_size, x.options());
start = (start * deg.toType(at::kFloat)).toType(at::kLong);
} else {
start = at::zeros(batch_size, k.options());
}
auto dist = at::full(x.size(0), 1e38, x.options());
auto tmp_dist = at::empty(x.size(0), x.options());
auto k_sum = (int64_t *)malloc(sizeof(int64_t));
cudaMemcpy(k_sum, cum_k[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
cudaMemcpyDeviceToHost);
auto out = at::empty(k_sum[0], k.options());
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "fps_kernel", [&] {
FPS_KERNEL(x.size(1), x.DATA_PTR<scalar_t>(), cum_deg.DATA_PTR<int64_t>(),
cum_k.DATA_PTR<int64_t>(), start.DATA_PTR<int64_t>(),
dist.DATA_PTR<scalar_t>(), tmp_dist.DATA_PTR<scalar_t>(),
out.DATA_PTR<int64_t>());
});
return out;
}
#include <torch/extension.h>
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
at::Tensor graclus_cuda(at::Tensor row, at::Tensor col, int64_t num_nodes);
at::Tensor weighted_graclus_cuda(at::Tensor row, at::Tensor col,
at::Tensor weight, int64_t num_nodes);
at::Tensor graclus(at::Tensor row, at::Tensor col, int64_t num_nodes) {
CHECK_CUDA(row);
CHECK_CUDA(col);
return graclus_cuda(row, col, num_nodes);
}
at::Tensor weighted_graclus(at::Tensor row, at::Tensor col, at::Tensor weight,
int64_t num_nodes) {
CHECK_CUDA(row);
CHECK_CUDA(col);
CHECK_CUDA(weight);
return weighted_graclus_cuda(row, col, weight, num_nodes);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("graclus", &graclus, "Graclus (CUDA)");
m.def("weighted_graclus", &weighted_graclus, "Weighted Graclus (CUDA)");
}
#include <ATen/ATen.h>
#include "coloring.cuh"
#include "proposal.cuh"
#include "response.cuh"
#include "utils.cuh"
at::Tensor graclus_cuda(at::Tensor row, at::Tensor col, int64_t num_nodes) {
cudaSetDevice(row.get_device());
std::tie(row, col) = remove_self_loops(row, col);
std::tie(row, col) = rand(row, col);
std::tie(row, col) = to_csr(row, col, num_nodes);
auto cluster = at::full(num_nodes, -1, row.options());
auto proposal = at::full(num_nodes, -1, row.options());
while (!colorize(cluster)) {
propose(cluster, proposal, row, col);
respond(cluster, proposal, row, col);
}
return cluster;
}
at::Tensor weighted_graclus_cuda(at::Tensor row, at::Tensor col,
at::Tensor weight, int64_t num_nodes) {
cudaSetDevice(row.get_device());
std::tie(row, col, weight) = remove_self_loops(row, col, weight);
std::tie(row, col, weight) = to_csr(row, col, weight, num_nodes);
auto cluster = at::full(num_nodes, -1, row.options());
auto proposal = at::full(num_nodes, -1, row.options());
while (!colorize(cluster)) {
propose(cluster, proposal, row, col, weight);
respond(cluster, proposal, row, col, weight);
}
return cluster;
}
#include <torch/extension.h>
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
at::Tensor grid_cuda(at::Tensor pos, at::Tensor size, at::Tensor start,
at::Tensor end);
at::Tensor grid(at::Tensor pos, at::Tensor size, at::Tensor start,
at::Tensor end) {
CHECK_CUDA(pos);
CHECK_CUDA(size);
CHECK_CUDA(start);
CHECK_CUDA(end);
return grid_cuda(pos, size, start, end);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("grid", &grid, "Grid (CUDA)");
}
#include <ATen/ATen.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include "compat.cuh"
#define THREADS 1024
#define BLOCKS(N) (N + THREADS - 1) / THREADS
template <typename scalar_t>
__global__ void grid_kernel(int64_t *cluster,
at::cuda::detail::TensorInfo<scalar_t, int64_t> pos,
scalar_t *__restrict__ size,
scalar_t *__restrict__ start,
scalar_t *__restrict__ end, size_t numel) {
const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
for (ptrdiff_t i = index; i < numel; i += stride) {
int64_t c = 0, k = 1;
for (ptrdiff_t d = 0; d < pos.sizes[1]; d++) {
scalar_t p = pos.data[i * pos.strides[0] + d * pos.strides[1]] - start[d];
c += (int64_t)(p / size[d]) * k;
k *= (int64_t)((end[d] - start[d]) / size[d]) + 1;
}
cluster[i] = c;
}
}
at::Tensor grid_cuda(at::Tensor pos, at::Tensor size, at::Tensor start,
at::Tensor end) {
cudaSetDevice(pos.get_device());
auto cluster = at::empty(pos.size(0), pos.options().dtype(at::kLong));
AT_DISPATCH_ALL_TYPES(pos.scalar_type(), "grid_kernel", [&] {
grid_kernel<scalar_t><<<BLOCKS(cluster.numel()), THREADS>>>(
cluster.DATA_PTR<int64_t>(),
at::cuda::detail::getTensorInfo<scalar_t, int64_t>(pos),
size.DATA_PTR<scalar_t>(), start.DATA_PTR<scalar_t>(),
end.DATA_PTR<scalar_t>(), cluster.numel());
});
return cluster;
}
#include <torch/extension.h>
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
at::Tensor knn_cuda(at::Tensor x, at::Tensor y, size_t k, at::Tensor batch_x,
at::Tensor batch_y, bool cosine);
at::Tensor knn(at::Tensor x, at::Tensor y, size_t k, at::Tensor batch_x,
at::Tensor batch_y, bool cosine) {
CHECK_CUDA(x);
IS_CONTIGUOUS(x);
CHECK_CUDA(y);
IS_CONTIGUOUS(y);
CHECK_CUDA(batch_x);
CHECK_CUDA(batch_y);
return knn_cuda(x, y, k, batch_x, batch_y, cosine);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("knn", &knn, "k-Nearest Neighbor (CUDA)");
}
#include <ATen/ATen.h>
#include "compat.cuh"
#include "utils.cuh"
#define THREADS 1024
template <typename scalar_t> struct Cosine {
static inline __device__ scalar_t dot(const scalar_t *a, const scalar_t *b,
size_t size) {
scalar_t result = 0;
for (ptrdiff_t i = 0; i < size; i++) {
result += a[i] * b[i];
}
return result;
}
static inline __device__ scalar_t norm(const scalar_t *a, size_t size) {
scalar_t result = 0;
for (ptrdiff_t i = 0; i < size; i++) {
result += a[i] * a[i];
}
return sqrt(result);
}
};
template <typename scalar_t>
__global__ void
knn_kernel(const scalar_t *__restrict__ x, const scalar_t *__restrict__ y,
const int64_t *__restrict__ batch_x,
const int64_t *__restrict__ batch_y, scalar_t *__restrict__ dist,
int64_t *__restrict__ row, int64_t *__restrict__ col, size_t k,
size_t dim, bool cosine) {
const ptrdiff_t batch_idx = blockIdx.x;
const ptrdiff_t idx = threadIdx.x;
const ptrdiff_t start_idx_x = batch_x[batch_idx];
const ptrdiff_t end_idx_x = batch_x[batch_idx + 1];
const ptrdiff_t start_idx_y = batch_y[batch_idx];
const ptrdiff_t end_idx_y = batch_y[batch_idx + 1];
for (ptrdiff_t n_y = start_idx_y + idx; n_y < end_idx_y; n_y += THREADS) {
for (ptrdiff_t k_idx = 0; k_idx < k; k_idx++) {
row[n_y * k + k_idx] = n_y;
}
for (ptrdiff_t n_x = start_idx_x; n_x < end_idx_x; n_x++) {
scalar_t tmp_dist = 0;
if (cosine) {
tmp_dist =
Cosine<scalar_t>::norm(x, dim) * Cosine<scalar_t>::norm(y, dim) -
Cosine<scalar_t>::dot(x, y, dim);
} else {
for (ptrdiff_t d = 0; d < dim; d++) {
tmp_dist += (x[n_x * dim + d] - y[n_y * dim + d]) *
(x[n_x * dim + d] - y[n_y * dim + d]);
}
}
for (ptrdiff_t k_idx_1 = 0; k_idx_1 < k; k_idx_1++) {
if (dist[n_y * k + k_idx_1] > tmp_dist) {
for (ptrdiff_t k_idx_2 = k - 1; k_idx_2 > k_idx_1; k_idx_2--) {
dist[n_y * k + k_idx_2] = dist[n_y * k + k_idx_2 - 1];
col[n_y * k + k_idx_2] = col[n_y * k + k_idx_2 - 1];
}
dist[n_y * k + k_idx_1] = tmp_dist;
col[n_y * k + k_idx_1] = n_x;
break;
}
}
}
}
}
at::Tensor knn_cuda(at::Tensor x, at::Tensor y, size_t k, at::Tensor batch_x,
at::Tensor batch_y, bool cosine) {
cudaSetDevice(x.get_device());
auto batch_sizes = (int64_t *)malloc(sizeof(int64_t));
cudaMemcpy(batch_sizes, batch_x[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
cudaMemcpyDeviceToHost);
auto batch_size = batch_sizes[0] + 1;
batch_x = degree(batch_x, batch_size);
batch_x = at::cat({at::zeros(1, batch_x.options()), batch_x.cumsum(0)}, 0);
batch_y = degree(batch_y, batch_size);
batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
auto dist = at::full(y.size(0) * k, 1e38, y.options());
auto row = at::empty(y.size(0) * k, batch_y.options());
auto col = at::full(y.size(0) * k, -1, batch_y.options());
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "knn_kernel", [&] {
knn_kernel<scalar_t><<<batch_size, THREADS>>>(
x.DATA_PTR<scalar_t>(), y.DATA_PTR<scalar_t>(),
batch_x.DATA_PTR<int64_t>(), batch_y.DATA_PTR<int64_t>(),
dist.DATA_PTR<scalar_t>(), row.DATA_PTR<int64_t>(),
col.DATA_PTR<int64_t>(), k, x.size(1), cosine);
});
auto mask = col != -1;
return at::stack({row.masked_select(mask), col.masked_select(mask)}, 0);
}
#include <torch/extension.h>
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
at::Tensor nearest_cuda(at::Tensor x, at::Tensor y, at::Tensor batch_x,
at::Tensor batch_y);
at::Tensor nearest(at::Tensor x, at::Tensor y, at::Tensor batch_x,
at::Tensor batch_y) {
CHECK_CUDA(x);
IS_CONTIGUOUS(x);
CHECK_CUDA(y);
IS_CONTIGUOUS(y);
CHECK_CUDA(batch_x);
CHECK_CUDA(batch_y);
return nearest_cuda(x, y, batch_x, batch_y);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nearest", &nearest, "Nearest Neighbor (CUDA)");
}
#include <ATen/ATen.h>
#include "compat.cuh"
#include "utils.cuh"
#define THREADS 1024
template <typename scalar_t>
__global__ void nearest_kernel(const scalar_t *__restrict__ x,
const scalar_t *__restrict__ y,
const int64_t *__restrict__ batch_x,
const int64_t *__restrict__ batch_y,
int64_t *__restrict__ out, const size_t dim) {
const ptrdiff_t n_x = blockIdx.x;
const ptrdiff_t batch_idx = batch_x[n_x];
const ptrdiff_t idx = threadIdx.x;
const ptrdiff_t start_idx = batch_y[batch_idx];
const ptrdiff_t end_idx = batch_y[batch_idx + 1];
__shared__ scalar_t best_dist[THREADS];
__shared__ int64_t best_dist_idx[THREADS];
scalar_t best = 1e38;
ptrdiff_t best_idx = 0;
for (ptrdiff_t n_y = start_idx + idx; n_y < end_idx; n_y += THREADS) {
scalar_t dist = 0;
for (ptrdiff_t d = 0; d < dim; d++) {
dist += (x[n_x * dim + d] - y[n_y * dim + d]) *
(x[n_x * dim + d] - y[n_y * dim + d]);
}
if (dist < best) {
best = dist;
best_idx = n_y;
}
}
best_dist[idx] = best;
best_dist_idx[idx] = best_idx;
for (int64_t u = 0; (1 << u) < THREADS; u++) {
__syncthreads();
if (idx < (THREADS >> (u + 1))) {
int64_t idx_1 = (idx * 2) << u;
int64_t idx_2 = (idx * 2 + 1) << u;
if (best_dist[idx_1] > best_dist[idx_2]) {
best_dist[idx_1] = best_dist[idx_2];
best_dist_idx[idx_1] = best_dist_idx[idx_2];
}
}
}
__syncthreads();
if (idx == 0) {
out[n_x] = best_dist_idx[0];
}
}
at::Tensor nearest_cuda(at::Tensor x, at::Tensor y, at::Tensor batch_x,
at::Tensor batch_y) {
cudaSetDevice(x.get_device());
auto batch_sizes = (int64_t *)malloc(sizeof(int64_t));
cudaMemcpy(batch_sizes, batch_x[-1].DATA_PTR<int64_t>(), sizeof(int64_t),
cudaMemcpyDeviceToHost);
auto batch_size = batch_sizes[0] + 1;
batch_y = degree(batch_y, batch_size);
batch_y = at::cat({at::zeros(1, batch_y.options()), batch_y.cumsum(0)}, 0);
auto out = at::empty_like(batch_x);
AT_DISPATCH_FLOATING_TYPES(x.scalar_type(), "nearest_kernel", [&] {
nearest_kernel<scalar_t><<<x.size(0), THREADS>>>(
x.DATA_PTR<scalar_t>(), y.DATA_PTR<scalar_t>(),
batch_x.DATA_PTR<int64_t>(), batch_y.DATA_PTR<int64_t>(),
out.DATA_PTR<int64_t>(), x.size(1));
});
return out;
}
#pragma once
#include <ATen/ATen.h>
#include "compat.cuh"
#define THREADS 1024
#define BLOCKS(N) (N + THREADS - 1) / THREADS
__global__ void propose_kernel(int64_t *__restrict__ cluster, int64_t *proposal,
int64_t *__restrict row,
int64_t *__restrict__ col, size_t numel) {
const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
for (int64_t u = index; u < numel; u += stride) {
if (cluster[u] != -1)
continue; // Only vist blue nodes.
bool has_unmatched_neighbor = false;
for (int64_t i = row[u]; i < row[u + 1]; i++) {
auto v = col[i];
if (cluster[v] < 0)
has_unmatched_neighbor = true; // Unmatched neighbor found.
if (cluster[v] == -2) {
proposal[u] = v; // Propose to first red neighbor.
break;
}
}
if (!has_unmatched_neighbor)
cluster[u] = u;
}
}
void propose(at::Tensor cluster, at::Tensor proposal, at::Tensor row,
at::Tensor col) {
propose_kernel<<<BLOCKS(cluster.numel()), THREADS>>>(
cluster.DATA_PTR<int64_t>(), proposal.DATA_PTR<int64_t>(),
row.DATA_PTR<int64_t>(), col.DATA_PTR<int64_t>(), cluster.numel());
}
template <typename scalar_t>
__global__ void propose_kernel(int64_t *__restrict__ cluster, int64_t *proposal,
int64_t *__restrict row,
int64_t *__restrict__ col,
scalar_t *__restrict__ weight, size_t numel) {
const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
const size_t stride = blockDim.x * gridDim.x;
for (int64_t u = index; u < numel; u += stride) {
if (cluster[u] != -1)
continue; // Only vist blue nodes.
bool has_unmatched_neighbor = false;
int64_t v_max = -1;
scalar_t w_max = 0;
for (int64_t i = row[u]; i < row[u + 1]; i++) {
auto v = col[i];
if (cluster[v] < 0)
has_unmatched_neighbor = true; // Unmatched neighbor found.
// Find maximum weighted red neighbor.
if (cluster[v] == -2 && weight[i] >= w_max) {
v_max = v;
w_max = weight[i];
}
}
proposal[u] = v_max; // Propose.
if (!has_unmatched_neighbor)
cluster[u] = u;
}
}
void propose(at::Tensor cluster, at::Tensor proposal, at::Tensor row,
at::Tensor col, at::Tensor weight) {
AT_DISPATCH_ALL_TYPES(weight.scalar_type(), "propose_kernel", [&] {
propose_kernel<scalar_t><<<BLOCKS(cluster.numel()), THREADS>>>(
cluster.DATA_PTR<int64_t>(), proposal.DATA_PTR<int64_t>(),
row.DATA_PTR<int64_t>(), col.DATA_PTR<int64_t>(),
weight.DATA_PTR<scalar_t>(), cluster.numel());
});
}
#include <torch/extension.h>
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
#define IS_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " is not contiguous");
at::Tensor radius_cuda(at::Tensor x, at::Tensor y, float radius,
at::Tensor batch_x, at::Tensor batch_y,
size_t max_num_neighbors);
at::Tensor radius(at::Tensor x, at::Tensor y, float radius, at::Tensor batch_x,
at::Tensor batch_y, size_t max_num_neighbors) {
CHECK_CUDA(x);
IS_CONTIGUOUS(x);
CHECK_CUDA(y);
IS_CONTIGUOUS(y);
CHECK_CUDA(batch_x);
CHECK_CUDA(batch_y);
return radius_cuda(x, y, radius, batch_x, batch_y, max_num_neighbors);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("radius", &radius, "Radius (CUDA)");
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment