#include #include "atomics.cuh" #include "utils.cuh" #define THREADS 1024 template __global__ void fps_kernel(scalar_t *__restrict__ x, int64_t *__restrict__ cum_deg, int64_t *__restrict__ cum_k, int64_t *__restrict__ start, scalar_t *__restrict__ dist, scalar_t *__restrict__ tmp_dist, int64_t *__restrict__ out, size_t dim) { const size_t batch_idx = blockIdx.x; const size_t idx = threadIdx.x; const size_t stride = blockDim.x; // == THREADS const size_t start_idx = cum_deg[batch_idx]; const size_t end_idx = cum_deg[batch_idx + 1]; int64_t old = start_idx + start[batch_idx]; if (idx == 0) { out[cum_k[batch_idx]] = old; } for (ptrdiff_t m = cum_k[batch_idx] + 1; m < cum_k[batch_idx + 1]; m++) { for (ptrdiff_t n = start_idx + idx; n < end_idx; n += stride) { tmp_dist[n] = 0; } __syncthreads(); for (ptrdiff_t i = start_idx * dim + idx; i < end_idx * dim; i += stride) { scalar_t d = x[(old * dim) + (i % dim)] - x[i]; atomicAdd(&tmp_dist[i / dim], d * d); } __syncthreads(); for (ptrdiff_t n = start_idx + idx; n < end_idx; n += stride) { dist[n] = min(dist[n], tmp_dist[n]); } } } at::Tensor fps_cuda(at::Tensor x, at::Tensor batch, float ratio, bool random) { auto batch_sizes = (int64_t *)malloc(sizeof(int64_t)); cudaMemcpy(batch_sizes, batch[-1].data(), sizeof(int64_t), cudaMemcpyDeviceToHost); auto batch_size = batch_sizes[0] + 1; auto deg = degree(batch, batch_size); auto cum_deg = at::cat({at::zeros(1, deg.options()), deg.cumsum(0)}, 0); auto k = (deg.toType(at::kFloat) * ratio).round().toType(at::kLong); auto cum_k = at::cat({at::zeros(1, k.options()), k.cumsum(0)}, 0); at::Tensor start; if (random) { start = at::rand(batch_size, x.options()); start = (start * deg.toType(at::kFloat)).toType(at::kLong); } else { start = at::zeros(batch_size, k.options()); } auto dist = at::full(x.size(0), 1e38, x.options()); auto tmp_dist = at::empty(x.size(0), x.options()); auto k_sum = (int64_t *)malloc(sizeof(int64_t)); cudaMemcpy(k_sum, cum_k[-1].data(), sizeof(int64_t), cudaMemcpyDeviceToHost); auto out = at::empty(k_sum[0], k.options()); AT_DISPATCH_FLOATING_TYPES(x.type(), "fps_kernel", [&] { fps_kernel<<>>( x.data(), cum_deg.data(), cum_k.data(), start.data(), dist.data(), tmp_dist.data(), out.data(), x.size(1)); }); return dist; } // at::Tensor ifp_cuda(at::Tensor x, at::Tensor batch, float ratio) { // AT_DISPATCH_FLOATING_TYPES(x.type(), "ifp_kernel", [&] { // ifp_kernel<<>>( // x.data(), batch.data(), ratio, x.numel()); // }); // return x; // } // __global__ void ifps_kernel() {} // // x: [N, F] // // count: [B] // // batch: [N] // // tmp min distances: [N] // // start node idx // // we parallelize over n times f // // parallelization over n times f: We can compute distances over atomicAdd // // each block corresponds to a batch // __global__ void farthestpointsamplingKernel(int b, int n, int m, // const float *__restrict__ // dataset, float *__restrict__ // temp, int *__restrict__ idxs) { // // dataset: [N*3] entries // // b: batch-size // // n: number of nodes // // m: number of sample points // if (m <= 0) // return; // const int BlockSize = 512; // __shared__ float dists[BlockSize]; // __shared__ int dists_i[BlockSize]; // const int BufferSize = 3072; // __shared__ float buf[BufferSize * 3]; // for (int i = blockIdx.x; i < b; i += gridDim.x) { // iterate over all // batches? // int old = 0; // if (threadIdx.x == 0) // idxs[i * m + 0] = old; // for (int j = threadIdx.x; j < n; j += blockDim.x) { // iterate over all n // temp[blockIdx.x * n + j] = 1e38; // } // for (int j = threadIdx.x; j < min(BufferSize, n) * 3; j += blockDim.x) { // buf[j] = dataset[i * n * 3 + j]; // } // __syncthreads(); // for (int j = 1; j < m; j++) { // int besti = 0; // float best = -1; // float x1 = dataset[i * n * 3 + old * 3 + 0]; // float y1 = dataset[i * n * 3 + old * 3 + 1]; // float z1 = dataset[i * n * 3 + old * 3 + 2]; // for (int k = threadIdx.x; k < n; k += blockDim.x) { // float td = temp[blockIdx.x * n + k]; // float x2, y2, z2; // if (k < BufferSize) { // x2 = buf[k * 3 + 0]; // y2 = buf[k * 3 + 1]; // z2 = buf[k * 3 + 2]; // } else { // x2 = dataset[i * n * 3 + k * 3 + 0]; // y2 = dataset[i * n * 3 + k * 3 + 1]; // z2 = dataset[i * n * 3 + k * 3 + 2]; // } // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + // (z2 - z1) * (z2 - z1); // float d2 = min(d, td); // if (d2 != td) // temp[blockIdx.x * n + k] = d2; // if (d2 > best) { // best = d2; // besti = k; // } // } // dists[threadIdx.x] = best; // dists_i[threadIdx.x] = besti; // for (int u = 0; (1 << u) < blockDim.x; u++) { // __syncthreads(); // if (threadIdx.x < (blockDim.x >> (u + 1))) { // int i1 = (threadIdx.x * 2) << u; // int i2 = (threadIdx.x * 2 + 1) << u; // if (dists[i1] < dists[i2]) { // dists[i1] = dists[i2]; // dists_i[i1] = dists_i[i2]; // } // } // } // __syncthreads(); // old = dists_i[0]; // if (threadIdx.x == 0) // idxs[i * m + j] = old; // } // } // }