all cuda fixed

9436b36c · rusty1s · de431201 · 9436b36c · de431201 · 9436b36c
Commit 9436b36c authored Mar 14, 2020 by rusty1s
Showing with 42 additions and 90 deletions

.travis.yml .travis.yml +12 -12

csrc/cuda/atomics.cuh csrc/cuda/atomics.cuh +0 -22

csrc/cuda/fps_cuda.cu csrc/cuda/fps_cuda.cu +25 -48

csrc/cuda/nearest_cuda.cu csrc/cuda/nearest_cuda.cu +5 -8

No files found.
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,25 +2,25 @@ language: shell
 os:
  - linux
-  # - osx
+  - osx
-  # - windows
+  - windows
 env:
  global:
    - CUDA_HOME=/usr/local/cuda
  jobs:
    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cpu
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cpu
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cpu
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cpu
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cpu
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
-    # - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
 jobs:
  exclude:  # Exclude *all* macOS CUDA jobs and Windows CUDA 9.2/10.0 jobs.

--- a/csrc/cuda/atomics.cuh
+++ b/csrc/cuda/atomics.cuh
-#pragma once
-static inline __device__ void atomAdd(float *address, float val) {
-  atomicAdd(address, val);
-}
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
-static inline __device__ void atomAdd(double *address, double val) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull;
-  unsigned long long int assumed;
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-  } while (assumed != old);
-}
-#else
-static inline __device__ void atomAdd(double *address, double val) {
-  atomicAdd(address, val);
-}
-#endif
--- a/csrc/cuda/fps_cuda.cu
+++ b/csrc/cuda/fps_cuda.cu
@@ -2,47 +2,17 @@
 #include <ATen/cuda/CUDAContext.h>
-#include "atomics.cuh"
 #include "utils.cuh"
 #define THREADS 1024
-template <typename scalar_t> struct Dist {
-  static inline __device__ void compute(int64_t idx, int64_t start_idx,
-                                        int64_t end_idx, int64_t old,
-                                        scalar_t *best, int64_t *best_idx,
-                                        const scalar_t *src, scalar_t *dist,
-                                        scalar_t *tmp_dist, int64_t dim) {
-    for (int64_t n = start_idx + idx; n < end_idx; n += THREADS) {
-      tmp_dist[n] = 0;
-    }
-    __syncthreads();
-    for (int64_t i = start_idx * dim + idx; i < end_idx * dim; i += THREADS) {
-      scalar_t d = src[(old * dim) + (i % dim)] - src[i];
-      atomAdd(&tmp_dist[i / dim], d * d);
-    }
-    __syncthreads();
-    for (int64_t n = start_idx + idx; n < end_idx; n += THREADS) {
-      dist[n] = min(dist[n], tmp_dist[n]);
-      if (dist[n] > *best) {
-        *best = dist[n];
-        *best_idx = n;
-      }
-    }
-  }
-};
 template <typename scalar_t>
 __global__ void fps_kernel(const scalar_t *src, const int64_t *ptr,
                           const int64_t *out_ptr, const int64_t *start,
-                           scalar_t *dist, scalar_t *tmp_dist, int64_t *out,
+                           scalar_t *dist, int64_t *out, int64_t dim) {
-                           int64_t dim) {
-  const int64_t batch_idx = blockIdx.x;
  const int64_t thread_idx = threadIdx.x;
+  const int64_t batch_idx = blockIdx.x;
  const int64_t start_idx = ptr[batch_idx];
  const int64_t end_idx = ptr[batch_idx + 1];
@@ -50,30 +20,39 @@ __global__ void fps_kernel(const scalar_t *src, const int64_t *ptr,
  __shared__ scalar_t best_dist[THREADS];
  __shared__ int64_t best_dist_idx[THREADS];
-  if (threadIdx.x == 0) {
+  if (thread_idx == 0) {
    out[out_ptr[batch_idx]] = start_idx + start[batch_idx];
  }
  for (int64_t m = out_ptr[batch_idx] + 1; m < out_ptr[batch_idx + 1]; m++) {
-    scalar_t best = -1;
+    int64_t old = out[m - 1];
+    scalar_t best = (scalar_t)-1.;
    int64_t best_idx = 0;
-    __syncthreads();
+    for (int64_t n = start_idx + thread_idx; n < end_idx; n += THREADS) {
-    Dist<scalar_t>::compute(thread_idx, start_idx, end_idx, out[m - 1], &best,
+      scalar_t tmp;
-                            &best_idx, src, dist, tmp_dist, dim);
+      scalar_t dd = (scalar_t)0.;
+      for (int64_t d = 0; d < dim; d++) {
+        tmp = src[dim * old + d] - src[dim * n + d];
+        dd += tmp * tmp;
+      }
+      dist[n] = min(dist[n], dd);
+      if (dist[n] > best) {
+        best = dist[n];
+        best_idx = n;
+      }
+    }
    best_dist[thread_idx] = best;
    best_dist_idx[thread_idx] = best_idx;
-    for (int64_t u = 0; (1 << u) < THREADS; u++) {
+    for (int64_t i = 1; i < THREADS; i *= 2) {
      __syncthreads();
-      if (thread_idx < (THREADS >> (u + 1))) {
+      if ((thread_idx + i) < THREADS &&
-        int64_t idx1 = (thread_idx * 2) << u;
+          best_dist[thread_idx] < best_dist[thread_idx + i]) {
-        int64_t idx2 = (thread_idx * 2 + 1) << u;
+        best_dist[thread_idx] = best_dist[thread_idx + i];
-        if (best_dist[idx1] < best_dist[idx2]) {
+        best_dist_idx[thread_idx] = best_dist_idx[thread_idx + i];
-          best_dist[idx1] = best_dist[idx2];
-          best_dist_idx[idx1] = best_dist_idx[idx2];
-        }
      }
    }
@@ -111,7 +90,6 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio,
  }
  auto dist = torch::full(src.size(0), 1e38, src.options());
-  auto tmp_dist = torch::empty(src.size(0), src.options());
  auto out_size = (int64_t *)malloc(sizeof(int64_t));
  cudaMemcpy(out_size, out_ptr[-1].data_ptr<int64_t>(), sizeof(int64_t),
@@ -123,8 +101,7 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio,
    fps_kernel<scalar_t><<<batch_size, THREADS, 0, stream>>>(
        src.data_ptr<scalar_t>(), ptr.data_ptr<int64_t>(),
        out_ptr.data_ptr<int64_t>(), start.data_ptr<int64_t>(),
-        dist.data_ptr<scalar_t>(), tmp_dist.data_ptr<scalar_t>(),
+        dist.data_ptr<scalar_t>(), out.data_ptr<int64_t>(), src.size(1));
-        out.data_ptr<int64_t>(), src.size(1));
  });
  return out;

--- a/csrc/cuda/nearest_cuda.cu
+++ b/csrc/cuda/nearest_cuda.cu
@@ -47,15 +47,12 @@ __global__ void nearest_kernel(const scalar_t *x, const scalar_t *y,
  best_dist[thread_idx] = best;
  best_dist_idx[thread_idx] = best_idx;
-  for (int64_t u = 0; (1 << u) < THREADS; u++) {
+  for (int64_t i = 1; i < THREADS; i *= 2) {
    __syncthreads();
-    if (thread_idx < (THREADS >> (u + 1))) {
+    if ((thread_idx + i) < THREADS &&
-      int64_t idx_1 = (thread_idx * 2) << u;
+        best_dist[thread_idx] > best_dist[thread_idx + i]) {
-      int64_t idx_2 = (thread_idx * 2 + 1) << u;
+      best_dist[thread_idx] = best_dist[thread_idx + i];
-      if (best_dist[idx_1] > best_dist[idx_2]) {
+      best_dist_idx[thread_idx] = best_dist_idx[thread_idx + i];
-        best_dist[idx_1] = best_dist[idx_2];
-        best_dist_idx[idx_1] = best_dist_idx[idx_2];
-      }
    }
  }