Commit 9436b36c authored by rusty1s's avatar rusty1s
Browse files

all cuda fixed

parent de431201
...@@ -2,25 +2,25 @@ language: shell ...@@ -2,25 +2,25 @@ language: shell
os: os:
- linux - linux
# - osx - osx
# - windows - windows
env: env:
global: global:
- CUDA_HOME=/usr/local/cuda - CUDA_HOME=/usr/local/cuda
jobs: jobs:
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cpu - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cpu
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cpu - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cpu
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cpu - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cpu
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
# - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101 - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
jobs: jobs:
exclude: # Exclude *all* macOS CUDA jobs and Windows CUDA 9.2/10.0 jobs. exclude: # Exclude *all* macOS CUDA jobs and Windows CUDA 9.2/10.0 jobs.
......
#pragma once
static inline __device__ void atomAdd(float *address, float val) {
atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
static inline __device__ void atomAdd(double *address, double val) {
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
}
#else
static inline __device__ void atomAdd(double *address, double val) {
atomicAdd(address, val);
}
#endif
...@@ -2,47 +2,17 @@ ...@@ -2,47 +2,17 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include "atomics.cuh"
#include "utils.cuh" #include "utils.cuh"
#define THREADS 1024 #define THREADS 1024
template <typename scalar_t> struct Dist {
static inline __device__ void compute(int64_t idx, int64_t start_idx,
int64_t end_idx, int64_t old,
scalar_t *best, int64_t *best_idx,
const scalar_t *src, scalar_t *dist,
scalar_t *tmp_dist, int64_t dim) {
for (int64_t n = start_idx + idx; n < end_idx; n += THREADS) {
tmp_dist[n] = 0;
}
__syncthreads();
for (int64_t i = start_idx * dim + idx; i < end_idx * dim; i += THREADS) {
scalar_t d = src[(old * dim) + (i % dim)] - src[i];
atomAdd(&tmp_dist[i / dim], d * d);
}
__syncthreads();
for (int64_t n = start_idx + idx; n < end_idx; n += THREADS) {
dist[n] = min(dist[n], tmp_dist[n]);
if (dist[n] > *best) {
*best = dist[n];
*best_idx = n;
}
}
}
};
template <typename scalar_t> template <typename scalar_t>
__global__ void fps_kernel(const scalar_t *src, const int64_t *ptr, __global__ void fps_kernel(const scalar_t *src, const int64_t *ptr,
const int64_t *out_ptr, const int64_t *start, const int64_t *out_ptr, const int64_t *start,
scalar_t *dist, scalar_t *tmp_dist, int64_t *out, scalar_t *dist, int64_t *out, int64_t dim) {
int64_t dim) {
const int64_t batch_idx = blockIdx.x;
const int64_t thread_idx = threadIdx.x; const int64_t thread_idx = threadIdx.x;
const int64_t batch_idx = blockIdx.x;
const int64_t start_idx = ptr[batch_idx]; const int64_t start_idx = ptr[batch_idx];
const int64_t end_idx = ptr[batch_idx + 1]; const int64_t end_idx = ptr[batch_idx + 1];
...@@ -50,30 +20,39 @@ __global__ void fps_kernel(const scalar_t *src, const int64_t *ptr, ...@@ -50,30 +20,39 @@ __global__ void fps_kernel(const scalar_t *src, const int64_t *ptr,
__shared__ scalar_t best_dist[THREADS]; __shared__ scalar_t best_dist[THREADS];
__shared__ int64_t best_dist_idx[THREADS]; __shared__ int64_t best_dist_idx[THREADS];
if (threadIdx.x == 0) { if (thread_idx == 0) {
out[out_ptr[batch_idx]] = start_idx + start[batch_idx]; out[out_ptr[batch_idx]] = start_idx + start[batch_idx];
} }
for (int64_t m = out_ptr[batch_idx] + 1; m < out_ptr[batch_idx + 1]; m++) { for (int64_t m = out_ptr[batch_idx] + 1; m < out_ptr[batch_idx + 1]; m++) {
scalar_t best = -1; int64_t old = out[m - 1];
scalar_t best = (scalar_t)-1.;
int64_t best_idx = 0; int64_t best_idx = 0;
__syncthreads(); for (int64_t n = start_idx + thread_idx; n < end_idx; n += THREADS) {
Dist<scalar_t>::compute(thread_idx, start_idx, end_idx, out[m - 1], &best, scalar_t tmp;
&best_idx, src, dist, tmp_dist, dim); scalar_t dd = (scalar_t)0.;
for (int64_t d = 0; d < dim; d++) {
tmp = src[dim * old + d] - src[dim * n + d];
dd += tmp * tmp;
}
dist[n] = min(dist[n], dd);
if (dist[n] > best) {
best = dist[n];
best_idx = n;
}
}
best_dist[thread_idx] = best; best_dist[thread_idx] = best;
best_dist_idx[thread_idx] = best_idx; best_dist_idx[thread_idx] = best_idx;
for (int64_t u = 0; (1 << u) < THREADS; u++) { for (int64_t i = 1; i < THREADS; i *= 2) {
__syncthreads(); __syncthreads();
if (thread_idx < (THREADS >> (u + 1))) { if ((thread_idx + i) < THREADS &&
int64_t idx1 = (thread_idx * 2) << u; best_dist[thread_idx] < best_dist[thread_idx + i]) {
int64_t idx2 = (thread_idx * 2 + 1) << u; best_dist[thread_idx] = best_dist[thread_idx + i];
if (best_dist[idx1] < best_dist[idx2]) { best_dist_idx[thread_idx] = best_dist_idx[thread_idx + i];
best_dist[idx1] = best_dist[idx2];
best_dist_idx[idx1] = best_dist_idx[idx2];
}
} }
} }
...@@ -111,7 +90,6 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio, ...@@ -111,7 +90,6 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio,
} }
auto dist = torch::full(src.size(0), 1e38, src.options()); auto dist = torch::full(src.size(0), 1e38, src.options());
auto tmp_dist = torch::empty(src.size(0), src.options());
auto out_size = (int64_t *)malloc(sizeof(int64_t)); auto out_size = (int64_t *)malloc(sizeof(int64_t));
cudaMemcpy(out_size, out_ptr[-1].data_ptr<int64_t>(), sizeof(int64_t), cudaMemcpy(out_size, out_ptr[-1].data_ptr<int64_t>(), sizeof(int64_t),
...@@ -123,8 +101,7 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio, ...@@ -123,8 +101,7 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio,
fps_kernel<scalar_t><<<batch_size, THREADS, 0, stream>>>( fps_kernel<scalar_t><<<batch_size, THREADS, 0, stream>>>(
src.data_ptr<scalar_t>(), ptr.data_ptr<int64_t>(), src.data_ptr<scalar_t>(), ptr.data_ptr<int64_t>(),
out_ptr.data_ptr<int64_t>(), start.data_ptr<int64_t>(), out_ptr.data_ptr<int64_t>(), start.data_ptr<int64_t>(),
dist.data_ptr<scalar_t>(), tmp_dist.data_ptr<scalar_t>(), dist.data_ptr<scalar_t>(), out.data_ptr<int64_t>(), src.size(1));
out.data_ptr<int64_t>(), src.size(1));
}); });
return out; return out;
......
...@@ -47,15 +47,12 @@ __global__ void nearest_kernel(const scalar_t *x, const scalar_t *y, ...@@ -47,15 +47,12 @@ __global__ void nearest_kernel(const scalar_t *x, const scalar_t *y,
best_dist[thread_idx] = best; best_dist[thread_idx] = best;
best_dist_idx[thread_idx] = best_idx; best_dist_idx[thread_idx] = best_idx;
for (int64_t u = 0; (1 << u) < THREADS; u++) { for (int64_t i = 1; i < THREADS; i *= 2) {
__syncthreads(); __syncthreads();
if (thread_idx < (THREADS >> (u + 1))) { if ((thread_idx + i) < THREADS &&
int64_t idx_1 = (thread_idx * 2) << u; best_dist[thread_idx] > best_dist[thread_idx + i]) {
int64_t idx_2 = (thread_idx * 2 + 1) << u; best_dist[thread_idx] = best_dist[thread_idx + i];
if (best_dist[idx_1] > best_dist[idx_2]) { best_dist_idx[thread_idx] = best_dist_idx[thread_idx + i];
best_dist[idx_1] = best_dist[idx_2];
best_dist_idx[idx_1] = best_dist_idx[idx_2];
}
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment