Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
torch-cluster
Commits
9436b36c
Commit
9436b36c
authored
Mar 14, 2020
by
rusty1s
Browse files
all cuda fixed
parent
de431201
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
42 additions
and
90 deletions
+42
-90
.travis.yml
.travis.yml
+12
-12
csrc/cuda/atomics.cuh
csrc/cuda/atomics.cuh
+0
-22
csrc/cuda/fps_cuda.cu
csrc/cuda/fps_cuda.cu
+25
-48
csrc/cuda/nearest_cuda.cu
csrc/cuda/nearest_cuda.cu
+5
-8
No files found.
.travis.yml
View file @
9436b36c
...
@@ -2,25 +2,25 @@ language: shell
...
@@ -2,25 +2,25 @@ language: shell
os
:
os
:
-
linux
-
linux
#
- osx
-
osx
#
- windows
-
windows
env
:
env
:
global
:
global
:
-
CUDA_HOME=/usr/local/cuda
-
CUDA_HOME=/usr/local/cuda
jobs
:
jobs
:
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cpu
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cpu
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cpu
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cpu
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cpu
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cpu
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
#
- TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
-
TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
jobs
:
jobs
:
exclude
:
# Exclude *all* macOS CUDA jobs and Windows CUDA 9.2/10.0 jobs.
exclude
:
# Exclude *all* macOS CUDA jobs and Windows CUDA 9.2/10.0 jobs.
...
...
csrc/cuda/atomics.cuh
deleted
100644 → 0
View file @
de431201
#pragma once
static
inline
__device__
void
atomAdd
(
float
*
address
,
float
val
)
{
atomicAdd
(
address
,
val
);
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
static
inline
__device__
void
atomAdd
(
double
*
address
,
double
val
)
{
unsigned
long
long
int
*
address_as_ull
=
(
unsigned
long
long
int
*
)
address
;
unsigned
long
long
int
old
=
*
address_as_ull
;
unsigned
long
long
int
assumed
;
do
{
assumed
=
old
;
old
=
atomicCAS
(
address_as_ull
,
assumed
,
__double_as_longlong
(
val
+
__longlong_as_double
(
assumed
)));
}
while
(
assumed
!=
old
);
}
#else
static
inline
__device__
void
atomAdd
(
double
*
address
,
double
val
)
{
atomicAdd
(
address
,
val
);
}
#endif
csrc/cuda/fps_cuda.cu
View file @
9436b36c
...
@@ -2,47 +2,17 @@
...
@@ -2,47 +2,17 @@
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAContext.h>
#include "atomics.cuh"
#include "utils.cuh"
#include "utils.cuh"
#define THREADS 1024
#define THREADS 1024
template
<
typename
scalar_t
>
struct
Dist
{
static
inline
__device__
void
compute
(
int64_t
idx
,
int64_t
start_idx
,
int64_t
end_idx
,
int64_t
old
,
scalar_t
*
best
,
int64_t
*
best_idx
,
const
scalar_t
*
src
,
scalar_t
*
dist
,
scalar_t
*
tmp_dist
,
int64_t
dim
)
{
for
(
int64_t
n
=
start_idx
+
idx
;
n
<
end_idx
;
n
+=
THREADS
)
{
tmp_dist
[
n
]
=
0
;
}
__syncthreads
();
for
(
int64_t
i
=
start_idx
*
dim
+
idx
;
i
<
end_idx
*
dim
;
i
+=
THREADS
)
{
scalar_t
d
=
src
[(
old
*
dim
)
+
(
i
%
dim
)]
-
src
[
i
];
atomAdd
(
&
tmp_dist
[
i
/
dim
],
d
*
d
);
}
__syncthreads
();
for
(
int64_t
n
=
start_idx
+
idx
;
n
<
end_idx
;
n
+=
THREADS
)
{
dist
[
n
]
=
min
(
dist
[
n
],
tmp_dist
[
n
]);
if
(
dist
[
n
]
>
*
best
)
{
*
best
=
dist
[
n
];
*
best_idx
=
n
;
}
}
}
};
template
<
typename
scalar_t
>
template
<
typename
scalar_t
>
__global__
void
fps_kernel
(
const
scalar_t
*
src
,
const
int64_t
*
ptr
,
__global__
void
fps_kernel
(
const
scalar_t
*
src
,
const
int64_t
*
ptr
,
const
int64_t
*
out_ptr
,
const
int64_t
*
start
,
const
int64_t
*
out_ptr
,
const
int64_t
*
start
,
scalar_t
*
dist
,
scalar_t
*
tmp_dist
,
int64_t
*
out
,
scalar_t
*
dist
,
int64_t
*
out
,
int64_t
dim
)
{
int64_t
dim
)
{
const
int64_t
batch_idx
=
blockIdx
.
x
;
const
int64_t
thread_idx
=
threadIdx
.
x
;
const
int64_t
thread_idx
=
threadIdx
.
x
;
const
int64_t
batch_idx
=
blockIdx
.
x
;
const
int64_t
start_idx
=
ptr
[
batch_idx
];
const
int64_t
start_idx
=
ptr
[
batch_idx
];
const
int64_t
end_idx
=
ptr
[
batch_idx
+
1
];
const
int64_t
end_idx
=
ptr
[
batch_idx
+
1
];
...
@@ -50,30 +20,39 @@ __global__ void fps_kernel(const scalar_t *src, const int64_t *ptr,
...
@@ -50,30 +20,39 @@ __global__ void fps_kernel(const scalar_t *src, const int64_t *ptr,
__shared__
scalar_t
best_dist
[
THREADS
];
__shared__
scalar_t
best_dist
[
THREADS
];
__shared__
int64_t
best_dist_idx
[
THREADS
];
__shared__
int64_t
best_dist_idx
[
THREADS
];
if
(
thread
Idx
.
x
==
0
)
{
if
(
thread
_id
x
==
0
)
{
out
[
out_ptr
[
batch_idx
]]
=
start_idx
+
start
[
batch_idx
];
out
[
out_ptr
[
batch_idx
]]
=
start_idx
+
start
[
batch_idx
];
}
}
for
(
int64_t
m
=
out_ptr
[
batch_idx
]
+
1
;
m
<
out_ptr
[
batch_idx
+
1
];
m
++
)
{
for
(
int64_t
m
=
out_ptr
[
batch_idx
]
+
1
;
m
<
out_ptr
[
batch_idx
+
1
];
m
++
)
{
scalar_t
best
=
-
1
;
int64_t
old
=
out
[
m
-
1
];
scalar_t
best
=
(
scalar_t
)
-
1.
;
int64_t
best_idx
=
0
;
int64_t
best_idx
=
0
;
__syncthreads
();
for
(
int64_t
n
=
start_idx
+
thread_idx
;
n
<
end_idx
;
n
+=
THREADS
)
{
Dist
<
scalar_t
>::
compute
(
thread_idx
,
start_idx
,
end_idx
,
out
[
m
-
1
],
&
best
,
scalar_t
tmp
;
&
best_idx
,
src
,
dist
,
tmp_dist
,
dim
);
scalar_t
dd
=
(
scalar_t
)
0.
;
for
(
int64_t
d
=
0
;
d
<
dim
;
d
++
)
{
tmp
=
src
[
dim
*
old
+
d
]
-
src
[
dim
*
n
+
d
];
dd
+=
tmp
*
tmp
;
}
dist
[
n
]
=
min
(
dist
[
n
],
dd
);
if
(
dist
[
n
]
>
best
)
{
best
=
dist
[
n
];
best_idx
=
n
;
}
}
best_dist
[
thread_idx
]
=
best
;
best_dist
[
thread_idx
]
=
best
;
best_dist_idx
[
thread_idx
]
=
best_idx
;
best_dist_idx
[
thread_idx
]
=
best_idx
;
for
(
int64_t
u
=
0
;
(
1
<<
u
)
<
THREADS
;
u
++
)
{
for
(
int64_t
i
=
1
;
i
<
THREADS
;
i
*=
2
)
{
__syncthreads
();
__syncthreads
();
if
(
thread_idx
<
(
THREADS
>>
(
u
+
1
)))
{
if
((
thread_idx
+
i
)
<
THREADS
&&
int64_t
idx1
=
(
thread_idx
*
2
)
<<
u
;
best_dist
[
thread_idx
]
<
best_dist
[
thread_idx
+
i
])
{
int64_t
idx2
=
(
thread_idx
*
2
+
1
)
<<
u
;
best_dist
[
thread_idx
]
=
best_dist
[
thread_idx
+
i
];
if
(
best_dist
[
idx1
]
<
best_dist
[
idx2
])
{
best_dist_idx
[
thread_idx
]
=
best_dist_idx
[
thread_idx
+
i
];
best_dist
[
idx1
]
=
best_dist
[
idx2
];
best_dist_idx
[
idx1
]
=
best_dist_idx
[
idx2
];
}
}
}
}
}
...
@@ -111,7 +90,6 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio,
...
@@ -111,7 +90,6 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio,
}
}
auto
dist
=
torch
::
full
(
src
.
size
(
0
),
1e38
,
src
.
options
());
auto
dist
=
torch
::
full
(
src
.
size
(
0
),
1e38
,
src
.
options
());
auto
tmp_dist
=
torch
::
empty
(
src
.
size
(
0
),
src
.
options
());
auto
out_size
=
(
int64_t
*
)
malloc
(
sizeof
(
int64_t
));
auto
out_size
=
(
int64_t
*
)
malloc
(
sizeof
(
int64_t
));
cudaMemcpy
(
out_size
,
out_ptr
[
-
1
].
data_ptr
<
int64_t
>
(),
sizeof
(
int64_t
),
cudaMemcpy
(
out_size
,
out_ptr
[
-
1
].
data_ptr
<
int64_t
>
(),
sizeof
(
int64_t
),
...
@@ -123,8 +101,7 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio,
...
@@ -123,8 +101,7 @@ torch::Tensor fps_cuda(torch::Tensor src, torch::Tensor ptr, double ratio,
fps_kernel
<
scalar_t
><<<
batch_size
,
THREADS
,
0
,
stream
>>>
(
fps_kernel
<
scalar_t
><<<
batch_size
,
THREADS
,
0
,
stream
>>>
(
src
.
data_ptr
<
scalar_t
>
(),
ptr
.
data_ptr
<
int64_t
>
(),
src
.
data_ptr
<
scalar_t
>
(),
ptr
.
data_ptr
<
int64_t
>
(),
out_ptr
.
data_ptr
<
int64_t
>
(),
start
.
data_ptr
<
int64_t
>
(),
out_ptr
.
data_ptr
<
int64_t
>
(),
start
.
data_ptr
<
int64_t
>
(),
dist
.
data_ptr
<
scalar_t
>
(),
tmp_dist
.
data_ptr
<
scalar_t
>
(),
dist
.
data_ptr
<
scalar_t
>
(),
out
.
data_ptr
<
int64_t
>
(),
src
.
size
(
1
));
out
.
data_ptr
<
int64_t
>
(),
src
.
size
(
1
));
});
});
return
out
;
return
out
;
...
...
csrc/cuda/nearest_cuda.cu
View file @
9436b36c
...
@@ -47,15 +47,12 @@ __global__ void nearest_kernel(const scalar_t *x, const scalar_t *y,
...
@@ -47,15 +47,12 @@ __global__ void nearest_kernel(const scalar_t *x, const scalar_t *y,
best_dist
[
thread_idx
]
=
best
;
best_dist
[
thread_idx
]
=
best
;
best_dist_idx
[
thread_idx
]
=
best_idx
;
best_dist_idx
[
thread_idx
]
=
best_idx
;
for
(
int64_t
u
=
0
;
(
1
<<
u
)
<
THREADS
;
u
++
)
{
for
(
int64_t
i
=
1
;
i
<
THREADS
;
i
*=
2
)
{
__syncthreads
();
__syncthreads
();
if
(
thread_idx
<
(
THREADS
>>
(
u
+
1
)))
{
if
((
thread_idx
+
i
)
<
THREADS
&&
int64_t
idx_1
=
(
thread_idx
*
2
)
<<
u
;
best_dist
[
thread_idx
]
>
best_dist
[
thread_idx
+
i
])
{
int64_t
idx_2
=
(
thread_idx
*
2
+
1
)
<<
u
;
best_dist
[
thread_idx
]
=
best_dist
[
thread_idx
+
i
];
if
(
best_dist
[
idx_1
]
>
best_dist
[
idx_2
])
{
best_dist_idx
[
thread_idx
]
=
best_dist_idx
[
thread_idx
+
i
];
best_dist
[
idx_1
]
=
best_dist
[
idx_2
];
best_dist_idx
[
idx_1
]
=
best_dist_idx
[
idx_2
];
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment