"docs/vscode:/vscode.git/clone" did not exist on "039e02601d447c127cc3a56c3d16716ef7d97a16"
Commit ad63397e authored by rusty1s's avatar rusty1s
Browse files

merge fix

parents 149b2f85 3f97baae
...@@ -12,7 +12,7 @@ void THGrid_(THLongTensor *cluster, THTensor *pos, THTensor *size, THLongTensor ...@@ -12,7 +12,7 @@ void THGrid_(THLongTensor *cluster, THTensor *pos, THTensor *size, THLongTensor
for (n = 0; n < THTensor_(size)(pos, 0); n++) { for (n = 0; n < THTensor_(size)(pos, 0); n++) {
coef = 1; value = 0; coef = 1; value = 0;
for (d = 0; d < THTensor_(size)(pos, 1); d++) { for (d = 0; d < THTensor_(size)(pos, 1); d++) {
value += coef * (int64_t) (*(posData + d * pos->stride[1]) / sizeData[d]); value += coef * (int64_t) (posData[d * pos->stride[1]] / sizeData[d]);
coef *= countData[d]; coef *= countData[d];
} }
posData += pos->stride[0]; posData += pos->stride[0];
......
#include "THCGrid.h" #include "THCGrid.h"
template<typename real, int dims>
__global__ void gridKernel(int64_t *cluster, TensorInfo<real> posInfo, real *size,
int64_t *count, const int nNodes) {
KERNEL_LOOP(i, nNodes) {
real *pos = posInfo->data + i * posInfo->stride[0];
int64_t coef = 1, value = 0;
for (ptrdiff_t d = 0; d < dims; d++) {
value += coef * (int64_t) (pos[d * posInfo->stride[1]] / size[d]);
coef *= count[d];
}
cluster[i] = value;
}
}
#include "generic/THCGrid.cu" #include "generic/THCGrid.cu"
#include "THC/THCGenerateAllTypes.h" #include "THC/THCGenerateAllTypes.h"
#ifndef THC_COMMON_INC
#define THC_COMMON_INC
#define KERNEL_LOOP(I, N) \
for (ptrdiff_t I = blockIdx.x * blockDim.x + threadIdx.x; I < I; I += blockDim.x * gridDim.x)
#define THC_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
"Some of the input tensors are located on different GPUs. Please move them to a single one.")
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
#define KERNEL_RUN(NAME, N, ...) \
int grid = GET_BLOCKS(N); \
cudaStream_t stream = THCState_getCurrentStream(state); \
NAME<real><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); \
THCudaCheck(cudaGetLastError())
#define FIXED_DIM_KERNEL_RUN(NAME, N, DIMS, ...) \
int grid = GET_BLOCKS(N); \
cudaStream_t stream = THCState_getCurrentStream(state); \
switch (DIMS) { \
case 1: NAME<real, 1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
case 2: NAME<real, 2><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
case 3: NAME<real, 3><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
case 4: NAME<real, 4><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); break; \
default: NAME<real, -1><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); \
} \
THCudaCheck(cudaGetLastError())
#endif // THC_COMMON_INC
...@@ -4,7 +4,16 @@ ...@@ -4,7 +4,16 @@
void THCGrid_(THCState *state, THCudaLongTensor *cluster, THCTensor *pos, THCTensor *size, void THCGrid_(THCState *state, THCudaLongTensor *cluster, THCTensor *pos, THCTensor *size,
THCudaLongTensor *count) { THCudaLongTensor *count) {
printf("THCGrid drin"); THC_assertSameGPU(state, 4, cluster, pos, size, count);
int64_t *clusterData = THCudaLongTensor_data(state, cluster);
TensorInfo<real> posInfo = THC_(getTensorInfo)(state, pos);
real *sizeData = THCTensor_(data)(state, size);
int64_t *countData = THCudaLongTensor_data(state, count);
const int nNodes = THCudaLongTensor_nElement(state, cluster);
const int dims = THCTensor_(nElement)(size);
FIXED_DIM_KERNEL_RUN(gridKernel, nNodes, dims, clusterData, posInfo, sizeData, countData);
} }
#endif // THC_GENERIC_FILE #endif // THC_GENERIC_FILE
import time
import torch
from torch_cluster import sparse_grid_cluster
n = 90000000
s = 1 / 64
print('GPU ===================')
t = time.perf_counter()
pos = torch.cuda.FloatTensor(n, 3).uniform_(0, 1)
size = torch.cuda.FloatTensor([s, s, s])
torch.cuda.synchronize()
print('Init:', time.perf_counter() - t)
t_all = time.perf_counter()
sparse_grid_cluster(pos, size)
torch.cuda.synchronize()
t_all = time.perf_counter() - t_all
print('All:', t_all)
print('CPU ===================')
pos = pos.cpu()
size = size.cpu()
t_all = time.perf_counter()
sparse_grid_cluster(pos, size)
t_all = time.perf_counter() - t_all
print('All:', t_all)
import time
import torch
from torch_cluster.functions.utils.ffi import _get_func
output = torch.cuda.FloatTensor(500000000).fill_(0.5)
torch.cuda.synchronize()
t = time.perf_counter()
torch.bernoulli(output)
torch.cuda.synchronize()
print(time.perf_counter() - t)
output = output.long().fill_(-1)
func = _get_func('serial', output)
torch.cuda.synchronize()
t = time.perf_counter()
func(output, output, output, output)
torch.cuda.synchronize()
print(time.perf_counter() - t)
...@@ -10,3 +10,4 @@ func = ffi.THCCFloatGrid ...@@ -10,3 +10,4 @@ func = ffi.THCCFloatGrid
print(func) print(func)
func(cluster, pos, size, count) func(cluster, pos, size, count)
print(cluster)
...@@ -2,8 +2,6 @@ ...@@ -2,8 +2,6 @@
extern "C" { extern "C" {
#endif #endif
int assignColor(THCState *state, THCudaLongTensor *color);
void cluster_serial_kernel(THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree); void cluster_serial_kernel(THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree);
void cluster_serial_kernel_Float (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaTensor *weight); void cluster_serial_kernel_Float (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaTensor *weight);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment