Commit fb4632ba authored by rusty1s's avatar rusty1s
Browse files

added cuda kernel

parent 6aad98c2
...@@ -10,14 +10,16 @@ def test_grid_cluster_cpu(tensor): ...@@ -10,14 +10,16 @@ def test_grid_cluster_cpu(tensor):
position = Tensor(tensor, [0, 9, 2, 8, 3]) position = Tensor(tensor, [0, 9, 2, 8, 3])
size = torch.LongTensor([5]) size = torch.LongTensor([5])
expected = torch.LongTensor([0, 1, 0, 1, 0]) expected = torch.LongTensor([0, 1, 0, 1, 0])
output = grid_cluster(position, size) output = grid_cluster(position, size)
assert output.tolist() == expected.tolist() assert output.tolist() == expected.tolist()
position = Tensor(tensor, [[0, 0], [9, 9], [2, 8], [2, 2], [8, 3]]) position = Tensor(tensor, [[0, 0], [9, 9], [2, 8], [2, 2], [8, 3]])
size = torch.LongTensor([5, 5]) size = torch.LongTensor([5, 5])
expected = torch.LongTensor([0, 3, 1, 0, 2]) expected = torch.LongTensor([0, 3, 1, 0, 2])
output = grid_cluster(position, size)
assert output.tolist() == expected.tolist()
position = Tensor(tensor, [[0, 9, 2, 2, 8], [0, 9, 8, 2, 3]]).t()
output = grid_cluster(position, size) output = grid_cluster(position, size)
assert output.tolist() == expected.tolist() assert output.tolist() == expected.tolist()
...@@ -36,21 +38,31 @@ def test_grid_cluster_cpu(tensor): ...@@ -36,21 +38,31 @@ def test_grid_cluster_cpu(tensor):
@pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA') @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA')
@pytest.mark.parametrize('tensor', tensors) @pytest.mark.parametrize('tensor', tensors)
def test_grid_cluster_gpu(tensor): # pragma: no cover def test_grid_cluster_gpu(tensor): # pragma: no cover
position = Tensor(tensor, [[0, 0], [9, 9], [2, 8], [2, 2], [8, 3]]).cuda() position = Tensor(tensor, [0, 9, 2, 8, 3]).cuda()
print(position) size = torch.cuda.LongTensor([5])
# size = torch.cuda.LongTensor([5, 5]) expected = torch.cuda.LongTensor([0, 1, 0, 1, 0])
# expected = torch.LongTensor([0, 3, 1, 0, 2]) output = grid_cluster(position, size)
assert output.cpu().tolist() == expected.tolist()
# output = grid_cluster(position, size) position = Tensor(tensor, [[0, 0], [9, 9], [2, 8], [2, 2], [8, 3]])
# assert output.cpu().tolist() == expected.tolist() position = position.cuda()
size = torch.cuda.LongTensor([5, 5])
expected = torch.cuda.LongTensor([0, 3, 1, 0, 2])
output = grid_cluster(position, size)
assert output.cpu().tolist() == expected.tolist()
# output = grid_cluster(position.expand(2, 5, 2), size) position = Tensor(tensor, [[0, 9, 2, 2, 8], [0, 9, 8, 2, 3]])
# # assert output.cpu().tolist() == expected.expand(2, 5).tolist() position = position.cuda().t()
output = grid_cluster(position, size)
assert output.cpu().tolist() == expected.tolist()
# expected = torch.LongTensor([0, 1, 3, 2, 4]) output = grid_cluster(position.expand(2, 5, 2), size)
# batch = torch.cuda.LongTensor([0, 0, 1, 1, 1]) assert output.tolist() == expected.expand(2, 5).tolist()
# output = grid_cluster(position, size, batch)
# # assert output.cpu().tolist() == expected.tolist() expected = torch.LongTensor([0, 1, 3, 2, 4])
batch = torch.cuda.LongTensor([0, 0, 1, 1, 1])
output = grid_cluster(position, size, batch)
assert output.cpu().tolist() == expected.tolist()
# output = grid_cluster(position.expand(2, 5, 2), size, batch.expand(2, 5)) output = grid_cluster(position.expand(2, 5, 2), size, batch.expand(2, 5))
# # assert output.cpu().tolist() == expected.expand(2, 5).tolist() assert output.cpu().tolist() == expected.expand(2, 5).tolist()
...@@ -43,7 +43,6 @@ def grid_cluster(position, size, batch=None): ...@@ -43,7 +43,6 @@ def grid_cluster(position, size, batch=None):
func = get_func('grid', position) func = get_func('grid', position)
func(C, cluster, position, size, c_max) func(C, cluster, position, size, c_max)
cluster = cluster.squeeze(dim=-1) cluster = cluster.squeeze(dim=-1)
if not cluster.is_cuda:
cluster = consecutive(cluster) cluster = consecutive(cluster)
return cluster return cluster
template <typename a, int Dims>
struct IndexToOffset {
static __device__ void compute(int i, const TensorInfo<a>& t1, int* t1Offset) {
int curDimIndex;
for (int d = Dims - 2; d >= 0; d--) {
curDimIndex = i % t1.size[d];
*t1Offset += curDimIndex * t1.stride[d];
i /= t1.size[d];
}
}
};
template <typename a>
struct IndexToOffset<a, -1> {
static __device__ void compute(int i, const TensorInfo<a>& t1, int* t1Offset) {
int curDimIndex;
for (int d = t1.dims - 2; d >= 0; d--) {
curDimIndex = i % t1.size[d];
*t1Offset += curDimIndex * t1.stride[d];
i /= t1.size[d];
}
}
};
const int MAX_DIMS = 25;
const int NUM_THREADS = 1024;
inline int GET_BLOCKS(const int n) {
return (n + NUM_THREADS - 1) / NUM_THREADS;
}
template<typename T>
struct TensorInfo {
TensorInfo(T *t, int d, int sz[MAX_DIMS], int st[MAX_DIMS]) {
data = t; dims = d;
for (int i = 0; i < dims; i++) {
size[i] = sz[i];
stride[i] = st[i];
}
}
T *data;
int dims;
int size[MAX_DIMS];
int stride[MAX_DIMS];
};
#define KERNEL_LOOP(I, N) \
for (int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; i += blockDim.x * gridDim.x)
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/common.cu"
#else
TensorInfo<real> thc_(getTensorInfo)(THCState *state, THCTensor *tensor) {
real *data = THCTensor_(data)(state, tensor);
int dims = THCTensor_(nDimension)(state, tensor);
int size[MAX_DIMS]; int stride[MAX_DIMS];
for (int i = 0; i < dims; i++) {
size[i] = THCTensor_(size)(state, tensor, i);
stride[i] = THCTensor_(stride)(state, tensor, i);
}
return TensorInfo<real>(data, dims, size, stride);
}
#endif
...@@ -3,7 +3,28 @@ ...@@ -3,7 +3,28 @@
#else #else
void cluster_(grid)(THCState *state, int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) { void cluster_(grid)(THCState *state, int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) {
printf("drin"); THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, position, size));
THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 2, output, count));
THArgCheck(THCudaLongTensor_nDimension(state, output) <= MAX_DIMS, 1, "Tensor too large or too many dimensions");
int64_t *outputData = THCudaLongTensor_data(state, output);
TensorInfo<real> positionInfo = thc_(getTensorInfo)(state, position);
real *sizeData = THCTensor_(data)(state, size);
int64_t *countData = THCudaLongTensor_data(state, count);
const int N = THCudaLongTensor_nElement(state, output);
int grid = GET_BLOCKS(N);
cudaStream_t stream = THCState_getCurrentStream(state);
switch (positionInfo.dims) {
case 1: gridKernel<real, 1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
case 2: gridKernel<real, 2><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
case 3: gridKernel<real, 3><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
case 4: gridKernel<real, 4><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
default: gridKernel<real, -1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
}
THCudaCheck(cudaGetLastError());
} }
#endif #endif
...@@ -2,7 +2,41 @@ ...@@ -2,7 +2,41 @@
#include "kernel.h" #include "kernel.h"
#include "common.cuh"
#include "THCIndex.cuh"
#define cluster_(NAME) TH_CONCAT_4(cluster_, NAME, _kernel_, Real) #define cluster_(NAME) TH_CONCAT_4(cluster_, NAME, _kernel_, Real)
#define thc_(NAME) TH_CONCAT_4(thc_, NAME, _, Real)
#include "generic/kernel.cu" #include "generic/common.cu"
#include "THCGenerateAllTypes.h" #include "THCGenerateAllTypes.h"
template<typename Real, int Dims>
__global__ void gridKernel(int64_t *output, TensorInfo<Real> position, Real *size, int64_t *count, const int C, const int N) {
KERNEL_LOOP(i, N) {
int positionOffset = 0;
IndexToOffset<Real, Dims>::compute(i, position, &positionOffset);
int tmp = C; int64_t c = 0;
for (int d = 0; d < position.size[position.dims - 1]; d++) {
tmp = tmp / count[d];
c += tmp * (int64_t) (position.data[positionOffset + d] / size[d]);
}
output[i] = c;
}
}
#include "generic/kernel.cu"
#include "THCGenerateFloatType.h"
#include "generic/kernel.cu"
#include "THCGenerateDoubleType.h"
#include "generic/kernel.cu"
#include "THCGenerateByteType.h"
#include "generic/kernel.cu"
#include "THCGenerateCharType.h"
#include "generic/kernel.cu"
#include "THCGenerateShortType.h"
#include "generic/kernel.cu"
#include "THCGenerateIntType.h"
#include "generic/kernel.cu"
#include "THCGenerateLongType.h"
...@@ -12,7 +12,7 @@ void cluster_(grid)(int C, THLongTensor *output, THTensor *position, THTensor *s ...@@ -12,7 +12,7 @@ void cluster_(grid)(int C, THLongTensor *output, THTensor *position, THTensor *s
tmp = C; c = 0; tmp = C; c = 0;
for (i = 0; i < d; i++) { for (i = 0; i < d; i++) {
tmp = tmp / *(count_data + i); tmp = tmp / *(count_data + i);
c += tmp * (int64_t)floor(*(position_data + i * position_stride) / *(size_data + i)); c += tmp * (int64_t) (*(position_data + i * position_stride) / *(size_data + i));
} }
output_data[0] = c; output_data[0] = c;
) )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment