Commit 9a1f7817 authored by rusty1s's avatar rusty1s
Browse files

clean up

parent e6638edd
...@@ -3,4 +3,4 @@ cpu_tensors = [ ...@@ -3,4 +3,4 @@ cpu_tensors = [
'FloatTensor', 'DoubleTensor' 'FloatTensor', 'DoubleTensor'
] ]
cuda_tensors = ['cuda.{}'.format(t) for t in cpu_tensors + ['HalfTensor']] gpu_tensors = ['cuda.{}'.format(t) for t in cpu_tensors + ['HalfTensor']]
...@@ -5,7 +5,7 @@ import torch ...@@ -5,7 +5,7 @@ import torch
import numpy as np import numpy as np
from torch_cluster import graclus_cluster from torch_cluster import graclus_cluster
from .tensor import cpu_tensors from .tensor import cpu_tensors, gpu_tensors
tests = [{ tests = [{
'row': [0, 0, 1, 1, 1, 2, 2, 2, 3, 3], 'row': [0, 0, 1, 1, 1, 2, 2, 2, 3, 3],
...@@ -20,6 +20,7 @@ tests = [{ ...@@ -20,6 +20,7 @@ tests = [{
def assert_correct_graclus(row, col, cluster): def assert_correct_graclus(row, col, cluster):
row, col, cluster = row.numpy(), col.numpy(), cluster.numpy() row, col, cluster = row.numpy(), col.numpy(), cluster.numpy()
n_nodes = cluster.shape[0]
# Every node was assigned a cluster. # Every node was assigned a cluster.
assert cluster.min() >= 0 assert cluster.min() >= 0
...@@ -28,6 +29,9 @@ def assert_correct_graclus(row, col, cluster): ...@@ -28,6 +29,9 @@ def assert_correct_graclus(row, col, cluster):
_, count = np.unique(cluster, return_counts=True) _, count = np.unique(cluster, return_counts=True)
assert (count > 2).max() == 0 assert (count > 2).max() == 0
# Cluster value is minimal.
assert (cluster <= np.arange(n_nodes, dtype=row.dtype)).sum() == n_nodes
# Corresponding clusters must be adjacent. # Corresponding clusters must be adjacent.
for n in range(cluster.shape[0]): for n in range(cluster.shape[0]):
x = cluster[col[row == n]] == cluster[n] # Neighbors with same cluster x = cluster[col[row == n]] == cluster[n] # Neighbors with same cluster
...@@ -50,9 +54,16 @@ def test_graclus_cluster_cpu(tensor, i): ...@@ -50,9 +54,16 @@ def test_graclus_cluster_cpu(tensor, i):
assert_correct_graclus(row, col, cluster) assert_correct_graclus(row, col, cluster)
def test_graclus_cluster_gpu(): @pytest.mark.skipif(not torch.cuda.is_available(), reason='no CUDA')
row = torch.cuda.LongTensor([0, 0, 1, 1, 1, 2, 2, 2, 3, 3]) @pytest.mark.parametrize('tensor,i', product(gpu_tensors, range(len(tests))))
col = torch.cuda.LongTensor([1, 2, 0, 2, 3, 0, 1, 3, 1, 2]) def test_graclus_cluster_gpu(tensor, i):
data = tests[i]
row = torch.cuda.LongTensor(data['row'])
col = torch.cuda.LongTensor(data['col'])
cluster = graclus_cluster(row, col) weight = data['weight']
print(cluster.cpu().tolist()) weight = weight if weight is None else getattr(torch.cuda, tensor)(weight)
cluster = graclus_cluster(row, col, weight)
assert_correct_graclus(row, col, cluster)
template <typename a, int Dims>
struct IndexToOffset {
static __device__ void compute(int i, const TensorInfo<a>& t1, int* t1Offset) {
int curDimIndex;
for (int d = Dims - 2; d >= 0; d--) {
curDimIndex = i % t1.size[d];
*t1Offset += curDimIndex * t1.stride[d];
i /= t1.size[d];
}
}
};
template <typename a>
struct IndexToOffset<a, -1> {
static __device__ void compute(int i, const TensorInfo<a>& t1, int* t1Offset) {
int curDimIndex;
for (int d = t1.dims - 2; d >= 0; d--) {
curDimIndex = i % t1.size[d];
*t1Offset += curDimIndex * t1.stride[d];
i /= t1.size[d];
}
}
};
const int MAX_DIMS = 25;
const int NUM_THREADS = 1024;
inline int GET_BLOCKS(const int n) {
return (n + NUM_THREADS - 1) / NUM_THREADS;
}
template<typename T>
struct TensorInfo {
TensorInfo(T *t, int d, int sz[MAX_DIMS], int st[MAX_DIMS]) {
data = t; dims = d;
for (int i = 0; i < dims; i++) {
size[i] = sz[i];
stride[i] = st[i];
}
}
T *data;
int dims;
int size[MAX_DIMS];
int stride[MAX_DIMS];
};
#define KERNEL_LOOP(I, N) \
for (int I = blockIdx.x * blockDim.x + threadIdx.x; I < N; i += blockDim.x * gridDim.x)
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/common.cu"
#else
TensorInfo<real> thc_(getTensorInfo)(THCState *state, THCTensor *tensor) {
real *data = THCTensor_(data)(state, tensor);
int dims = THCTensor_(nDimension)(state, tensor);
int size[MAX_DIMS]; int stride[MAX_DIMS];
for (int i = 0; i < dims; i++) {
size[i] = THCTensor_(size)(state, tensor, i);
stride[i] = THCTensor_(stride)(state, tensor, i);
}
return TensorInfo<real>(data, dims, size, stride);
}
#endif
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/grid.cu"
#else
void cluster_(grid)(THCState *state, int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) {
THCAssertSameGPU(THCTensor_(checkGPU)(state, 2, position, size));
THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 2, output, count));
THArgCheck(THCudaLongTensor_nDimension(state, output) <= MAX_DIMS, 1, "Tensor too large or too many dimensions");
int64_t *outputData = THCudaLongTensor_data(state, output);
TensorInfo<real> positionInfo = thc_(getTensorInfo)(state, position);
real *sizeData = THCTensor_(data)(state, size);
int64_t *countData = THCudaLongTensor_data(state, count);
const int N = THCudaLongTensor_nElement(state, output);
int grid = GET_BLOCKS(N);
cudaStream_t stream = THCState_getCurrentStream(state);
switch (positionInfo.dims) {
case 1: gridKernel<real, 1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
case 2: gridKernel<real, 2><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
case 3: gridKernel<real, 3><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
case 4: gridKernel<real, 4><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
default: gridKernel<real, -1><<<grid, NUM_THREADS, 0, stream>>>(outputData, positionInfo, sizeData, countData, C, N); break;
}
THCudaCheck(cudaGetLastError());
}
#endif
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/serial.cu"
#else
void cluster_(serial)(THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCTensor *weight) {
}
#endif
#include <THC.h>
#include "grid.h"
#include "common.cuh"
#include "THCIndex.cuh"
#define cluster_(NAME) TH_CONCAT_4(cluster_, NAME, _kernel_, Real)
#define thc_(NAME) TH_CONCAT_4(thc_, NAME, _, Real)
#include "generic/common.cu"
#include "THCGenerateAllTypes.h"
template<typename Real, int Dims>
__global__ void gridKernel(int64_t *output, TensorInfo<Real> position, Real *size, int64_t *count, const int C, const int N) {
KERNEL_LOOP(i, N) {
int positionOffset = 0; int tmp = C; int64_t c = 0;
IndexToOffset<Real, Dims>::compute(i, position, &positionOffset);
for (int d = 0; d < position.size[position.dims - 1]; d++) {
tmp = tmp / count[d];
c += tmp * (int64_t) (position.data[positionOffset + d] / size[d]);
}
output[i] = c;
}
}
#include "generic/grid.cu"
#include "THCGenerateFloatType.h"
#include "generic/grid.cu"
#include "THCGenerateDoubleType.h"
#include "generic/grid.cu"
#include "THCGenerateByteType.h"
#include "generic/grid.cu"
#include "THCGenerateCharType.h"
#include "generic/grid.cu"
#include "THCGenerateShortType.h"
#include "generic/grid.cu"
#include "THCGenerateIntType.h"
#include "generic/grid.cu"
#include "THCGenerateLongType.h"
#ifdef __cplusplus
extern "C" {
#endif
void cluster_grid_kernel_Float (THCState *state, int C, THCudaLongTensor *output, THCudaTensor *position, THCudaTensor *size, THCudaLongTensor *count);
void cluster_grid_kernel_Double(THCState *state, int C, THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaLongTensor *count);
void cluster_grid_kernel_Byte (THCState *state, int C, THCudaLongTensor *output, THCudaByteTensor *position, THCudaByteTensor *size, THCudaLongTensor *count);
void cluster_grid_kernel_Char (THCState *state, int C, THCudaLongTensor *output, THCudaCharTensor *position, THCudaCharTensor *size, THCudaLongTensor *count);
void cluster_grid_kernel_Short (THCState *state, int C, THCudaLongTensor *output, THCudaShortTensor *position, THCudaShortTensor *size, THCudaLongTensor *count);
void cluster_grid_kernel_Int (THCState *state, int C, THCudaLongTensor *output, THCudaIntTensor *position, THCudaIntTensor *size, THCudaLongTensor *count);
void cluster_grid_kernel_Long (THCState *state, int C, THCudaLongTensor *output, THCudaLongTensor *position, THCudaLongTensor *size, THCudaLongTensor *count);
#ifdef __cplusplus
}
#endif
#include <THC.h>
#include "THCTensorRandom.h"
#include "serial.h"
#include <curand.h>
#include <curand_kernel.h>
#include "common.cuh"
#define cluster_(NAME) TH_CONCAT_4(cluster_, NAME, _kernel_, Real)
#define thc_(NAME) TH_CONCAT_4(thc_, NAME, _, Real)
__global__ void assignColorKernel(curandStateMtgp32 *state, int64_t *color, const int n, uint8_t *done) {
KERNEL_LOOP(i, n) {
if (color[i] < 0) {
color[i] = 0; //(int64_t) (curand_uniform(&state[blockIdx.x]) <= 0.53406) - 2;
*done = 0;
}
}
}
int assignColor(THCState *state, THCudaLongTensor *color) {
curandStateMtgp32 *gen_states = THCRandom_generatorStates(state);
int64_t *colorVec = THCudaLongTensor_data(state, color);
const int n = THCudaLongTensor_nElement(state, color);
uint8_t done; uint8_t* d_done; cudaMalloc(&d_done, sizeof(uint8_t)); cudaMemset(d_done, 1, sizeof(uint8_t)); // *(done) = (int) 1;
assignColorKernel<<<GET_BLOCKS(n), NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(gen_states, colorVec, n, d_done);
cudaMemcpy(&done, d_done, sizeof(uint8_t), cudaMemcpyDeviceToHost); cudaFree(d_done);
return done;
}
/* GENERATE_KERNEL1(generate_bernoulli, double, double p, double, curand_uniform_double, x <= p) */
/* #define GENERATE_KERNEL1(NAME, T, ARG1, CURAND_T, CURAND_FUNC, TRANSFORM) \ */
/* __global__ void NAME(curandStateMtgp32 *state, int size, T *result, ARG1) \ */
/* { \ */
/* int idx = blockIdx.x * BLOCK_SIZE + threadIdx.x; \ */
/* int rounded_size = THCCeilDiv(size, BLOCK_SIZE) * BLOCK_SIZE; \ */
/* for (int i = idx; i < rounded_size; i += BLOCK_SIZE * MAX_NUM_BLOCKS) { \ */
/* CURAND_T x = CURAND_FUNC(&state[blockIdx.x]); \ */
/* if (i < size) { \ */
/* T y = TRANSFORM; \ */
/* result[i] = y; \ */
/* } \ */
/* } \ */
/* } */
void cluster_serial_kernel(THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree) {
}
#include "generic/serial.cu"
#include "THCGenerateFloatType.h"
#include "generic/serial.cu"
#include "THCGenerateDoubleType.h"
#include "generic/serial.cu"
#include "THCGenerateByteType.h"
#include "generic/serial.cu"
#include "THCGenerateCharType.h"
#include "generic/serial.cu"
#include "THCGenerateShortType.h"
#include "generic/serial.cu"
#include "THCGenerateIntType.h"
#include "generic/serial.cu"
#include "THCGenerateLongType.h"
#ifdef __cplusplus
extern "C" {
#endif
void cluster_serial_kernel(THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree);
void cluster_serial_kernel_Float (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaTensor *weight);
void cluster_serial_kernel_Double(THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaDoubleTensor *weight);
void cluster_serial_kernel_Byte (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaByteTensor *weight);
void cluster_serial_kernel_Char (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaCharTensor *weight);
void cluster_serial_kernel_Short (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaShortTensor *weight);
void cluster_serial_kernel_Int (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaIntTensor *weight);
void cluster_serial_kernel_Long (THCState *state, THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCudaLongTensor *weight);
#ifdef __cplusplus
}
#endif
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/grid_cpu.c"
#else
void cluster_(grid)(int C, THLongTensor *output, THTensor *position, THTensor *size, THLongTensor *count) {
real *size_data = size->storage->data + size->storageOffset;
int64_t *count_data = count->storage->data + count->storageOffset;
int64_t D = THLongTensor_nElement(count), d, c, tmp;
TH_TENSOR_DIM_APPLY2(int64_t, output, real, position, THTensor_(nDimension)(position) - 1,
tmp = C; c = 0;
for (d = 0; d < D; d++) {
tmp = tmp / *(count_data + d);
c += tmp * (int64_t) (*(position_data + d * position_stride) / *(size_data + d));
}
output_data[0] = c;
)
}
#endif
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/grid_cuda.c"
#else
void cluster_(grid)(int C, THCudaLongTensor *output, THCTensor *position, THCTensor *size, THCudaLongTensor *count) {
cluster_kernel_(grid)(state, C, output, position, size, count);
}
#endif
#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/serial_cpu.c"
#else
void cluster_(serial)(THLongTensor *output, THLongTensor *row, THLongTensor *col, THLongTensor *degree, THTensor *weight) {
real *weight_data = weight->storage->data + weight->storageOffset;
real weight_value, w;
int64_t d, c;
SERIAL(output, row, col, degree,
weight_value = 0;
for (d = 0; d < degree_data[row_value]; d++) { // Iterate over neighbors.
c = col_data[e + d];
w = weight_data[e + d];
if (output_data[c] < 0 && w >= weight_value) {
// Neighbor is unmatched and edge has a higher weight.
col_value = c;
weight_value = w;
}
}
)
}
#endif
#ifndef THC_GENERIC_FILE
#define THC_GENERIC_FILE "generic/serial_cuda.c"
#else
void cluster_(serial)(THCudaLongTensor *output, THCudaLongTensor *row, THCudaLongTensor *col, THCudaLongTensor *degree, THCTensor *weight) {
int bla = assignColor(state, output);
}
#endif
#include <TH/TH.h>
#define cluster_(NAME) TH_CONCAT_4(cluster_, NAME, _, Real)
#include "generic/grid_cpu.c"
#include "THGenerateAllTypes.h"
void cluster_grid_Float (int C, THLongTensor *output, THFloatTensor *position, THFloatTensor *size, THLongTensor *count);
void cluster_grid_Double(int C, THLongTensor *output, THDoubleTensor *position, THDoubleTensor *size, THLongTensor *count);
void cluster_grid_Byte (int C, THLongTensor *output, THByteTensor *position, THByteTensor *size, THLongTensor *count);
void cluster_grid_Char (int C, THLongTensor *output, THCharTensor *position, THCharTensor *size, THLongTensor *count);
void cluster_grid_Short (int C, THLongTensor *output, THShortTensor *position, THShortTensor *size, THLongTensor *count);
void cluster_grid_Int (int C, THLongTensor *output, THIntTensor *position, THIntTensor *size, THLongTensor *count);
void cluster_grid_Long (int C, THLongTensor *output, THLongTensor *position, THLongTensor *size, THLongTensor *count);
#include <THC/THC.h>
#include "grid.h"
#define cluster_(NAME) TH_CONCAT_4(cluster_, NAME, _cuda_, Real)
#define cluster_kernel_(NAME) TH_CONCAT_4(cluster_, NAME, _kernel_, Real)
extern THCState *state;
#include "generic/grid_cuda.c"
#include "THCGenerateFloatType.h"
#include "generic/grid_cuda.c"
#include "THCGenerateDoubleType.h"
#include "generic/grid_cuda.c"
#include "THCGenerateByteType.h"
#include "generic/grid_cuda.c"
#include "THCGenerateCharType.h"
#include "generic/grid_cuda.c"
#include "THCGenerateShortType.h"
#include "generic/grid_cuda.c"
#include "THCGenerateIntType.h"
#include "generic/grid_cuda.c"
#include "THCGenerateLongType.h"
void cluster_grid_cuda_Float (int C, THCudaLongTensor *output, THCudaTensor *position, THCudaTensor *size, THCudaLongTensor *count);
void cluster_grid_cuda_Double(int C, THCudaLongTensor *output, THCudaDoubleTensor *position, THCudaDoubleTensor *size, THCudaLongTensor *count);
void cluster_grid_cuda_Byte (int C, THCudaLongTensor *output, THCudaByteTensor *position, THCudaByteTensor *size, THCudaLongTensor *count);
void cluster_grid_cuda_Char (int C, THCudaLongTensor *output, THCudaCharTensor *position, THCudaCharTensor *size, THCudaLongTensor *count);
void cluster_grid_cuda_Short (int C, THCudaLongTensor *output, THCudaShortTensor *position, THCudaShortTensor *size, THCudaLongTensor *count);
void cluster_grid_cuda_Int (int C, THCudaLongTensor *output, THCudaIntTensor *position, THCudaIntTensor *size, THCudaLongTensor *count);
void cluster_grid_cuda_Long (int C, THCudaLongTensor *output, THCudaLongTensor *position, THCudaLongTensor *size, THCudaLongTensor *count);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment