bugfixes

4c4478a7 · rusty1s · 16eb9e1d · 4c4478a7 · 4c4478a7 · 4c4478a7
Commit 4c4478a7 authored Apr 07, 2018 by rusty1s
Show whitespace changes
Inline Side-by-side

Showing with 20 additions and 20 deletions

aten/THC/THCColor.cuh aten/THC/THCColor.cuh +17 -16

aten/THC/THCGraclus.cu aten/THC/THCGraclus.cu +2 -2

aten/THC/common.cuh aten/THC/common.cuh +1 -2

No files found.
--- a/aten/THC/THCColor.cuh
+++ b/aten/THC/THCColor.cuh
 #ifndef THC_COLOR_INC
 #define THC_COLOR_INC

+#include <curand_kernel.h>
+
 #include "common.cuh"

-#define BLUE_PROBABILITY 0.53406
+#define BLUE_PROB 0.53406
+
+__device__ int d_done;
+__global__ void initDoneKernel() { d_done = 1; }

-__global__ void colorKernel(int64_t *self, int64_t *bernoulli, uint8_t *done, ptrdiff_t nNodes) {
+__global__ void colorKernel(int64_t *self, uint8_t *bernoulli, ptrdiff_t nNodes) {
  KERNEL_LOOP(i, nNodes) {
    if (self[i] < 0) {
      self[i] = bernoulli[i] - 2;
-      *done = 0;
+      d_done = 0;
    }
  }
 }

-int THCTensor_color(THCState *state, THCudaLongTensor *self) {
-  ptrdiff_t nNodes = THCudaLongTensor_nElement(state, self);
-  THCudaLongTensor *bernoulli = THCudaLongTensor_newWithSize1d(state, nNodes);
-  THCudaLongTensor_bernoulli(state, bernoulli, BLUE_PROBABILITY);
+int THCudaLongTensor_color(THCState *state, THCudaLongTensor *self) {
+  initDoneKernel<<<1, 1>>>();

-  int64_t *selfData = THCudaLongTensor_data(state, self);
-  int64_t *bernoulliData = THCudaLongTensor_data(state, bernoulli);
+  ptrdiff_t nNodes = THCudaLongTensor_nElement(state, self);

-  uint8_t* d_done;
-  cudaMalloc(&d_done, sizeof(uint8_t));
-  cudaMemset(d_done, 1, sizeof(uint8_t));
+  THCudaByteTensor *bernoulli = THCudaByteTensor_newWithSize1d(state, nNodes);
+  THCudaByteTensor_bernoulli(state, bernoulli, BLUE_PROB);

-  KERNEL_RUN(colorKernel, nNodes, selfData, bernoulliData, d_done);
+  int64_t *selfData = THCudaLongTensor_data(state, self);
+  uint8_t *bernoulliData = THCudaByteTensor_data(state, bernoulli);

-  uint8_t done;
-  cudaMemcpy(&done, d_done, sizeof(uint8_t), cudaMemcpyDeviceToHost);
-  cudaFree(d_done);
+  KERNEL_RUN(colorKernel, nNodes, selfData, bernoulliData);

+  int done; cudaMemcpyFromSymbol(&done, d_done, sizeof(done), 0, cudaMemcpyDeviceToHost);
  return done;
 }


--- a/aten/THC/THCGraclus.cu
+++ b/aten/THC/THCGraclus.cu
@@ -20,9 +20,9 @@ void THCTensor_graclus(THCState *state, THCudaLongTensor *self, THCudaLongTensor
  THCudaLongTensor_degree(state, degree, row);

  THCudaLongTensor *cumDegree = THCudaLongTensor_newWithSize1d(state, nNodes);
-  THCudaLongTensor_cumDegree(state, self, row);
+  THCudaLongTensor_cumDegree(state, cumDegree, row);

-  while(!THCTensor_color(state, self)) {
+  while(!THCudaLongTensor_color(state, self)) {
    THCTensor_propose(state, self, prop, row, col, degree, cumDegree);
    THCTensor_response(state, self, prop, row, col, degree, cumDegree);
  }

--- a/aten/THC/common.cuh
+++ b/aten/THC/common.cuh
@@ -5,8 +5,7 @@
  for (ptrdiff_t I = blockIdx.x * blockDim.x + threadIdx.x; I < N; I += blockDim.x * gridDim.x)

 const int MAX_DIMS = 25;
-/* const int NUM_THREADS = 1024; */
-const int NUM_THREADS = 256;
+const int NUM_THREADS = 1024;

 inline int GET_BLOCKS(int N) {
  return (N + NUM_THREADS - 1) / NUM_THREADS;