assignColorKernel

ef9677c5 · rusty1s · 2f0248cf · ef9677c5 · ef9677c5 · ef9677c5
Commit ef9677c5 authored Mar 30, 2018 by rusty1s
10 changed files
--- a/aten/THC/THC.cu
+++ b/aten/THC/THC.cu
+#include "THCGreedy.cu"
+#include "THCGrid.cu"
--- a/aten/THC/THC.h
+++ b/aten/THC/THC.h
+#ifndef THC_INC
+#define THC_INC
+
+#include "THCGreedy.h"
+#include "THCGrid.h"
+
+#endif
--- a/aten/THC/THCColor.cu
+++ b/aten/THC/THCColor.cu
+#include <curand.h>
+#include <curand_kernel.h>
+
+#include "common.cuh"
+
+__global__ void assignColorKernel(int64_t *color, curandStateMtgp32 *state, uint8_t *done,
+                                  ptrdiff_t nNodes) {
+  KERNEL_LOOP(i, nNodes) {
+    if (color[i] < 0) {
+      color[i] = (curand_uniform(&state[0]) < 0.53406) - 2;  // blue = -1, red = -2
+      *done = 0;
+    }
+  }
+}
+
+int THCGreedy_assignColor(THCState *state, THCudaLongTensor *color) {
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 1, color));
+
+  int64_t *colorData = THCudaLongTensor_data(state, color);
+  ptrdiff_t nNodes = THCudaLongTensor_nElement(state, color);
+  uint8_t* d_done; cudaMalloc(&d_done, sizeof(uint8_t)); cudaMemset(d_done, 1, sizeof(uint8_t));
+  KERNEL_RUN(assignColorKernel, nNodes, colorData, THCRandom_generatorStates(state), d_done);
+  uint8_t done; cudaMemcpy(&done, d_done, sizeof(uint8_t), cudaMemcpyDeviceToHost);
+  cudaFree(d_done);
+  return done;
+}
--- a/aten/THC/THCGreedy.cu
+++ b/aten/THC/THCGreedy.cu
 #include "THCGreedy.h"

+#include "THCColor.cu"
+
 void THCGreedy(THCState *state, THCudaLongTensor *cluster, THCudaLongTensor *row,
               THCudaLongTensor *col, THCudaLongTensor *deg) {
-  printf("THCGreedy drin");
+  THCAssertSameGPU(THCudaLongTensor_checkGPU(state, 4, cluster, row, col, deg));
+
+  THCudaLongTensor_fill(state, cluster, -1);
+  THCGreedy_assignColor(state, cluster);
+  /* while(!THCGreedy_assignColor(state, cluster)) { */
+  /*   printf("DRIN"); */
+  /* }; */
+
  // Fill cluster with -1
  // assign color to clusters < 0 (return done)
  // Generate proposal vector with length of nodes (init?)

--- a/aten/THC/THCGrid.cu
+++ b/aten/THC/THCGrid.cu
 #include "THCGrid.h"

-#include "common.h"
+#include "common.cuh"
 #include "THCNumerics.cuh"

 template<typename T>
 __global__ void gridKernel(int64_t *cluster, TensorInfo<T> posInfo, T *size,
-                           int64_t *count, const int nNodes) {
+                           int64_t *count, ptrdiff_t nNodes) {
  KERNEL_LOOP(i, nNodes) {
    T *pos = posInfo.data + i * posInfo.stride[0];
    int64_t coef = 1, value = 0;

--- a/aten/THC/common.h
+++ b/aten/THC/common.h
@@ -6,17 +6,21 @@
 #define KERNEL_LOOP(I, N) \
  for (ptrdiff_t I = blockIdx.x * blockDim.x + threadIdx.x; I < N; I += blockDim.x * gridDim.x)

-#define THC_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
-  "Some of the input tensors are located on different GPUs. Please move them to a single one.")
-
 const int MAX_DIMS = 25;
-const int NUM_THREADS = 1024;
+/* const int NUM_THREADS = 1024; */
+const int NUM_THREADS = 256;

-inline int GET_BLOCKS(const int N) {
+inline int GET_BLOCKS(int N) {
  return (N + NUM_THREADS - 1) / NUM_THREADS;
 }

 #define KERNEL_RUN(NAME, N, ...) \
+  int grid = GET_BLOCKS(N); \
+  cudaStream_t stream = THCState_getCurrentStream(state); \
+  NAME<<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); \
+  THCudaCheck(cudaGetLastError())
+
+#define KERNEL_REAL_RUN(NAME, N, ...) \
  int grid = GET_BLOCKS(N); \
  cudaStream_t stream = THCState_getCurrentStream(state); \
  NAME<real><<<grid, NUM_THREADS, 0, stream>>>(__VA_ARGS__, N); \
@@ -30,7 +34,7 @@ struct TensorInfo {
  int stride[MAX_DIMS];
 };

-#include "generic/common.h"
+#include "generic/common.cuh"
 #include "THC/THCGenerateAllTypes.h"

 #endif  // THC_COMMON_INC
--- a/aten/THC/generic/THCGrid.cu
+++ b/aten/THC/generic/THCGrid.cu
@@ -4,15 +4,15 @@

 void THCGrid_(THCState *state, THCudaLongTensor *cluster, THCTensor *pos, THCTensor *size,
              THCudaLongTensor *count) {
-  THC_assertSameGPU(state, 4, cluster, pos, size, count);
+  THCAssertSameGPU(THCTensor_(checkGPU)(state, 4, cluster, pos, size, count));

  int64_t *clusterData = THCudaLongTensor_data(state, cluster);
  TensorInfo<real> posInfo = THCTensor_(getTensorInfo)(state, pos);
  real *sizeData = THCTensor_(data)(state, size);
  int64_t *countData = THCudaLongTensor_data(state, count);

-  const int nNodes = THCudaLongTensor_nElement(state, cluster);
-  KERNEL_RUN(gridKernel, nNodes, clusterData, posInfo, sizeData, countData);
+  ptrdiff_t nNodes = THCudaLongTensor_nElement(state, cluster);
+  KERNEL_REAL_RUN(gridKernel, nNodes, clusterData, posInfo, sizeData, countData);
 }

 #endif  // THC_GENERIC_FILE
--- a/aten/THC/generic/common.h
+++ b/aten/THC/generic/common.h
 #ifndef THC_GENERIC_FILE
-#define THC_GENERIC_FILE "generic/common.h"
+#define THC_GENERIC_FILE "generic/common.cuh"
 #else

 TensorInfo<real> THCTensor_(getTensorInfo)(THCState *state, THCTensor *tensor) {

--- a/build.py
+++ b/build.py
@@ -24,7 +24,7 @@ if torch.cuda.is_available():
    sources += ['aten/THCC/THCC{}.c'.format(f) for f in files]
    include_dirs += ['aten/THC', 'aten/THCC']
    define_macros += [('WITH_CUDA', None)]
-    extra_objects += ['aten/build/THC{}.so'.format(f) for f in files]
+    extra_objects += ['aten/build/THC.so']
    with_cuda = True

 ffi = create_extension(

--- a/build_new.sh
+++ b/build_new.sh
@@ -7,6 +7,4 @@ SRC_DIR=aten/THC
 BUILD_DIR=aten/build

 mkdir -p $BUILD_DIR
-for i in THCGreedy THCGrid; do
-  $(which nvcc) -c -o "$BUILD_DIR/$i.so" "$SRC_DIR/$i.cu" -arch=sm_52 -Xcompiler -fPIC -shared "-I$TORCH/lib/include/TH" "-I$TORCH/lib/include" "-I$SRC_DIR"
-done
+$(which nvcc) -c -o "$BUILD_DIR/THC.so" "$SRC_DIR/THC.cu" -arch=sm_52 -Xcompiler -fPIC -shared "-I$TORCH/lib/include/TH" "-I$TORCH/lib/include" "-I$SRC_DIR"