added experimental flag

f82bfbac · rusty1s · e30538b1 · f82bfbac · f82bfbac · f82bfbac
Commit f82bfbac authored Jan 09, 2020 by rusty1s
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 8 deletions

cuda/indptr.cuh cuda/indptr.cuh +1 -1

cuda/segment_kernel.cu cuda/segment_kernel.cu +6 -6

setup.py setup.py +1 -1

No files found.
--- a/cuda/indptr.cuh
+++ b/cuda/indptr.cuh
@@ -6,7 +6,7 @@
 // We need our own `IndexToOffset` implementation since we do not want to
 // access the last element of the `indexptr`.
 template <typename scalar_t> struct IndexPtrToOffset {
-  static inline __device__ int
+  static inline __host__ __device__ int
  get(int idx, const at::cuda::detail::TensorInfo<scalar_t, int> &info) {
    int offset = idx % (info.sizes[info.dims - 1] - 1);
    offset *= info.strides[info.dims - 1];

--- a/cuda/segment_kernel.cu
+++ b/cuda/segment_kernel.cu
@@ -30,7 +30,7 @@ enum ReductionType { ADD, MEAN, MIN, MAX };
  }()

 template <typename scalar_t, ReductionType REDUCE> struct Reducer {
-  static inline __device__ scalar_t init() {
+  static inline __host__ __device__ scalar_t init() {
    if (REDUCE == MIN) {
      return std::numeric_limits<scalar_t>::max();
    } else if (REDUCE == MAX) {
@@ -40,8 +40,8 @@ template <typename scalar_t, ReductionType REDUCE> struct Reducer {
    }
  }

-  static inline __device__ void update(scalar_t *val, scalar_t new_val,
-                                       int64_t *arg, int64_t new_arg) {
+  static inline __host__ __device__ void update(scalar_t *val, scalar_t new_val,
+                                                int64_t *arg, int64_t new_arg) {
    if (REDUCE == ADD || REDUCE == MEAN) {
      *val = *val + new_val;
    } else if ((REDUCE == MIN && new_val < *val) ||
@@ -51,9 +51,9 @@ template <typename scalar_t, ReductionType REDUCE> struct Reducer {
    }
  }

-  static inline __device__ void write(scalar_t *address, scalar_t val,
-                                      int64_t *arg_address, int64_t arg,
-                                      int count) {
+  static inline __host__ __device__ void write(scalar_t *address, scalar_t val,
+                                               int64_t *arg_address,
+                                               int64_t arg, int count) {
    if (REDUCE == ADD) {
      *address = val;
    } else if (REDUCE == MEAN) {

--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@ if '--cpu' in argv:
    USE_GPU = False

 cxx_extra_compile_args = []
-nvcc_extra_compile_args = ['-arch=sm_35']
+nvcc_extra_compile_args = ['-arch=sm_35', '--expt-relaxed-constexpr']
 if platform.system() != 'Windows':
    cxx_extra_compile_args += ['-Wno-unused-variable']
 TORCH_MAJOR = int(torch.__version__.split('.')[0])