Fix cuaev build on CUDA 11.5 and latest torch (#603)

* Fix cuaev build on CUDA 11.5 and latest torch * Update aev.cu Co-authored-by: Jinze (Richard) Xue <yueyericardo@gmail.com>

Fix cuaev build on CUDA 11.5 and latest torch (#603)
* Fix cuaev build on CUDA 11.5 and latest torch * Update aev.cu Co-authored-by: Jinze (Richard) Xue <yueyericardo@gmail.com>
c94ad7a0 · Gao, Xiang · GitHub · 258e6c36 · c94ad7a0 · c94ad7a0
Unverified Commit c94ad7a0 authored Jan 20, 2022 by Gao, Xiang Committed by GitHub Jan 21, 2022
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 18 deletions

torchani/cuaev/aev.cu torchani/cuaev/aev.cu +0 -2

torchani/cuaev/cuaev_cub.cuh torchani/cuaev/cuaev_cub.cuh +8 -16

No files found.
--- a/torchani/cuaev/aev.cu
+++ b/torchani/cuaev/aev.cu
@@ -3,12 +3,10 @@
 #include <cuaev_cub.cuh>

 #include <ATen/Context.h>
-#include <THC/THC.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
-#include <THC/THCThrustAllocator.cuh>
 #include <vector>

 #define PI 3.141592653589793

--- a/torchani/cuaev/cuaev_cub.cuh
+++ b/torchani/cuaev/cuaev_cub.cuh
 #pragma once

-// include cub in a safe manner, see:
-// https://github.com/pytorch/pytorch/pull/55292
-#undef CUB_NS_POSTFIX // undef to avoid redefinition warnings
-#undef CUB_NS_PREFIX
-#define CUB_NS_PREFIX namespace cuaev {
-#define CUB_NS_POSTFIX }
 #include <cub/cub.cuh>
-#undef CUB_NS_POSTFIX
-#undef CUB_NS_PREFIX

 template <typename DataT>
 void cubScan(const DataT* d_in, DataT* d_out, int num_items, cudaStream_t stream) {
@@ -17,14 +9,14 @@ void cubScan(const DataT* d_in, DataT* d_out, int num_items, cudaStream_t stream
  // Determine temporary device storage requirements
  void* d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;
-  cuaev::cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);

  // Allocate temporary storage
  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
  d_temp_storage = buffer_tmp.get();

  // Run exclusive prefix sum
-  cuaev::cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
 }

 template <typename DataT, typename IndexT>
@@ -40,7 +32,7 @@ int cubEncode(
  // Determine temporary device storage requirements
  void* d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;
-  cuaev::cub::DeviceRunLengthEncode::Encode(
+  cub::DeviceRunLengthEncode::Encode(
      d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);

  // Allocate temporary storage
@@ -48,7 +40,7 @@ int cubEncode(
  d_temp_storage = buffer_tmp.get();

  // Run encoding
-  cuaev::cub::DeviceRunLengthEncode::Encode(
+  cub::DeviceRunLengthEncode::Encode(
      d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);

  int num_selected = 0;
@@ -70,7 +62,7 @@ int cubDeviceSelect(
  // Determine temporary device storage requirements
  void* d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;
-  cuaev::cub::DeviceSelect::If(
+  cub::DeviceSelect::If(
      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);

  // Allocate temporary storage
@@ -78,7 +70,7 @@ int cubDeviceSelect(
  d_temp_storage = buffer_tmp.get();

  // Run selection
-  cuaev::cub::DeviceSelect::If(
+  cub::DeviceSelect::If(
      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);

  int num_selected = 0;
@@ -94,14 +86,14 @@ DataT cubMax(const DataT* d_in, int num_items, DataT* d_out, cudaStream_t stream
  // Determine temporary device storage requirements
  void* d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;
-  cuaev::cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);

  // Allocate temporary storage
  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
  d_temp_storage = buffer_tmp.get();

  // Run min-reduction
-  cuaev::cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+  cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);

  int maxVal = 0;
  cudaMemcpyAsync(&maxVal, d_out, sizeof(DataT), cudaMemcpyDefault, stream);