Wrap cub in it's own namespace and remove dependency on Thrust (#587)

* Wrap cub with CUB_NS_PREFIX and remove dependency on Thrust * update readme * format * Update install_cuda.sh Co-authored-by: Gao, Xiang <qasdfgtyuiop@gmail.com>

Wrap cub in it's own namespace and remove dependency on Thrust (#587)
* Wrap cub with CUB_NS_PREFIX and remove dependency on Thrust * update readme * format * Update install_cuda.sh Co-authored-by: Gao, Xiang <qasdfgtyuiop@gmail.com>
00548245 · Jinze (Richard) Xue · GitHub · 2007d181 · 00548245 · 00548245
Unverified Commit 00548245 authored Apr 14, 2021 by Jinze (Richard) Xue Committed by GitHub Apr 13, 2021
4 changed files
--- a/ci/install_cuda.sh
+++ b/ci/install_cuda.sh
@@ -2,14 +2,13 @@

 # command copy-pasted from https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&target_distro=Ubuntu&target_version=1804&target_type=debnetwork

-wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
-sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
-sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
-sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
+sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
+sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub
+sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
 sudo apt-get update
-
 # from https://github.com/ptheywood/cuda-cmake-github-actions/blob/master/scripts/actions/install_cuda_ubuntu.sh
-sudo apt-get -y install cuda-command-line-tools-11-0 cuda-libraries-dev-11-0
-export CUDA_HOME=/usr/local/cuda-11.0
+sudo apt-get -y install cuda-command-line-tools-11-2 cuda-libraries-dev-11-2
+export CUDA_HOME=/usr/local/cuda-11.2
 export PATH="$CUDA_HOME/bin:$PATH"
 nvcc -V
--- a/torchani/cuaev/README.md
+++ b/torchani/cuaev/README.md
@@ -8,7 +8,6 @@ If you use conda, you could install it by
 ```
 conda install pytorch torchvision torchaudio cudatoolkit={YOUR_CUDA_VERSION} -c pytorch-nightly
 ```
-Note that [CUDA 11](https://github.com/aiqm/torchani/issues/549) is still not supported yet.

 ## Install
 In most cases, if `gcc` and `cuda` environment are well configured, runing the following command at `torchani` directory will install torchani and cuaev together.
@@ -51,15 +50,13 @@ pip install -e . && pip install -v -e . --global-option="--cuaev"

 ```bash
 srun -p gpu --ntasks=1 --cpus-per-task=2 --gpus=geforce:1 --time=02:00:00 --mem=10gb  --pty -u bash -i
-module load cuda/10.0.130 gcc/7.3.0 git
-conda remove --name cuaev --all -y && conda create -n cuaev python=3.8 -y
+# create env if necessary
+conda create -n cuaev python=3.8
 conda activate cuaev
-# install compiled torch-cu100 because pytorch droped official build for cuda 10.0
-. /home/jinzexue/pytorch/loadmodule  # note that there is a space after .
-. /home/jinzexue/pytorch/install_deps
-pip install $(realpath /home/jinzexue/pytorch/dist/torch-nightly-cu100.whl)
-# check if pytorch is working, should print available's gpu infomations
-python /home/jinzexue/pytorch/testcuda/testcuda.py
+# modules
+module load cuda/11.1.0 gcc/7.3.0 git
+# pytorch
+conda install pytorch torchvision torchaudio cudatoolkit=11.1 -c pytorch-nightly -c conda-forge
 # install torchani
 git clone https://github.com/aiqm/torchani.git
 cd torchani

--- a/torchani/cuaev/aev.cu
+++ b/torchani/cuaev/aev.cu
 #include <aev.h>
-#include <thrust/equal.h>
 #include <torch/extension.h>
-#include <cub/cub.cuh>
+#include <cuaev_cub.cuh>
 #include <vector>

 #include <ATen/Context.h>
@@ -728,105 +727,6 @@ __global__ void cuRadialAEVs_backward_or_doublebackward(
  }
 }

-template <typename DataT>
-void cubScan(const DataT* d_in, DataT* d_out, int num_items, cudaStream_t stream) {
-  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-
-  // Determine temporary device storage requirements
-  void* d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-
-  // Allocate temporary storage
-  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
-  d_temp_storage = buffer_tmp.get();
-
-  // Run exclusive prefix sum
-  cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-}
-
-template <typename DataT, typename IndexT>
-int cubEncode(
-    const DataT* d_in,
-    DataT* d_unique_out,
-    IndexT* d_counts_out,
-    int num_items,
-    int* d_num_runs_out,
-    cudaStream_t stream) {
-  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-
-  // Determine temporary device storage requirements
-  void* d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceRunLengthEncode::Encode(
-      d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
-
-  // Allocate temporary storage
-  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
-  d_temp_storage = buffer_tmp.get();
-
-  // Run encoding
-  cub::DeviceRunLengthEncode::Encode(
-      d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
-
-  int num_selected = 0;
-  cudaMemcpyAsync(&num_selected, d_num_runs_out, sizeof(int), cudaMemcpyDefault, stream);
-  cudaStreamSynchronize(stream);
-  return num_selected;
-}
-
-template <typename DataT, typename LambdaOpT>
-int cubDeviceSelect(
-    const DataT* d_in,
-    DataT* d_out,
-    int num_items,
-    int* d_num_selected_out,
-    LambdaOpT select_op,
-    cudaStream_t stream) {
-  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-
-  // Determine temporary device storage requirements
-  void* d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-
-  // Allocate temporary storage
-  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
-  d_temp_storage = buffer_tmp.get();
-
-  // Run selection
-  cub::DeviceSelect::If(
-      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
-
-  int num_selected = 0;
-  cudaMemcpyAsync(&num_selected, d_num_selected_out, sizeof(int), cudaMemcpyDefault, stream);
-  cudaStreamSynchronize(stream);
-
-  return num_selected;
-}
-
-template <typename DataT>
-DataT cubMax(const DataT* d_in, int num_items, DataT* d_out, cudaStream_t stream) {
-  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
-  // Determine temporary device storage requirements
-  void* d_temp_storage = NULL;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-
-  // Allocate temporary storage
-  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
-  d_temp_storage = buffer_tmp.get();
-
-  // Run min-reduction
-  cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
-
-  int maxVal = 0;
-  cudaMemcpyAsync(&maxVal, d_out, sizeof(DataT), cudaMemcpyDefault, stream);
-  cudaStreamSynchronize(stream);
-
-  return maxVal;
-}
-
 // NOTE: assumes size of EtaA_t = Zeta_t = EtaR_t = 1
 Result cuaev_forward(const Tensor& coordinates_t, const Tensor& species_t, const AEVScalarParams& aev_params) {
  TORCH_CHECK(
@@ -850,21 +750,15 @@ Result cuaev_forward(const Tensor& coordinates_t, const Tensor& species_t, const
  }

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  auto thrust_allocator = THCThrustAllocator(at::globalContext().lazyInitCUDA());
-  auto policy = thrust::cuda::par(thrust_allocator).on(stream);
  auto& allocator = *c10::cuda::CUDACachingAllocator::get();

  // buffer to store all the pairwise distance (Rij)
  auto total_natom_pairs = n_molecules * max_natoms_per_mol * max_natoms_per_mol;
-  auto d_options = torch::dtype(torch::kUInt8).device(coordinates_t.device());
-  Tensor tensor_Rij = torch::empty(sizeof(PairDist<float>) * total_natom_pairs, d_options);
+  auto d_options = torch::dtype(torch::kFloat32).device(coordinates_t.device());
+  float inf = std::numeric_limits<float>::infinity();
+  Tensor tensor_Rij = torch::full(sizeof(PairDist<float>) / sizeof(float) * total_natom_pairs, inf, d_options);
  PairDist<float>* d_Rij = (PairDist<float>*)tensor_Rij.data_ptr();

-  // init all Rij to inf
-  PairDist<float> init;
-  init.Rij = std::numeric_limits<float>::infinity();
-  thrust::fill(policy, d_Rij, d_Rij + total_natom_pairs, init);
-
  // buffer to store all the pairwise distance that is needed for Radial AEV
  // computation
  Tensor tensor_radialRij = torch::empty(sizeof(PairDist<float>) * total_natom_pairs, d_options);
@@ -986,7 +880,8 @@ Result cuaev_forward(const Tensor& coordinates_t, const Tensor& species_t, const
        angular_length_aligned,
        ncenter_atoms);

-    return {aev_t,
+    return {
+        aev_t,
        tensor_Rij,
        tensor_radialRij,
        tensor_angularRij,

--- a/torchani/cuaev/cuaev_cub.cuh
+++ b/torchani/cuaev/cuaev_cub.cuh
+#pragma once
+
+// include cub in a safe manner, see:
+// https://github.com/pytorch/pytorch/pull/55292
+#undef CUB_NS_POSTFIX // undef to avoid redefinition warnings
+#undef CUB_NS_PREFIX
+#define CUB_NS_PREFIX namespace cuaev {
+#define CUB_NS_POSTFIX }
+#include <cub/cub.cuh>
+#undef CUB_NS_POSTFIX
+#undef CUB_NS_PREFIX
+
+template <typename DataT>
+void cubScan(const DataT* d_in, DataT* d_out, int num_items, cudaStream_t stream) {
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cuaev::cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+
+  // Allocate temporary storage
+  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
+  d_temp_storage = buffer_tmp.get();
+
+  // Run exclusive prefix sum
+  cuaev::cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+}
+
+template <typename DataT, typename IndexT>
+int cubEncode(
+    const DataT* d_in,
+    DataT* d_unique_out,
+    IndexT* d_counts_out,
+    int num_items,
+    int* d_num_runs_out,
+    cudaStream_t stream) {
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cuaev::cub::DeviceRunLengthEncode::Encode(
+      d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
+
+  // Allocate temporary storage
+  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
+  d_temp_storage = buffer_tmp.get();
+
+  // Run encoding
+  cuaev::cub::DeviceRunLengthEncode::Encode(
+      d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items, stream);
+
+  int num_selected = 0;
+  cudaMemcpyAsync(&num_selected, d_num_runs_out, sizeof(int), cudaMemcpyDefault, stream);
+  cudaStreamSynchronize(stream);
+  return num_selected;
+}
+
+template <typename DataT, typename LambdaOpT>
+int cubDeviceSelect(
+    const DataT* d_in,
+    DataT* d_out,
+    int num_items,
+    int* d_num_selected_out,
+    LambdaOpT select_op,
+    cudaStream_t stream) {
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+
+  // Determine temporary device storage requirements
+  void* d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cuaev::cub::DeviceSelect::If(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+
+  // Allocate temporary storage
+  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
+  d_temp_storage = buffer_tmp.get();
+
+  // Run selection
+  cuaev::cub::DeviceSelect::If(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream);
+
+  int num_selected = 0;
+  cudaMemcpyAsync(&num_selected, d_num_selected_out, sizeof(int), cudaMemcpyDefault, stream);
+  cudaStreamSynchronize(stream);
+
+  return num_selected;
+}
+
+template <typename DataT>
+DataT cubMax(const DataT* d_in, int num_items, DataT* d_out, cudaStream_t stream) {
+  auto& allocator = *c10::cuda::CUDACachingAllocator::get();
+  // Determine temporary device storage requirements
+  void* d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cuaev::cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+
+  // Allocate temporary storage
+  auto buffer_tmp = allocator.allocate(temp_storage_bytes);
+  d_temp_storage = buffer_tmp.get();
+
+  // Run min-reduction
+  cuaev::cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream);
+
+  int maxVal = 0;
+  cudaMemcpyAsync(&maxVal, d_out, sizeof(DataT), cudaMemcpyDefault, stream);
+  cudaStreamSynchronize(stream);
+
+  return maxVal;
+}