push 2.0.9 version

c7c514c2 · yangzhong · cf967b1f · c7c514c2 · cf967b1f · cf967b1f
Commit c7c514c2 authored Jan 22, 2024 by yangzhong
20 changed files
--- a/csrc/hip/utils.cuh
+++ b/csrc/hip/utils.cuh
--- a/csrc/hip/scatter_hip_hip.hip
+++ b/csrc/hip/scatter_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "scatter_hip.h"
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
-#include "reducer.cuh"
-#include "utils.cuh"
-#define THREADS 1024
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-template <typename scalar_t, ReductionType REDUCE>
-__global__ void
-scatter_kernel(const scalar_t *src_data,
-               const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-               scalar_t *out_data, int E, int K, int N, int numel) {
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int b = thread_idx / (E * K);
-  int k = thread_idx % K;
-  if (thread_idx < numel) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        thread_idx, index_info);
-    int64_t idx = index_info.data[offset];
-    Reducer<scalar_t, REDUCE>::atomic_write(out_data + b * N * K + idx * K + k,
-                                            src_data[thread_idx]);
-  }
-}
-template <typename scalar_t>
-__global__ void
-scatter_arg_kernel(const scalar_t *src_data,
-                   const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-                   const scalar_t *out_data, int64_t *arg_out_data, int E,
-                   int K, int N, int numel) {
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int b = thread_idx / (E * K);
-  int e = (thread_idx / K) % E;
-  int k = thread_idx % K;
-  if (thread_idx < numel) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        thread_idx, index_info);
-    int64_t idx = index_info.data[offset];
-    if (src_data[thread_idx] == out_data[b * N * K + idx * K + k]) {
-      arg_out_data[b * N * K + idx * K + k] = e;
-    }
-  }
-}
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
-             torch::optional<torch::Tensor> optional_out,
-             torch::optional<int64_t> dim_size, std::string reduce) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(index);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-  CHECK_INPUT(src.dim() == index.dim());
-  for (auto i = 0; i < index.dim() - 1; i++)
-    CHECK_INPUT(src.size(i) >= index.size(i));
-  src = src.contiguous();
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (auto i = 0; i < out.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-  } else {
-    auto sizes = src.sizes().vec();
-    if (dim_size.has_value())
-      sizes[dim] = dim_size.value();
-    else if (index.numel() == 0)
-      sizes[dim] = 0;
-    else {
-      sizes[dim] = 1 + index.max().cpu().data_ptr<int64_t>()[0];
-    }
-    out = torch::empty(sizes, src.options());
-  }
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
-  int64_t *arg_out_data = nullptr;
-  if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
-    arg_out = torch::full_like(out, src.size(dim), index.options());
-    arg_out_data = arg_out.value().data_ptr<int64_t>();
-  }
-  if (src.numel() == 0) {
-    if (!optional_out.has_value())
-      out.fill_(0);
-    return std::make_tuple(out, arg_out);
-  }
-  auto B = 1;
-  for (auto i = 0; i < dim; i++)
-    B *= src.size(i);
-  auto E = src.size(dim);
-  auto K = src.numel() / (B * E);
-  auto N = out.size(dim);
-  auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-    AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
-      if (!optional_out.has_value())
-        out.fill_(Reducer<scalar_t, REDUCE>::init());
-     hipLaunchKernelGGL(( scatter_kernel<scalar_t, REDUCE>)
-          , dim3(BLOCKS(src.numel())), dim3(THREADS), 0, stream, 
-              src_data, index_info, out_data, E, K, N, src.numel());
-      if (!optional_out.has_value() && (REDUCE == MIN || REDUCE == MAX))
-        out.masked_fill_(out == Reducer<scalar_t, REDUCE>::init(), (scalar_t)0);
-      if (REDUCE == MIN || REDUCE == MAX)
-       hipLaunchKernelGGL(( scatter_arg_kernel<scalar_t>)
-            , dim3(BLOCKS(src.numel())), dim3(THREADS), 0, stream, 
-                src_data, index_info, out_data, arg_out_data, E, K, N,
-                src.numel());
-    });
-  });
-  return std::make_tuple(out, arg_out);
-}
--- a/csrc/hip/segment_coo_hip_hip.hip
+++ b/csrc/hip/segment_coo_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "segment_coo_hip.h"
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
-#include "reducer.cuh"
-#include "utils.cuh"
-#define THREADS 256
-#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
-#define FULL_MASK 0xffffffff
-template <typename scalar_t, ReductionType REDUCE, bool HAS_VAL>
-__global__ void
-segment_coo_kernel(const scalar_t *src_data,
-                   const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-                   scalar_t *out_data, size_t E, size_t N) {
-  // Each thread processes exactly one entry. Within a warp, we perform a
-  // parallel reduction across equal indices, and write the intermediate
-  // result via atomics.
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int lane_idx = row_idx & (32 - 1);
-  int D = index_info.sizes[index_info.dims - 1];
-  if (row_idx < E) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int64_t idx = index_info.data[offset], next_idx;
-    int out_idx = (row_idx / D) * N + idx;
-    scalar_t val = HAS_VAL ? src_data[row_idx] : (scalar_t)1, tmp;
-#pragma unroll
-    for (int i = 1; i < 32; i *= 2) {
-      // Parallel reduction inside a single warp.
-      tmp = __shfl_up_sync(FULL_MASK, val, i);
-      next_idx = __shfl_up_sync(FULL_MASK, idx, i);
-      if (lane_idx >= i && row_idx / D == (row_idx - i) / D) {
-        assert(idx >= next_idx);
-        if (idx == next_idx)
-          Reducer<scalar_t, REDUCE>::update(&val, tmp);
-      }
-    }
-    next_idx = __shfl_down_sync(FULL_MASK, idx, 1);
-    if (lane_idx == 32 - 1 || row_idx / D != (row_idx + 1) / D ||
-        idx != next_idx)
-      Reducer<scalar_t, REDUCE>::atomic_write(out_data + out_idx, val);
-  }
-}
-template <typename scalar_t>
-__global__ void segment_coo_arg_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-    scalar_t *out_data, int64_t *arg_out_data, size_t E, size_t N) {
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int D = index_info.sizes[index_info.dims - 1];
-  if (row_idx < E) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int64_t idx = index_info.data[offset];
-    int out_idx = (row_idx / D) * N + idx;
-    scalar_t val = __ldg(out_data + out_idx);
-    if (src_data[row_idx] == val)
-      arg_out_data[out_idx] = row_idx % D;
-  }
-}
-template <typename scalar_t, ReductionType REDUCE, int TB>
-__global__ void segment_coo_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-    scalar_t *out_data, size_t E, size_t K, size_t N) {
-  // Each thread processes a single column and `TB` index entries. Coalesced
-  // read and write is performed in column-major order. The intermediate
-  // results are written via atomics.
-  int D = index_info.sizes[index_info.dims - 1];
-  int E_1 = E / D;
-  int E_2 = (D - 1) + TB - ((D - 1) % TB);
-  int row_idx = blockIdx.x * blockDim.y + threadIdx.y;
-  int col_idx = blockIdx.y * blockDim.x + threadIdx.x;
-  int dim_start = (row_idx * TB) / E_2;
-  int row_start = (row_idx * TB) % E_2;
-  if (dim_start < E_1 && col_idx < K) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        dim_start * D + row_start, index_info);
-    int idx1 = __ldg(index_info.data + offset), idx2;
-    scalar_t val = src_data[K * (dim_start * D + row_start) + col_idx];
-#pragma unroll
-    for (int i = 1; i < TB; i++) {
-      if (row_start + i >= D)
-        break;
-      idx2 = __ldg(index_info.data + offset +
-                   i * index_info.strides[index_info.dims - 1]);
-      assert(idx1 <= idx2);
-      if (idx1 == idx2) {
-        Reducer<scalar_t, REDUCE>::update(
-            &val, src_data[K * (dim_start * D + row_start + i) + col_idx]);
-      } else {
-        Reducer<scalar_t, REDUCE>::atomic_write(
-            out_data + (dim_start * N + idx1) * K + col_idx, val);
-        val = src_data[K * (dim_start * D + row_start + i) + col_idx];
-      }
-      idx1 = idx2;
-    }
-    Reducer<scalar_t, REDUCE>::atomic_write(
-        out_data + (dim_start * N + idx1) * K + col_idx, val);
-  }
-}
-template <typename scalar_t>
-__global__ void segment_coo_arg_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-    scalar_t *out_data, int64_t *arg_out_data, size_t E, size_t K, size_t N) {
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int col_idx = thread_idx % K;
-  int D = index_info.sizes[index_info.dims - 1];
-  if (row_idx < E && col_idx < K) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int idx = __ldg(index_info.data + offset);
-    int out_idx = ((row_idx / D) * N + idx) * K + col_idx;
-    scalar_t val = __ldg(out_data + out_idx);
-    if (src_data[thread_idx] == val)
-      arg_out_data[out_idx] = row_idx % D;
-  }
-}
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-segment_coo_cuda(torch::Tensor src, torch::Tensor index,
-                 torch::optional<torch::Tensor> optional_out,
-                 torch::optional<int64_t> dim_size, std::string reduce) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(index);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-  CHECK_INPUT(src.dim() >= index.dim());
-  auto sizes = index.sizes().vec();
-  for (int i = 0; i < index.dim(); i++) {
-    sizes[i] = src.size(i);
-  }
-  index = index.expand(sizes);
-  auto dim = index.dim() - 1;
-  src = src.contiguous();
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (int i = 0; i < out.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-  } else {
-    sizes = src.sizes().vec();
-    if (dim_size.has_value())
-      sizes[dim] = dim_size.value();
-    else if (index.numel() == 0)
-      sizes[dim] = 0;
-    else {
-      auto tmp = index.select(dim, index.size(dim) - 1);
-      tmp = tmp.numel() > 1 ? tmp.max() : tmp;
-      sizes[dim] = 1 + tmp.cpu().data_ptr<int64_t>()[0];
-    }
-    out = torch::zeros(sizes, src.options());
-  }
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
-  int64_t *arg_out_data = nullptr;
-  if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
-    arg_out = torch::full_like(out, src.size(dim), index.options());
-    arg_out_data = arg_out.value().data_ptr<int64_t>();
-  } else if (reduce2REDUCE.at(reduce) == MEAN) {
-    auto sizes = index.sizes().vec();
-    sizes[dim] = out.size(dim);
-    arg_out = torch::zeros(sizes, out.options());
-  }
-  if (index.numel() == 0)
-    return std::make_tuple(out, arg_out);
-  auto E = index.numel();
-  auto E_2 = index.size(dim);
-  auto E_1 = index.numel() / E_2;
-  auto K = src.numel() / E;
-  auto N = out.size(dim);
-  auto avg_len = (float)E_2 / (float)N;
-  auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-    AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
-      if (!optional_out.has_value())
-        out.fill_(Reducer<scalar_t, REDUCE>::init());
-      if (K == 1)
-       hipLaunchKernelGGL(( segment_coo_kernel<scalar_t, REDUCE, true>)
-            , dim3(BLOCKS(1, E)), dim3(THREADS), 0, stream, src_data, index_info,
-                                                   out_data, E, N);
-      else if (avg_len <= 8)
-       hipLaunchKernelGGL(( segment_coo_broadcast_kernel<scalar_t, REDUCE, 4>)
-            , dim3(dim3((E_1 * ((E_2 + 3) / 4) + 7) / 8, (K + 31) / 32)),
-               dim3(dim3(32, 8)), 0, stream, src_data, index_info, out_data, E, K,
-                                         N);
-      else if (avg_len <= 16)
-       hipLaunchKernelGGL(( segment_coo_broadcast_kernel<scalar_t, REDUCE, 8>)
-            , dim3(dim3((E_1 * ((E_2 + 7) / 8) + 7) / 8, (K + 31) / 32)),
-               dim3(dim3(32, 8)), 0, stream, src_data, index_info, out_data, E, K,
-                                         N);
-      else if (avg_len <= 32)
-       hipLaunchKernelGGL(( segment_coo_broadcast_kernel<scalar_t, REDUCE, 16>)
-            , dim3(dim3((E_1 * ((E_2 + 15) / 16) + 7) / 8, (K + 31) / 32)),
-               dim3(dim3(32, 8)), 0, stream, src_data, index_info, out_data, E, K,
-                                         N);
-      else
-       hipLaunchKernelGGL(( segment_coo_broadcast_kernel<scalar_t, REDUCE, 32>)
-            , dim3(dim3((E_1 * ((E_2 + 31) / 32) + 7) / 8, (K + 31) / 32)),
-               dim3(dim3(32, 8)), 0, stream, src_data, index_info, out_data, E, K,
-                                         N);
-      if (!optional_out.has_value() && (REDUCE == MIN || REDUCE == MAX))
-        out.masked_fill_(out == Reducer<scalar_t, REDUCE>::init(), (scalar_t)0);
-      if (REDUCE == MIN || REDUCE == MAX) {
-        if (K == 1)
-         hipLaunchKernelGGL(( segment_coo_arg_kernel<scalar_t>)
-              , dim3(BLOCKS(1, E)), dim3(THREADS), 0, stream, 
-                  src_data, index_info, out_data, arg_out_data, E, N);
-        else
-         hipLaunchKernelGGL(( segment_coo_arg_broadcast_kernel<scalar_t>)
-              , dim3(BLOCKS(1, E * K)), dim3(THREADS), 0, stream, 
-                  src_data, index_info, out_data, arg_out_data, E, K, N);
-      }
-      if (REDUCE == MEAN) {
-        auto count_data = arg_out.value().data_ptr<scalar_t>();
-       hipLaunchKernelGGL(( segment_coo_kernel<scalar_t, SUM, false>)
-            , dim3(BLOCKS(1, E)), dim3(THREADS), 0, stream, nullptr, index_info,
-                                                   count_data, E, N);
-        arg_out.value().masked_fill_(arg_out.value() < (scalar_t)1,
-                                     (scalar_t)1);
-        auto count = arg_out.value();
-        for (int i = dim + 1; i < out.dim(); i++)
-          count = count.unsqueeze(-1);
-        if (out.is_floating_point())
-          out.true_divide_(count);
-        else
-          out.div_(count, "floor");
-      }
-    });
-  });
-  return std::make_tuple(out, arg_out);
-}
-template <typename scalar_t>
-__global__ void
-gather_coo_kernel(const scalar_t *src_data,
-                  const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-                  scalar_t *out_data, size_t E, size_t N) {
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_idx < E) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int row = index_info.data[offset];
-    offset = (row_idx / index_info.sizes[index_info.dims - 1]) * N;
-    scalar_t val = __ldg(src_data + offset + row);
-    out_data[row_idx] = val;
-  }
-}
-template <typename scalar_t>
-__global__ void gather_coo_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-    scalar_t *out_data, size_t E, size_t K, size_t N) {
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int col_idx = thread_idx % K;
-  if (thread_idx < E * K) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int row = index_info.data[offset];
-    offset = (row_idx / index_info.sizes[index_info.dims - 1]) * N * K;
-    scalar_t val = __ldg(src_data + offset + K * row + col_idx);
-    out_data[thread_idx] = val;
-  }
-}
-torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
-                              torch::optional<torch::Tensor> optional_out) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(index);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-  CHECK_INPUT(src.dim() >= index.dim());
-  auto sizes = index.sizes().vec();
-  for (auto i = 0; i < index.dim() - 1; i++)
-    sizes[i] = src.size(i);
-  index = index.expand(sizes);
-  auto dim = index.dim() - 1;
-  src = src.contiguous();
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (auto i = 0; i < src.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-    CHECK_INPUT(index.size(dim) == out.size(dim));
-  } else {
-    auto sizes = src.sizes().vec();
-    sizes[dim] = index.size(dim);
-    out = torch::empty(sizes, src.options());
-  }
-  if (index.numel() == 0)
-    return out;
-  auto E = index.numel();
-  auto K = out.numel() / E;
-  auto N = src.size(dim);
-  auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-    if (K == 1)
-     hipLaunchKernelGGL(( gather_coo_kernel<scalar_t>), dim3(BLOCKS(1, E)), dim3(THREADS), 0, stream, 
-          src_data, index_info, out_data, E, N);
-    else
-     hipLaunchKernelGGL(( gather_coo_broadcast_kernel<scalar_t>)
-          , dim3(BLOCKS(1, E * K)), dim3(THREADS), 0, stream, src_data, index_info,
-                                                     out_data, E, K, N);
-  });
-  return out;
-}
--- a/csrc/hip/segment_csr_hip_hip.hip
+++ b/csrc/hip/segment_csr_hip_hip.hip
-#include "hip/hip_runtime.h"
-#include "segment_csr_hip.h"
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
-#include "index_info.cuh"
-#include "reducer.cuh"
-#include "utils.cuh"
-#define THREADS 256
-#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
-#define FULL_MASK 0xffffffff
-template <typename scalar_t, ReductionType REDUCE, int TB>
-__global__ void
-segment_csr_kernel(const scalar_t *src_data,
-                   const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
-                   scalar_t *out_data, int64_t *arg_out_data, size_t N,
-                   size_t E) {
-  // Each warp processes exactly `32/TB` rows and aggregates all row values
-  // via a parallel reduction.
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / TB;
-  int lane_idx = thread_idx & (TB - 1);
-  if (row_idx < N) {
-    int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);
-    int64_t row_start = __ldg(indptr_info.data + offset);
-    int64_t row_end = __ldg(indptr_info.data + offset +
-                            indptr_info.strides[indptr_info.dims - 1]);
-    scalar_t val = Reducer<scalar_t, REDUCE>::init();
-    int64_t arg, arg_tmp;
-    offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E;
-    for (int64_t src_idx = row_start + lane_idx; src_idx < row_end;
-         src_idx += TB) {
-      Reducer<scalar_t, REDUCE>::update(&val, src_data[offset + src_idx], &arg,
-                                        src_idx);
-    }
-#pragma unroll
-    for (int i = TB / 2; i > 0; i /= 2) {
-      // Parallel reduction inside a single warp.
-      if (REDUCE == MIN || REDUCE == MAX)
-        arg_tmp = __shfl_down_sync(FULL_MASK, arg, i);
-      Reducer<scalar_t, REDUCE>::update(
-          &val, __shfl_down_sync(FULL_MASK, val, i), &arg, arg_tmp);
-    }
-    if (lane_idx == 0) {
-      Reducer<scalar_t, REDUCE>::write(out_data + row_idx, val,
-                                       arg_out_data + row_idx, arg,
-                                       row_end - row_start);
-    }
-  }
-}
-template <typename scalar_t, ReductionType REDUCE>
-__global__ void segment_csr_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
-    scalar_t *out_data, int64_t *arg_out_data, size_t N, size_t K, size_t E) {
-  // Each thread processes exactly one row. It turned out that is more
-  // efficient than using shared memory due to avoiding synchronization
-  // barriers.
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int lane_idx = thread_idx % K;
-  if (thread_idx < N * K) {
-    int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);
-    int64_t row_start = __ldg(indptr_info.data + offset);
-    int64_t row_end = __ldg(indptr_info.data + offset +
-                            indptr_info.strides[indptr_info.dims - 1]);
-    scalar_t val = Reducer<scalar_t, REDUCE>::init();
-    int64_t arg;
-    offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E * K;
-    for (int64_t src_idx = row_start; src_idx < row_end; src_idx++) {
-      Reducer<scalar_t, REDUCE>::update(
-          &val, src_data[offset + K * src_idx + lane_idx], &arg, src_idx);
-    }
-    Reducer<scalar_t, REDUCE>::write(out_data + thread_idx, val,
-                                     arg_out_data + thread_idx, arg,
-                                     row_end - row_start);
-  }
-}
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                 torch::optional<torch::Tensor> optional_out,
-                 std::string reduce) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(indptr);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-  CHECK_INPUT(src.dim() >= indptr.dim());
-  auto sizes = indptr.sizes().vec();
-  for (auto i = 0; i < indptr.dim() - 1; i++)
-    sizes[i] = src.size(i);
-  indptr = indptr.expand(sizes);
-  auto dim = indptr.dim() - 1;
-  src = src.contiguous();
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (int i = 0; i < out.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-    CHECK_INPUT(src.numel() == 0 || out.size(dim) == indptr.size(dim) - 1);
-  } else {
-    sizes = src.sizes().vec();
-    sizes[dim] = std::max<int64_t>(indptr.size(dim) - 1, 0);
-    out = torch::empty(sizes, src.options());
-  }
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
-  int64_t *arg_out_data = nullptr;
-  if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
-    arg_out = torch::full(out.sizes(), src.size(dim), indptr.options());
-    arg_out_data = arg_out.value().data_ptr<int64_t>();
-  }
-  if (src.numel() == 0) {
-    if (!optional_out.has_value())
-      out.fill_(0);
-    return std::make_tuple(out, arg_out);
-  }
-  auto N = out.size(dim) * (indptr.numel() / indptr.size(-1));
-  auto K = out.numel() / N;
-  auto E = src.size(dim);
-  auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-    AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
-      if (K == 1) {
-       hipLaunchKernelGGL(( segment_csr_kernel<scalar_t, REDUCE, 1>)
-            , dim3(BLOCKS(32, N)), dim3(THREADS), 0, stream, 
-                src_data, indptr_info, out_data, arg_out_data, N, E);
-      } else {
-       hipLaunchKernelGGL(( segment_csr_broadcast_kernel<scalar_t, REDUCE>)
-            , dim3(BLOCKS(1, N * K)), dim3(THREADS), 0, stream, 
-                src_data, indptr_info, out_data, arg_out_data, N, K, E);
-      }
-    });
-  });
-  return std::make_tuple(out, arg_out);
-}
-template <typename scalar_t, int TB>
-__global__ void
-gather_csr_kernel(const scalar_t *src_data,
-                  const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
-                  scalar_t *out_data, size_t N, size_t E) {
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / TB;
-  int lane_idx = thread_idx % TB;
-  if (row_idx < N) {
-    int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);
-    int row_start = __ldg(indptr_info.data + offset);
-    int row_end = __ldg(indptr_info.data + offset +
-                        indptr_info.strides[indptr_info.dims - 1]);
-    scalar_t val = __ldg(src_data + row_idx);
-    offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E;
-    for (int out_idx = row_start + lane_idx; out_idx < row_end; out_idx += TB) {
-      out_data[offset + out_idx] = val; // "Mostly" coalesced.
-    }
-  }
-}
-template <typename scalar_t>
-__global__ void gather_csr_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
-    scalar_t *out_data, size_t N, size_t K, size_t E) {
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int lane_idx = thread_idx % K;
-  if (thread_idx < N * K) {
-    int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);
-    int row_start = __ldg(indptr_info.data + offset);
-    int row_end = __ldg(indptr_info.data + offset +
-                        indptr_info.strides[indptr_info.dims - 1]);
-    scalar_t val = src_data[thread_idx]; // Coalesced.
-    offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E * K;
-    for (int out_idx = row_start; out_idx < row_end; out_idx++) {
-      out_data[offset + K * out_idx + lane_idx] = val; // "Mostly" coalesced.
-    }
-  }
-}
-torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                              torch::optional<torch::Tensor> optional_out) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(indptr);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-  CHECK_INPUT(src.dim() >= indptr.dim());
-  auto sizes = indptr.sizes().vec();
-  for (auto i = 0; i < indptr.dim() - 1; i++)
-    sizes[i] = src.size(i);
-  indptr = indptr.expand(sizes);
-  auto dim = indptr.dim() - 1;
-  CHECK_INPUT(src.size(dim) == 0 || src.size(dim) == indptr.size(dim) - 1);
-  src = src.contiguous();
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (auto i = 0; i < out.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-  } else {
-    auto sizes = src.sizes().vec();
-    if (src.numel() > 0) {
-      sizes[dim] = indptr.flatten()[-1].cpu().data_ptr<int64_t>()[0];
-    } else {
-      sizes[dim] = 0;
-    }
-    out = torch::empty(sizes, src.options());
-  }
-  if (src.numel() == 0) {
-    if (!optional_out.has_value())
-      out.fill_(0);
-    return out;
-  }
-  auto N = src.size(dim) * (indptr.numel() / indptr.size(-1));
-  auto K = src.numel() / N;
-  auto E = out.size(dim);
-  auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-    if (K == 1)
-     hipLaunchKernelGGL(( gather_csr_kernel<scalar_t, 4>), dim3(BLOCKS(1, 4 * N)), dim3(THREADS), 0, stream, 
-          src_data, indptr_info, out_data, N, E);
-    else
-     hipLaunchKernelGGL(( gather_csr_broadcast_kernel<scalar_t>)
-          , dim3(BLOCKS(1, N * K)), dim3(THREADS), 0, stream, src_data, indptr_info,
-                                                     out_data, N, K, E);
-  });
-  return out;
-}
--- a/csrc/scatter.cpp
+++ b/csrc/scatter.cpp
@@ -4,12 +4,12 @@
 #include "cpu/scatter_cpu.h"
 #include "utils.h"
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
-#include "hip/scatter_hip.h"
+#include "cuda/scatter_cuda.h"
 #endif
 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__scatter_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__scatter_cpu(void) { return NULL; }
@@ -31,7 +31,7 @@ scatter_fw(torch::Tensor src, torch::Tensor index, int64_t dim,
           torch::optional<torch::Tensor> optional_out,
           torch::optional<int64_t> dim_size, std::string reduce) {
  if (src.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return scatter_cuda(src, index, dim, optional_out, dim_size, reduce);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/segment_coo.cpp
+++ b/csrc/segment_coo.cpp
@@ -4,12 +4,12 @@
 #include "cpu/segment_coo_cpu.h"
 #include "utils.h"
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
-#include "hip/segment_coo_hip.h"
+#include "cuda/segment_coo_cuda.h"
 #endif
 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__segment_coo_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__segment_coo_cpu(void) { return NULL; }
@@ -21,7 +21,7 @@ segment_coo_fw(torch::Tensor src, torch::Tensor index,
               torch::optional<torch::Tensor> optional_out,
               torch::optional<int64_t> dim_size, std::string reduce) {
  if (src.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return segment_coo_cuda(src, index, optional_out, dim_size, reduce);
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -34,7 +34,7 @@ segment_coo_fw(torch::Tensor src, torch::Tensor index,
 torch::Tensor gather_coo_fw(torch::Tensor src, torch::Tensor index,
                            torch::optional<torch::Tensor> optional_out) {
  if (src.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return gather_coo_cuda(src, index, optional_out);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/segment_csr.cpp
+++ b/csrc/segment_csr.cpp
@@ -4,12 +4,12 @@
 #include "cpu/segment_csr_cpu.h"
 #include "utils.h"
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
-#include "hip/segment_csr_hip.h"
+#include "cuda/segment_csr_cuda.h"
 #endif
 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__segment_csr_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__segment_csr_cpu(void) { return NULL; }
@@ -21,7 +21,7 @@ segment_csr_fw(torch::Tensor src, torch::Tensor indptr,
               torch::optional<torch::Tensor> optional_out,
               std::string reduce) {
  if (src.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return segment_csr_cuda(src, indptr, optional_out, reduce);
 #else
    AT_ERROR("Not compiled with CUDA support");
@@ -34,7 +34,7 @@ segment_csr_fw(torch::Tensor src, torch::Tensor indptr,
 torch::Tensor gather_csr_fw(torch::Tensor src, torch::Tensor indptr,
                            torch::optional<torch::Tensor> optional_out) {
  if (src.device().is_cuda()) {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
    return gather_csr_cuda(src, indptr, optional_out);
 #else
    AT_ERROR("Not compiled with CUDA support");

--- a/csrc/version.cpp
+++ b/csrc/version.cpp
 #include <Python.h>
 #include <torch/script.h>
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
-#include <hip/hip_runtime.h>
+#include <cuda.h>
 #endif
 #ifdef _WIN32
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
 PyMODINIT_FUNC PyInit__version_cuda(void) { return NULL; }
 #else
 PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
@@ -14,8 +14,8 @@ PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
 #endif
 int64_t cuda_version() {
-#ifdef WITH_HIP
+#ifdef WITH_CUDA
-  return TORCH_HIP_VERSION;
+  return CUDA_VERSION;
 #else
  return -1;
 #endif

--- a/docs/.nojekyll
+++ b/docs/.nojekyll
--- a/docs/Makefile
+++ b/docs/Makefile
+SPHINXBUILD := sphinx-build
+SPHINXPROJ := pytorch_scatter
+SOURCEDIR := source
+BUILDDIR := build
+.PHONY: help Makefile
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)"
--- a/docs/index.html
+++ b/docs/index.html
+<!DOCTYPE html>
+<html>
+<head>
+  <title>Redirect</title>
+  <meta http-equiv="refresh" content="0; url=https://pytorch-scatter.readthedocs.io" />
+</head>
+</html>
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
+numpy
+https://download.pytorch.org/whl/cpu/torch-1.5.0%2Bcpu-cp37-cp37m-linux_x86_64.whl
+sphinx>=3
+sphinx_rtd_theme
--- a/docs/source/_figures/add.svg
+++ b/docs/source/_figures/add.svg
--- a/docs/source/_figures/add.tex
+++ b/docs/source/_figures/add.tex
+\def\indices{{0, 0, 1, 0, 2, 2, 3, 3}}
+\def\inputs{{5, 1, 7, 2, 3, 2, 1, 3}}
+\def\outputs{{8, 7, 5, 4}}
+\def\colors{{"cyan", "orange", "olive", "magenta"}}
+\def\numberInputs{7}
+\def\numberOutputs{3}
+\def\operation{add}
+\input{template}
--- a/docs/source/_figures/build.sh
+++ b/docs/source/_figures/build.sh
+#!/bin/bash
+files=(add sub mul div mean max min std)
+for name in "${files[@]}"; do
+  pdflatex "$name"
+  pdf2svg "$name.pdf" "$name.svg"
+done
--- a/docs/source/_figures/div.svg
+++ b/docs/source/_figures/div.svg
--- a/docs/source/_figures/div.tex
+++ b/docs/source/_figures/div.tex
+\def\indices{{0, 0, 1, 0, 2, 2, 3, 3}}
+\def\inputs{{5, 1, 7, 2, 3, 2, 1, 3}}
+\def\outputs{{"$\frac{1}{10}$", "$\frac{1}{7}$", "$\frac{1}{6}$", "$\frac{1}{3}$"}}
+\def\colors{{"cyan", "orange", "olive", "magenta"}}
+\def\numberInputs{7}
+\def\numberOutputs{3}
+\def\operation{div}
+\input{template}
--- a/docs/source/_figures/max.svg
+++ b/docs/source/_figures/max.svg
--- a/docs/source/_figures/max.tex
+++ b/docs/source/_figures/max.tex
+\def\indices{{0, 0, 1, 0, 2, 2, 3, 3}}
+\def\inputs{{5, 1, 7, 2, 3, 2, 1, 3}}
+\def\outputs{{5, 7, 3, 3}}
+\def\colors{{"cyan", "orange", "olive", "magenta"}}
+\def\numberInputs{7}
+\def\numberOutputs{3}
+\def\operation{max}
+\input{template}
--- a/docs/source/_figures/mean.svg
+++ b/docs/source/_figures/mean.svg