delete hip files

c21cd909 · limm · fd09c4eb · c21cd909 · c21cd909 · fd09c4eb
Commit c21cd909 authored May 27, 2025 by limm
20 changed files
--- a/README.md
+++ b/README.md
 # <div align="center"><strong>PyTorch Scatter</strong></div>
 ## 简介
-PyTorch Scatter由一个小型扩展库组成，专为PyTorch设计，提供了一系列高度优化的稀疏更新（scatter和segment）操作，这些操作在主包中是缺失的。Scatter和segment操作可以大致描述为基于给定“组索引”张量的归约操作。Segment操作要求“组索引”张量是有序的，而scatter操作则不受此要求限制。DAS软件栈中的PyTorch Scatter版本,不仅保证了该组件 核心功能在DCU加速卡的可用性，还针对DCU特有的硬件架构进行了深度定制优化，这使得开发者能够以极低的成本，轻松实现应用程序在DCU加速卡上的快速迁移和性能提升。目前已适配支持Pytorch1.13，Pyotrch2.1，Pytorch2.3。
+PyTorch Scatter由一个小型扩展库组成，专为PyTorch设计，提供了一系列高度优化的稀疏更新（scatter和segment）操作，这些操作在主包中是缺失的。Scatter和segment操作可以大致描述为基于给定“组索引”张量的归约操作。Segment操作要求“组索引”张量是有序的，而scatter操作则不受此要求限制。DAS软件栈中的PyTorch Scatter版本,不仅保证了该组件 核心功能在DCU加速卡的可用性，还针对DCU特有的硬件架构进行了深度定制优化，这使得开发者能够以极低的成本，轻松实现应用程序在DCU加速卡上的快速迁移和性能提升。目前已适配支持Pytorch1.13，Pyotrch2.1，Pytorch2.4.1. Pytorch2.5.1

 ## 安装

 ### 使用pip方式安装
-pytorch-scatter whl包下载目录：[http://10.6.10.68:8000/customized/torch-scatter/dtk2404](http://10.6.10.68:8000/customized/torch-scatter/dtk2404)，目前只提供有python3.8版本的whl包。
+pytorch-scatter whl包下载目录：[光合开发者社区](https://das.sourcefind.cn:55011/portal/#/home)，
 ```shell
 pip install torch_scatter* (下载的torch_scatter的whl包)
 ```
@@ -16,18 +16,26 @@ pip install torch_scatter* (下载的torch_scatter的whl包)
 ```shell
 pip install -r requirements.txt 
 ```
- 在首页 | 光合开发者社区下载 dtk24.04 解压至 /opt/ 路径下，并建立软链接
+- 在首页 | 光合开发者社区下载 dtk25.04 解压至 /opt/ 路径下，并建立软链接
 ```shell
-cd /opt && ln -s dtk-24.04 dtk
-source /opt/dtk/env.sh
+cd /opt && ln -s dtk-25.04 dtk
+
 ```
- 安装pytorch，pytorch whl包下载目录：[http://10.6.10.68:8000/debug/pytorch/dtk24.10/hipify/](http://10.6.10.68:8000/debug/pytorch/dtk24.04/hipify/)，根据python、dtk版本,下载对应pytorch的whl包。安装命令如下：
+- 安装pytorch，pytorch whl包下载目录：[光合开发者社区](https://das.sourcefind.cn:55011/portal/#/home)，根据python、dtk版本,下载对应pytorch的whl包。安装命令如下：
 ```shell
 pip install torch* (下载的torch的whl包)
 ```
+
+- 安装fastpt，fastpt whl包下载目录：[光合开发者社区](https://das.sourcefind.cn:55011/portal/#/home)，根据fastpt、dtk版本,下载对应fastpt的whl包。安装命令如下：
+```shell
+pip install fastpt* (下载的fastpt的whl包)
+```
+
 #### 源码编译安装
 ```shell
-git clone -b 2.1.0-release http://developer.hpccube.com/codes/aicomponent/torch-scatter.git
+git clone -b 2.1.0-fastpt http://developer.hpccube.com/codes/aicomponent/torch-scatter.git
+export FORCE_CUDA=1
+source /usr/local/bin/fastpt -C
 cd torch-scatter
 python setup.py bdist_wheel
 pip install dist/*.whl

--- a/csrc/cuda/utils.cuh
+++ b/csrc/cuda/utils.cuh
@@ -18,16 +18,6 @@ __device__ __inline__ at::Half __shfl_down_sync(const unsigned mask,
  return __shfl_down_sync(mask, var.operator __half(), delta);
 }

-#ifdef USE_ROCM
-__device__ __inline__ at::Half __shfl_up(const at::Half var,  const unsigned int delta) {
-          return __shfl_up(var.operator __half(), delta);
-}
-
-__device__ __inline__ at::Half __shfl_down(const at::Half var, const unsigned int delta) {
-          return __shfl_down(var.operator __half(), delta);
-}
-#endif
-
 #ifdef USE_ROCM
 __device__ __inline__ at::Half __ldg(const at::Half* ptr) {
  return __ldg(reinterpret_cast<const __half*>(ptr));

--- a/csrc/hip/atomics.cuh
+++ b/csrc/hip/atomics.cuh
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-
-#define ATOMIC(NAME)                                                           \
-  template <typename scalar, size_t size> struct Atomic##NAME##IntegerImpl;    \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 1> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      uint32_t *address_as_ui = (uint32_t *)(address - ((size_t)address & 3)); \
-      uint32_t old = *address_as_ui;                                           \
-      uint32_t shift = ((size_t)address & 3) * 8;                              \
-      uint32_t sum;                                                            \
-      uint32_t assumed;                                                        \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        sum = OP(val, scalar((old >> shift) & 0xff));                          \
-        old = (old & ~(0x000000ff << shift)) | (sum << shift);                 \
-        old = atomicCAS(address_as_ui, assumed, old);                          \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 2> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      uint32_t *address_as_ui =                                                \
-          (uint32_t *)((char *)address - ((size_t)address & 2));               \
-      uint32_t old = *address_as_ui;                                           \
-      uint32_t sum;                                                            \
-      uint32_t newval;                                                         \
-      uint32_t assumed;                                                        \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        sum = OP(val, (size_t)address & 2 ? scalar(old >> 16)                  \
-                                          : scalar(old & 0xffff));             \
-        newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16)            \
-                                     : (old & 0xffff0000) | sum;               \
-        old = atomicCAS(address_as_ui, assumed, newval);                       \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 4> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      uint32_t *address_as_ui = (uint32_t *)address;                           \
-      uint32_t old = *address_as_ui;                                           \
-      uint32_t assumed;                                                        \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        old = atomicCAS(address_as_ui, assumed, OP(val, (scalar)old));         \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 8> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      unsigned long long *address_as_ull = (unsigned long long *)address;      \
-      unsigned long long old = *address_as_ull;                                \
-      unsigned long long assumed;                                              \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        old = atomicCAS(address_as_ull, assumed, OP(val, (scalar)old));        \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar, size_t size> struct Atomic##NAME##DecimalImpl;    \
-                                                                               \
-  template <> struct Atomic##NAME##DecimalImpl<at::Half, 2> {                  \
-    inline __device__ void operator()(at::Half *address, at::Half val) {       \
-      unsigned int *address_as_ui =                                            \
-          (unsigned int *)((char *)address - ((size_t)address & 2));           \
-      unsigned int old = *address_as_ui;                                       \
-      unsigned int assumed;                                                    \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        at::Half hsum;                                                         \
-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);           \
-        hsum = OP(hsum, val);                                                  \
-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)            \
-                                  : (old & 0xffff0000) | hsum.x;               \
-        old = atomicCAS(address_as_ui, assumed, old);                          \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <> struct Atomic##NAME##DecimalImpl<at::BFloat16, 2> {              \
-    inline __device__ void operator()(at::BFloat16 *address, at::BFloat16 val){\
-      unsigned int *address_as_ui =                                            \
-          (unsigned int *)((char *)address - ((size_t)address & 2));           \
-      unsigned int old = *address_as_ui;                                       \
-      unsigned int assumed;                                                    \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        at::BFloat16 hsum;                                                     \
-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);           \
-        hsum = OP(hsum, val);                                                  \
-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)            \
-                                  : (old & 0xffff0000) | hsum.x;               \
-        old = atomicCAS(address_as_ui, assumed, old);                          \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 4> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      int *address_as_i = (int *)address;                                      \
-      int old = *address_as_i;                                                 \
-      int assumed;                                                             \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        old = atomicCAS(address_as_i, assumed,                                 \
-                        __float_as_int(OP(val, __int_as_float(assumed))));     \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 8> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      unsigned long long int *address_as_ull =                                 \
-          (unsigned long long int *)address;                                   \
-      unsigned long long int old = *address_as_ull;                            \
-      unsigned long long int assumed;                                          \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        old = atomicCAS(                                                       \
-            address_as_ull, assumed,                                           \
-            __double_as_longlong(OP(val, __longlong_as_double(assumed))));     \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };
-
-#define OP(X, Y) Y + X
-ATOMIC(Add)
-#undef OP
-static inline __device__ void atomAdd(uint8_t *address, uint8_t val) {
-  AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
-}
-static inline __device__ void atomAdd(int8_t *address, int8_t val) {
-  AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
-}
-static inline __device__ void atomAdd(int16_t *address, int16_t val) {
-  AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
-}
-static inline __device__ void atomAdd(int32_t *address, int32_t val) {
-  atomicAdd(address, val);
-}
-static inline __device__ void atomAdd(int64_t *address, int64_t val) {
-  AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
-}
-#if defined(USE_ROCM) || (defined(__DTK_ARCH__) && (__DTK_ARCH__ < 700 || DTK_VERSION < 10000))
-static inline __device__ void atomAdd(at::Half *address, at::Half val) {
-  AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
-}
-#else
-static inline __device__ void atomAdd(at::Half *address, at::Half val) {
-  atomicAdd(reinterpret_cast<__half *>(address), val);
-}
-#endif
-static inline __device__ void atomAdd(float *address, float val) {
-  atomicAdd(address, val);
-}
-#if defined(__DTK_ARCH__) && (__DTK_ARCH__ < 600 || DTK_VERSION < 8000)
-static inline __device__ void atomAdd(double *address, double val) {
-  AtomicAddDecimalImpl<double, sizeof(double)>()(address, val);
-}
-#else
-static inline __device__ void atomAdd(double *address, double val) {
-  atomicAdd(address, val);
-}
-#endif
-static inline __device__ void atomAdd(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicAddDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
-}
-
-#define OP(X, Y) Y *X
-ATOMIC(Mul)
-#undef OP
-static inline __device__ void atomMul(uint8_t *address, uint8_t val) {
-  AtomicMulIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
-}
-static inline __device__ void atomMul(int8_t *address, int8_t val) {
-  AtomicMulIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
-}
-static inline __device__ void atomMul(int16_t *address, int16_t val) {
-  AtomicMulIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
-}
-static inline __device__ void atomMul(int32_t *address, int32_t val) {
-  AtomicMulIntegerImpl<int32_t, sizeof(int32_t)>()(address, val);
-}
-static inline __device__ void atomMul(int64_t *address, int64_t val) {
-  AtomicMulIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
-}
-static inline __device__ void atomMul(float *address, float val) {
-  AtomicMulDecimalImpl<float, sizeof(float)>()(address, val);
-}
-static inline __device__ void atomMul(at::Half *address, at::Half val) {
-  AtomicMulDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
-}
-static inline __device__ void atomMul(double *address, double val) {
-  AtomicMulDecimalImpl<double, sizeof(double)>()(address, val);
-}
-static inline __device__ void atomMul(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicMulDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
-}
-
-#define OP(X, Y) Y / X
-ATOMIC(Div)
-#undef OP
-static inline __device__ void atomDiv(uint8_t *address, uint8_t val) {
-  AtomicDivIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
-}
-static inline __device__ void atomDiv(int8_t *address, int8_t val) {
-  AtomicDivIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
-}
-static inline __device__ void atomDiv(int16_t *address, int16_t val) {
-  AtomicDivIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
-}
-static inline __device__ void atomDiv(int32_t *address, int32_t val) {
-  AtomicDivIntegerImpl<int32_t, sizeof(int32_t)>()(address, val);
-}
-static inline __device__ void atomDiv(int64_t *address, int64_t val) {
-  AtomicDivIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
-}
-static inline __device__ void atomDiv(at::Half *address, at::Half val) {
-  AtomicDivDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
-}
-static inline __device__ void atomDiv(float *address, float val) {
-  AtomicDivDecimalImpl<float, sizeof(float)>()(address, val);
-}
-static inline __device__ void atomDiv(double *address, double val) {
-  AtomicDivDecimalImpl<double, sizeof(double)>()(address, val);
-}
-static inline __device__ void atomDiv(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicDivDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
-}
-
-#define OP(X, Y) max(Y, X)
-ATOMIC(Max)
-#undef OP
-static inline __device__ void atomMax(uint8_t *address, uint8_t val) {
-  AtomicMaxIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
-}
-static inline __device__ void atomMax(int8_t *address, int8_t val) {
-  AtomicMaxIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
-}
-static inline __device__ void atomMax(int16_t *address, int16_t val) {
-  AtomicMaxIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
-}
-static inline __device__ void atomMax(int32_t *address, int32_t val) {
-  atomicMax(address, val);
-}
-static inline __device__ void atomMax(int64_t *address, int64_t val) {
-  AtomicMaxIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
-}
-static inline __device__ void atomMax(at::Half *address, at::Half val) {
-  AtomicMaxDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
-}
-static inline __device__ void atomMax(float *address, float val) {
-  AtomicMaxDecimalImpl<float, sizeof(float)>()(address, val);
-}
-static inline __device__ void atomMax(double *address, double val) {
-  AtomicMaxDecimalImpl<double, sizeof(double)>()(address, val);
-}
-static inline __device__ void atomMax(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicMaxDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
-}
-
-#define OP(X, Y) min(Y, X)
-ATOMIC(Min)
-#undef OP
-static inline __device__ void atomMin(uint8_t *address, uint8_t val) {
-  AtomicMinIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
-}
-static inline __device__ void atomMin(int8_t *address, int8_t val) {
-  AtomicMinIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
-}
-static inline __device__ void atomMin(int16_t *address, int16_t val) {
-  AtomicMinIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
-}
-static inline __device__ void atomMin(int32_t *address, int32_t val) {
-  atomicMin(address, val);
-}
-static inline __device__ void atomMin(int64_t *address, int64_t val) {
-  AtomicMinIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
-}
-static inline __device__ void atomMin(at::Half *address, at::Half val) {
-  AtomicMinDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
-}
-static inline __device__ void atomMin(float *address, float val) {
-  AtomicMinDecimalImpl<float, sizeof(float)>()(address, val);
-}
-static inline __device__ void atomMin(double *address, double val) {
-  AtomicMinDecimalImpl<double, sizeof(double)>()(address, val);
-}
-static inline __device__ void atomMin(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicMinDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
-}
--- a/csrc/hip/index_info.cuh
+++ b/csrc/hip/index_info.cuh
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-
-#include <ATen/hip/detail/TensorInfo.cuh>
-
-// We need our own `IndexToOffset` implementation since we do not want to
-// access the last element of the `indexptr`.
-template <typename scalar_t> struct IndexPtrToOffset {
-  static inline __host__ __device__ int
-  get(int idx, const at::cuda::detail::TensorInfo<scalar_t, int> &info) {
-    int offset = idx % (info.sizes[info.dims - 1] - 1);
-    offset *= info.strides[info.dims - 1];
-    idx /= info.sizes[info.dims - 1] - 1;
-    for (int i = info.dims - 2; i >= 0; --i) {
-      offset += (idx % info.sizes[i]) * info.strides[i];
-      idx /= info.sizes[i];
-    }
-    return offset;
-  }
-};
--- a/csrc/hip/reducer.cuh
+++ b/csrc/hip/reducer.cuh
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-
-#include <limits>
-#include <map>
-
-#include "../hip/atomics.cuh"
-
-enum ReductionType { SUM, MEAN, MUL, DIV, MIN, MAX };
-
-const std::map<std::string, ReductionType> reduce2REDUCE = {
-    {"sum", SUM}, {"mean", MEAN}, {"mul", MUL},
-    {"div", DIV}, {"min", MIN},   {"max", MAX},
-};
-
-#define AT_DISPATCH_REDUCTION_TYPES(reduce, ...)                               \
-  [&] {                                                                        \
-    switch (reduce2REDUCE.at(reduce)) {                                        \
-    case SUM: {                                                                \
-      static constexpr ReductionType REDUCE = SUM;                                        \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    case MEAN: {                                                               \
-      static constexpr ReductionType REDUCE = MEAN;                                       \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    case MUL: {                                                                \
-      static constexpr ReductionType REDUCE = MUL;                                        \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    case DIV: {                                                                \
-      static constexpr ReductionType REDUCE = DIV;                                        \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    case MIN: {                                                                \
-      static constexpr ReductionType REDUCE = MIN;                                        \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    case MAX: {                                                                \
-      static constexpr ReductionType REDUCE = MAX;                                        \
-      return __VA_ARGS__();                                                    \
-    }                                                                          \
-    }                                                                          \
-  }()
-
-template <typename scalar_t, ReductionType REDUCE> struct Reducer {
-  static inline __host__ __device__ scalar_t init() {
-    if (REDUCE == MUL || REDUCE == DIV)
-      return (scalar_t)1;
-    else if (REDUCE == MIN)
-      return std::numeric_limits<scalar_t>::max();
-    else if (REDUCE == MAX)
-      return std::numeric_limits<scalar_t>::lowest();
-    else
-      return (scalar_t)0;
-  }
-
-  static inline __host__ __device__ void update(scalar_t *val,
-                                                scalar_t new_val) {
-    if (REDUCE == SUM || REDUCE == MEAN)
-      *val = *val + new_val;
-    else if (REDUCE == MUL)
-      *val = *val * new_val;
-    else if (REDUCE == DIV)
-      *val = *val / new_val;
-    else if ((REDUCE == MIN && new_val < *val) ||
-             (REDUCE == MAX && new_val > *val)) {
-      *val = new_val;
-    }
-  }
-
-  static inline __host__ __device__ void update(scalar_t *val, scalar_t new_val,
-                                                int64_t *arg, int64_t new_arg) {
-    if (REDUCE == SUM || REDUCE == MEAN)
-      *val = *val + new_val;
-    else if (REDUCE == MUL)
-      *val = *val * new_val;
-    else if (REDUCE == DIV)
-      *val = *val / new_val;
-    else if ((REDUCE == MIN && new_val < *val) ||
-             (REDUCE == MAX && new_val > *val)) {
-      *val = new_val;
-      *arg = new_arg;
-    }
-  }
-
-  static inline __host__ __device__ void write(scalar_t *address, scalar_t val,
-                                               int64_t *arg_address,
-                                               int64_t arg, int count) {
-    if (REDUCE == SUM || REDUCE == MUL || REDUCE == DIV)
-      *address = val;
-    else if (REDUCE == MEAN)
-      *address = val / (scalar_t)(count > 0 ? count : 1);
-    else if (REDUCE == MIN || REDUCE == MAX) {
-      if (count > 0) {
-        *address = val;
-        *arg_address = arg;
-      } else
-        *address = (scalar_t)0;
-    }
-  }
-
-  static inline __device__ void atomic_write(scalar_t *address, scalar_t val) {
-    if (REDUCE == SUM || REDUCE == MEAN)
-      atomAdd(address, val);
-    else if (REDUCE == MUL)
-      atomMul(address, val);
-    else if (REDUCE == DIV)
-      atomDiv(address, val);
-    else if (REDUCE == MIN)
-      atomMin(address, val);
-    else if (REDUCE == MAX)
-      atomMax(address, val);
-  }
-};
--- a/csrc/hip/scatter_cuda.h
+++ b/csrc/hip/scatter_cuda.h
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-
-#include "../extensions.h"
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
-             torch::optional<torch::Tensor> optional_out,
-             torch::optional<int64_t> dim_size, std::string reduce);
--- a/csrc/hip/scatter_cuda.hip
+++ b/csrc/hip/scatter_cuda.hip
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#include "hip/hip_runtime.h"
-#include "../hip/scatter_cuda.h"
-
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
-
-#include "../hip/reducer.cuh"
-#include "../hip/utils.cuh"
-
-#define THREADS 256
-#define BLOCKS(N) (N + THREADS - 1) / THREADS
-
-template <typename scalar_t, ReductionType REDUCE>
-__global__ void
-scatter_kernel(const scalar_t *src_data,
-               const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-               scalar_t *out_data, int E, int K, int N, int numel) {
-
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  int b = thread_idx / (E * K);
-  int k = thread_idx % K;
-
-  if (thread_idx < numel) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        thread_idx, index_info);
-    int64_t idx = index_info.data[offset];
-
-    Reducer<scalar_t, REDUCE>::atomic_write(out_data + b * N * K + idx * K + k,
-                                            src_data[thread_idx]);
-  }
-}
-
-template <typename scalar_t>
-__global__ void
-scatter_arg_kernel(const scalar_t *src_data,
-                   const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-                   const scalar_t *out_data, int64_t *arg_out_data, int E,
-                   int K, int N, int numel) {
-
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  int b = thread_idx / (E * K);
-  int e = (thread_idx / K) % E;
-  int k = thread_idx % K;
-
-  if (thread_idx < numel) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        thread_idx, index_info);
-    int64_t idx = index_info.data[offset];
-
-    if (src_data[thread_idx] == out_data[b * N * K + idx * K + k]) {
-      arg_out_data[b * N * K + idx * K + k] = e;
-    }
-  }
-}
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
-             torch::optional<torch::Tensor> optional_out,
-             torch::optional<int64_t> dim_size, std::string reduce) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(index);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-
-  CHECK_INPUT(src.dim() == index.dim());
-  for (auto i = 0; i < index.dim() - 1; i++)
-    CHECK_INPUT(src.size(i) >= index.size(i));
-
-  src = src.contiguous();
-
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (auto i = 0; i < out.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-  } else {
-    auto sizes = src.sizes().vec();
-    if (dim_size.has_value())
-      sizes[dim] = dim_size.value();
-    else if (index.numel() == 0)
-      sizes[dim] = 0;
-    else {
-      sizes[dim] = 1 + index.max().cpu().data_ptr<int64_t>()[0];
-    }
-    out = torch::empty(sizes, src.options());
-  }
-
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
-  int64_t *arg_out_data = nullptr;
-  if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
-    arg_out = torch::full_like(out, src.size(dim), index.options());
-    arg_out_data = arg_out.value().data_ptr<int64_t>();
-  }
-
-  if (src.numel() == 0) {
-    if (!optional_out.has_value())
-      out.fill_(0);
-    return std::make_tuple(out, arg_out);
-  }
-
-  auto B = 1;
-  for (auto i = 0; i < dim; i++)
-    B *= src.size(i);
-  auto E = src.size(dim);
-  auto K = src.numel() / (B * E);
-  auto N = out.size(dim);
-
-  auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-
-    AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
-      if (!optional_out.has_value())
-        out.fill_(Reducer<scalar_t, REDUCE>::init());
-
-     hipLaunchKernelGGL(( scatter_kernel<scalar_t, REDUCE>)
-          , dim3(BLOCKS(src.numel())), dim3(THREADS), 0, stream, 
-              src_data, index_info, out_data, E, K, N, src.numel());
-
-      if (!optional_out.has_value() && (REDUCE == MIN || REDUCE == MAX))
-        out.masked_fill_(out == Reducer<scalar_t, REDUCE>::init(), (scalar_t)0);
-
-      if (REDUCE == MIN || REDUCE == MAX)
-       hipLaunchKernelGGL(( scatter_arg_kernel<scalar_t>)
-            , dim3(BLOCKS(src.numel())), dim3(THREADS), 0, stream, 
-                src_data, index_info, out_data, arg_out_data, E, K, N,
-                src.numel());
-    });
-  });
-
-  return std::make_tuple(out, arg_out);
-}
--- a/csrc/hip/segment_coo_cuda.h
+++ b/csrc/hip/segment_coo_cuda.h
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-
-#include "../extensions.h"
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-segment_coo_cuda(torch::Tensor src, torch::Tensor index,
-                 torch::optional<torch::Tensor> optional_out,
-                 torch::optional<int64_t> dim_size, std::string reduce);
-
-torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
-                              torch::optional<torch::Tensor> optional_out);
--- a/csrc/hip/segment_coo_cuda.hip
+++ b/csrc/hip/segment_coo_cuda.hip
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#include "hip/hip_runtime.h"
-#include "../hip/segment_coo_cuda.h"
-
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
-
-#include "../hip/reducer.cuh"
-#include "../hip/utils.cuh"
-
-#define THREADS 256
-#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
-#define FULL_MASK 0xffffffff
-
-template <typename scalar_t, ReductionType REDUCE, bool HAS_VAL>
-__global__ void
-segment_coo_kernel(const scalar_t *src_data,
-                   const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-                   scalar_t *out_data, size_t E, size_t N) {
-
-  // Each thread processes exactly one entry. Within a warp, we perform a
-  // parallel reduction across equal indices, and write the intermediate
-  // result via atomics.
-
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int lane_idx = row_idx & (32 - 1);
-  int D = index_info.sizes[index_info.dims - 1];
-
-  if (row_idx < E) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int64_t idx = index_info.data[offset], next_idx;
-    int out_idx = (row_idx / D) * N + idx;
-
-    scalar_t val = HAS_VAL ? src_data[row_idx] : (scalar_t)1, tmp;
-
-#pragma unroll
-    for (int i = 1; i < 32; i *= 2) {
-      // Parallel reduction inside a single warp.
-      tmp = SHFL_UP_SYNC(FULL_MASK, val, i);
-      next_idx = SHFL_UP_SYNC(FULL_MASK, idx, i);
-      if (lane_idx >= i && row_idx / D == (row_idx - i) / D) {
-        assert(idx >= next_idx);
-        if (idx == next_idx)
-          Reducer<scalar_t, REDUCE>::update(&val, tmp);
-      }
-    }
-
-    next_idx = SHFL_DOWN_SYNC(FULL_MASK, idx, 1);
-    if (lane_idx == 32 - 1 || row_idx / D != (row_idx + 1) / D ||
-        idx != next_idx)
-      Reducer<scalar_t, REDUCE>::atomic_write(out_data + out_idx, val);
-  }
-}
-
-template <typename scalar_t>
-__global__ void segment_coo_arg_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-    scalar_t *out_data, int64_t *arg_out_data, size_t E, size_t N) {
-
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int D = index_info.sizes[index_info.dims - 1];
-
-  if (row_idx < E) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int64_t idx = index_info.data[offset];
-    int out_idx = (row_idx / D) * N + idx;
-
-    scalar_t val = __ldg(out_data + out_idx);
-    if (src_data[row_idx] == val)
-      arg_out_data[out_idx] = row_idx % D;
-  }
-}
-
-template <typename scalar_t, ReductionType REDUCE, int TB>
-__global__ void segment_coo_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-    scalar_t *out_data, size_t E, size_t K, size_t N) {
-
-  // Each thread processes a single column and `TB` index entries. Coalesced
-  // read and write is performed in column-major order. The intermediate
-  // results are written via atomics.
-
-  int D = index_info.sizes[index_info.dims - 1];
-  int E_1 = E / D;
-  int E_2 = (D - 1) + TB - ((D - 1) % TB);
-
-  int row_idx = blockIdx.x * blockDim.y + threadIdx.y;
-  int col_idx = blockIdx.y * blockDim.x + threadIdx.x;
-
-  int dim_start = (row_idx * TB) / E_2;
-  int row_start = (row_idx * TB) % E_2;
-
-  if (dim_start < E_1 && col_idx < K) {
-
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        dim_start * D + row_start, index_info);
-    int idx1 = __ldg(index_info.data + offset), idx2;
-
-    scalar_t val = src_data[K * (dim_start * D + row_start) + col_idx];
-
-#pragma unroll
-    for (int i = 1; i < TB; i++) {
-      if (row_start + i >= D)
-        break;
-
-      idx2 = __ldg(index_info.data + offset +
-                   i * index_info.strides[index_info.dims - 1]);
-      assert(idx1 <= idx2);
-      if (idx1 == idx2) {
-        Reducer<scalar_t, REDUCE>::update(
-            &val, src_data[K * (dim_start * D + row_start + i) + col_idx]);
-      } else {
-        Reducer<scalar_t, REDUCE>::atomic_write(
-            out_data + (dim_start * N + idx1) * K + col_idx, val);
-        val = src_data[K * (dim_start * D + row_start + i) + col_idx];
-      }
-
-      idx1 = idx2;
-    }
-
-    Reducer<scalar_t, REDUCE>::atomic_write(
-        out_data + (dim_start * N + idx1) * K + col_idx, val);
-  }
-}
-
-template <typename scalar_t>
-__global__ void segment_coo_arg_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-    scalar_t *out_data, int64_t *arg_out_data, size_t E, size_t K, size_t N) {
-
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int col_idx = thread_idx % K;
-  int D = index_info.sizes[index_info.dims - 1];
-
-  if (row_idx < E && col_idx < K) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int idx = __ldg(index_info.data + offset);
-    int out_idx = ((row_idx / D) * N + idx) * K + col_idx;
-
-    scalar_t val = __ldg(out_data + out_idx);
-    if (src_data[thread_idx] == val)
-      arg_out_data[out_idx] = row_idx % D;
-  }
-}
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-segment_coo_cuda(torch::Tensor src, torch::Tensor index,
-                 torch::optional<torch::Tensor> optional_out,
-                 torch::optional<int64_t> dim_size, std::string reduce) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(index);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-
-  CHECK_INPUT(src.dim() >= index.dim());
-
-  auto sizes = index.sizes().vec();
-  for (int i = 0; i < index.dim(); i++) {
-    sizes[i] = src.size(i);
-  }
-  index = index.expand(sizes);
-
-  auto dim = index.dim() - 1;
-
-  src = src.contiguous();
-
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (int i = 0; i < out.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-  } else {
-    sizes = src.sizes().vec();
-    if (dim_size.has_value())
-      sizes[dim] = dim_size.value();
-    else if (index.numel() == 0)
-      sizes[dim] = 0;
-    else {
-      auto tmp = index.select(dim, index.size(dim) - 1);
-      tmp = tmp.numel() > 1 ? tmp.max() : tmp;
-      sizes[dim] = 1 + tmp.cpu().data_ptr<int64_t>()[0];
-    }
-    out = torch::zeros(sizes, src.options());
-  }
-
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
-  int64_t *arg_out_data = nullptr;
-  if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
-    arg_out = torch::full_like(out, src.size(dim), index.options());
-    arg_out_data = arg_out.value().data_ptr<int64_t>();
-  } else if (reduce2REDUCE.at(reduce) == MEAN) {
-    auto sizes = index.sizes().vec();
-    sizes[dim] = out.size(dim);
-    arg_out = torch::zeros(sizes, out.options());
-  }
-
-  if (index.numel() == 0)
-    return std::make_tuple(out, arg_out);
-
-  auto E = index.numel();
-  auto E_2 = index.size(dim);
-  auto E_1 = index.numel() / E_2;
-  auto K = src.numel() / E;
-  auto N = out.size(dim);
-  auto avg_len = (float)E_2 / (float)N;
-
-  auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-
-    AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
-      if (!optional_out.has_value())
-        out.fill_(Reducer<scalar_t, REDUCE>::init());
-
-      if (K == 1)
-       hipLaunchKernelGGL(( segment_coo_kernel<scalar_t, REDUCE, true>)
-            , dim3(BLOCKS(1, E)), dim3(THREADS), 0, stream, src_data, index_info,
-                                                   out_data, E, N);
-      else if (avg_len <= 8)
-       hipLaunchKernelGGL(( segment_coo_broadcast_kernel<scalar_t, REDUCE, 4>)
-            , dim3(dim3((E_1 * ((E_2 + 3) / 4) + 7) / 8, (K + 31) / 32)),
-               dim3(dim3(32, 8)), 0, stream, src_data, index_info, out_data, E, K,
-                                         N);
-      else if (avg_len <= 16)
-       hipLaunchKernelGGL(( segment_coo_broadcast_kernel<scalar_t, REDUCE, 8>)
-            , dim3(dim3((E_1 * ((E_2 + 7) / 8) + 7) / 8, (K + 31) / 32)),
-               dim3(dim3(32, 8)), 0, stream, src_data, index_info, out_data, E, K,
-                                         N);
-      else if (avg_len <= 32)
-       hipLaunchKernelGGL(( segment_coo_broadcast_kernel<scalar_t, REDUCE, 16>)
-            , dim3(dim3((E_1 * ((E_2 + 15) / 16) + 7) / 8, (K + 31) / 32)),
-               dim3(dim3(32, 8)), 0, stream, src_data, index_info, out_data, E, K,
-                                         N);
-      else
-       hipLaunchKernelGGL(( segment_coo_broadcast_kernel<scalar_t, REDUCE, 32>)
-            , dim3(dim3((E_1 * ((E_2 + 31) / 32) + 7) / 8, (K + 31) / 32)),
-               dim3(dim3(32, 8)), 0, stream, src_data, index_info, out_data, E, K,
-                                         N);
-
-      if (!optional_out.has_value() && (REDUCE == MIN || REDUCE == MAX))
-        out.masked_fill_(out == Reducer<scalar_t, REDUCE>::init(), (scalar_t)0);
-
-      if (REDUCE == MIN || REDUCE == MAX) {
-        if (K == 1)
-         hipLaunchKernelGGL(( segment_coo_arg_kernel<scalar_t>)
-              , dim3(BLOCKS(1, E)), dim3(THREADS), 0, stream, 
-                  src_data, index_info, out_data, arg_out_data, E, N);
-        else
-         hipLaunchKernelGGL(( segment_coo_arg_broadcast_kernel<scalar_t>)
-              , dim3(BLOCKS(1, E * K)), dim3(THREADS), 0, stream, 
-                  src_data, index_info, out_data, arg_out_data, E, K, N);
-      }
-
-      if (REDUCE == MEAN) {
-        auto count_data = arg_out.value().data_ptr<scalar_t>();
-       hipLaunchKernelGGL(( segment_coo_kernel<scalar_t, SUM, false>)
-            , dim3(BLOCKS(1, E)), dim3(THREADS), 0, stream, nullptr, index_info,
-                                                   count_data, E, N);
-        arg_out.value().masked_fill_(arg_out.value() < (scalar_t)1,
-                                     (scalar_t)1);
-        auto count = arg_out.value();
-        for (int i = dim + 1; i < out.dim(); i++)
-          count = count.unsqueeze(-1);
-        if (out.is_floating_point())
-          out.true_divide_(count);
-        else
-          out.div_(count, "floor");
-      }
-    });
-  });
-
-  return std::make_tuple(out, arg_out);
-}
-
-template <typename scalar_t>
-__global__ void
-gather_coo_kernel(const scalar_t *src_data,
-                  const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-                  scalar_t *out_data, size_t E, size_t N) {
-
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (row_idx < E) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int row = index_info.data[offset];
-
-    offset = (row_idx / index_info.sizes[index_info.dims - 1]) * N;
-    scalar_t val = __ldg(src_data + offset + row);
-
-    out_data[row_idx] = val;
-  }
-}
-
-template <typename scalar_t>
-__global__ void gather_coo_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> index_info,
-    scalar_t *out_data, size_t E, size_t K, size_t N) {
-
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int col_idx = thread_idx % K;
-
-  if (thread_idx < E * K) {
-    int offset = at::cuda::detail::IndexToOffset<int64_t, int, -1>::get(
-        row_idx, index_info);
-    int row = index_info.data[offset];
-
-    offset = (row_idx / index_info.sizes[index_info.dims - 1]) * N * K;
-    scalar_t val = __ldg(src_data + offset + K * row + col_idx);
-
-    out_data[thread_idx] = val;
-  }
-}
-
-torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
-                              torch::optional<torch::Tensor> optional_out) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(index);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-
-  CHECK_INPUT(src.dim() >= index.dim());
-
-  auto sizes = index.sizes().vec();
-  for (auto i = 0; i < index.dim() - 1; i++)
-    sizes[i] = src.size(i);
-  index = index.expand(sizes);
-
-  auto dim = index.dim() - 1;
-
-  src = src.contiguous();
-
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (auto i = 0; i < src.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-    CHECK_INPUT(index.size(dim) == out.size(dim));
-  } else {
-    auto sizes = src.sizes().vec();
-    sizes[dim] = index.size(dim);
-    out = torch::empty(sizes, src.options());
-  }
-
-  if (index.numel() == 0)
-    return out;
-
-  auto E = index.numel();
-  auto K = out.numel() / E;
-  auto N = src.size(dim);
-
-  auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-
-    if (K == 1)
-     hipLaunchKernelGGL(( gather_coo_kernel<scalar_t>), dim3(BLOCKS(1, E)), dim3(THREADS), 0, stream, 
-          src_data, index_info, out_data, E, N);
-    else
-     hipLaunchKernelGGL(( gather_coo_broadcast_kernel<scalar_t>)
-          , dim3(BLOCKS(1, E * K)), dim3(THREADS), 0, stream, src_data, index_info,
-                                                     out_data, E, K, N);
-  });
-
-  return out;
-}
--- a/csrc/hip/segment_csr_cuda.h
+++ b/csrc/hip/segment_csr_cuda.h
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-
-#include "../extensions.h"
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                 torch::optional<torch::Tensor> optional_out,
-                 std::string reduce);
-
-torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                              torch::optional<torch::Tensor> optional_out);
--- a/csrc/hip/segment_csr_cuda.hip
+++ b/csrc/hip/segment_csr_cuda.hip
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#include "hip/hip_runtime.h"
-#include "../hip/segment_csr_cuda.h"
-
-#include <ATen/hip/HIPContext.h>
-#include <ATen/hip/detail/IndexUtils.cuh>
-#include <ATen/hip/detail/TensorInfo.cuh>
-
-#include "../hip/index_info.cuh"
-#include "../hip/reducer.cuh"
-#include "../hip/utils.cuh"
-
-#define THREADS 256
-#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
-#define FULL_MASK 0xffffffff
-
-template <typename scalar_t, ReductionType REDUCE, int TB>
-__global__ void
-segment_csr_kernel(const scalar_t *src_data,
-                   const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
-                   scalar_t *out_data, int64_t *arg_out_data, size_t N,
-                   size_t E) {
-
-  // Each warp processes exactly `32/TB` rows and aggregates all row values
-  // via a parallel reduction.
-
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / TB;
-  int lane_idx = thread_idx & (TB - 1);
-
-  if (row_idx < N) {
-    int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);
-    int64_t row_start = __ldg(indptr_info.data + offset);
-    int64_t row_end = __ldg(indptr_info.data + offset +
-                            indptr_info.strides[indptr_info.dims - 1]);
-
-    scalar_t val = Reducer<scalar_t, REDUCE>::init();
-    int64_t arg, arg_tmp;
-
-    offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E;
-    for (int64_t src_idx = row_start + lane_idx; src_idx < row_end;
-         src_idx += TB) {
-      Reducer<scalar_t, REDUCE>::update(&val, src_data[offset + src_idx], &arg,
-                                        src_idx);
-    }
-
-#pragma unroll
-    for (int i = TB / 2; i > 0; i /= 2) {
-      // Parallel reduction inside a single warp.
-      if (REDUCE == MIN || REDUCE == MAX)
-        arg_tmp = SHFL_DOWN_SYNC(FULL_MASK, arg, i);
-      Reducer<scalar_t, REDUCE>::update(
-          &val, SHFL_DOWN_SYNC(FULL_MASK, val, i), &arg, arg_tmp);
-    }
-
-    if (lane_idx == 0) {
-      Reducer<scalar_t, REDUCE>::write(out_data + row_idx, val,
-                                       arg_out_data + row_idx, arg,
-                                       row_end - row_start);
-    }
-  }
-}
-
-template <typename scalar_t, ReductionType REDUCE>
-__global__ void segment_csr_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
-    scalar_t *out_data, int64_t *arg_out_data, size_t N, size_t K, size_t E) {
-
-  // Each thread processes exactly one row. It turned out that is more
-  // efficient than using shared memory due to avoiding synchronization
-  // barriers.
-
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int lane_idx = thread_idx % K;
-
-  if (thread_idx < N * K) {
-    int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);
-    int64_t row_start = __ldg(indptr_info.data + offset);
-    int64_t row_end = __ldg(indptr_info.data + offset +
-                            indptr_info.strides[indptr_info.dims - 1]);
-
-    scalar_t val = Reducer<scalar_t, REDUCE>::init();
-    int64_t arg;
-
-    offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E * K;
-    for (int64_t src_idx = row_start; src_idx < row_end; src_idx++) {
-      Reducer<scalar_t, REDUCE>::update(
-          &val, src_data[offset + K * src_idx + lane_idx], &arg, src_idx);
-    }
-
-    Reducer<scalar_t, REDUCE>::write(out_data + thread_idx, val,
-                                     arg_out_data + thread_idx, arg,
-                                     row_end - row_start);
-  }
-}
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                 torch::optional<torch::Tensor> optional_out,
-                 std::string reduce) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(indptr);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-
-  CHECK_INPUT(src.dim() >= indptr.dim());
-
-  auto sizes = indptr.sizes().vec();
-  for (auto i = 0; i < indptr.dim() - 1; i++)
-    sizes[i] = src.size(i);
-  indptr = indptr.expand(sizes);
-
-  auto dim = indptr.dim() - 1;
-
-  src = src.contiguous();
-
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (int i = 0; i < out.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-    CHECK_INPUT(src.numel() == 0 || out.size(dim) == indptr.size(dim) - 1);
-  } else {
-    sizes = src.sizes().vec();
-    sizes[dim] = std::max<int64_t>(indptr.size(dim) - 1, 0);
-    out = torch::empty(sizes, src.options());
-  }
-
-  torch::optional<torch::Tensor> arg_out = torch::nullopt;
-  int64_t *arg_out_data = nullptr;
-  if (reduce2REDUCE.at(reduce) == MIN || reduce2REDUCE.at(reduce) == MAX) {
-    arg_out = torch::full(out.sizes(), src.size(dim), indptr.options());
-    arg_out_data = arg_out.value().data_ptr<int64_t>();
-  }
-
-  if (src.numel() == 0) {
-    if (!optional_out.has_value())
-      out.fill_(0);
-    return std::make_tuple(out, arg_out);
-  }
-
-  auto N = out.size(dim) * (indptr.numel() / indptr.size(-1));
-  auto K = out.numel() / N;
-  auto E = src.size(dim);
-
-  auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-
-    AT_DISPATCH_REDUCTION_TYPES(reduce, [&] {
-      if (K == 1) {
-       hipLaunchKernelGGL(( segment_csr_kernel<scalar_t, REDUCE, 1>)
-            , dim3(BLOCKS(32, N)), dim3(THREADS), 0, stream, 
-                src_data, indptr_info, out_data, arg_out_data, N, E);
-      } else {
-       hipLaunchKernelGGL(( segment_csr_broadcast_kernel<scalar_t, REDUCE>)
-            , dim3(BLOCKS(1, N * K)), dim3(THREADS), 0, stream, 
-                src_data, indptr_info, out_data, arg_out_data, N, K, E);
-      }
-    });
-  });
-
-  return std::make_tuple(out, arg_out);
-}
-
-template <typename scalar_t, int TB>
-__global__ void
-gather_csr_kernel(const scalar_t *src_data,
-                  const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
-                  scalar_t *out_data, size_t N, size_t E) {
-
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / TB;
-  int lane_idx = thread_idx % TB;
-
-  if (row_idx < N) {
-    int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);
-    int row_start = __ldg(indptr_info.data + offset);
-    int row_end = __ldg(indptr_info.data + offset +
-                        indptr_info.strides[indptr_info.dims - 1]);
-    scalar_t val = __ldg(src_data + row_idx);
-
-    offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E;
-    for (int out_idx = row_start + lane_idx; out_idx < row_end; out_idx += TB) {
-      out_data[offset + out_idx] = val; // "Mostly" coalesced.
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void gather_csr_broadcast_kernel(
-    const scalar_t *src_data,
-    const at::cuda::detail::TensorInfo<int64_t, int> indptr_info,
-    scalar_t *out_data, size_t N, size_t K, size_t E) {
-
-  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int row_idx = thread_idx / K;
-  int lane_idx = thread_idx % K;
-
-  if (thread_idx < N * K) {
-    int offset = IndexPtrToOffset<int64_t>::get(row_idx, indptr_info);
-    int row_start = __ldg(indptr_info.data + offset);
-    int row_end = __ldg(indptr_info.data + offset +
-                        indptr_info.strides[indptr_info.dims - 1]);
-
-    scalar_t val = src_data[thread_idx]; // Coalesced.
-
-    offset = (row_idx / (indptr_info.sizes[indptr_info.dims - 1] - 1)) * E * K;
-    for (int out_idx = row_start; out_idx < row_end; out_idx++) {
-      out_data[offset + K * out_idx + lane_idx] = val; // "Mostly" coalesced.
-    }
-  }
-}
-
-torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
-                              torch::optional<torch::Tensor> optional_out) {
-  CHECK_CUDA(src);
-  CHECK_CUDA(indptr);
-  if (optional_out.has_value())
-    CHECK_CUDA(optional_out.value());
-  hipSetDevice(src.get_device());
-
-  CHECK_INPUT(src.dim() >= indptr.dim());
-
-  auto sizes = indptr.sizes().vec();
-  for (auto i = 0; i < indptr.dim() - 1; i++)
-    sizes[i] = src.size(i);
-  indptr = indptr.expand(sizes);
-
-  auto dim = indptr.dim() - 1;
-  CHECK_INPUT(src.size(dim) == 0 || src.size(dim) == indptr.size(dim) - 1);
-
-  src = src.contiguous();
-
-  torch::Tensor out;
-  if (optional_out.has_value()) {
-    out = optional_out.value().contiguous();
-    for (auto i = 0; i < out.dim(); i++)
-      if (i != dim)
-        CHECK_INPUT(src.size(i) == out.size(i));
-  } else {
-    auto sizes = src.sizes().vec();
-    if (src.numel() > 0) {
-      sizes[dim] = indptr.flatten()[-1].cpu().data_ptr<int64_t>()[0];
-    } else {
-      sizes[dim] = 0;
-    }
-    out = torch::empty(sizes, src.options());
-  }
-
-  if (src.numel() == 0) {
-    if (!optional_out.has_value())
-      out.fill_(0);
-    return out;
-  }
-
-  auto N = src.size(dim) * (indptr.numel() / indptr.size(-1));
-  auto K = src.numel() / N;
-  auto E = out.size(dim);
-
-  auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
-  auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
-  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
-    auto src_data = src.data_ptr<scalar_t>();
-    auto out_data = out.data_ptr<scalar_t>();
-
-    if (K == 1)
-     hipLaunchKernelGGL(( gather_csr_kernel<scalar_t, 4>), dim3(BLOCKS(1, 4 * N)), dim3(THREADS), 0, stream, 
-          src_data, indptr_info, out_data, N, E);
-    else
-     hipLaunchKernelGGL(( gather_csr_broadcast_kernel<scalar_t>)
-          , dim3(BLOCKS(1, N * K)), dim3(THREADS), 0, stream, src_data, indptr_info,
-                                                     out_data, N, K, E);
-  });
-
-  return out;
-}
--- a/csrc/hip/utils.cuh
+++ b/csrc/hip/utils.cuh
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-
-#include "../extensions.h"
-
-#define CHECK_CUDA(x)                                                          \
-  AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
-#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
-
-__device__ __inline__ at::Half __shfl_up_sync(const unsigned mask,
-                                              const at::Half var,
-                                              const unsigned int delta) {
-  return __shfl_up_sync(mask, var.operator __half(), delta);
-}
-
-__device__ __inline__ at::Half __shfl_down_sync(const unsigned mask,
-                                                const at::Half var,
-                                                const unsigned int delta) {
-  return __shfl_down_sync(mask, var.operator __half(), delta);
-}
-
-__device__ __inline__ at::Half __shfl_up(const at::Half var,
-                                         const unsigned int delta) {
-  return __shfl_up(var.operator __half(), delta);
-}
-
-__device__ __inline__ at::Half __shfl_down(const at::Half var,
-                                           const unsigned int delta) {
-  return __shfl_down(var.operator __half(), delta);
-}
-
-#ifdef USE_ROCM
-__device__ __inline__ at::Half __ldg(const at::Half* ptr) {
-  return __ldg(reinterpret_cast<const __half*>(ptr));
-}
-#define SHFL_UP_SYNC(mask, var, delta) __shfl_up(var, delta)
-#define SHFL_DOWN_SYNC(mask, var, delta) __shfl_down(var, delta)
-#else
-#define SHFL_UP_SYNC __shfl_up_sync
-#define SHFL_DOWN_SYNC __shfl_down_sync
-#endif
--- a/csrc/scatter_hip.cpp
+++ b/csrc/scatter_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#ifdef WITH_PYTHON
-#include <Python.h>
-#endif
-
-#include <torch/script.h>
-
-#include "cpu/scatter_cpu.h"
-#include "macros.h"
-#include "utils.h"
-
-#ifdef WITH_CUDA
-#include "hip/scatter_cuda.h"
-#endif
-
-#ifdef _WIN32
-#ifdef WITH_PYTHON
-#ifdef WITH_CUDA
-PyMODINIT_FUNC PyInit__scatter_cuda(void) { return NULL; }
-#else
-PyMODINIT_FUNC PyInit__scatter_cpu(void) { return NULL; }
-#endif
-#endif
-#endif
-
-torch::Tensor broadcast(torch::Tensor src, torch::Tensor other, int64_t dim) {
-  if (src.dim() == 1)
-    for (auto i = 0; i < dim; i++)
-      src = src.unsqueeze(0);
-  for (auto i = src.dim(); i < other.dim(); i++)
-    src = src.unsqueeze(-1);
-  src = src.expand(other.sizes().vec());
-  return src;
-}
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-scatter_fw(torch::Tensor src, torch::Tensor index, int64_t dim,
-           torch::optional<torch::Tensor> optional_out,
-           torch::optional<int64_t> dim_size, std::string reduce) {
-  if (src.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return scatter_cuda(src, index, dim, optional_out, dim_size, reduce);
-#else
-    AT_ERROR("Not compiled with CUDA support");
-#endif
-  } else {
-    return scatter_cpu(src, index, dim, optional_out, dim_size, reduce);
-  }
-}
-
-using torch::autograd::AutogradContext;
-using torch::autograd::Variable;
-using torch::autograd::variable_list;
-
-class ScatterSum : public torch::autograd::Function<ScatterSum> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    dim = dim < 0 ? src.dim() + dim : dim;
-    ctx->saved_data["dim"] = dim;
-    ctx->saved_data["src_shape"] = src.sizes();
-    index = broadcast(index, src, dim);
-    auto result = scatter_fw(src, index, dim, optional_out, dim_size, "sum");
-    auto out = std::get<0>(result);
-    ctx->save_for_backward({index});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto dim = ctx->saved_data["dim"].toInt();
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    auto grad_in = torch::gather(grad_out, dim, index, false);
-    return {grad_in, Variable(), Variable(), Variable(), Variable()};
-  }
-};
-
-class ScatterMul : public torch::autograd::Function<ScatterMul> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    dim = dim < 0 ? src.dim() + dim : dim;
-    ctx->saved_data["dim"] = dim;
-    ctx->saved_data["src_shape"] = src.sizes();
-    index = broadcast(index, src, dim);
-    auto result = scatter_fw(src, index, dim, optional_out, dim_size, "mul");
-    auto out = std::get<0>(result);
-    ctx->save_for_backward({src, index, out});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto src = saved[0];
-    auto index = saved[1];
-    auto out = saved[2];
-    auto dim = ctx->saved_data["dim"].toInt();
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    auto grad_in = torch::gather(grad_out * out, dim, index, false).div_(src);
-    grad_in.masked_fill_(grad_in.isnan(), 0);
-    return {grad_in, Variable(), Variable(), Variable(), Variable()};
-  }
-};
-
-class ScatterMean : public torch::autograd::Function<ScatterMean> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    dim = dim < 0 ? src.dim() + dim : dim;
-    ctx->saved_data["dim"] = dim;
-    ctx->saved_data["src_shape"] = src.sizes();
-
-    auto old_index = index;
-
-    index = broadcast(index, src, dim);
-    auto result = scatter_fw(src, index, dim, optional_out, dim_size, "sum");
-    auto out = std::get<0>(result);
-
-    auto ones = torch::ones(old_index.sizes(), src.options());
-    result = scatter_fw(ones, old_index,
-                        old_index.dim() <= dim ? old_index.dim() - 1 : dim,
-                        torch::nullopt, out.size(dim), "sum");
-    auto count = std::get<0>(result);
-    count.masked_fill_(count < 1, 1);
-    count = broadcast(count, out, dim);
-    if (out.is_floating_point())
-      out.true_divide_(count);
-    else
-      out.div_(count, "floor");
-
-    ctx->save_for_backward({index, count});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto count = saved[1];
-    auto dim = ctx->saved_data["dim"].toInt();
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    count = torch::gather(count, dim, index, false);
-    auto grad_in = torch::gather(grad_out, dim, index, false);
-    grad_in.true_divide_(count);
-    return {grad_in, Variable(), Variable(), Variable(), Variable()};
-  }
-};
-
-class ScatterMin : public torch::autograd::Function<ScatterMin> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    dim = dim < 0 ? src.dim() + dim : dim;
-    ctx->saved_data["dim"] = dim;
-    ctx->saved_data["src_shape"] = src.sizes();
-
-    index = broadcast(index, src, dim);
-    auto result = scatter_fw(src, index, dim, optional_out, dim_size, "min");
-    auto out = std::get<0>(result);
-    auto arg_out = std::get<1>(result).value();
-    ctx->save_for_backward({index, arg_out});
-    ctx->mark_non_differentiable({arg_out});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out, arg_out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto arg_out = saved[1];
-    auto dim = ctx->saved_data["dim"].toInt();
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    src_shape[dim] += 1;
-    auto grad_in = torch::zeros(src_shape, grad_out.options());
-    grad_in.scatter_(dim, arg_out, grad_out);
-    grad_in = grad_in.narrow(dim, 0, src_shape[dim] - 1);
-    return {grad_in, Variable(), Variable(), Variable(), Variable()};
-  }
-};
-
-class ScatterMax : public torch::autograd::Function<ScatterMax> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index, int64_t dim,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    dim = dim < 0 ? src.dim() + dim : dim;
-    ctx->saved_data["dim"] = dim;
-    ctx->saved_data["src_shape"] = src.sizes();
-
-    index = broadcast(index, src, dim);
-    auto result = scatter_fw(src, index, dim, optional_out, dim_size, "max");
-    auto out = std::get<0>(result);
-    auto arg_out = std::get<1>(result).value();
-    ctx->save_for_backward({index, arg_out});
-    ctx->mark_non_differentiable({arg_out});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out, arg_out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto arg_out = saved[1];
-    auto dim = ctx->saved_data["dim"].toInt();
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    src_shape[dim] += 1;
-    auto grad_in = torch::zeros(src_shape, grad_out.options());
-    grad_in.scatter_(dim, arg_out, grad_out);
-    grad_in = grad_in.narrow(dim, 0, src_shape[dim] - 1);
-    return {grad_in, Variable(), Variable(), Variable(), Variable()};
-  }
-};
-
-SCATTER_API torch::Tensor
-scatter_sum(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size) {
-  return ScatterSum::apply(src, index, dim, optional_out, dim_size)[0];
-}
-
-SCATTER_API torch::Tensor
-scatter_mul(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size) {
-  return ScatterMul::apply(src, index, dim, optional_out, dim_size)[0];
-}
-
-SCATTER_API torch::Tensor
-scatter_mean(torch::Tensor src, torch::Tensor index, int64_t dim,
-             torch::optional<torch::Tensor> optional_out,
-             torch::optional<int64_t> dim_size) {
-  return ScatterMean::apply(src, index, dim, optional_out, dim_size)[0];
-}
-
-SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
-scatter_min(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size) {
-  auto result = ScatterMin::apply(src, index, dim, optional_out, dim_size);
-  return std::make_tuple(result[0], result[1]);
-}
-
-SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
-scatter_max(torch::Tensor src, torch::Tensor index, int64_t dim,
-            torch::optional<torch::Tensor> optional_out,
-            torch::optional<int64_t> dim_size) {
-  auto result = ScatterMax::apply(src, index, dim, optional_out, dim_size);
-  return std::make_tuple(result[0], result[1]);
-}
-
-static auto registry = torch::RegisterOperators()
-                           .op("torch_scatter::scatter_sum", &scatter_sum)
-                           .op("torch_scatter::scatter_mul", &scatter_mul)
-                           .op("torch_scatter::scatter_mean", &scatter_mean)
-                           .op("torch_scatter::scatter_min", &scatter_min)
-                           .op("torch_scatter::scatter_max", &scatter_max);
--- a/csrc/segment_coo_hip.cpp
+++ b/csrc/segment_coo_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#ifdef WITH_PYTHON
-#include <Python.h>
-#endif
-
-#include <torch/script.h>
-
-#include "cpu/segment_coo_cpu.h"
-#include "macros.h"
-#include "utils.h"
-
-#ifdef WITH_CUDA
-#include "hip/segment_coo_cuda.h"
-#endif
-
-#ifdef _WIN32
-#ifdef WITH_PYTHON
-#ifdef WITH_CUDA
-PyMODINIT_FUNC PyInit__segment_coo_cuda(void) { return NULL; }
-#else
-PyMODINIT_FUNC PyInit__segment_coo_cpu(void) { return NULL; }
-#endif
-#endif
-#endif
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-segment_coo_fw(torch::Tensor src, torch::Tensor index,
-               torch::optional<torch::Tensor> optional_out,
-               torch::optional<int64_t> dim_size, std::string reduce) {
-  if (src.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return segment_coo_cuda(src, index, optional_out, dim_size, reduce);
-#else
-    AT_ERROR("Not compiled with CUDA support");
-#endif
-  } else {
-    return segment_coo_cpu(src, index, optional_out, dim_size, reduce);
-  }
-}
-
-torch::Tensor gather_coo_fw(torch::Tensor src, torch::Tensor index,
-                            torch::optional<torch::Tensor> optional_out) {
-  if (src.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return gather_coo_cuda(src, index, optional_out);
-#else
-    AT_ERROR("Not compiled with CUDA support");
-#endif
-  } else {
-    return gather_coo_cpu(src, index, optional_out);
-  }
-}
-
-using torch::autograd::AutogradContext;
-using torch::autograd::Variable;
-using torch::autograd::variable_list;
-
-class SegmentSumCOO : public torch::autograd::Function<SegmentSumCOO> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto result = segment_coo_fw(src, index, optional_out, dim_size, "sum");
-    auto out = std::get<0>(result);
-    ctx->save_for_backward({index});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    auto grad_in = torch::empty(src_shape, grad_out.options());
-    gather_coo_fw(grad_out, index, grad_in);
-    return {grad_in, Variable(), Variable(), Variable()};
-  }
-};
-
-class SegmentMeanCOO : public torch::autograd::Function<SegmentMeanCOO> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto result = segment_coo_fw(src, index, optional_out, dim_size, "mean");
-    auto out = std::get<0>(result);
-    auto count = std::get<1>(result).value();
-    ctx->save_for_backward({index, count});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto count = saved[1];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    auto grad_in = torch::empty(src_shape, grad_out.options());
-    gather_coo_fw(grad_out, index, grad_in);
-    count = gather_coo_fw(count, index, torch::nullopt);
-    for (auto i = 0; i < grad_out.dim() - index.dim(); i++)
-      count = count.unsqueeze(-1);
-    grad_in.true_divide_(count);
-    return {grad_in, Variable(), Variable(), Variable()};
-  }
-};
-
-class SegmentMinCOO : public torch::autograd::Function<SegmentMinCOO> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto result = segment_coo_fw(src, index, optional_out, dim_size, "min");
-    auto out = std::get<0>(result);
-    auto arg_out = std::get<1>(result).value();
-    ctx->save_for_backward({index, arg_out});
-    ctx->mark_non_differentiable({arg_out});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out, arg_out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto arg_out = saved[1];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    src_shape[index.dim() - 1] += 1;
-    auto grad_in = torch::zeros(src_shape, grad_out.options());
-    grad_in.scatter_(index.dim() - 1, arg_out, grad_out);
-    grad_in =
-        grad_in.narrow(index.dim() - 1, 0, src_shape[index.dim() - 1] - 1);
-    return {grad_in, Variable(), Variable(), Variable()};
-  }
-};
-
-class SegmentMaxCOO : public torch::autograd::Function<SegmentMaxCOO> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index,
-                               torch::optional<Variable> optional_out,
-                               torch::optional<int64_t> dim_size) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto result = segment_coo_fw(src, index, optional_out, dim_size, "max");
-    auto out = std::get<0>(result);
-    auto arg_out = std::get<1>(result).value();
-    ctx->save_for_backward({index, arg_out});
-    ctx->mark_non_differentiable({arg_out});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out, arg_out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto arg_out = saved[1];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    src_shape[index.dim() - 1] += 1;
-    auto grad_in = torch::zeros(src_shape, grad_out.options());
-    grad_in.scatter_(index.dim() - 1, arg_out, grad_out);
-    grad_in =
-        grad_in.narrow(index.dim() - 1, 0, src_shape[index.dim() - 1] - 1);
-    return {grad_in, Variable(), Variable(), Variable()};
-  }
-};
-
-class GatherCOO : public torch::autograd::Function<GatherCOO> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable index,
-                               torch::optional<Variable> optional_out) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto out = gather_coo_fw(src, index, optional_out);
-    ctx->save_for_backward({index});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto index = saved[0];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-
-    auto grad_in = torch::zeros(src_shape, grad_out.options());
-    segment_coo_fw(grad_out, index, grad_in, torch::nullopt, "sum");
-    return {grad_in, Variable(), Variable()};
-  }
-};
-
-SCATTER_API torch::Tensor
-segment_sum_coo(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size) {
-  return SegmentSumCOO::apply(src, index, optional_out, dim_size)[0];
-}
-
-SCATTER_API torch::Tensor
-segment_mean_coo(torch::Tensor src, torch::Tensor index,
-                 torch::optional<torch::Tensor> optional_out,
-                 torch::optional<int64_t> dim_size) {
-  return SegmentMeanCOO::apply(src, index, optional_out, dim_size)[0];
-}
-
-SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
-segment_min_coo(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size) {
-  auto result = SegmentMinCOO::apply(src, index, optional_out, dim_size);
-  return std::make_tuple(result[0], result[1]);
-}
-
-SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
-segment_max_coo(torch::Tensor src, torch::Tensor index,
-                torch::optional<torch::Tensor> optional_out,
-                torch::optional<int64_t> dim_size) {
-  auto result = SegmentMaxCOO::apply(src, index, optional_out, dim_size);
-  return std::make_tuple(result[0], result[1]);
-}
-
-SCATTER_API torch::Tensor
-gather_coo(torch::Tensor src, torch::Tensor index,
-           torch::optional<torch::Tensor> optional_out) {
-  return GatherCOO::apply(src, index, optional_out)[0];
-}
-
-static auto registry =
-    torch::RegisterOperators()
-        .op("torch_scatter::segment_sum_coo", &segment_sum_coo)
-        .op("torch_scatter::segment_mean_coo", &segment_mean_coo)
-        .op("torch_scatter::segment_min_coo", &segment_min_coo)
-        .op("torch_scatter::segment_max_coo", &segment_max_coo)
-        .op("torch_scatter::gather_coo", &gather_coo);
--- a/csrc/segment_csr_hip.cpp
+++ b/csrc/segment_csr_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#ifdef WITH_PYTHON
-#include <Python.h>
-#endif
-
-#include <torch/script.h>
-
-#include "cpu/segment_csr_cpu.h"
-#include "macros.h"
-#include "utils.h"
-
-#ifdef WITH_CUDA
-#include "hip/segment_csr_cuda.h"
-#endif
-
-#ifdef _WIN32
-#ifdef WITH_PYTHON
-#ifdef WITH_CUDA
-PyMODINIT_FUNC PyInit__segment_csr_cuda(void) { return NULL; }
-#else
-PyMODINIT_FUNC PyInit__segment_csr_cpu(void) { return NULL; }
-#endif
-#endif
-#endif
-
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
-segment_csr_fw(torch::Tensor src, torch::Tensor indptr,
-               torch::optional<torch::Tensor> optional_out,
-               std::string reduce) {
-  if (src.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return segment_csr_cuda(src, indptr, optional_out, reduce);
-#else
-    AT_ERROR("Not compiled with CUDA support");
-#endif
-  } else {
-    return segment_csr_cpu(src, indptr, optional_out, reduce);
-  }
-}
-
-torch::Tensor gather_csr_fw(torch::Tensor src, torch::Tensor indptr,
-                            torch::optional<torch::Tensor> optional_out) {
-  if (src.device().is_cuda()) {
-#ifdef WITH_CUDA
-    return gather_csr_cuda(src, indptr, optional_out);
-#else
-    AT_ERROR("Not compiled with CUDA support");
-#endif
-  } else {
-    return gather_csr_cpu(src, indptr, optional_out);
-  }
-}
-
-using torch::autograd::AutogradContext;
-using torch::autograd::Variable;
-using torch::autograd::variable_list;
-
-class SegmentSumCSR : public torch::autograd::Function<SegmentSumCSR> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable indptr,
-                               torch::optional<Variable> optional_out) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto out = std::get<0>(segment_csr_fw(src, indptr, optional_out, "sum"));
-    ctx->save_for_backward({indptr});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto indptr = saved[0];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    auto grad_in = torch::empty(src_shape, grad_out.options());
-    gather_csr_fw(grad_out, indptr, grad_in);
-    return {grad_in, Variable(), Variable()};
-  }
-};
-
-class SegmentMeanCSR : public torch::autograd::Function<SegmentMeanCSR> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable indptr,
-                               torch::optional<Variable> optional_out) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto out = std::get<0>(segment_csr_fw(src, indptr, optional_out, "mean"));
-    ctx->save_for_backward({indptr});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto indptr = saved[0];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    auto grad_in = torch::empty(src_shape, grad_out.options());
-    if (grad_in.numel() > 0) {
-      gather_csr_fw(grad_out, indptr, grad_in);
-      auto indptr1 = indptr.narrow(-1, 0, indptr.size(-1) - 1);
-      auto indptr2 = indptr.narrow(-1, 1, indptr.size(-1) - 1);
-      auto count = (indptr2 - indptr1).to(grad_in.options());
-      count = gather_csr_fw(count, indptr, torch::nullopt);
-      for (auto i = 0; i < grad_out.dim() - indptr.dim(); i++)
-        count = count.unsqueeze(-1);
-      grad_in.true_divide_(count);
-    }
-    return {grad_in, Variable(), Variable()};
-  }
-};
-
-class SegmentMinCSR : public torch::autograd::Function<SegmentMinCSR> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable indptr,
-                               torch::optional<Variable> optional_out) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto result = segment_csr_fw(src, indptr, optional_out, "min");
-    auto out = std::get<0>(result);
-    auto arg_out = std::get<1>(result).value();
-    ctx->save_for_backward({indptr, arg_out});
-    ctx->mark_non_differentiable({arg_out});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out, arg_out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto indptr = saved[0];
-    auto arg_out = saved[1];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    src_shape[indptr.dim() - 1] += 1;
-    auto grad_in = torch::zeros(src_shape, grad_out.options());
-    grad_in.scatter_(indptr.dim() - 1, arg_out, grad_out);
-    grad_in =
-        grad_in.narrow(indptr.dim() - 1, 0, src_shape[indptr.dim() - 1] - 1);
-    return {grad_in, Variable(), Variable()};
-  }
-};
-
-class SegmentMaxCSR : public torch::autograd::Function<SegmentMaxCSR> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable indptr,
-                               torch::optional<Variable> optional_out) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto result = segment_csr_fw(src, indptr, optional_out, "max");
-    auto out = std::get<0>(result);
-    auto arg_out = std::get<1>(result).value();
-    ctx->save_for_backward({indptr, arg_out});
-    ctx->mark_non_differentiable({arg_out});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out, arg_out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto indptr = saved[0];
-    auto arg_out = saved[1];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-    src_shape[indptr.dim() - 1] += 1;
-    auto grad_in = torch::zeros(src_shape, grad_out.options());
-    grad_in.scatter_(indptr.dim() - 1, arg_out, grad_out);
-    grad_in =
-        grad_in.narrow(indptr.dim() - 1, 0, src_shape[indptr.dim() - 1] - 1);
-    return {grad_in, Variable(), Variable()};
-  }
-};
-
-class GatherCSR : public torch::autograd::Function<GatherCSR> {
-public:
-  static variable_list forward(AutogradContext *ctx, Variable src,
-                               Variable indptr,
-                               torch::optional<Variable> optional_out) {
-    ctx->saved_data["src_shape"] = src.sizes();
-    auto out = gather_csr_fw(src, indptr, optional_out);
-    ctx->save_for_backward({indptr});
-    if (optional_out.has_value())
-      ctx->mark_dirty({optional_out.value()});
-    return {out};
-  }
-
-  static variable_list backward(AutogradContext *ctx, variable_list grad_outs) {
-    auto grad_out = grad_outs[0];
-    auto saved = ctx->get_saved_variables();
-    auto indptr = saved[0];
-    auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
-
-    auto grad_in = torch::empty(src_shape, grad_out.options());
-    segment_csr_fw(grad_out, indptr, grad_in, "sum");
-    return {grad_in, Variable(), Variable()};
-  }
-};
-
-SCATTER_API torch::Tensor
-segment_sum_csr(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out) {
-  return SegmentSumCSR::apply(src, indptr, optional_out)[0];
-}
-
-SCATTER_API torch::Tensor
-segment_mean_csr(torch::Tensor src, torch::Tensor indptr,
-                 torch::optional<torch::Tensor> optional_out) {
-  return SegmentMeanCSR::apply(src, indptr, optional_out)[0];
-}
-
-SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
-segment_min_csr(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out) {
-  auto result = SegmentMinCSR::apply(src, indptr, optional_out);
-  return std::make_tuple(result[0], result[1]);
-}
-
-SCATTER_API std::tuple<torch::Tensor, torch::Tensor>
-segment_max_csr(torch::Tensor src, torch::Tensor indptr,
-                torch::optional<torch::Tensor> optional_out) {
-  auto result = SegmentMaxCSR::apply(src, indptr, optional_out);
-  return std::make_tuple(result[0], result[1]);
-}
-
-SCATTER_API torch::Tensor
-gather_csr(torch::Tensor src, torch::Tensor indptr,
-           torch::optional<torch::Tensor> optional_out) {
-  return GatherCSR::apply(src, indptr, optional_out)[0];
-}
-
-static auto registry =
-    torch::RegisterOperators()
-        .op("torch_scatter::segment_sum_csr", &segment_sum_csr)
-        .op("torch_scatter::segment_mean_csr", &segment_mean_csr)
-        .op("torch_scatter::segment_min_csr", &segment_min_csr)
-        .op("torch_scatter::segment_max_csr", &segment_max_csr)
-        .op("torch_scatter::gather_csr", &gather_csr);
--- a/csrc/version_hip.cpp
+++ b/csrc/version_hip.cpp
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#ifdef WITH_PYTHON
-#include <Python.h>
-#endif
-
-#include <torch/script.h>
-#include "scatter.h"
-#include "macros.h"
-
-#ifdef WITH_CUDA
-#ifdef USE_ROCM
-#include <hip/hip_version.h>
-#else
-#include <hip/hip_runtime.h>
-#endif
-#endif
-
-#ifdef _WIN32
-#ifdef WITH_PYTHON
-#ifdef WITH_CUDA
-PyMODINIT_FUNC PyInit__version_cuda(void) { return NULL; }
-#else
-PyMODINIT_FUNC PyInit__version_cpu(void) { return NULL; }
-#endif
-#endif
-#endif
-
-namespace scatter {
-SCATTER_API int64_t cuda_version() noexcept {
-#ifdef WITH_CUDA
-#ifdef USE_ROCM
-  return HIP_VERSION;
-#else
-  return DTK_VERSION;
-#endif
-#else
-  return -1;
-#endif
-}
-} // namespace scatter
-
-static auto registry = torch::RegisterOperators().op(
-    "torch_scatter::cuda_version", [] { return scatter::cuda_version(); });
--- a/torch_scatter.egg-info/PKG-INFO
+++ b/torch_scatter.egg-info/PKG-INFO
-Metadata-Version: 2.1
-Name: torch-scatter
-Version: 2.1.2
-Summary: PyTorch Extension Library of Optimized Scatter Operations
-Home-page: https://github.com/rusty1s/pytorch_scatter
-Author: Matthias Fey
-Author-email: matthias.fey@tu-dortmund.de
-License: UNKNOWN
-Download-URL: https://github.com/rusty1s/pytorch_scatter/archive/2.1.2.tar.gz
-Description: [pypi-image]: https://badge.fury.io/py/torch-scatter.svg
-        [pypi-url]: https://pypi.python.org/pypi/torch-scatter
-        [testing-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml/badge.svg
-        [testing-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/testing.yml
-        [linting-image]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml/badge.svg
-        [linting-url]: https://github.com/rusty1s/pytorch_scatter/actions/workflows/linting.yml
-        [docs-image]: https://readthedocs.org/projects/pytorch-scatter/badge/?version=latest
-        [docs-url]: https://pytorch-scatter.readthedocs.io/en/latest/?badge=latest
-        [coverage-image]: https://codecov.io/gh/rusty1s/pytorch_scatter/branch/master/graph/badge.svg
-        [coverage-url]: https://codecov.io/github/rusty1s/pytorch_scatter?branch=master
-        
-        # PyTorch Scatter
-        
-        [![PyPI Version][pypi-image]][pypi-url]
-        [![Testing Status][testing-image]][testing-url]
-        [![Linting Status][linting-image]][linting-url]
-        [![Docs Status][docs-image]][docs-url]
-        [![Code Coverage][coverage-image]][coverage-url]
-        
-        <p align="center">
-          <img width="50%" src="https://raw.githubusercontent.com/rusty1s/pytorch_scatter/master/docs/source/_figures/add.svg?sanitize=true" />
-        </p>
-        
-        --------------------------------------------------------------------------------
-        
-        **[Documentation](https://pytorch-scatter.readthedocs.io)**
-        
-        This package consists of a small extension library of highly optimized sparse update (scatter and segment) operations for the use in [PyTorch](http://pytorch.org/), which are missing in the main package.
-        Scatter and segment operations can be roughly described as reduce operations based on a given "group-index" tensor.
-        Segment operations require the "group-index" tensor to be sorted, whereas scatter operations are not subject to these requirements.
-        
-        The package consists of the following operations with reduction types `"sum"|"mean"|"min"|"max"`:
-        
-        * [**scatter**](https://pytorch-scatter.readthedocs.io/en/latest/functions/scatter.html) based on arbitrary indices
-        * [**segment_coo**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_coo.html) based on sorted indices
-        * [**segment_csr**](https://pytorch-scatter.readthedocs.io/en/latest/functions/segment_csr.html) based on compressed indices via pointers
-        
-        In addition, we provide the following **composite functions** which make use of `scatter_*` operations under the hood: `scatter_std`, `scatter_logsumexp`, `scatter_softmax` and `scatter_log_softmax`.
-        
-        All included operations are broadcastable, work on varying data types, are implemented both for CPU and GPU with corresponding backward implementations, and are fully traceable.
-        
-        ## Installation
-        
-        ### Anaconda
-        
-        **Update:** You can now install `pytorch-scatter` via [Anaconda](https://anaconda.org/pyg/pytorch-scatter) for all major OS/PyTorch/CUDA combinations 🤗
-        Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
-        
-        ```
-        conda install pytorch-scatter -c pyg
-        ```
-        
-        ### Binaries
-        
-        We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
-        
-        #### PyTorch 2.2
-        
-        To install the binaries for PyTorch 2.2.0, simply run
-        
-        ```
-        pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+${CUDA}.html
-        ```
-        
-        where `${CUDA}` should be replaced by either `cpu`, `cu118`, or `cu121` depending on your PyTorch installation.
-        
-        |             | `cpu` | `cu118` | `cu121` |
-        |-------------|-------|---------|---------|
-        | **Linux**   | ✅    | ✅      | ✅      |
-        | **Windows** | ✅    | ✅      | ✅      |
-        | **macOS**   | ✅    |         |         |
-        
-        #### PyTorch 2.1
-        
-        To install the binaries for PyTorch 2.1.0, simply run
-        
-        ```
-        pip install torch-scatter -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
-        ```
-        
-        where `${CUDA}` should be replaced by either `cpu`, `cu118`, or `cu121` depending on your PyTorch installation.
-        
-        |             | `cpu` | `cu118` | `cu121` |
-        |-------------|-------|---------|---------|
-        | **Linux**   | ✅    | ✅      | ✅      |
-        | **Windows** | ✅    | ✅      | ✅      |
-        | **macOS**   | ✅    |         |         |
-        
-        **Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1, PyTorch 1.9.0, PyTorch 1.10.0/1.10.1/1.10.2, PyTorch 1.11.0, PyTorch 1.12.0/1.12.1, PyTorch 1.13.0/1.13.1, and PyTorch 2.0.0 (following the same procedure).
-        For older versions, you need to explicitly specify the latest supported version number or install via `pip install --no-index` in order to prevent a manual installation from source.
-        You can look up the latest supported version number [here](https://data.pyg.org/whl).
-        
-        ### From source
-        
-        Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
-        
-        ```
-        $ python -c "import torch; print(torch.__version__)"
-        >>> 1.4.0
-        
-        $ echo $PATH
-        >>> /usr/local/cuda/bin:...
-        
-        $ echo $CPATH
-        >>> /usr/local/cuda/include:...
-        ```
-        
-        Then run:
-        
-        ```
-        pip install torch-scatter
-        ```
-        
-        When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
-        In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
-        
-        ```
-        export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
-        ```
-        
-        ## Example
-        
-        ```py
-        import torch
-        from torch_scatter import scatter_max
-        
-        src = torch.tensor([[2, 0, 1, 4, 3], [0, 2, 1, 3, 4]])
-        index = torch.tensor([[4, 5, 4, 2, 3], [0, 0, 2, 2, 1]])
-        
-        out, argmax = scatter_max(src, index, dim=-1)
-        ```
-        
-        ```
-        print(out)
-        tensor([[0, 0, 4, 3, 2, 0],
-                [2, 4, 3, 0, 0, 0]])
-        
-        print(argmax)
-        tensor([[5, 5, 3, 4, 0, 1]
-                [1, 4, 3, 5, 5, 5]])
-        ```
-        
-        ## Running tests
-        
-        ```
-        pytest
-        ```
-        
-        ## C++ API
-        
-        `torch-scatter` also offers a C++ API that contains C++ equivalent of python models.
-        For this, we need to add `TorchLib` to the `-DCMAKE_PREFIX_PATH` (*e.g.*, it may exists in `{CONDA}/lib/python{X.X}/site-packages/torch` if installed via `conda`):
-        
-        ```
-        mkdir build
-        cd build
-        # Add -DWITH_CUDA=on support for CUDA support
-        cmake -DCMAKE_PREFIX_PATH="..." ..
-        make
-        make install
-        ```
-        
-Keywords: pytorch,scatter,segment,gather
-Platform: UNKNOWN
-Classifier: Development Status :: 5 - Production/Stable
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3 :: Only
-Requires-Python: >=3.8
-Description-Content-Type: text/markdown
-Provides-Extra: test
--- a/torch_scatter.egg-info/SOURCES.txt
+++ b/torch_scatter.egg-info/SOURCES.txt
-LICENSE
-MANIFEST.in
-README.md
-setup.cfg
-setup.py
-csrc/extensions.h
-csrc/macros.h
-csrc/scatter.cpp
-csrc/scatter.h
-csrc/scatter_hip.cpp
-csrc/segment_coo.cpp
-csrc/segment_coo_hip.cpp
-csrc/segment_csr.cpp
-csrc/segment_csr_hip.cpp
-csrc/utils.h
-csrc/version.cpp
-csrc/version_hip.cpp
-csrc/cpu/index_info.h
-csrc/cpu/reducer.h
-csrc/cpu/scatter_cpu.cpp
-csrc/cpu/scatter_cpu.h
-csrc/cpu/segment_coo_cpu.cpp
-csrc/cpu/segment_coo_cpu.h
-csrc/cpu/segment_csr_cpu.cpp
-csrc/cpu/segment_csr_cpu.h
-csrc/cpu/utils.h
-csrc/cuda/atomics.cuh
-csrc/cuda/index_info.cuh
-csrc/cuda/reducer.cuh
-csrc/cuda/scatter_cuda.cu
-csrc/cuda/scatter_cuda.h
-csrc/cuda/segment_coo_cuda.cu
-csrc/cuda/segment_coo_cuda.h
-csrc/cuda/segment_csr_cuda.cu
-csrc/cuda/segment_csr_cuda.h
-csrc/cuda/utils.cuh
-csrc/hip/atomics.cuh
-csrc/hip/index_info.cuh
-csrc/hip/reducer.cuh
-csrc/hip/scatter_cuda.h
-csrc/hip/scatter_cuda.hip
-csrc/hip/segment_coo_cuda.h
-csrc/hip/segment_coo_cuda.hip
-csrc/hip/segment_csr_cuda.h
-csrc/hip/segment_csr_cuda.hip
-csrc/hip/utils.cuh
-torch_scatter/__init__.py
-torch_scatter/placeholder.py
-torch_scatter/scatter.py
-torch_scatter/segment_coo.py
-torch_scatter/segment_csr.py
-torch_scatter/testing.py
-torch_scatter/utils.py
-torch_scatter.egg-info/PKG-INFO
-torch_scatter.egg-info/SOURCES.txt
-torch_scatter.egg-info/dependency_links.txt
-torch_scatter.egg-info/requires.txt
-torch_scatter.egg-info/top_level.txt
-torch_scatter/composite/__init__.py
-torch_scatter/composite/logsumexp.py
-torch_scatter/composite/softmax.py
-torch_scatter/composite/std.py
\ No newline at end of file
--- a/torch_scatter.egg-info/dependency_links.txt
+++ b/torch_scatter.egg-info/dependency_links.txt
-
--- a/torch_scatter.egg-info/requires.txt
+++ b/torch_scatter.egg-info/requires.txt
-
-[test]
-pytest
-pytest-cov