support v1.6.3

6b634203 · limm · c2dcc5fd · 6b634203 · 6b634203 · 6b634203
Commit 6b634203 authored May 27, 2025 by limm
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
 project(torchcluster)
 set(CMAKE_CXX_STANDARD 14)
-set(TORCHCLUSTER_VERSION 1.6.0)
+set(TORCHCLUSTER_VERSION 1.6.3)
 option(WITH_CUDA "Enable CUDA support" OFF)
+option(WITH_PYTHON "Link to Python when building" ON)
 if(WITH_CUDA)
  enable_language(CUDA)
@@ -12,21 +13,27 @@ if(WITH_CUDA)
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 endif()
-find_package(Python3 COMPONENTS Development)
+if (WITH_PYTHON)
+  add_definitions(-DWITH_PYTHON)
+  find_package(Python3 COMPONENTS Development)
+endif()
 find_package(Torch REQUIRED)
-file(GLOB HEADERS csrc/cluster.h)
+file(GLOB HEADERS csrc/*.h)
-file(GLOB OPERATOR_SOURCES csrc/cpu/*.h csrc/cpu/*.cpp csrc/*.cpp)
+file(GLOB OPERATOR_SOURCES csrc/*.* csrc/cpu/*.*)
 if(WITH_CUDA)
  file(GLOB OPERATOR_SOURCES ${OPERATOR_SOURCES} csrc/cuda/*.h csrc/cuda/*.cu)
 endif()
 add_library(${PROJECT_NAME} SHARED ${OPERATOR_SOURCES})
-target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} Python3::Python)
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
+if (WITH_PYTHON)
+  target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python)
+endif()
 set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchCluster)
 target_include_directories(${PROJECT_NAME} INTERFACE
-  $<BUILD_INTERFACE:${HEADERS}>
+  "$<BUILD_INTERFACE:${HEADERS}>"
  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 include(GNUInstallDirs)

--- a/README.md
+++ b/README.md
 # <div align="center"><strong>PyTorch Cluster</strong></div>
 ## 简介
-PyTorch Cluster是一个小型的扩展库，其中包含了高度优化的图聚类算法，用于在PyTorch中使用。该包包括以下聚类算法：Graclus，Voxel Grid Pooling，迭代最远点采样，k-NN和Radius图生成，基于最近点的聚类，随机游走采样等。DAS软件栈中的PyTorch Cluster版本，不仅保证了组件核心功能在DCU加速卡的可用性,还针对DCU特有的硬件架构进行了深度定制优化。这使得开发者能够以极低的成本，轻松实现应用程序在DCU加速卡上的快速迁移和性能提升。目前支持Pytorch1.13 Pyotrch2.1 Pytorch2.3
+PyTorch Cluster是一个小型的扩展库，其中包含了高度优化的图聚类算法，用于在PyTorch中使用。该包包括以下聚类算法：Graclus，Voxel Grid Pooling，迭代最远点采样，k-NN和Radius图生成，基于最近点的聚类，随机游走采样等。DAS软件栈中的PyTorch Cluster版本，不仅保证了组件核心功能在DCU加速卡的可用性,还针对DCU特有的硬件架构进行了深度定制优化。这使得开发者能够以极低的成本，轻松实现应用程序在DCU加速卡上的快速迁移和性能提升。目前支持Pytorch1.13 Pyotrch2.1 Pytorch2.4.1 Pytorch2.5.1
 ## 安装
 ### 使用pip方式安装
-pytorch-cluster whl包下载目录：[http://10.6.10.68:8000/customized/torch-cluster/dtk2310](http://10.6.10.68:8000/customized/torch-cluster/dtk2310)，目前只提供有python3.8版本的whl包。
+pytorch-cluster whl包下载目录：[https://das.sourcefind.cn:55011/portal/#/installation?id=2083b36e-6c1b-11ef-bb3e-005056904552&type=frame](https://das.sourcefind.cn:55011/portal/#/installation?id=2083b36e-6c1b-11ef-bb3e-005056904552&type=frame).
 ```shell
 pip install torch_cluster* (下载的torch_cluster的whl包)
 ```
@@ -19,22 +19,29 @@ pip install 'urllib3==1.26.14'
 pip install pytest
 pip install wheel 
 ```
- 在首页 | 光合开发者社区下载 dtk23.10 解压至 /opt/ 路径下，并建立软链接
+- 在首页 | 光合开发者社区下载 dtk25.04 解压至 /opt/ 路径下，并建立软链接
-```shell
-cd /opt && ln -s dtk-23.10 dtk
-source /opt/dtk/env.sh
-```
- 安装pytorch，pytorch whl包下载目录：[http://10.6.10.68:8000/debug/pytorch/dtk23.10/hipify/](http://10.6.10.68:8000/debug/pytorch/dtk23.10/hipify/)，根据python、dtk版本,下载对应pytorch的whl包。安装命令如下：
 ```shell
+cd /opt && ln -s dtk-25.04 dtk
+source /opt/dtk/cuda/env.sh
+- 安装pytorch，pytorch whl包下载目录：[http://10.16.4.1:8000/debug/pytorch/dtk25.04/](http://10.16.4.1:8000/debug/pytorch/dtk25.04/)，根据python、dtk版本,下载对应pytorch的whl包。安装命令如下：
 pip install torch* (下载的torch的whl包)
+- 安装fastpt，fastpt whl包下载目录：[https://das.sourcefind.cn:55011/portal/#/installation?id=8e0d0030-6c14-11ef-bf92-005056904552&type=frame](https://das.sourcefind.cn:55011/portal/#/installation?id=8e0d0030-6c14-11ef-bf92-005056904552&type=frame)
+pip install fastpt*
 ```
 #### 源码编译安装
 ```shell
-git clone -b 1.6.0-release http://developer.hpccube.com/codes/aicomponent/torch-cluster.git
+git clone -b 1.6.3-fastpt http://developer.hpccube.com/codes/aicomponent/torch-cluster.git
-export FORCE_CUDA=1 or python pymap_script.py /path/to/pytorch_cluster(二选一，该操作是把HIP(CUDA)代码编译进去)
+export FORCE_CUDA=1
+source /usr/local/bin/fastpt -C
 cd pytorch_cluster
 python setup.py bdist_wheel
 pip install dist/*.whl
 ```
 ## 单测
 ```shell

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
+[pypi-image]: https://badge.fury.io/py/torch-cluster.svg
+[pypi-url]: https://pypi.python.org/pypi/torch-cluster
+[testing-image]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/testing.yml/badge.svg
+[testing-url]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/testing.yml
+[linting-image]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/linting.yml/badge.svg
+[linting-url]: https://github.com/rusty1s/pytorch_cluster/actions/workflows/linting.yml
+[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_cluster/branch/master/graph/badge.svg
+[coverage-url]: https://codecov.io/github/rusty1s/pytorch_cluster?branch=master
+# PyTorch Cluster
+[![PyPI Version][pypi-image]][pypi-url]
+[![Testing Status][testing-image]][testing-url]
+[![Linting Status][linting-image]][linting-url]
+[![Code Coverage][coverage-image]][coverage-url]
+--------------------------------------------------------------------------------
+This package consists of a small extension library of highly optimized graph cluster algorithms for the use in [PyTorch](http://pytorch.org/).
+The package consists of the following clustering algorithms:
+* **[Graclus](#graclus)** from Dhillon *et al.*: [Weighted Graph Cuts without Eigenvectors: A Multilevel Approach](http://www.cs.utexas.edu/users/inderjit/public_papers/multilevel_pami.pdf) (PAMI 2007)
+* **[Voxel Grid Pooling](#voxelgrid)** from, *e.g.*, Simonovsky and Komodakis: [Dynamic Edge-Conditioned Filters in Convolutional Neural Networks on Graphs](https://arxiv.org/abs/1704.02901) (CVPR 2017)
+* **[Iterative Farthest Point Sampling](#farthestpointsampling)** from, *e.g.* Qi *et al.*: [PointNet++: Deep Hierarchical Feature Learning on Point Sets in a Metric Space](https://arxiv.org/abs/1706.02413) (NIPS 2017)
+* **[k-NN](#knn-graph)** and **[Radius](#radius-graph)** graph generation
+* Clustering based on **[Nearest](#nearest)** points
+* **[Random Walk Sampling](#randomwalk-sampling)** from, *e.g.*, Grover and Leskovec: [node2vec: Scalable Feature Learning for Networks](https://arxiv.org/abs/1607.00653) (KDD 2016)
+All included operations work on varying data types and are implemented both for CPU and GPU.
+## Installation
+### Anaconda
+**Update:** You can now install `pytorch-cluster` via [Anaconda](https://anaconda.org/pyg/pytorch-cluster) for all major OS/PyTorch/CUDA combinations 🤗
+Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
+```
+conda install pytorch-cluster -c pyg
+```
+### Binaries
+We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
+#### PyTorch 2.1
+To install the binaries for PyTorch 2.1.0, simply run
+```
+pip install torch-cluster -f https://data.pyg.org/whl/torch-2.1.0+${CUDA}.html
+```
+where `${CUDA}` should be replaced by either `cpu`, `cu118`, or `cu121` depending on your PyTorch installation.
+|             | `cpu` | `cu118` | `cu121` |
+|-------------|-------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |
+#### PyTorch 2.0
+To install the binaries for PyTorch 2.0.0, simply run
+```
+pip install torch-cluster -f https://data.pyg.org/whl/torch-2.0.0+${CUDA}.html
+```
+where `${CUDA}` should be replaced by either `cpu`, `cu117`, or `cu118` depending on your PyTorch installation.
+|             | `cpu` | `cu117` | `cu118` |
+|-------------|-------|---------|---------|
+| **Linux**   | ✅    | ✅      | ✅      |
+| **Windows** | ✅    | ✅      | ✅      |
+| **macOS**   | ✅    |         |         |
+**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1, PyTorch 1.9.0, PyTorch 1.10.0/1.10.1/1.10.2, PyTorch 1.11.0, PyTorch 1.12.0/1.12.1 and PyTorch 1.13.0/1.13.1 (following the same procedure).
+For older versions, you need to explicitly specify the latest supported version number or install via `pip install --no-index` in order to prevent a manual installation from source.
+You can look up the latest supported version number [here](https://data.pyg.org/whl).
+### From source
+Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
+```
+$ python -c "import torch; print(torch.__version__)"
+>>> 1.4.0
+$ python -c "import torch; print(torch.__version__)"
+>>> 1.1.0
+$ echo $PATH
+>>> /usr/local/cuda/bin:...
+$ echo $CPATH
+>>> /usr/local/cuda/include:...
+```
+Then run:
+```
+pip install torch-cluster
+```
+When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
+In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
+```
+export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
+```
+## Functions
+### Graclus
+A greedy clustering algorithm of picking an unmarked vertex and matching it with one its unmarked neighbors (that maximizes its edge weight).
+The GPU algorithm is adapted from Fagginger Auer and Bisseling: [A GPU Algorithm for Greedy Graph Matching](http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf) (LNCS 2012)
+```python
+import torch
+from torch_cluster import graclus_cluster
+row = torch.tensor([0, 1, 1, 2])
+col = torch.tensor([1, 0, 2, 1])
+weight = torch.tensor([1., 1., 1., 1.])  # Optional edge weights.
+cluster = graclus_cluster(row, col, weight)
+```
+```
+print(cluster)
+tensor([0, 0, 1])
+```
+### VoxelGrid
+A clustering algorithm, which overlays a regular grid of user-defined size over a point cloud and clusters all points within a voxel.
+```python
+import torch
+from torch_cluster import grid_cluster
+pos = torch.tensor([[0., 0.], [11., 9.], [2., 8.], [2., 2.], [8., 3.]])
+size = torch.Tensor([5, 5])
+cluster = grid_cluster(pos, size)
+```
+```
+print(cluster)
+tensor([0, 5, 3, 0, 1])
+```
+### FarthestPointSampling
+A sampling algorithm, which iteratively samples the most distant point with regard to the rest points.
+```python
+import torch
+from torch_cluster import fps
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+batch = torch.tensor([0, 0, 0, 0])
+index = fps(x, batch, ratio=0.5, random_start=False)
+```
+```
+print(index)
+tensor([0, 3])
+```
+### kNN-Graph
+Computes graph edges to the nearest *k* points.
+**Args:**
+* **x** *(Tensor)*: Node feature matrix of shape `[N, F]`.
+* **k** *(int)*: The number of neighbors.
+* **batch** *(LongTensor, optional)*: Batch vector of shape `[N]`, which assigns each node to a specific example. `batch` needs to be sorted. (default: `None`)
+* **loop** *(bool, optional)*: If `True`, the graph will contain self-loops. (default: `False`)
+* **flow** *(string, optional)*: The flow direction when using in combination with message passing (`"source_to_target"` or `"target_to_source"`). (default: `"source_to_target"`)
+* **cosine** *(boolean, optional)*: If `True`, will use the Cosine distance instead of Euclidean distance to find nearest neighbors. (default: `False`)
+* **num_workers** *(int)*: Number of workers to use for computation. Has no effect in case `batch` is not `None`, or the input lies on the GPU. (default: `1`)
+```python
+import torch
+from torch_cluster import knn_graph
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+batch = torch.tensor([0, 0, 0, 0])
+edge_index = knn_graph(x, k=2, batch=batch, loop=False)
+```
+```
+print(edge_index)
+tensor([[1, 2, 0, 3, 0, 3, 1, 2],
+        [0, 0, 1, 1, 2, 2, 3, 3]])
+```
+### Radius-Graph
+Computes graph edges to all points within a given distance.
+**Args:**
+* **x** *(Tensor)*: Node feature matrix of shape `[N, F]`.
+* **r** *(float)*: The radius.
+* **batch** *(LongTensor, optional)*: Batch vector of shape `[N]`, which assigns each node to a specific example. `batch` needs to be sorted. (default: `None`)
+* **loop** *(bool, optional)*: If `True`, the graph will contain self-loops. (default: `False`)
+* **max_num_neighbors** *(int, optional)*: The maximum number of neighbors to return for each element. If the number of actual neighbors is greater than `max_num_neighbors`, returned neighbors are picked randomly. (default: `32`)
+* **flow** *(string, optional)*: The flow direction when using in combination with message passing (`"source_to_target"` or `"target_to_source"`). (default: `"source_to_target"`)
+* **num_workers** *(int)*: Number of workers to use for computation. Has no effect in case `batch` is not `None`, or the input lies on the GPU. (default: `1`)
+```python
+import torch
+from torch_cluster import radius_graph
+x = torch.tensor([[-1., -1.], [-1., 1.], [1., -1.], [1., 1.]])
+batch = torch.tensor([0, 0, 0, 0])
+edge_index = radius_graph(x, r=2.5, batch=batch, loop=False)
+```
+```
+print(edge_index)
+tensor([[1, 2, 0, 3, 0, 3, 1, 2],
+        [0, 0, 1, 1, 2, 2, 3, 3]])
+```
+### Nearest
+Clusters points in *x* together which are nearest to a given query point in *y*.
+`batch_{x,y}` vectors need to be sorted.
+```python
+import torch
+from torch_cluster import nearest
+x = torch.Tensor([[-1, -1], [-1, 1], [1, -1], [1, 1]])
+batch_x = torch.tensor([0, 0, 0, 0])
+y = torch.Tensor([[-1, 0], [1, 0]])
+batch_y = torch.tensor([0, 0])
+cluster = nearest(x, y, batch_x, batch_y)
+```
+```
+print(cluster)
+tensor([0, 0, 1, 1])
+```
+### RandomWalk-Sampling
+Samples random walks of length `walk_length` from all node indices in `start` in the graph given by `(row, col)`.
+```python
+import torch
+from torch_cluster import random_walk
+row = torch.tensor([0, 1, 1, 1, 2, 2, 3, 3, 4, 4])
+col = torch.tensor([1, 0, 2, 3, 1, 4, 1, 4, 2, 3])
+start = torch.tensor([0, 1, 2, 3, 4])
+walk = random_walk(row, col, start, walk_length=3)
+```
+```
+print(walk)
+tensor([[0, 1, 2, 4],
+        [1, 3, 4, 2],
+        [2, 4, 2, 1],
+        [3, 4, 2, 4],
+        [4, 3, 1, 0]])
+```
+## Running tests
+```
+pytest
+```
+## C++ API
+`torch-cluster` also offers a C++ API that contains C++ equivalent of python models.
+```
+export Torch_DIR=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'`
+mkdir build
+cd build
+# Add -DWITH_CUDA=on support for the CUDA if needed
+cmake ..
+make
+make install
+```
--- a/conda/pytorch-cluster/README.md
+++ b/conda/pytorch-cluster/README.md
 ```
-./build_conda.sh 3.9 1.11.0 cu113  # python, pytorch and cuda version
+./build_conda.sh 3.9 2.1.0 cu118  # python, pytorch and cuda version
 ```
--- a/conda/pytorch-cluster/build_conda.sh
+++ b/conda/pytorch-cluster/build_conda.sh
@@ -10,6 +10,22 @@ if [ "${CUDA_VERSION}" = "cpu" ]; then
  export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly  # [not osx]"
 else
  case $CUDA_VERSION in
+    cu121)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==12.1.*"
+      ;;
+    cu118)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.8.*"
+      ;;
+    cu117)
+      export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.7.*"
+      ;;
+    cu116)
+      if [ "${TORCH_VERSION}" = "1.12.0" ]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*"
+      else
+        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.6.*"
+      fi
+      ;;
    cu115)
      export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.5.*"
      ;;
@@ -36,4 +52,8 @@ echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
 echo "- $CONDA_PYTORCH_CONSTRAINT"
 echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"
-conda build . -c pytorch -c default -c nvidia --output-folder "$HOME/conda-bld"
+if [ "${TORCH_VERSION}" = "1.12.0" ] && [ "${CUDA_VERSION}" = "cu116" ]; then
+  conda build . -c pytorch -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld"
+else
+  conda build . -c pytorch -c default -c nvidia --output-folder "$HOME/conda-bld"
+fi
--- a/conda/pytorch-cluster/meta.yaml
+++ b/conda/pytorch-cluster/meta.yaml
 package:
  name: pytorch-cluster
-  version: 1.6.0
+  version: 1.6.3
 source:
  path: ../..

--- a/csrc/cluster.h
+++ b/csrc/cluster.h
 #pragma once
-#include <torch/extension.h>
+#include "extensions.h"
-int64_t cuda_version();
+namespace cluster {
+CLUSTER_API int64_t cuda_version() noexcept;
-torch::Tensor fps(torch::Tensor src, torch::Tensor ptr, double ratio,
+namespace detail {
+CLUSTER_INLINE_VARIABLE int64_t _cuda_version = cuda_version();
+} // namespace detail
+} // namespace cluster
+CLUSTER_API torch::Tensor fps(torch::Tensor src, torch::Tensor ptr, double ratio,
                  bool random_start);
-torch::Tensor graclus(torch::Tensor rowptr, torch::Tensor col,
+CLUSTER_API torch::Tensor graclus(torch::Tensor rowptr, torch::Tensor col,
                      torch::optional<torch::Tensor> optional_weight);
-torch::Tensor grid(torch::Tensor pos, torch::Tensor size,
+CLUSTER_API torch::Tensor grid(torch::Tensor pos, torch::Tensor size,
                   torch::optional<torch::Tensor> optional_start,
                   torch::optional<torch::Tensor> optional_end);
-torch::Tensor knn(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+CLUSTER_API torch::Tensor knn(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
                  torch::Tensor ptr_y, int64_t k, bool cosine);
-torch::Tensor nearest(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+CLUSTER_API torch::Tensor nearest(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
                      torch::Tensor ptr_y);
-torch::Tensor radius(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
+CLUSTER_API torch::Tensor radius(torch::Tensor x, torch::Tensor y, torch::Tensor ptr_x,
                     torch::Tensor ptr_y, double r, int64_t max_num_neighbors);
-std::tuple<torch::Tensor, torch::Tensor>
+CLUSTER_API std::tuple<torch::Tensor, torch::Tensor>
 random_walk(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,
            int64_t walk_length, double p, double q);
-torch::Tensor neighbor_sampler(torch::Tensor start, torch::Tensor rowptr,
+CLUSTER_API torch::Tensor neighbor_sampler(torch::Tensor start, torch::Tensor rowptr,
                               int64_t count, double factor);
--- a/csrc/cpu/fps_cpu.cpp
+++ b/csrc/cpu/fps_cpu.cpp
@@ -24,7 +24,7 @@ torch::Tensor fps_cpu(torch::Tensor src, torch::Tensor ptr, torch::Tensor ratio,
  auto out_ptr = deg.toType(torch::kFloat) * ratio;
  out_ptr = out_ptr.ceil().toType(torch::kLong).cumsum(0);
-  auto out = torch::empty(out_ptr[-1].data_ptr<int64_t>()[0], ptr.options());
+  auto out = torch::empty({out_ptr[-1].data_ptr<int64_t>()[0]}, ptr.options());
  auto ptr_data = ptr.data_ptr<int64_t>();
  auto out_ptr_data = out_ptr.data_ptr<int64_t>();

--- a/csrc/cpu/fps_cpu.h
+++ b/csrc/cpu/fps_cpu.h
 #pragma once
-#include <torch/extension.h>
+#include "../extensions.h"
 torch::Tensor fps_cpu(torch::Tensor src, torch::Tensor ptr, torch::Tensor ratio,
                      bool random_start);
--- a/csrc/cpu/graclus_cpu.cpp
+++ b/csrc/cpu/graclus_cpu.cpp
@@ -47,7 +47,7 @@ torch::Tensor graclus_cpu(torch::Tensor rowptr, torch::Tensor col,
  } else {
    auto weight = optional_weight.value();
    auto scalar_type = weight.scalar_type();
-    AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, scalar_type, "_", [&] {
+    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "graclus_cpu", [&] {
      auto weight_data = weight.data_ptr<scalar_t>();
      for (auto n = 0; n < num_nodes; n++) {

--- a/csrc/cpu/graclus_cpu.h
+++ b/csrc/cpu/graclus_cpu.h
 #pragma once
-#include <torch/extension.h>
+#include "../extensions.h"
 torch::Tensor graclus_cpu(torch::Tensor rowptr, torch::Tensor col,
                          torch::optional<torch::Tensor> optional_weight);
--- a/csrc/cpu/grid_cpu.cpp
+++ b/csrc/cpu/grid_cpu.cpp
@@ -35,7 +35,7 @@ torch::Tensor grid_cpu(torch::Tensor pos, torch::Tensor size,
  auto num_voxels = (end - start).true_divide(size).toType(torch::kLong) + 1;
  num_voxels = num_voxels.cumprod(0);
  num_voxels =
-      torch::cat({torch::ones(1, num_voxels.options()), num_voxels}, 0);
+      torch::cat({torch::ones({1}, num_voxels.options()), num_voxels}, 0);
  num_voxels = num_voxels.narrow(0, 0, size.size(0));
  auto out = pos.true_divide(size.view({1, -1})).toType(torch::kLong);

--- a/csrc/cpu/grid_cpu.h
+++ b/csrc/cpu/grid_cpu.h
 #pragma once
-#include <torch/extension.h>
+#include "../extensions.h"
 torch::Tensor grid_cpu(torch::Tensor pos, torch::Tensor size,
                       torch::optional<torch::Tensor> optional_start,
                       torch::optional<torch::Tensor> optional_end);
--- a/csrc/cpu/knn_cpu.cpp
+++ b/csrc/cpu/knn_cpu.cpp
@@ -25,7 +25,7 @@ torch::Tensor knn_cpu(torch::Tensor x, torch::Tensor y,
  std::vector<size_t> out_vec = std::vector<size_t>();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, x.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, x.scalar_type(), "knn_cpu", [&] {
    // See: nanoflann/examples/vector_of_vectors_example.cpp
    auto x_data = x.data_ptr<scalar_t>();

--- a/csrc/cpu/knn_cpu.h
+++ b/csrc/cpu/knn_cpu.h
 #pragma once
-#include <torch/extension.h>
+#include "../extensions.h"
 torch::Tensor knn_cpu(torch::Tensor x, torch::Tensor y,
                      torch::optional<torch::Tensor> ptr_x,

--- a/csrc/cpu/radius_cpu.cpp
+++ b/csrc/cpu/radius_cpu.cpp
@@ -25,7 +25,7 @@ torch::Tensor radius_cpu(torch::Tensor x, torch::Tensor y,
  std::vector<size_t> out_vec = std::vector<size_t>();
-  AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, x.scalar_type(), "_", [&] {
+  AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, x.scalar_type(), "radius_cpu", [&] {
    // See: nanoflann/examples/vector_of_vectors_example.cpp
    auto x_data = x.data_ptr<scalar_t>();

--- a/csrc/cpu/radius_cpu.h
+++ b/csrc/cpu/radius_cpu.h
 #pragma once
-#include <torch/extension.h>
+#include "../extensions.h"
 torch::Tensor radius_cpu(torch::Tensor x, torch::Tensor y,
                         torch::optional<torch::Tensor> ptr_x,

--- a/csrc/cpu/rw_cpu.h
+++ b/csrc/cpu/rw_cpu.h
 #pragma once
-#include <torch/extension.h>
+#include "../extensions.h"
 std::tuple<torch::Tensor, torch::Tensor>
 random_walk_cpu(torch::Tensor rowptr, torch::Tensor col, torch::Tensor start,

--- a/csrc/cpu/sampler_cpu.h
+++ b/csrc/cpu/sampler_cpu.h
 #pragma once
-#include <torch/extension.h>
+#include "../extensions.h"
 torch::Tensor neighbor_sampler_cpu(torch::Tensor start, torch::Tensor rowptr,
                                   int64_t count, double factor);
--- a/csrc/cpu/utils.h
+++ b/csrc/cpu/utils.h
 #pragma once
-#include <torch/extension.h>
+#include "../extensions.h"
 #define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
 #define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")