Commit 19fd8251 authored by limm's avatar limm
Browse files

support v0.6.16

parent 9ccee9c0
cmake_minimum_required(VERSION 3.0)
cmake_minimum_required(VERSION 3.10)
project(torchsparse)
set(CMAKE_CXX_STANDARD 14)
set(TORCHSPARSE_VERSION 0.6.15)
set(TORCHSPARSE_VERSION 0.6.16)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
option(WITH_CUDA "Enable CUDA support" OFF)
option(WITH_PYTHON "Link to Python when building" ON)
option(WITH_METIS "Enable METIS support" OFF)
if(WITH_CUDA)
enable_language(CUDA)
......@@ -19,6 +21,11 @@ if (WITH_PYTHON)
endif()
find_package(Torch REQUIRED)
if (WITH_METIS)
add_definitions(-DWITH_METIS)
find_package(METIS)
endif()
file(GLOB HEADERS csrc/*.h)
file(GLOB OPERATOR_SOURCES csrc/*.* csrc/cpu/*.*)
if(WITH_CUDA)
......@@ -30,6 +37,17 @@ target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
if (WITH_PYTHON)
target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python)
endif()
if (WITH_METIS)
target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} PRIVATE ${METIS_LIBRARIES})
endif()
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
# set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchSparse)
target_include_directories(${PROJECT_NAME} INTERFACE
......@@ -74,7 +92,6 @@ install(FILES
csrc/cpu/saint_cpu.h
csrc/cpu/sample_cpu.h
csrc/cpu/spmm_cpu.h
csrc/cpu/spspmm_cpu.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cpu)
if(WITH_CUDA)
install(FILES
......@@ -82,7 +99,6 @@ if(WITH_CUDA)
csrc/cuda/diag_cuda.h
csrc/cuda/rw_cuda.h
csrc/cuda/spmm_cuda.h
csrc/cuda/spspmm_cuda.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/cuda)
endif()
......
include README.md
include LICENSE
recursive-exclude test *
recursive-include csrc *
recursive-include third_party *
recursive-exclude third_party/parallel-hashmap/css *
recursive-exclude third_party/parallel-hashmap/html *
recursive-exclude third_party/parallel-hashmap/tests *
recursive-exclude third_party/parallel-hashmap/examples *
recursive-exclude third_party/parallel-hashmap/benchmark *
recursive-exclude test *
recursive-exclude benchmark *
Metadata-Version: 2.1
Name: torch_sparse
Version: 0.6.13
Summary: PyTorch Extension Library of Optimized Autograd Sparse Matrix Operations
Home-page: https://github.com/rusty1s/pytorch_sparse
Author: Matthias Fey
Author-email: matthias.fey@tu-dortmund.de
License: UNKNOWN
Download-URL: https://github.com/rusty1s/pytorch_sparse/archive/0.6.13.tar.gz
Description: [pypi-image]: https://badge.fury.io/py/torch-sparse.svg
[pypi-url]: https://pypi.python.org/pypi/torch-sparse
[testing-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml/badge.svg
[testing-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/testing.yml
[linting-image]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml/badge.svg
[linting-url]: https://github.com/rusty1s/pytorch_sparse/actions/workflows/linting.yml
[coverage-image]: https://codecov.io/gh/rusty1s/pytorch_sparse/branch/master/graph/badge.svg
[coverage-url]: https://codecov.io/github/rusty1s/pytorch_sparse?branch=master
# PyTorch Sparse
[![PyPI Version][pypi-image]][pypi-url]
[![Testing Status][testing-image]][testing-url]
[![Linting Status][linting-image]][linting-url]
[![Code Coverage][coverage-image]][coverage-url]
--------------------------------------------------------------------------------
This package consists of a small extension library of optimized sparse matrix operations with autograd support.
This package currently consists of the following methods:
* **[Coalesce](#coalesce)**
* **[Transpose](#transpose)**
* **[Sparse Dense Matrix Multiplication](#sparse-dense-matrix-multiplication)**
* **[Sparse Sparse Matrix Multiplication](#sparse-sparse-matrix-multiplication)**
All included operations work on varying data types and are implemented both for CPU and GPU.
To avoid the hazzle of creating [`torch.sparse_coo_tensor`](https://pytorch.org/docs/stable/torch.html?highlight=sparse_coo_tensor#torch.sparse_coo_tensor), this package defines operations on sparse tensors by simply passing `index` and `value` tensors as arguments ([with same shapes as defined in PyTorch](https://pytorch.org/docs/stable/sparse.html)).
Note that only `value` comes with autograd support, as `index` is discrete and therefore not differentiable.
## Installation
### Anaconda
**Update:** You can now install `pytorch-sparse` via [Anaconda](https://anaconda.org/pyg/pytorch-sparse) for all major OS/PyTorch/CUDA combinations 🤗
Given that you have [`pytorch >= 1.8.0` installed](https://pytorch.org/get-started/locally/), simply run
```
conda install pytorch-sparse -c pyg
```
### Binaries
We alternatively provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](https://data.pyg.org/whl).
#### PyTorch 1.11
To install the binaries for PyTorch 1.11.0, simply run
```
pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.11.0+${CUDA}.html
```
where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu113`, or `cu115` depending on your PyTorch installation.
| | `cpu` | `cu102` | `cu113` | `cu115` |
|-------------|-------|---------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ | ✅ |
| **Windows** | ✅ | | ✅ | ✅ |
| **macOS** | ✅ | | | |
#### PyTorch 1.10
To install the binaries for PyTorch 1.10.0, PyTorch 1.10.1 and PyTorch 1.10.2, simply run
```
pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+${CUDA}.html
```
where `${CUDA}` should be replaced by either `cpu`, `cu102`, `cu111`, or `cu113` depending on your PyTorch installation.
| | `cpu` | `cu102` | `cu111` | `cu113` |
|-------------|-------|---------|---------|---------|
| **Linux** | ✅ | ✅ | ✅ | ✅ |
| **Windows** | ✅ | ✅ | ✅ | ✅ |
| **macOS** | ✅ | | | |
**Note:** Binaries of older versions are also provided for PyTorch 1.4.0, PyTorch 1.5.0, PyTorch 1.6.0, PyTorch 1.7.0/1.7.1, PyTorch 1.8.0/1.8.1 and PyTorch 1.9.0 (following the same procedure).
For older versions, you might need to explicitly specify the latest supported version number in order to prevent a manual installation from source.
You can look up the latest supported version number [here](https://data.pyg.org/whl).
### From source
Ensure that at least PyTorch 1.7.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
```
$ python -c "import torch; print(torch.__version__)"
>>> 1.7.0
$ echo $PATH
>>> /usr/local/cuda/bin:...
$ echo $CPATH
>>> /usr/local/cuda/include:...
```
If you want to additionally build `torch-sparse` with METIS support, *e.g.* for partioning, please download and install the [METIS library](http://glaros.dtc.umn.edu/gkhome/metis/metis/download) by following the instructions in the `Install.txt` file.
Note that METIS needs to be installed with 64 bit `IDXTYPEWIDTH` by changing `include/metis.h`.
Afterwards, set the environment variable `WITH_METIS=1`.
Then run:
```
pip install torch-scatter torch-sparse
```
When running in a docker container without NVIDIA driver, PyTorch needs to evaluate the compute capabilities and may fail.
In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
```
export TORCH_CUDA_ARCH_LIST="6.0 6.1 7.2+PTX 7.5+PTX"
```
## Functions
### Coalesce
```
torch_sparse.coalesce(index, value, m, n, op="add") -> (torch.LongTensor, torch.Tensor)
```
Row-wise sorts `index` and removes duplicate entries.
Duplicate entries are removed by scattering them together.
For scattering, any operation of [`torch_scatter`](https://github.com/rusty1s/pytorch_scatter) can be used.
#### Parameters
* **index** *(LongTensor)* - The index tensor of sparse matrix.
* **value** *(Tensor)* - The value tensor of sparse matrix.
* **m** *(int)* - The first dimension of sparse matrix.
* **n** *(int)* - The second dimension of sparse matrix.
* **op** *(string, optional)* - The scatter operation to use. (default: `"add"`)
#### Returns
* **index** *(LongTensor)* - The coalesced index tensor of sparse matrix.
* **value** *(Tensor)* - The coalesced value tensor of sparse matrix.
#### Example
```python
import torch
from torch_sparse import coalesce
index = torch.tensor([[1, 0, 1, 0, 2, 1],
[0, 1, 1, 1, 0, 0]])
value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
index, value = coalesce(index, value, m=3, n=2)
```
```
print(index)
tensor([[0, 1, 1, 2],
[1, 0, 1, 0]])
print(value)
tensor([[6.0, 8.0],
[7.0, 9.0],
[3.0, 4.0],
[5.0, 6.0]])
```
### Transpose
```
torch_sparse.transpose(index, value, m, n) -> (torch.LongTensor, torch.Tensor)
```
Transposes dimensions 0 and 1 of a sparse matrix.
#### Parameters
* **index** *(LongTensor)* - The index tensor of sparse matrix.
* **value** *(Tensor)* - The value tensor of sparse matrix.
* **m** *(int)* - The first dimension of sparse matrix.
* **n** *(int)* - The second dimension of sparse matrix.
* **coalesced** *(bool, optional)* - If set to `False`, will not coalesce the output. (default: `True`)
#### Returns
* **index** *(LongTensor)* - The transposed index tensor of sparse matrix.
* **value** *(Tensor)* - The transposed value tensor of sparse matrix.
#### Example
```python
import torch
from torch_sparse import transpose
index = torch.tensor([[1, 0, 1, 0, 2, 1],
[0, 1, 1, 1, 0, 0]])
value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
index, value = transpose(index, value, 3, 2)
```
```
print(index)
tensor([[0, 0, 1, 1],
[1, 2, 0, 1]])
print(value)
tensor([[7.0, 9.0],
[5.0, 6.0],
[6.0, 8.0],
[3.0, 4.0]])
```
### Sparse Dense Matrix Multiplication
```
torch_sparse.spmm(index, value, m, n, matrix) -> torch.Tensor
```
Matrix product of a sparse matrix with a dense matrix.
#### Parameters
* **index** *(LongTensor)* - The index tensor of sparse matrix.
* **value** *(Tensor)* - The value tensor of sparse matrix.
* **m** *(int)* - The first dimension of sparse matrix.
* **n** *(int)* - The second dimension of sparse matrix.
* **matrix** *(Tensor)* - The dense matrix.
#### Returns
* **out** *(Tensor)* - The dense output matrix.
#### Example
```python
import torch
from torch_sparse import spmm
index = torch.tensor([[0, 0, 1, 2, 2],
[0, 2, 1, 0, 1]])
value = torch.Tensor([1, 2, 4, 1, 3])
matrix = torch.Tensor([[1, 4], [2, 5], [3, 6]])
out = spmm(index, value, 3, 3, matrix)
```
```
print(out)
tensor([[7.0, 16.0],
[8.0, 20.0],
[7.0, 19.0]])
```
### Sparse Sparse Matrix Multiplication
```
torch_sparse.spspmm(indexA, valueA, indexB, valueB, m, k, n) -> (torch.LongTensor, torch.Tensor)
```
Matrix product of two sparse tensors.
Both input sparse matrices need to be **coalesced** (use the `coalesced` attribute to force).
#### Parameters
* **indexA** *(LongTensor)* - The index tensor of first sparse matrix.
* **valueA** *(Tensor)* - The value tensor of first sparse matrix.
* **indexB** *(LongTensor)* - The index tensor of second sparse matrix.
* **valueB** *(Tensor)* - The value tensor of second sparse matrix.
* **m** *(int)* - The first dimension of first sparse matrix.
* **k** *(int)* - The second dimension of first sparse matrix and first dimension of second sparse matrix.
* **n** *(int)* - The second dimension of second sparse matrix.
* **coalesced** *(bool, optional)*: If set to `True`, will coalesce both input sparse matrices. (default: `False`)
#### Returns
* **index** *(LongTensor)* - The output index tensor of sparse matrix.
* **value** *(Tensor)* - The output value tensor of sparse matrix.
#### Example
```python
import torch
from torch_sparse import spspmm
indexA = torch.tensor([[0, 0, 1, 2, 2], [1, 2, 0, 0, 1]])
valueA = torch.Tensor([1, 2, 3, 4, 5])
indexB = torch.tensor([[0, 2], [1, 0]])
valueB = torch.Tensor([2, 4])
indexC, valueC = spspmm(indexA, valueA, indexB, valueB, 3, 3, 2)
```
```
print(indexC)
tensor([[0, 1, 2],
[0, 1, 1]])
print(valueC)
tensor([8.0, 6.0, 8.0])
```
## C++ API
`torch-sparse` also offers a C++ API that contains C++ equivalent of python models.
```
mkdir build
cd build
# Add -DWITH_CUDA=on support for the CUDA if needed
cmake ..
make
make install
```
## Running tests
```
pytest
```
Keywords: pytorch,sparse,sparse-matrices,autograd
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3 :: Only
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Provides-Extra: test
###
#
# @copyright (c) 2009-2014 The University of Tennessee and The University
# of Tennessee Research Foundation.
# All rights reserved.
# @copyright (c) 2012-2014 Inria. All rights reserved.
# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
#
###
#
# - Find METIS include dirs and libraries
# Use this module by invoking find_package with the form:
# find_package(METIS
# [REQUIRED] # Fail with error if metis is not found
# )
#
# This module finds headers and metis library.
# Results are reported in variables:
# METIS_FOUND - True if headers and requested libraries were found
# METIS_INCLUDE_DIRS - metis include directories
# METIS_LIBRARY_DIRS - Link directories for metis libraries
# METIS_LIBRARIES - metis component libraries to be linked
#
# The user can give specific paths where to find the libraries adding cmake
# options at configure (ex: cmake path/to/project -DMETIS_DIR=path/to/metis):
# METIS_DIR - Where to find the base directory of metis
# METIS_INCDIR - Where to find the header files
# METIS_LIBDIR - Where to find the library files
# The module can also look for the following environment variables if paths
# are not given as cmake variable: METIS_DIR, METIS_INCDIR, METIS_LIBDIR
#=============================================================================
# Copyright 2012-2013 Inria
# Copyright 2012-2013 Emmanuel Agullo
# Copyright 2012-2013 Mathieu Faverge
# Copyright 2012 Cedric Castagnede
# Copyright 2013 Florent Pruvost
#
# Distributed under the OSI-approved BSD License (the "License");
# see accompanying file MORSE-Copyright.txt for details.
#
# This software is distributed WITHOUT ANY WARRANTY; without even the
# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the License for more information.
#=============================================================================
# (To distribute this file outside of Morse, substitute the full
# License text for the above reference.)
if (NOT METIS_FOUND)
set(METIS_DIR "" CACHE PATH "Installation directory of METIS library")
if (NOT METIS_FIND_QUIETLY)
message(STATUS "A cache variable, namely METIS_DIR, has been set to specify the install directory of METIS")
endif()
endif()
# Looking for include
# -------------------
# Add system include paths to search include
# ------------------------------------------
unset(_inc_env)
set(ENV_METIS_DIR "$ENV{METIS_DIR}")
set(ENV_METIS_INCDIR "$ENV{METIS_INCDIR}")
if(ENV_METIS_INCDIR)
list(APPEND _inc_env "${ENV_METIS_INCDIR}")
elseif(ENV_METIS_DIR)
list(APPEND _inc_env "${ENV_METIS_DIR}")
list(APPEND _inc_env "${ENV_METIS_DIR}/include")
list(APPEND _inc_env "${ENV_METIS_DIR}/include/metis")
else()
if(WIN32)
string(REPLACE ":" ";" _inc_env "$ENV{INCLUDE}")
else()
string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
list(APPEND _inc_env "${_path_env}")
string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
list(APPEND _inc_env "${_path_env}")
string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
list(APPEND _inc_env "${_path_env}")
string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
list(APPEND _inc_env "${_path_env}")
endif()
endif()
list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
list(REMOVE_DUPLICATES _inc_env)
# Try to find the metis header in the given paths
# -------------------------------------------------
# call cmake macro to find the header path
if(METIS_INCDIR)
set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
find_path(METIS_metis.h_DIRS
NAMES metis.h
HINTS ${METIS_INCDIR})
else()
if(METIS_DIR)
set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
find_path(METIS_metis.h_DIRS
NAMES metis.h
HINTS ${METIS_DIR}
PATH_SUFFIXES "include" "include/metis")
else()
set(METIS_metis.h_DIRS "METIS_metis.h_DIRS-NOTFOUND")
find_path(METIS_metis.h_DIRS
NAMES metis.h
HINTS ${_inc_env})
endif()
endif()
mark_as_advanced(METIS_metis.h_DIRS)
# If found, add path to cmake variable
# ------------------------------------
if (METIS_metis.h_DIRS)
set(METIS_INCLUDE_DIRS "${METIS_metis.h_DIRS}")
else ()
set(METIS_INCLUDE_DIRS "METIS_INCLUDE_DIRS-NOTFOUND")
if(NOT METIS_FIND_QUIETLY)
message(STATUS "Looking for metis -- metis.h not found")
endif()
endif()
# Looking for lib
# ---------------
# Add system library paths to search lib
# --------------------------------------
unset(_lib_env)
set(ENV_METIS_LIBDIR "$ENV{METIS_LIBDIR}")
if(ENV_METIS_LIBDIR)
list(APPEND _lib_env "${ENV_METIS_LIBDIR}")
elseif(ENV_METIS_DIR)
list(APPEND _lib_env "${ENV_METIS_DIR}")
list(APPEND _lib_env "${ENV_METIS_DIR}/lib")
else()
if(WIN32)
string(REPLACE ":" ";" _lib_env "$ENV{LIB}")
else()
if(APPLE)
string(REPLACE ":" ";" _lib_env "$ENV{DYLD_LIBRARY_PATH}")
else()
string(REPLACE ":" ";" _lib_env "$ENV{LD_LIBRARY_PATH}")
endif()
list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
endif()
endif()
list(REMOVE_DUPLICATES _lib_env)
# Try to find the metis lib in the given paths
# ----------------------------------------------
# call cmake macro to find the lib path
if(METIS_LIBDIR)
set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
find_library(METIS_metis_LIBRARY
NAMES metis
HINTS ${METIS_LIBDIR})
else()
if(METIS_DIR)
set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
find_library(METIS_metis_LIBRARY
NAMES metis
HINTS ${METIS_DIR}
PATH_SUFFIXES lib lib32 lib64)
else()
set(METIS_metis_LIBRARY "METIS_metis_LIBRARY-NOTFOUND")
find_library(METIS_metis_LIBRARY
NAMES metis
HINTS ${_lib_env})
endif()
endif()
mark_as_advanced(METIS_metis_LIBRARY)
# If found, add path to cmake variable
# ------------------------------------
if (METIS_metis_LIBRARY)
get_filename_component(metis_lib_path "${METIS_metis_LIBRARY}" PATH)
# set cmake variables
set(METIS_LIBRARIES "${METIS_metis_LIBRARY}")
set(METIS_LIBRARY_DIRS "${metis_lib_path}")
else ()
set(METIS_LIBRARIES "METIS_LIBRARIES-NOTFOUND")
set(METIS_LIBRARY_DIRS "METIS_LIBRARY_DIRS-NOTFOUND")
if(NOT METIS_FIND_QUIETLY)
message(STATUS "Looking for metis -- lib metis not found")
endif()
endif ()
# check a function to validate the find
if(METIS_LIBRARIES)
set(REQUIRED_INCDIRS)
set(REQUIRED_LIBDIRS)
set(REQUIRED_LIBS)
# METIS
if (METIS_INCLUDE_DIRS)
set(REQUIRED_INCDIRS "${METIS_INCLUDE_DIRS}")
endif()
if (METIS_LIBRARY_DIRS)
set(REQUIRED_LIBDIRS "${METIS_LIBRARY_DIRS}")
endif()
set(REQUIRED_LIBS "${METIS_LIBRARIES}")
# m
find_library(M_LIBRARY NAMES m)
mark_as_advanced(M_LIBRARY)
if(M_LIBRARY)
list(APPEND REQUIRED_LIBS "-lm")
endif()
# set required libraries for link
set(CMAKE_REQUIRED_INCLUDES "${REQUIRED_INCDIRS}")
set(CMAKE_REQUIRED_LIBRARIES)
foreach(lib_dir ${REQUIRED_LIBDIRS})
list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
endforeach()
list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}")
string(REGEX REPLACE "^ -" "-" CMAKE_REQUIRED_LIBRARIES "${CMAKE_REQUIRED_LIBRARIES}")
# test link
unset(METIS_WORKS CACHE)
include(CheckFunctionExists)
check_function_exists(METIS_NodeND METIS_WORKS)
mark_as_advanced(METIS_WORKS)
if(NOT METIS_WORKS)
if(NOT METIS_FIND_QUIETLY)
message(STATUS "Looking for METIS : test of METIS_NodeND with METIS library fails")
message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
endif()
endif()
set(CMAKE_REQUIRED_INCLUDES)
set(CMAKE_REQUIRED_FLAGS)
set(CMAKE_REQUIRED_LIBRARIES)
endif()
if (METIS_LIBRARIES)
list(GET METIS_LIBRARIES 0 first_lib)
get_filename_component(first_lib_path "${first_lib}" PATH)
if (${first_lib_path} MATCHES "/lib(32|64)?$")
string(REGEX REPLACE "/lib(32|64)?$" "" not_cached_dir "${first_lib_path}")
set(METIS_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of METIS library" FORCE)
else()
set(METIS_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of METIS library" FORCE)
endif()
endif()
mark_as_advanced(METIS_DIR)
mark_as_advanced(METIS_DIR_FOUND)
# check that METIS has been found
# ---------------------------------
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(METIS DEFAULT_MSG
METIS_LIBRARIES
METIS_WORKS
METIS_INCLUDE_DIRS)
#
# TODO: Add possibility to check for specific functions in the library
#
```
./build_conda.sh 3.9 1.12.0 cu113 # python, pytorch and cuda version
./build_conda.sh 3.9 1.13.0 cu116 # python, pytorch and cuda version
```
......@@ -10,8 +10,15 @@ if [ "${CUDA_VERSION}" = "cpu" ]; then
export CONDA_CUDATOOLKIT_CONSTRAINT="cpuonly # [not osx]"
else
case $CUDA_VERSION in
cu117)
export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.7.*"
;;
cu116)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*"
if [ "${TORCH_VERSION}" = "1.12.0" ]; then
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.6.*"
else
export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda==11.6.*"
fi
;;
cu115)
export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit==11.5.*"
......@@ -39,8 +46,8 @@ echo "PyTorch $TORCH_VERSION+$CUDA_VERSION"
echo "- $CONDA_PYTORCH_CONSTRAINT"
echo "- $CONDA_CUDATOOLKIT_CONSTRAINT"
if [ "${CUDA_VERSION}" = "cu116" ]; then
conda build . -c pytorch -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld"
if [ "${TORCH_VERSION}" = "1.12.0" ] && [ "${CUDA_VERSION}" = "cu116" ]; then
conda build . -c pytorch -c pyg -c default -c nvidia -c conda-forge --output-folder "$HOME/conda-bld"
else
conda build . -c pytorch -c default -c nvidia --output-folder "$HOME/conda-bld"
conda build . -c pytorch -c pyg -c default -c nvidia --output-folder "$HOME/conda-bld"
fi
package:
name: pytorch-sparse
version: 0.6.15
version: 0.6.16
source:
path: ../..
......
......@@ -365,11 +365,11 @@ hetero_sample(const vector<node_t> &node_types,
if (temporal) {
for (const auto &kv : temp_samples_dict) {
slice_dict[kv.first] = {0, kv.second.size()};
slice_dict[kv.first] = {slice_dict.at(kv.first).second, kv.second.size()};
}
} else {
for (const auto &kv : samples_dict)
slice_dict[kv.first] = {0, kv.second.size()};
slice_dict[kv.first] = {slice_dict.at(kv.first).second, kv.second.size()};
}
}
......@@ -504,4 +504,4 @@ hetero_temporal_neighbor_sample_cpu(
node_types, edge_types, colptr_dict, row_dict, input_node_dict,
num_neighbors_dict, node_time_dict, num_hops);
}
}
\ No newline at end of file
}
......@@ -44,7 +44,7 @@ spmm_cpu(torch::Tensor rowptr, torch::Tensor col,
auto K = mat.size(-1);
auto B = mat.numel() / (N * K);
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, mat.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "spmm_cpu", [&] {
scalar_t *value_data = nullptr;
auto mat_data = mat.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......@@ -123,7 +123,7 @@ torch::Tensor spmm_value_bw_cpu(torch::Tensor row, torch::Tensor rowptr,
auto row_data = row.data_ptr<int64_t>();
auto rowptr_data = rowptr.data_ptr<int64_t>();
auto col_data = col.data_ptr<int64_t>();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, mat.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, mat.scalar_type(), "spmm_value_bw_cpu", [&] {
auto mat_data = mat.data_ptr<scalar_t>();
auto grad_data = grad.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......
#include "spspmm_cpu.h"
#include "utils.h"
std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
spspmm_cpu(torch::Tensor rowptrA, torch::Tensor colA,
torch::optional<torch::Tensor> optional_valueA,
torch::Tensor rowptrB, torch::Tensor colB,
torch::optional<torch::Tensor> optional_valueB, int64_t K,
std::string reduce) {
CHECK_CPU(rowptrA);
CHECK_CPU(colA);
if (optional_valueA.has_value())
CHECK_CPU(optional_valueA.value());
CHECK_CPU(rowptrB);
CHECK_CPU(colB);
if (optional_valueB.has_value())
CHECK_CPU(optional_valueB.value());
CHECK_INPUT(rowptrA.dim() == 1);
CHECK_INPUT(colA.dim() == 1);
if (optional_valueA.has_value()) {
CHECK_INPUT(optional_valueA.value().dim() == 1);
CHECK_INPUT(optional_valueA.value().size(0) == colA.size(0));
}
CHECK_INPUT(rowptrB.dim() == 1);
CHECK_INPUT(colB.dim() == 1);
if (optional_valueB.has_value()) {
CHECK_INPUT(optional_valueB.value().dim() == 1);
CHECK_INPUT(optional_valueB.value().size(0) == colB.size(0));
}
if (!optional_valueA.has_value() && optional_valueB.has_value())
optional_valueA =
torch::ones({colA.numel()}, optional_valueB.value().options());
if (!optional_valueB.has_value() && optional_valueA.has_value())
optional_valueB =
torch::ones({colB.numel()}, optional_valueA.value().options());
auto scalar_type = torch::ScalarType::Float;
if (optional_valueA.has_value())
scalar_type = optional_valueA.value().scalar_type();
auto rowptrA_data = rowptrA.data_ptr<int64_t>();
auto colA_data = colA.data_ptr<int64_t>();
auto rowptrB_data = rowptrB.data_ptr<int64_t>();
auto colB_data = colB.data_ptr<int64_t>();
auto rowptrC = torch::empty_like(rowptrA);
auto rowptrC_data = rowptrC.data_ptr<int64_t>();
rowptrC_data[0] = 0;
torch::Tensor colC;
torch::optional<torch::Tensor> optional_valueC = torch::nullopt;
AT_DISPATCH_ALL_TYPES(scalar_type, "spspmm", [&] {
AT_DISPATCH_HAS_VALUE(optional_valueA, [&] {
scalar_t *valA_data = nullptr, *valB_data = nullptr;
if (HAS_VALUE) {
valA_data = optional_valueA.value().data_ptr<scalar_t>();
valB_data = optional_valueB.value().data_ptr<scalar_t>();
}
int64_t nnz = 0, cA, cB;
std::vector<scalar_t> tmp_vals(K, 0);
std::vector<int64_t> cols;
std::vector<scalar_t> vals;
for (auto rA = 0; rA < rowptrA.numel() - 1; rA++) {
for (auto eA = rowptrA_data[rA]; eA < rowptrA_data[rA + 1]; eA++) {
cA = colA_data[eA];
for (auto eB = rowptrB_data[cA]; eB < rowptrB_data[cA + 1]; eB++) {
cB = colB_data[eB];
if (HAS_VALUE)
tmp_vals[cB] += valA_data[eA] * valB_data[eB];
else
tmp_vals[cB]++;
}
}
for (auto k = 0; k < K; k++) {
if (tmp_vals[k] != 0) {
cols.push_back(k);
if (HAS_VALUE)
vals.push_back(tmp_vals[k]);
nnz++;
}
tmp_vals[k] = (scalar_t)0;
}
rowptrC_data[rA + 1] = nnz;
}
colC = torch::from_blob(cols.data(), {nnz}, colA.options()).clone();
if (HAS_VALUE) {
optional_valueC = torch::from_blob(vals.data(), {nnz},
optional_valueA.value().options());
optional_valueC = optional_valueC.value().clone();
}
});
});
return std::make_tuple(rowptrC, colC, optional_valueC);
}
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
spspmm_cpu(torch::Tensor rowptrA, torch::Tensor colA,
torch::optional<torch::Tensor> optional_valueA,
torch::Tensor rowptrB, torch::Tensor colB,
torch::optional<torch::Tensor> optional_valueB, int64_t K,
std::string reduce);
......@@ -5,6 +5,7 @@
#define CHECK_CPU(x) AT_ASSERTM(x.device().is_cpu(), #x " must be CPU tensor")
#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
#define CHECK_LT(low, high) AT_ASSERTM(low < high, "low must be smaller than high")
#define AT_DISPATCH_HAS_VALUE(optional_value, ...) \
[&] { \
......
......@@ -5,7 +5,7 @@ static inline __device__ void atomAdd(float *address, float val) {
}
static inline __device__ void atomAdd(double *address, double val) {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000))
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
......
......@@ -9,7 +9,7 @@
#define FULL_MASK 0xffffffff
// Paper: Design Principles for Sparse Matrix Multiplication on the GPU
// Code: https://github.com/owensgroup/merge-spmm
template <typename scalar_t, ReductionType REDUCE, bool HAS_VALUE>
__global__ void spmm_kernel(const int64_t *rowptr_data, const int64_t *col_data,
const scalar_t *value_data,
......@@ -20,22 +20,22 @@ __global__ void spmm_kernel(const int64_t *rowptr_data, const int64_t *col_data,
// across `blockIdx.y` are treated equally.
int thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
int row = thread_idx >> 5; // thread_idx / 32
int lane_idx = thread_idx & (32 - 1); // thread_idx % 32
int row = thread_idx >> 6; // thread_idx / 32
int lane_idx = thread_idx & (64 - 1); // thread_idx % 32
int batch_idx = row / M;
// Compute the column index of `mat` in which the thread is operating.
int mat_col_idx = lane_idx + (blockIdx.y << 5);
int mat_col_idx = lane_idx + (blockIdx.y << 6);
// Compute the output index (row-major order).
int out_idx = row * K + mat_col_idx;
// Helper arrays for warp communication.
int mat_row, mat_rows[32];
scalar_t val, vals[HAS_VALUE ? 32 : 1];
int mat_row, mat_rows[64];
scalar_t val, vals[HAS_VALUE ? 64 : 1];
// Do not aggregate/write across the Y-axis (lane_idx < leftover).
int leftover = K - (blockIdx.y << 5);
int leftover = K - (blockIdx.y << 6);
if (batch_idx < B) {
int row_start = __ldg(rowptr_data + (row % M));
......@@ -46,7 +46,7 @@ __global__ void spmm_kernel(const int64_t *rowptr_data, const int64_t *col_data,
int64_t arg;
// Iterate over all `col` indices in parallel within a warp.
for (int c = row_start; c < row_end; c += 32) {
for (int c = row_start; c < row_end; c += 64) {
if (col_idx < row_end) {
// Coalesced memory access into `col` and `val`.
......@@ -58,18 +58,18 @@ __global__ void spmm_kernel(const int64_t *rowptr_data, const int64_t *col_data,
if (HAS_VALUE)
val = (scalar_t)0;
}
col_idx += 32;
col_idx += 64;
#pragma unroll
for (int i = 0; i < 32; i++) {
for (int i = 0; i < 64; i++) {
// Communication between all threads in a warp.
mat_rows[i] = __shfl_sync(FULL_MASK, mat_row, i);
mat_rows[i] = SHFL_SYNC(FULL_MASK, mat_row, i);
if (HAS_VALUE)
vals[i] = __shfl_sync(FULL_MASK, val, i);
vals[i] = SHFL_SYNC(FULL_MASK, val, i);
}
#pragma unroll
for (int i = 0; i < 32; i++) {
for (int i = 0; i < 64; i++) {
if (lane_idx < leftover && mat_rows[i] != -1) {
// Coalesced memory access into `mat`.
val = __ldg(mat_data + batch_idx * N * K + mat_rows[i] + mat_col_idx);
......@@ -129,7 +129,7 @@ spmm_cuda(torch::Tensor rowptr, torch::Tensor col,
auto N = mat.size(-2);
auto K = mat.size(-1);
auto B = mat.numel() / (N * K);
auto BLOCKS = dim3((32 * B * M + THREADS - 1) / THREADS, (K + 31) / 32);
auto BLOCKS = dim3((64 * B * M + THREADS - 1) / THREADS, (K + 63) / 64);
auto stream = at::cuda::getCurrentCUDAStream();
......@@ -154,6 +154,7 @@ spmm_cuda(torch::Tensor rowptr, torch::Tensor col,
return std::make_tuple(out, arg_out);
}
template <typename scalar_t, ReductionType REDUCE>
__global__ void
spmm_value_bw_kernel(const int64_t *row_data, const int64_t *rowptr_data,
......@@ -162,8 +163,8 @@ spmm_value_bw_kernel(const int64_t *row_data, const int64_t *rowptr_data,
int M, int N, int E, int K) {
int thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
int index_idx = (thread_idx >> 5); // thread_idx / 32
int lane_idx = thread_idx & (32 - 1); // thread_idx % 32
int index_idx = (thread_idx >> 6); // thread_idx / 32
int lane_idx = thread_idx & (64 - 1); // thread_idx % 32
if (index_idx < E) {
int row = __ldg(row_data + index_idx);
......@@ -171,15 +172,15 @@ spmm_value_bw_kernel(const int64_t *row_data, const int64_t *rowptr_data,
scalar_t val = (scalar_t)0;
for (int b = 0; b < B; b++) {
for (int k = lane_idx; k < K; k += 32) {
for (int k = lane_idx; k < K; k += 64) {
val += mat_data[b * N * K + col * K + k] *
grad_data[b * M * K + row * K + k];
}
}
#pragma unroll
for (int i = 32 / 2; i > 0; i /= 2) { // Parallel reduction inside a warp.
val += __shfl_down_sync(FULL_MASK, val, i);
for (int i = 64 / 2; i > 0; i /= 2) { // Parallel reduction inside a warp.
val += SHFL_DOWN_SYNC(FULL_MASK, val, i);
}
if (lane_idx == 0) {
......@@ -211,7 +212,7 @@ torch::Tensor spmm_value_bw_cuda(torch::Tensor row, torch::Tensor rowptr,
auto E = row.numel();
auto K = mat.size(-1);
auto B = mat.numel() / (N * K);
auto BLOCKS = dim3((E * 32 + THREADS - 1) / THREADS);
auto BLOCKS = dim3((E * 64 + THREADS - 1) / THREADS);
auto out = torch::zeros({row.numel()}, grad.options());
......
#include "spspmm_cuda.h"
#include <ATen/cuda/CUDAContext.h>
#include <cusparse.h>
#include "utils.cuh"
#define AT_DISPATCH_CUSPARSE_TYPES(TYPE, ...) \
[&] { \
switch (TYPE) { \
case torch::ScalarType::Float: { \
using scalar_t = float; \
const auto &cusparsecsrgemm2_bufferSizeExt = \
cusparseScsrgemm2_bufferSizeExt; \
const auto &cusparsecsrgemm2 = cusparseScsrgemm2; \
return __VA_ARGS__(); \
} \
case torch::ScalarType::Double: { \
using scalar_t = double; \
const auto &cusparsecsrgemm2_bufferSizeExt = \
cusparseDcsrgemm2_bufferSizeExt; \
const auto &cusparsecsrgemm2 = cusparseDcsrgemm2; \
return __VA_ARGS__(); \
} \
default: \
AT_ERROR("Not implemented for '", toString(TYPE), "'"); \
} \
}()
std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
spspmm_cuda(torch::Tensor rowptrA, torch::Tensor colA,
torch::optional<torch::Tensor> optional_valueA,
torch::Tensor rowptrB, torch::Tensor colB,
torch::optional<torch::Tensor> optional_valueB, int64_t K,
std::string reduce) {
CHECK_CUDA(rowptrA);
CHECK_CUDA(colA);
if (optional_valueA.has_value())
CHECK_CUDA(optional_valueA.value());
CHECK_CUDA(rowptrB);
CHECK_CUDA(colB);
if (optional_valueB.has_value())
CHECK_CUDA(optional_valueB.value());
cudaSetDevice(rowptrA.get_device());
CHECK_INPUT(rowptrA.dim() == 1);
CHECK_INPUT(colA.dim() == 1);
if (optional_valueA.has_value()) {
CHECK_INPUT(optional_valueA.value().dim() == 1);
CHECK_INPUT(optional_valueA.value().size(0) == colA.size(0));
}
CHECK_INPUT(rowptrB.dim() == 1);
CHECK_INPUT(colB.dim() == 1);
if (optional_valueB.has_value()) {
CHECK_INPUT(optional_valueB.value().dim() == 1);
CHECK_INPUT(optional_valueB.value().size(0) == colB.size(0));
}
if (!optional_valueA.has_value() && optional_valueB.has_value())
optional_valueA =
torch::ones({colA.numel()}, optional_valueB.value().options());
if (!optional_valueB.has_value() && optional_valueA.has_value())
optional_valueB =
torch::ones({colB.numel()}, optional_valueA.value().options());
auto scalar_type = torch::ScalarType::Float;
if (optional_valueA.has_value())
scalar_type = optional_valueA.value().scalar_type();
auto handle = at::cuda::getCurrentCUDASparseHandle();
cusparseMatDescr_t descr;
cusparseCreateMatDescr(&descr);
rowptrA = rowptrA.toType(torch::kInt);
colA = colA.toType(torch::kInt);
rowptrB = rowptrB.toType(torch::kInt);
colB = colB.toType(torch::kInt);
int64_t M = rowptrA.numel() - 1, N = rowptrB.numel() - 1;
auto rowptrA_data = rowptrA.data_ptr<int>();
auto colA_data = colA.data_ptr<int>();
auto rowptrB_data = rowptrB.data_ptr<int>();
auto colB_data = colB.data_ptr<int>();
torch::Tensor rowptrC, colC;
torch::optional<torch::Tensor> optional_valueC = torch::nullopt;
int nnzC;
int *nnzTotalDevHostPtr = &nnzC;
// Step 1: Create an opaque structure.
csrgemm2Info_t info = NULL;
cusparseCreateCsrgemm2Info(&info);
// Step 2: Allocate buffer for `csrgemm2Nnz` and `csrgemm2`.
size_t bufferSize;
AT_DISPATCH_CUSPARSE_TYPES(scalar_type, [&] {
scalar_t alpha = (scalar_t)1.0;
cusparsecsrgemm2_bufferSizeExt(handle, M, N, K, &alpha, descr, colA.numel(),
rowptrA_data, colA_data, descr, colB.numel(),
rowptrB_data, colB_data, NULL, descr, 0,
NULL, NULL, info, &bufferSize);
void *buffer = NULL;
cudaMalloc(&buffer, bufferSize);
// Step 3: Compute CSR row pointer.
rowptrC = torch::empty({M + 1}, rowptrA.options());
auto rowptrC_data = rowptrC.data_ptr<int>();
cusparseXcsrgemm2Nnz(handle, M, N, K, descr, colA.numel(), rowptrA_data,
colA_data, descr, colB.numel(), rowptrB_data,
colB_data, descr, 0, NULL, NULL, descr, rowptrC_data,
nnzTotalDevHostPtr, info, buffer);
// Step 4: Compute CSR entries.
colC = torch::empty({nnzC}, rowptrC.options());
auto colC_data = colC.data_ptr<int>();
if (optional_valueA.has_value())
optional_valueC = torch::empty({nnzC}, optional_valueA.value().options());
scalar_t *valA_data = NULL, *valB_data = NULL, *valC_data = NULL;
if (optional_valueA.has_value()) {
valA_data = optional_valueA.value().data_ptr<scalar_t>();
valB_data = optional_valueB.value().data_ptr<scalar_t>();
valC_data = optional_valueC.value().data_ptr<scalar_t>();
}
cusparsecsrgemm2(handle, M, N, K, &alpha, descr, colA.numel(), valA_data,
rowptrA_data, colA_data, descr, colB.numel(), valB_data,
rowptrB_data, colB_data, NULL, descr, 0, NULL, NULL, NULL,
descr, valC_data, rowptrC_data, colC_data, info, buffer);
cudaFree(buffer);
});
// Step 5: Destroy the opaque structure.
cusparseDestroyCsrgemm2Info(info);
rowptrC = rowptrC.toType(torch::kLong);
colC = colC.toType(torch::kLong);
return std::make_tuple(rowptrC, colC, optional_valueC);
}
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::Tensor, torch::optional<torch::Tensor>>
spspmm_cuda(torch::Tensor rowptrA, torch::Tensor colA,
torch::optional<torch::Tensor> optional_valueA,
torch::Tensor rowptrB, torch::Tensor colB,
torch::optional<torch::Tensor> optional_valueB, int64_t K,
std::string reduce);
......@@ -17,6 +17,20 @@ __device__ __inline__ at::Half __shfl_down_sync(const unsigned mask,
return __shfl_down_sync(mask, var.operator __half(), delta);
}
#ifdef USE_ROCM
__device__ __inline__ at::Half __shfl_up(const at::Half var, const unsigned int delta) {
return __shfl_up(var.operator __half(), delta);
}
__device__ __inline__ at::Half __shfl_down(const at::Half var, const unsigned int delta) {
return __shfl_down(var.operator __half(), delta);
}
__device__ __inline__ at::Half __shfl(const at::Half var, int delta) {
return __shfl(var.operator __half(), delta);
}
#endif
#ifdef USE_ROCM
__device__ __inline__ at::Half __ldg(const at::Half* ptr) {
return __ldg(reinterpret_cast<const __half*>(ptr));
......
#pragma once
static inline __device__ void atomAdd(float *address, float val) {
atomicAdd(address, val);
}
static inline __device__ void atomAdd(double *address, double val) {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || TORCH_HIP_VERSION < 8000)
unsigned long long int *address_as_ull = (unsigned long long int *)address;
unsigned long long int old = *address_as_ull;
unsigned long long int assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
#else
atomicAdd(address, val);
#endif
}
#pragma once
#include "../extensions.h"
torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M);
torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E);
#include "hip/hip_runtime.h"
#include "convert_hip.h"
#include <ATen/hip/HIPContext.h>
#include "utils.cuh"
#define THREADS 256
__global__ void ind2ptr_kernel(const int64_t *ind_data, int64_t *out_data,
int64_t M, int64_t numel) {
int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_idx == 0) {
for (int64_t i = 0; i <= ind_data[0]; i++)
out_data[i] = 0;
} else if (thread_idx < numel) {
for (int64_t i = ind_data[thread_idx - 1]; i < ind_data[thread_idx]; i++)
out_data[i + 1] = thread_idx;
} else if (thread_idx == numel) {
for (int64_t i = ind_data[numel - 1] + 1; i < M + 1; i++)
out_data[i] = numel;
}
}
torch::Tensor ind2ptr_cuda(torch::Tensor ind, int64_t M) {
CHECK_CUDA(ind);
hipSetDevice(ind.get_device());
auto out = torch::empty(M + 1, ind.options());
if (ind.numel() == 0)
return out.zero_();
auto ind_data = ind.data_ptr<int64_t>();
auto out_data = out.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
ind2ptr_kernel<<<(ind.numel() + 2 + THREADS - 1) / THREADS, THREADS, 0,
stream>>>(ind_data, out_data, M, ind.numel());
return out;
}
__global__ void ptr2ind_kernel(const int64_t *ptr_data, int64_t *out_data,
int64_t E, int64_t numel) {
int64_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (thread_idx < numel) {
int64_t idx = ptr_data[thread_idx], next_idx = ptr_data[thread_idx + 1];
for (int64_t i = idx; i < next_idx; i++) {
out_data[i] = thread_idx;
}
}
}
torch::Tensor ptr2ind_cuda(torch::Tensor ptr, int64_t E) {
CHECK_CUDA(ptr);
hipSetDevice(ptr.get_device());
auto out = torch::empty(E, ptr.options());
auto ptr_data = ptr.data_ptr<int64_t>();
auto out_data = out.data_ptr<int64_t>();
auto stream = at::cuda::getCurrentCUDAStream();
ptr2ind_kernel<<<(ptr.numel() - 1 + THREADS - 1) / THREADS, THREADS, 0,
stream>>>(ptr_data, out_data, E, ptr.numel() - 1);
return out;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment