Commit 0799bc08 authored by limm's avatar limm
Browse files

suport v2.1.0

parent 50e05e1e
#pragma once
#include <ATen/cuda/detail/TensorInfo.cuh>
// We need our own `IndexToOffset` implementation since we do not want to
// access the last element of the `indexptr`.
template <typename scalar_t> struct IndexPtrToOffset {
static inline __host__ __device__ int
get(int idx, const at::cuda::detail::TensorInfo<scalar_t, int> &info) {
int offset = idx % (info.sizes[info.dims - 1] - 1);
offset *= info.strides[info.dims - 1];
idx /= info.sizes[info.dims - 1] - 1;
for (int i = info.dims - 2; i >= 0; --i) {
offset += (idx % info.sizes[i]) * info.strides[i];
idx /= info.sizes[i];
}
return offset;
}
};
#pragma once
#include <limits>
#include <map>
#include "atomics.cuh"
enum ReductionType { SUM, MEAN, MUL, DIV, MIN, MAX };
const std::map<std::string, ReductionType> reduce2REDUCE = {
{"sum", SUM}, {"mean", MEAN}, {"mul", MUL},
{"div", DIV}, {"min", MIN}, {"max", MAX},
};
#define AT_DISPATCH_REDUCTION_TYPES(reduce, ...) \
[&] { \
switch (reduce2REDUCE.at(reduce)) { \
case SUM: { \
static constexpr ReductionType REDUCE = SUM; \
return __VA_ARGS__(); \
} \
case MEAN: { \
static constexpr ReductionType REDUCE = MEAN; \
return __VA_ARGS__(); \
} \
case MUL: { \
static constexpr ReductionType REDUCE = MUL; \
return __VA_ARGS__(); \
} \
case DIV: { \
static constexpr ReductionType REDUCE = DIV; \
return __VA_ARGS__(); \
} \
case MIN: { \
static constexpr ReductionType REDUCE = MIN; \
return __VA_ARGS__(); \
} \
case MAX: { \
static constexpr ReductionType REDUCE = MAX; \
return __VA_ARGS__(); \
} \
} \
}()
template <typename scalar_t, ReductionType REDUCE> struct Reducer {
static inline __host__ __device__ scalar_t init() {
if (REDUCE == MUL || REDUCE == DIV)
return (scalar_t)1;
else if (REDUCE == MIN)
return std::numeric_limits<scalar_t>::max();
else if (REDUCE == MAX)
return std::numeric_limits<scalar_t>::lowest();
else
return (scalar_t)0;
}
static inline __host__ __device__ void update(scalar_t *val,
scalar_t new_val) {
if (REDUCE == SUM || REDUCE == MEAN)
*val = *val + new_val;
else if (REDUCE == MUL)
*val = *val * new_val;
else if (REDUCE == DIV)
*val = *val / new_val;
else if ((REDUCE == MIN && new_val < *val) ||
(REDUCE == MAX && new_val > *val)) {
*val = new_val;
}
}
static inline __host__ __device__ void update(scalar_t *val, scalar_t new_val,
int64_t *arg, int64_t new_arg) {
if (REDUCE == SUM || REDUCE == MEAN)
*val = *val + new_val;
else if (REDUCE == MUL)
*val = *val * new_val;
else if (REDUCE == DIV)
*val = *val / new_val;
else if ((REDUCE == MIN && new_val < *val) ||
(REDUCE == MAX && new_val > *val)) {
*val = new_val;
*arg = new_arg;
}
}
static inline __host__ __device__ void write(scalar_t *address, scalar_t val,
int64_t *arg_address,
int64_t arg, int count) {
if (REDUCE == SUM || REDUCE == MUL || REDUCE == DIV)
*address = val;
else if (REDUCE == MEAN)
*address = val / (scalar_t)(count > 0 ? count : 1);
else if (REDUCE == MIN || REDUCE == MAX) {
if (count > 0) {
*address = val;
*arg_address = arg;
} else
*address = (scalar_t)0;
}
}
static inline __device__ void atomic_write(scalar_t *address, scalar_t val) {
if (REDUCE == SUM || REDUCE == MEAN)
atomAdd(address, val);
else if (REDUCE == MUL)
atomMul(address, val);
else if (REDUCE == DIV)
atomDiv(address, val);
else if (REDUCE == MIN)
atomMin(address, val);
else if (REDUCE == MAX)
atomMax(address, val);
}
};
#include "hip/hip_runtime.h"
#include "scatter_hip.h"
#include "scatter_cuda.h"
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include "reducer.cuh"
#include "utils.cuh"
#define THREADS 1024
#define THREADS 256
#define BLOCKS(N) (N + THREADS - 1) / THREADS
template <typename scalar_t, ReductionType REDUCE>
......@@ -64,7 +63,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
CHECK_CUDA(index);
if (optional_out.has_value())
CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device());
cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() == index.dim());
for (auto i = 0; i < index.dim() - 1; i++)
......@@ -112,7 +111,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......
#pragma once
#include <torch/extension.h>
#include "../extensions.h"
std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
......
#include "hip/hip_runtime.h"
#include "segment_coo_hip.h"
#include "segment_coo_cuda.h"
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include "reducer.cuh"
#include "utils.cuh"
......@@ -37,8 +36,8 @@ segment_coo_kernel(const scalar_t *src_data,
#pragma unroll
for (int i = 1; i < 32; i *= 2) {
// Parallel reduction inside a single warp.
tmp = __shfl_up_sync(FULL_MASK, val, i);
next_idx = __shfl_up_sync(FULL_MASK, idx, i);
tmp = SHFL_UP_SYNC(FULL_MASK, val, i);
next_idx = SHFL_UP_SYNC(FULL_MASK, idx, i);
if (lane_idx >= i && row_idx / D == (row_idx - i) / D) {
assert(idx >= next_idx);
if (idx == next_idx)
......@@ -46,7 +45,7 @@ segment_coo_kernel(const scalar_t *src_data,
}
}
next_idx = __shfl_down_sync(FULL_MASK, idx, 1);
next_idx = SHFL_DOWN_SYNC(FULL_MASK, idx, 1);
if (lane_idx == 32 - 1 || row_idx / D != (row_idx + 1) / D ||
idx != next_idx)
Reducer<scalar_t, REDUCE>::atomic_write(out_data + out_idx, val);
......@@ -158,7 +157,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
CHECK_CUDA(index);
if (optional_out.has_value())
CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device());
cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() >= index.dim());
......@@ -215,7 +214,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......@@ -331,7 +330,7 @@ torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
CHECK_CUDA(index);
if (optional_out.has_value())
CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device());
cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() >= index.dim());
......@@ -366,7 +365,7 @@ torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......
#pragma once
#include <torch/extension.h>
#include "../extensions.h"
std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
segment_coo_cuda(torch::Tensor src, torch::Tensor index,
......@@ -9,7 +9,3 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
torch::optional<torch::Tensor> optional_out);
template<typename T>
__device__ T __ldg(const T* ptr) {
return *ptr;
}
#include "hip/hip_runtime.h"
#include "segment_csr_hip.h"
#include "segment_csr_cuda.h"
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include "index_info.cuh"
#include "reducer.cuh"
......@@ -47,9 +46,9 @@ segment_csr_kernel(const scalar_t *src_data,
for (int i = TB / 2; i > 0; i /= 2) {
// Parallel reduction inside a single warp.
if (REDUCE == MIN || REDUCE == MAX)
arg_tmp = __shfl_down_sync(FULL_MASK, arg, i);
arg_tmp = SHFL_DOWN_SYNC(FULL_MASK, arg, i);
Reducer<scalar_t, REDUCE>::update(
&val, __shfl_down_sync(FULL_MASK, val, i), &arg, arg_tmp);
&val, SHFL_DOWN_SYNC(FULL_MASK, val, i), &arg, arg_tmp);
}
if (lane_idx == 0) {
......@@ -103,7 +102,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
CHECK_CUDA(indptr);
if (optional_out.has_value())
CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device());
cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() >= indptr.dim());
......@@ -148,7 +147,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......@@ -223,7 +222,7 @@ torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
CHECK_CUDA(indptr);
if (optional_out.has_value())
CHECK_CUDA(optional_out.value());
hipSetDevice(src.get_device());
cudaSetDevice(src.get_device());
CHECK_INPUT(src.dim() >= indptr.dim());
......@@ -265,7 +264,7 @@ torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
auto stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......
#pragma once
#include <torch/extension.h>
#include "../extensions.h"
std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
......@@ -9,7 +9,3 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
torch::optional<torch::Tensor> optional_out);
template<typename T>
__device__ T __ldg(const T* ptr) {
return *ptr;
}
#pragma once
#include "../extensions.h"
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
#define CHECK_INPUT(x) AT_ASSERTM(x, "Input mismatch")
__device__ __inline__ at::Half __shfl_up_sync(const unsigned mask,
const at::Half var,
const unsigned int delta) {
return __shfl_up_sync(mask, var.operator __half(), delta);
}
__device__ __inline__ at::Half __shfl_down_sync(const unsigned mask,
const at::Half var,
const unsigned int delta) {
return __shfl_down_sync(mask, var.operator __half(), delta);
}
#ifdef USE_ROCM
__device__ __inline__ at::Half __shfl_up(const at::Half var, const unsigned int delta) {
return __shfl_up(var.operator __half(), delta);
}
__device__ __inline__ at::Half __shfl_down(const at::Half var, const unsigned int delta) {
return __shfl_down(var.operator __half(), delta);
}
#endif
#ifdef USE_ROCM
__device__ __inline__ at::Half __ldg(const at::Half* ptr) {
return __ldg(reinterpret_cast<const __half*>(ptr));
}
#define SHFL_UP_SYNC(mask, var, delta) __shfl_up(var, delta)
#define SHFL_DOWN_SYNC(mask, var, delta) __shfl_down(var, delta)
#else
#define SHFL_UP_SYNC __shfl_up_sync
#define SHFL_DOWN_SYNC __shfl_down_sync
#endif
#include "macros.h"
#include <torch/torch.h>
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#define ATOMIC(NAME) \
......@@ -68,8 +70,8 @@
\
template <typename scalar, size_t size> struct Atomic##NAME##DecimalImpl; \
\
template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 2> { \
inline __device__ void operator()(scalar *address, scalar val) { \
template <> struct Atomic##NAME##DecimalImpl<at::Half, 2> { \
inline __device__ void operator()(at::Half *address, at::Half val) { \
unsigned int *address_as_ui = \
(unsigned int *)((char *)address - ((size_t)address & 2)); \
unsigned int old = *address_as_ui; \
......@@ -87,6 +89,25 @@
} \
}; \
\
template <> struct Atomic##NAME##DecimalImpl<at::BFloat16, 2> { \
inline __device__ void operator()(at::BFloat16 *address, at::BFloat16 val){\
unsigned int *address_as_ui = \
(unsigned int *)((char *)address - ((size_t)address & 2)); \
unsigned int old = *address_as_ui; \
unsigned int assumed; \
\
do { \
assumed = old; \
at::BFloat16 hsum; \
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); \
hsum = OP(hsum, val); \
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) \
: (old & 0xffff0000) | hsum.x; \
old = atomicCAS(address_as_ui, assumed, old); \
} while (assumed != old); \
} \
}; \
\
template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 4> { \
inline __device__ void operator()(scalar *address, scalar val) { \
int *address_as_i = (int *)address; \
......@@ -135,19 +156,19 @@ static inline __device__ void atomAdd(int32_t *address, int32_t val) {
static inline __device__ void atomAdd(int64_t *address, int64_t val) {
AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || TORCH_HIP_VERSION < 10000)
#if defined(USE_ROCM) || (defined(__DTK_ARCH__) && (__DTK_ARCH__ < 700 || DTK_VERSION < 10000))
static inline __device__ void atomAdd(at::Half *address, at::Half val) {
AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
}
#else
static inline __device__ void atomAdd(at::Half *address, at::Half val) {
AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
atomicAdd(reinterpret_cast<__half *>(address), val);
}
#endif
static inline __device__ void atomAdd(float *address, float val) {
atomicAdd(address, val);
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || TORCH_HIP_VERSION < 8000)
#if defined(__DTK_ARCH__) && (__DTK_ARCH__ < 600 || DTK_VERSION < 8000)
static inline __device__ void atomAdd(double *address, double val) {
AtomicAddDecimalImpl<double, sizeof(double)>()(address, val);
}
......@@ -156,6 +177,9 @@ static inline __device__ void atomAdd(double *address, double val) {
atomicAdd(address, val);
}
#endif
static inline __device__ void atomAdd(at::BFloat16 *address, at::BFloat16 val) {
AtomicAddDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
}
#define OP(X, Y) Y *X
ATOMIC(Mul)
......@@ -184,6 +208,9 @@ static inline __device__ void atomMul(at::Half *address, at::Half val) {
static inline __device__ void atomMul(double *address, double val) {
AtomicMulDecimalImpl<double, sizeof(double)>()(address, val);
}
static inline __device__ void atomMul(at::BFloat16 *address, at::BFloat16 val) {
AtomicMulDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
}
#define OP(X, Y) Y / X
ATOMIC(Div)
......@@ -212,6 +239,9 @@ static inline __device__ void atomDiv(float *address, float val) {
static inline __device__ void atomDiv(double *address, double val) {
AtomicDivDecimalImpl<double, sizeof(double)>()(address, val);
}
static inline __device__ void atomDiv(at::BFloat16 *address, at::BFloat16 val) {
AtomicDivDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
}
#define OP(X, Y) max(Y, X)
ATOMIC(Max)
......@@ -240,6 +270,9 @@ static inline __device__ void atomMax(float *address, float val) {
static inline __device__ void atomMax(double *address, double val) {
AtomicMaxDecimalImpl<double, sizeof(double)>()(address, val);
}
static inline __device__ void atomMax(at::BFloat16 *address, at::BFloat16 val) {
AtomicMaxDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
}
#define OP(X, Y) min(Y, X)
ATOMIC(Min)
......@@ -268,3 +301,6 @@ static inline __device__ void atomMin(float *address, float val) {
static inline __device__ void atomMin(double *address, double val) {
AtomicMinDecimalImpl<double, sizeof(double)>()(address, val);
}
static inline __device__ void atomMin(at::BFloat16 *address, at::BFloat16 val) {
AtomicMinDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
}
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#include <ATen/hip/detail/TensorInfo.cuh>
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#include <limits>
#include <map>
#include "atomics.cuh"
#include "../hip/atomics.cuh"
enum ReductionType { SUM, MEAN, MUL, DIV, MIN, MAX };
......@@ -16,27 +18,27 @@ const std::map<std::string, ReductionType> reduce2REDUCE = {
[&] { \
switch (reduce2REDUCE.at(reduce)) { \
case SUM: { \
const ReductionType REDUCE = SUM; \
static constexpr ReductionType REDUCE = SUM; \
return __VA_ARGS__(); \
} \
case MEAN: { \
const ReductionType REDUCE = MEAN; \
static constexpr ReductionType REDUCE = MEAN; \
return __VA_ARGS__(); \
} \
case MUL: { \
const ReductionType REDUCE = MUL; \
static constexpr ReductionType REDUCE = MUL; \
return __VA_ARGS__(); \
} \
case DIV: { \
const ReductionType REDUCE = DIV; \
static constexpr ReductionType REDUCE = DIV; \
return __VA_ARGS__(); \
} \
case MIN: { \
const ReductionType REDUCE = MIN; \
static constexpr ReductionType REDUCE = MIN; \
return __VA_ARGS__(); \
} \
case MAX: { \
const ReductionType REDUCE = MAX; \
static constexpr ReductionType REDUCE = MAX; \
return __VA_ARGS__(); \
} \
} \
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
torch::optional<torch::Tensor> optional_out,
torch::optional<int64_t> dim_size, std::string reduce);
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
#include "scatter_hip.h"
#include "../hip/scatter_cuda.h"
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh>
#include "reducer.cuh"
#include "utils.cuh"
#include "../hip/reducer.cuh"
#include "../hip/utils.cuh"
#define THREADS 1024
#define THREADS 256
#define BLOCKS(N) (N + THREADS - 1) / THREADS
template <typename scalar_t, ReductionType REDUCE>
......@@ -112,7 +114,7 @@ scatter_cuda(torch::Tensor src, torch::Tensor index, int64_t dim,
auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
segment_coo_cuda(torch::Tensor src, torch::Tensor index,
torch::optional<torch::Tensor> optional_out,
torch::optional<int64_t> dim_size, std::string reduce);
torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
torch::optional<torch::Tensor> optional_out);
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
#include "segment_coo_hip.h"
#include "../hip/segment_coo_cuda.h"
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh>
#include "reducer.cuh"
#include "utils.cuh"
#include "../hip/reducer.cuh"
#include "../hip/utils.cuh"
#define THREADS 256
#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
......@@ -37,8 +39,8 @@ segment_coo_kernel(const scalar_t *src_data,
#pragma unroll
for (int i = 1; i < 32; i *= 2) {
// Parallel reduction inside a single warp.
tmp = __shfl_up_sync(FULL_MASK, val, i);
next_idx = __shfl_up_sync(FULL_MASK, idx, i);
tmp = SHFL_UP_SYNC(FULL_MASK, val, i);
next_idx = SHFL_UP_SYNC(FULL_MASK, idx, i);
if (lane_idx >= i && row_idx / D == (row_idx - i) / D) {
assert(idx >= next_idx);
if (idx == next_idx)
......@@ -46,7 +48,7 @@ segment_coo_kernel(const scalar_t *src_data,
}
}
next_idx = __shfl_down_sync(FULL_MASK, idx, 1);
next_idx = SHFL_DOWN_SYNC(FULL_MASK, idx, 1);
if (lane_idx == 32 - 1 || row_idx / D != (row_idx + 1) / D ||
idx != next_idx)
Reducer<scalar_t, REDUCE>::atomic_write(out_data + out_idx, val);
......@@ -215,7 +217,7 @@ segment_coo_cuda(torch::Tensor src, torch::Tensor index,
auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......@@ -366,7 +368,7 @@ torch::Tensor gather_coo_cuda(torch::Tensor src, torch::Tensor index,
auto index_info = at::cuda::detail::getTensorInfo<int64_t, int>(index);
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#include "../extensions.h"
std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
torch::optional<torch::Tensor> optional_out,
std::string reduce);
torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
torch::optional<torch::Tensor> optional_out);
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
#include "segment_csr_hip.h"
#include "../hip/segment_csr_cuda.h"
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/detail/IndexUtils.cuh>
#include <ATen/hip/detail/TensorInfo.cuh>
#include "index_info.cuh"
#include "reducer.cuh"
#include "utils.cuh"
#include "../hip/index_info.cuh"
#include "../hip/reducer.cuh"
#include "../hip/utils.cuh"
#define THREADS 256
#define BLOCKS(TB, N) (TB * N + THREADS - 1) / THREADS
......@@ -47,9 +49,9 @@ segment_csr_kernel(const scalar_t *src_data,
for (int i = TB / 2; i > 0; i /= 2) {
// Parallel reduction inside a single warp.
if (REDUCE == MIN || REDUCE == MAX)
arg_tmp = __shfl_down_sync(FULL_MASK, arg, i);
arg_tmp = SHFL_DOWN_SYNC(FULL_MASK, arg, i);
Reducer<scalar_t, REDUCE>::update(
&val, __shfl_down_sync(FULL_MASK, val, i), &arg, arg_tmp);
&val, SHFL_DOWN_SYNC(FULL_MASK, val, i), &arg, arg_tmp);
}
if (lane_idx == 0) {
......@@ -148,7 +150,7 @@ segment_csr_cuda(torch::Tensor src, torch::Tensor indptr,
auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......@@ -265,7 +267,7 @@ torch::Tensor gather_csr_cuda(torch::Tensor src, torch::Tensor indptr,
auto indptr_info = at::cuda::detail::getTensorInfo<int64_t, int>(indptr);
auto stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::Half, src.scalar_type(), "_", [&] {
AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, src.scalar_type(), "_", [&] {
auto src_data = src.data_ptr<scalar_t>();
auto out_data = out.data_ptr<scalar_t>();
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#include <torch/extension.h>
#include "../extensions.h"
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x " must be CUDA tensor")
......@@ -9,11 +11,32 @@
__device__ __inline__ at::Half __shfl_up_sync(const unsigned mask,
const at::Half var,
const unsigned int delta) {
return __shfl_up_sync(mask, (__half)var, delta);
return __shfl_up_sync(mask, var.operator __half(), delta);
}
__device__ __inline__ at::Half __shfl_down_sync(const unsigned mask,
const at::Half var,
const unsigned int delta) {
return __shfl_down_sync(mask, (__half)var, delta);
return __shfl_down_sync(mask, var.operator __half(), delta);
}
__device__ __inline__ at::Half __shfl_up(const at::Half var,
const unsigned int delta) {
return __shfl_up(var.operator __half(), delta);
}
__device__ __inline__ at::Half __shfl_down(const at::Half var,
const unsigned int delta) {
return __shfl_down(var.operator __half(), delta);
}
#ifdef USE_ROCM
__device__ __inline__ at::Half __ldg(const at::Half* ptr) {
return __ldg(reinterpret_cast<const __half*>(ptr));
}
#define SHFL_UP_SYNC(mask, var, delta) __shfl_up(var, delta)
#define SHFL_DOWN_SYNC(mask, var, delta) __shfl_down(var, delta)
#else
#define SHFL_UP_SYNC __shfl_up_sync
#define SHFL_DOWN_SYNC __shfl_down_sync
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment