Unverified Commit e4605f7c authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #293 from YdrMaster/distinct-cuda

issue291 合并 cuda 代码
parents 5025ebed eac2b0ca
...@@ -4,6 +4,9 @@ ...@@ -4,6 +4,9 @@
#define INFINIOP_CUDA_KERNEL __global__ void #define INFINIOP_CUDA_KERNEL __global__ void
#endif #endif
#include <cuda_bf16.h>
#include <cuda_fp16.h>
// Posible maximum number of threads per block for CUDA architectures // Posible maximum number of threads per block for CUDA architectures
// Used for picking correct kernel launch configuration // Used for picking correct kernel launch configuration
#define CUDA_BLOCK_SIZE_4096 4096 #define CUDA_BLOCK_SIZE_4096 4096
...@@ -12,8 +15,10 @@ ...@@ -12,8 +15,10 @@
#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess) #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
namespace device::cuda { using cuda_bfloat16 = nv_bfloat16;
using cuda_bfloat162 = nv_bfloat162;
namespace device::cuda {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor // return the memory offset of original tensor, given the flattened index of broadcasted tensor
__forceinline__ __device__ __host__ size_t __forceinline__ __device__ __host__ size_t
indexToReducedOffset( indexToReducedOffset(
...@@ -45,8 +50,6 @@ indexToOffset( ...@@ -45,8 +50,6 @@ indexToOffset(
} }
} // namespace device::cuda } // namespace device::cuda
#ifdef ENABLE_NVIDIA_API
#include <cuda_fp16.h>
__forceinline__ __device__ float __forceinline__ __device__ float
exp_(const float val) { exp_(const float val) {
return expf(val); return expf(val);
...@@ -73,4 +76,3 @@ __forceinline__ __device__ __nv_bfloat16 ...@@ -73,4 +76,3 @@ __forceinline__ __device__ __nv_bfloat16
exp_(const __nv_bfloat16 x) { exp_(const __nv_bfloat16 x) {
return hexp(x); return hexp(x);
} }
#endif
#define INFINIOP_MACA_KERNEL __global__ void #define INFINIOP_MACA_KERNEL __global__ void
// Posible maximum number of threads per block for MACA architectures // Posible maximum number of threads per block for MACA architectures
// Used for picking correct kernel launch configuration // Used for picking correct kernel launch configuration
#define MACA_BLOCK_SIZE_1024 1024 #define MACA_BLOCK_SIZE_1024 1024
...@@ -6,6 +7,9 @@ ...@@ -6,6 +7,9 @@
#define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess) #define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
using cuda_bfloat16 = hpcc_bfloat16;
using cuda_bfloat162 = hpcc_bfloat162;
namespace device::maca { namespace device::maca {
// return the memory offset of original tensor, given the flattened index of broadcasted tensor // return the memory offset of original tensor, given the flattened index of broadcasted tensor
...@@ -39,8 +43,6 @@ indexToOffset( ...@@ -39,8 +43,6 @@ indexToOffset(
} }
} // namespace device::maca } // namespace device::maca
#ifdef ENABLE_MACA_API
#include <maca_fp16.h>
__forceinline__ __device__ float __forceinline__ __device__ float
exp_(const float val) { exp_(const float val) {
return expf(val); return expf(val);
...@@ -48,7 +50,7 @@ exp_(const float val) { ...@@ -48,7 +50,7 @@ exp_(const float val) {
__forceinline__ __device__ long double __forceinline__ __device__ long double
exp_(const long double val) { exp_(const long double val) {
return expl(val); return exp(val);
} }
__forceinline__ __device__ double __forceinline__ __device__ double
...@@ -61,8 +63,7 @@ exp_(const __half x) { ...@@ -61,8 +63,7 @@ exp_(const __half x) {
return hexp(x); return hexp(x);
} }
__forceinline__ __device__ __hpcc_bfloat16; __forceinline__ __device__ __hpcc_bfloat16
exp_(const __hpcc_bfloat16; x) { exp_(const __hpcc_bfloat16 x) {
return hexp(x); return hexp(x);
} }
#endif
...@@ -12,45 +12,45 @@ ...@@ -12,45 +12,45 @@
#include <numeric> #include <numeric>
#include <vector> #include <vector>
#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE) \ #define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON) \
\ \
namespace op::OP::NAMESPACE { \ namespace op::OP::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \ class Descriptor final : public InfiniopDescriptor { \
infiniDtype_t _dtype; \ infiniDtype_t _dtype; \
op::elementwise::ElementwiseInfo _info; \ op::elementwise::ElementwiseInfo _info; \
std::unique_ptr<op::elementwise::NAMESPACE::DeviceImpl> _device_info; \ std::unique_ptr<op::elementwise::KERNEL_COMMON::DeviceImpl> _device_info; \
size_t _workspace_size; \ size_t _workspace_size; \
\ \
Descriptor( \ Descriptor( \
infiniDtype_t dtype, \ infiniDtype_t dtype, \
op::elementwise::ElementwiseInfo info, \ op::elementwise::ElementwiseInfo info, \
op::elementwise::NAMESPACE::DeviceImpl *device_info, \ op::elementwise::KERNEL_COMMON::DeviceImpl *device_info, \
size_t workspace_size, \ size_t workspace_size, \
infiniDevice_t device_type, \ infiniDevice_t device_type, \
int device_id) \ int device_id) \
: InfiniopDescriptor{device_type, device_id}, \ : InfiniopDescriptor{device_type, device_id}, \
_dtype(dtype), \ _dtype(dtype), \
_info(std::move(info)), \ _info(std::move(info)), \
_device_info(std::move(device_info)), \ _device_info(std::move(device_info)), \
_workspace_size(workspace_size) {} \ _workspace_size(workspace_size) {} \
\ \
public: \ public: \
~Descriptor(); \ ~Descriptor(); \
\ \
size_t workspaceSize() const { return _workspace_size; } \ size_t workspaceSize() const { return _workspace_size; } \
\ \
static infiniStatus_t create( \ static infiniStatus_t create( \
infiniopHandle_t handle, \ infiniopHandle_t handle, \
Descriptor **desc_ptr, \ Descriptor **desc_ptr, \
infiniopTensorDescriptor_t output_desc, \ infiniopTensorDescriptor_t output_desc, \
std::vector<infiniopTensorDescriptor_t> input_descs); \ std::vector<infiniopTensorDescriptor_t> input_descs); \
\ \
infiniStatus_t calculate( \ infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \ void *workspace, size_t workspace_size, \
void *output, \ void *output, \
std::vector<const void *> inputs, \ std::vector<const void *> inputs, \
void *stream) const; \ void *stream) const; \
}; \ }; \
} }
namespace op::elementwise { namespace op::elementwise {
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h" #include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR(add, cpu) ELEMENTWISE_DESCRIPTOR(add, cpu, cpu)
namespace op::add::cpu { namespace op::add::cpu {
typedef struct AddOp { typedef struct AddOp {
......
#include "add_cuda.cuh" #include "../cuda/kernel.cuh"
#include "add_cuda_internal.cuh" #include "add_nvidia.cuh"
namespace op::add::cuda { namespace op::add::nvidia {
Descriptor::~Descriptor() = default; Descriptor::~Descriptor() = default;
...@@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate( ...@@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) { switch (_dtype) {
case INFINI_DTYPE_F16: case INFINI_DTYPE_F16:
return _device_info->calculate<256, AddOp, half>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16: case INFINI_DTYPE_BF16:
return _device_info->calculate<256, AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: case INFINI_DTYPE_F32:
return _device_info->calculate<256, AddOp, float>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: case INFINI_DTYPE_F64:
return _device_info->calculate<256, AddOp, double>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream);
default: default:
return INFINI_STATUS_BAD_TENSOR_DTYPE; return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::add::cuda } // namespace op::add::nvidia
...@@ -3,6 +3,6 @@ ...@@ -3,6 +3,6 @@
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh" #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR(add, cuda) ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda)
#endif // __ADD_CUDA_API_H__ #endif // __ADD_CUDA_API_H__
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "cpu/add_cpu.h" #include "cpu/add_cpu.h"
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "cuda/add_cuda.cuh" #include "nvidia/add_nvidia.cuh"
#endif #endif
__C infiniStatus_t infiniopCreateAddDescriptor( __C infiniStatus_t infiniopCreateAddDescriptor(
...@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateAddDescriptor( ...@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu); CREATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
default: default:
...@@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz ...@@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
#define GET(CASE, NAMESPACE) \ #define GET(CASE, NAMESPACE) \
case CASE: \ case CASE: \
*size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \ *size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS
switch (desc->device_type) { switch (desc->device_type) {
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
GET(INFINI_DEVICE_CPU, cpu) GET(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda) GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopAdd( ...@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopAdd(
CALCULATE(INFINI_DEVICE_CPU, cpu); CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda); CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
default: default:
...@@ -99,7 +99,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { ...@@ -99,7 +99,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
#define DELETE(CASE, NAMESPACE) \ #define DELETE(CASE, NAMESPACE) \
case CASE: \ case CASE: \
delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \ delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS
switch (desc->device_type) { switch (desc->device_type) {
...@@ -107,7 +107,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { ...@@ -107,7 +107,7 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
DELETE(INFINI_DEVICE_CPU, cpu); DELETE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda); DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
default: default:
......
#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__ #ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
#define __CAUSAL_SOFTMAX_KERNEL_CUH__ #define __CAUSAL_SOFTMAX_KERNEL_CUH__
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../reduce/cuda/reduce.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute> template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_CUDA_KERNEL causalSoftmax( __device__ void causalSoftmaxKernel(
Tdata *y_, const Tdata *x_, Tdata *y_, const Tdata *x_,
size_t batch, size_t height, size_t width, size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
...@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax( ...@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
// 2 | * * * ... * * * | // 2 | * * * ... * * * |
// height: 3 col_id-> // height: 3 col_id->
if (width + blockIdx.x >= threadIdx.x + height) { if (width + blockIdx.x >= threadIdx.x + height) {
#ifdef ENABLE_NVIDIA_API if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
y[col] = exp_(x[col] - max_); y[col] = hexp(x[col] - max_);
#else } else {
y[col] = exp(x[col] - max_); y[col] = exp(x[col] - max_);
#endif }
} else { } else {
y[col] = Tdata(0); y[col] = Tdata(0);
} }
......
#ifndef __CAUSAL_SOFTMAX_KERNEL_H__
#define __CAUSAL_SOFTMAX_KERNEL_H__
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/maca/reduce.h"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_MACA_KERNEL causalSoftmax(
Tdata *y_, const Tdata *x_,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
Tdata *y = y_ // threadIdx.x for col_id
+ blockIdx.y * y_stride_b // gridDim.y for batch_id
+ blockIdx.x * y_stride_h; // gridDim.x for row_id
const Tdata *x = x_ + blockIdx.y * x_stride_b + blockIdx.x * x_stride_h;
// [Reduce] Find max value in each row and store in shared memory
__shared__ Tdata max_;
Tdata max_0 = op::common_maca::reduce_op::max<BLOCK_SIZE, Tdata>(x, width - height + 1 + blockIdx.x);
if (threadIdx.x == 0) {
max_ = max_0;
}
__syncthreads();
// [Elementwise] Subtract max value from each element and apply causal mask
for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) {
// row_id ↓ |<- width ->|
// 0 | * * * ... * |
// 1 | * * * ... * * |
// 2 | * * * ... * * * |
// height: 3 col_id->
if (width + blockIdx.x >= threadIdx.x + height) {
#ifdef ENABLE_MACA_API
y[col] = exp_(x[col] - max_);
#else
y[col] = exp(x[col] - max_);
#endif
} else {
y[col] = Tdata(0);
}
}
__syncthreads();
// [Reduce] Find the sum of each updated row and store in shared memory
__shared__ Tcompute sum_;
Tcompute sum_0 = op::common_maca::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(y, width);
if (threadIdx.x == 0) {
sum_ = sum_0;
}
__syncthreads();
// [Elementwise] Divide each element by the sum and store in shared memory
for (size_t col = threadIdx.x; col < width; col += BLOCK_SIZE) {
y[col] /= Tdata(sum_);
}
}
#endif // __CAUSAL_SOFTMAX_KERNEL_H__
#ifndef __CAUSAL_SOFTMAX_MACA_H__ #ifndef __CAUSAL_SOFTMAX_METAX_H__
#define __CAUSAL_SOFTMAX_MACA_H__ #define __CAUSAL_SOFTMAX_METAX_H__
#include "../causal_softmax.h" #include "../causal_softmax.h"
DESCRIPTOR(maca) DESCRIPTOR(metax)
#endif #endif
#include "../../../devices/maca/common_maca.h" #include "../../../devices/maca/common_maca.h"
#include "causal_softmax_kernel.h" #include "causal_softmax_metax.h"
#include "causal_softmax_maca.h"
namespace op::causal_softmax::maca { #include <hccub/block/block_reduce.cuh>
#include "../../../devices/maca/maca_kernel_common.h"
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_MACA_KERNEL causalSoftmax(
Tdata *y, const Tdata *x,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
}
namespace op::causal_softmax::metax {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::maca::Handle::Internal> internal; std::shared_ptr<device::maca::Handle::Internal> internal;
...@@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, ...@@ -75,4 +90,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::causal_softmax::maca } // namespace op::causal_softmax::metax
#include "../../../devices/cuda/cuda_common.cuh" #include "../../../devices/cuda/cuda_common.cuh"
#include "causal_softmax_cuda.cuh" #include "causal_softmax_nvidia.cuh"
#include "causal_softmax_kernel.cuh"
namespace op::causal_softmax::cuda { #include "../../../devices/cuda/cuda_kernel_common.cuh"
#include <cub/block/block_reduce.cuh>
#include "../../../reduce/cuda/reduce.cuh"
#include "../cuda/kernel.cuh"
template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
INFINIOP_CUDA_KERNEL causalSoftmax(
Tdata *y, const Tdata *x,
size_t batch, size_t height, size_t width,
ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
ptrdiff_t x_stride_b, ptrdiff_t x_stride_h) {
causalSoftmaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y, x, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h);
}
namespace op::causal_softmax::nvidia {
struct Descriptor::Opaque { struct Descriptor::Opaque {
std::shared_ptr<device::cuda::Handle::Internal> internal; std::shared_ptr<device::cuda::Handle::Internal> internal;
...@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, ...@@ -79,4 +94,4 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::causal_softmax::cuda } // namespace op::causal_softmax::nvidia
#ifndef __CAUSAL_SOFTMAX_CUDA_H__ #ifndef __CAUSAL_SOFTMAX_NVIDIA_H__
#define __CAUSAL_SOFTMAX_CUDA_H__ #define __CAUSAL_SOFTMAX_NVIDIA_H__
#include "../causal_softmax.h" #include "../causal_softmax.h"
DESCRIPTOR(cuda) DESCRIPTOR(nvidia)
#endif #endif
...@@ -6,10 +6,10 @@ ...@@ -6,10 +6,10 @@
#include "cpu/causal_softmax_cpu.h" #include "cpu/causal_softmax_cpu.h"
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "cuda/causal_softmax_cuda.cuh" #include "nvidia/causal_softmax_nvidia.cuh"
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
#include "maca/causal_softmax_maca.h" #include "metax/causal_softmax_metax.h"
#endif #endif
#ifdef ENABLE_ASCEND_API #ifdef ENABLE_ASCEND_API
#include "ascend/causal_softmax_ascend.h" #include "ascend/causal_softmax_ascend.h"
...@@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor( ...@@ -34,10 +34,13 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu) CREATE(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda) CREATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, maca) CREATE(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
...@@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor( ...@@ -45,14 +48,6 @@ __C infiniStatus_t infiniopCreateCausalSoftmaxDescriptor(
// return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc); // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_API
CREATE(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCreateCausalSoftmaxDescriptor((MacaHandle_t)handle, (CausalSoftmaxMacaDescriptor_t *)desc_ptr, y_desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc); return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
...@@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe ...@@ -74,7 +69,13 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
GET(INFINI_DEVICE_CPU, cpu) GET(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda) GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
...@@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe ...@@ -83,17 +84,6 @@ __C infiniStatus_t infiniopGetCausalSoftmaxWorkspaceSize(infiniopCausalSoftmaxDe
} }
#endif #endif
#ifdef ENABLE_ASCEND_API
GET(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, maca)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMacaDescriptor_t)desc, size);
}
#endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size); return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
...@@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax( ...@@ -120,10 +110,13 @@ __C infiniStatus_t infiniopCausalSoftmax(
CALCULATE(INFINI_DEVICE_CPU, cpu) CALCULATE(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda) CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, maca) CALCULATE(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
...@@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax( ...@@ -131,14 +124,6 @@ __C infiniStatus_t infiniopCausalSoftmax(
// return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
} }
#endif #endif
#ifdef ENABLE_ASCEND_API
CALCULATE(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaCausalSoftmax((CausalSoftmaxMacaDescriptor_t)desc, workspace, workspace_size, data, stream);
}
#endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: { case DevMthreadsGpu: {
return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream); return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
...@@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD ...@@ -160,10 +145,13 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
DESTROY(INFINI_DEVICE_CPU, cpu) DESTROY(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DESTROY(INFINI_DEVICE_NVIDIA, cuda) DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
#endif #endif
#ifdef ENABLE_METAX_API #ifdef ENABLE_METAX_API
DESTROY(INFINI_DEVICE_METAX, maca) DESTROY(INFINI_DEVICE_METAX, metax)
#endif
#ifdef ENABLE_ASCEND_API
DESTROY(INFINI_DEVICE_ASCEND, ascend)
#endif #endif
#ifdef ENABLE_CAMBRICON_MLU #ifdef ENABLE_CAMBRICON_MLU
case DevCambriconMlu: { case DevCambriconMlu: {
...@@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD ...@@ -171,14 +159,6 @@ __C infiniStatus_t infiniopDestroyCausalSoftmaxDescriptor(infiniopCausalSoftmaxD
// return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc); // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
} }
#endif #endif
#ifdef ENABLE_ASCEND_API
DESTROY(INFINI_DEVICE_ASCEND, ascend)
#endif
#ifdef ENABLE_METAX_GPU
case DevMetaxGpu: {
return macaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMacaDescriptor_t)desc);
}
#endif
#ifdef ENABLE_MTHREADS_GPU #ifdef ENABLE_MTHREADS_GPU
case DevMthreadsGpu: case DevMthreadsGpu:
return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc); return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
#include "../../../elementwise/cpu/elementwise_cpu.h" #include "../../../elementwise/cpu/elementwise_cpu.h"
#include "infiniop/ops/clip.h" #include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR(clip, cpu) ELEMENTWISE_DESCRIPTOR(clip, cpu, cpu)
namespace op::clip::cpu { namespace op::clip::cpu {
......
#include "clip_cuda.cuh" #include "../cuda/kernel.cuh"
#include "clip_cuda_internal.cuh" #include "clip_nvidia.cuh"
namespace op::clip::cuda { namespace op::clip::nvidia {
Descriptor::~Descriptor() = default; Descriptor::~Descriptor() = default;
...@@ -45,17 +45,17 @@ infiniStatus_t Descriptor::calculate( ...@@ -45,17 +45,17 @@ infiniStatus_t Descriptor::calculate(
switch (_dtype) { switch (_dtype) {
case INFINI_DTYPE_F16: case INFINI_DTYPE_F16:
return _device_info->calculate<256, ClipOp, half>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::ClipOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: case INFINI_DTYPE_F32:
return _device_info->calculate<256, ClipOp, float>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::ClipOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: case INFINI_DTYPE_F64:
return _device_info->calculate<256, ClipOp, double>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::ClipOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16: case INFINI_DTYPE_BF16:
return _device_info->calculate<256, ClipOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::ClipOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
default: default:
return INFINI_STATUS_BAD_TENSOR_DTYPE; return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::clip::cuda } // namespace op::clip::nvidia
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
#define __CLIP_CUDA_API_H__ #define __CLIP_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh" #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
#include "infiniop/ops/clip.h"
ELEMENTWISE_DESCRIPTOR(clip, cuda) ELEMENTWISE_DESCRIPTOR(clip, nvidia, cuda)
#endif // __CLIP_CUDA_API_H__ #endif // __CLIP_CUDA_API_H__
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include "cpu/clip_cpu.h" #include "cpu/clip_cpu.h"
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "cuda/clip_cuda.cuh" #include "nvidia/clip_nvidia.cuh"
#endif #endif
__C infiniStatus_t infiniopCreateClipDescriptor( __C infiniStatus_t infiniopCreateClipDescriptor(
...@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateClipDescriptor( ...@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
CREATE(INFINI_DEVICE_CPU, cpu); CREATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, cuda); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
default: default:
...@@ -53,7 +53,7 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s ...@@ -53,7 +53,7 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
GET(INFINI_DEVICE_CPU, cpu) GET(INFINI_DEVICE_CPU, cpu)
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, cuda) GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif #endif
} }
...@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopClip( ...@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopClip(
CALCULATE(INFINI_DEVICE_CPU, cpu); CALCULATE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, cuda); CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
default: default:
...@@ -107,7 +107,7 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) { ...@@ -107,7 +107,7 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
DELETE(INFINI_DEVICE_CPU, cpu); DELETE(INFINI_DEVICE_CPU, cpu);
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, cuda); DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
default: default:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment