Unverified Commit d76a2607 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #324 from InfiniTensor/issue/312-metax

issue/312 沐曦add clip sub mul 支持bf16
parents 55f60c1c a6aa9816
#ifndef __ADD_CUDA_H__ #ifndef __ADD_CUDA_H__
#define __ADD_CUDA_H__ #define __ADD_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace op::add::cuda { namespace op::add::cuda {
typedef struct AddOp { typedef struct AddOp {
public: public:
...@@ -13,7 +9,7 @@ public: ...@@ -13,7 +9,7 @@ public:
__device__ __forceinline__ T operator()(const T &a, const T &b) const { __device__ __forceinline__ T operator()(const T &a, const T &b) const {
if constexpr (std::is_same_v<T, half2>) { if constexpr (std::is_same_v<T, half2>) {
return __hadd2(a, b); return __hadd2(a, b);
} else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>) { } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
return __hadd(a, b); return __hadd(a, b);
} else if constexpr (std::is_same_v<T, float>) { } else if constexpr (std::is_same_v<T, float>) {
return __fadd_rd(a, b); return __fadd_rd(a, b);
......
#ifndef __ADD_METAX_API_H__
#define __ADD_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(add, metax, metax)
#endif // __ADD_METAX_API_H__
#include "add_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::add::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::AddOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::add::metax
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh" #include "../cuda/kernel.cuh"
#include "add_nvidia.cuh" #include "add_nvidia.cuh"
...@@ -45,7 +47,7 @@ infiniStatus_t Descriptor::calculate( ...@@ -45,7 +47,7 @@ infiniStatus_t Descriptor::calculate(
case INFINI_DTYPE_F16: case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16: case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::AddOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32: case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64: case INFINI_DTYPE_F64:
......
...@@ -8,6 +8,9 @@ ...@@ -8,6 +8,9 @@
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "nvidia/add_nvidia.cuh" #include "nvidia/add_nvidia.cuh"
#endif #endif
#ifdef ENABLE_METAX_API
#include "metax/add_metax.h"
#endif
__C infiniStatus_t infiniopCreateAddDescriptor( __C infiniStatus_t infiniopCreateAddDescriptor(
infiniopHandle_t handle, infiniopHandle_t handle,
...@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor( ...@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz ...@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia); GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopAdd( ...@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopAdd(
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -109,6 +121,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { ...@@ -109,6 +121,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia); DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __CLIP_CUDA_H__ #ifndef __CLIP_CUDA_H__
#define __CLIP_CUDA_H__ #define __CLIP_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace op::clip::cuda { namespace op::clip::cuda {
typedef struct ClipOp { typedef struct ClipOp {
...@@ -13,7 +9,7 @@ public: ...@@ -13,7 +9,7 @@ public:
template <typename T> template <typename T>
__device__ __forceinline__ T operator()(const T &x, const T &min_val, const T &max_val) const { __device__ __forceinline__ T operator()(const T &x, const T &min_val, const T &max_val) const {
if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) { if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, cuda_bfloat162>) {
#ifndef ENABLE_ILUVATAR_CUDA_API #ifndef ENABLE_ILUVATAR_CUDA_API
return __hmax2(__hmin2(x, max_val), min_val); return __hmax2(__hmin2(x, max_val), min_val);
#else #else
......
#ifndef __CLIP_METAX_API_H__
#define __CLIP_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(clip, metax, metax)
#endif // __CLIP_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "clip_metax.h"
namespace op::clip::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0);
const auto &min_desc = input_desc_vec.at(1);
const auto &max_desc = input_desc_vec.at(2);
const auto &out_shape = out_desc->shape();
const auto &in_shape = in_desc->shape();
const auto &min_shape = min_desc->shape();
const auto &max_shape = max_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(out_shape, in_shape);
CHECK_SAME_SHAPE(out_shape, min_shape);
CHECK_SAME_SHAPE(out_shape, max_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::ClipOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::ClipOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::ClipOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::ClipOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::clip::metax
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh" #include "../cuda/kernel.cuh"
#include "clip_nvidia.cuh" #include "clip_nvidia.cuh"
...@@ -51,7 +53,7 @@ infiniStatus_t Descriptor::calculate( ...@@ -51,7 +53,7 @@ infiniStatus_t Descriptor::calculate(
case INFINI_DTYPE_F64: case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::ClipOp, double>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::ClipOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16: case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::ClipOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::ClipOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default: default:
return INFINI_STATUS_BAD_TENSOR_DTYPE; return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
......
...@@ -8,6 +8,9 @@ ...@@ -8,6 +8,9 @@
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "nvidia/clip_nvidia.cuh" #include "nvidia/clip_nvidia.cuh"
#endif #endif
#ifdef ENABLE_METAX_API
#include "metax/clip_metax.h"
#endif
__C infiniStatus_t infiniopCreateClipDescriptor( __C infiniStatus_t infiniopCreateClipDescriptor(
infiniopHandle_t handle, infiniopHandle_t handle,
...@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor( ...@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s ...@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia) GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax)
#endif #endif
} }
...@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopClip( ...@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopClip(
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -109,6 +121,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) { ...@@ -109,6 +121,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia); DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __MUL_CUDA_H__ #ifndef __MUL_CUDA_H__
#define __MUL_CUDA_H__ #define __MUL_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace op::mul::cuda { namespace op::mul::cuda {
typedef struct MulOp { typedef struct MulOp {
static constexpr size_t num_inputs = 2; static constexpr size_t num_inputs = 2;
template <typename T> template <typename T>
__device__ __forceinline__ T operator()(const T &a, const T &b) const { __device__ __forceinline__ T operator()(const T &a, const T &b) const {
if constexpr (std::is_same_v<T, half2>) { if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, cuda_bfloat162>) {
return __hmul2(a, b); return __hmul2(a, b);
} else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>) { } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
return __hmul(a, b); return __hmul(a, b);
} else if constexpr (std::is_same_v<T, float>) { } else if constexpr (std::is_same_v<T, float>) {
return __fmul_rn(a, b); return __fmul_rn(a, b);
......
#ifndef __MUL_METAX_API_H__
#define __MUL_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(mul, metax, metax)
#endif // __MUL_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "mul_metax.h"
namespace op::mul::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::MulOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::MulOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::MulOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::MulOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::mul::metax
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh" #include "../cuda/kernel.cuh"
#include "mul_nvidia.cuh" #include "mul_nvidia.cuh"
...@@ -49,7 +51,7 @@ infiniStatus_t Descriptor::calculate( ...@@ -49,7 +51,7 @@ infiniStatus_t Descriptor::calculate(
case INFINI_DTYPE_F64: case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::MulOp, double>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::MulOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16: case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::MulOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::MulOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default: default:
return INFINI_STATUS_BAD_TENSOR_DTYPE; return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
......
...@@ -5,10 +5,12 @@ ...@@ -5,10 +5,12 @@
#ifdef ENABLE_CPU_API #ifdef ENABLE_CPU_API
#include "cpu/mul_cpu.h" #include "cpu/mul_cpu.h"
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "nvidia/mul_nvidia.cuh" #include "nvidia/mul_nvidia.cuh"
#endif #endif
#ifdef ENABLE_METAX_API
#include "metax/mul_metax.h"
#endif
__C infiniStatus_t infiniopCreateMulDescriptor( __C infiniStatus_t infiniopCreateMulDescriptor(
infiniopHandle_t handle, infiniopHandle_t handle,
...@@ -34,6 +36,9 @@ __C infiniStatus_t infiniopCreateMulDescriptor( ...@@ -34,6 +36,9 @@ __C infiniStatus_t infiniopCreateMulDescriptor(
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -55,6 +60,9 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz ...@@ -55,6 +60,9 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia); GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -86,6 +94,9 @@ __C infiniStatus_t infiniopMul( ...@@ -86,6 +94,9 @@ __C infiniStatus_t infiniopMul(
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -110,6 +121,9 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) { ...@@ -110,6 +121,9 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia); DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __SUB_CUDA_H__ #ifndef __SUB_CUDA_H__
#define __SUB_CUDA_H__ #define __SUB_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace op::sub::cuda { namespace op::sub::cuda {
typedef struct SubOp { typedef struct SubOp {
public: public:
static constexpr size_t num_inputs = 2; static constexpr size_t num_inputs = 2;
template <typename T> template <typename T>
__device__ __forceinline__ T operator()(const T &a, const T &b) const { __device__ __forceinline__ T operator()(const T &a, const T &b) const {
if constexpr (std::is_same_v<T, half2>) { if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, cuda_bfloat162>) {
return __hsub2(a, b); return __hsub2(a, b);
} else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, __nv_bfloat16>) { } else if constexpr (std::is_same_v<T, half> || std::is_same_v<T, cuda_bfloat16>) {
return __hsub(a, b); return __hsub(a, b);
} else if constexpr (std::is_same_v<T, float>) { } else if constexpr (std::is_same_v<T, float>) {
return __fsub_rd(a, b); return __fsub_rd(a, b);
......
#ifndef __SUB_METAX_API_H__
#define __SUB_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR(sub, metax, metax)
#endif // __SUB_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "sub_metax.h"
namespace op::sub::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::SubOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::SubOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SubOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SubOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::sub::metax
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh" #include "../cuda/kernel.cuh"
#include "sub_nvidia.cuh" #include "sub_nvidia.cuh"
...@@ -49,7 +51,7 @@ infiniStatus_t Descriptor::calculate( ...@@ -49,7 +51,7 @@ infiniStatus_t Descriptor::calculate(
case INFINI_DTYPE_F64: case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SubOp, double>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::SubOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16: case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SubOp, __nv_bfloat16>(_info, workspace, output, inputs, stream); return _device_info->calculate<256, cuda::SubOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default: default:
return INFINI_STATUS_BAD_TENSOR_DTYPE; return INFINI_STATUS_BAD_TENSOR_DTYPE;
} }
......
...@@ -8,6 +8,9 @@ ...@@ -8,6 +8,9 @@
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
#include "nvidia/sub_nvidia.cuh" #include "nvidia/sub_nvidia.cuh"
#endif #endif
#ifdef ENABLE_METAX_API
#include "metax/sub_metax.h"
#endif
__C infiniStatus_t infiniopCreateSubDescriptor( __C infiniStatus_t infiniopCreateSubDescriptor(
infiniopHandle_t handle, infiniopHandle_t handle,
...@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor( ...@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CREATE(INFINI_DEVICE_NVIDIA, nvidia); CREATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
CREATE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz ...@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
#endif #endif
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
GET(INFINI_DEVICE_NVIDIA, nvidia); GET(INFINI_DEVICE_NVIDIA, nvidia);
#endif
#ifdef ENABLE_METAX_API
GET(INFINI_DEVICE_METAX, metax);
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopSub( ...@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopSub(
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
CALCULATE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -109,6 +121,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) { ...@@ -109,6 +121,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API #ifdef ENABLE_NVIDIA_API
DELETE(INFINI_DEVICE_NVIDIA, nvidia); DELETE(INFINI_DEVICE_NVIDIA, nvidia);
#endif #endif
#ifdef ENABLE_METAX_API
DELETE(INFINI_DEVICE_METAX, metax);
#endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment