Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
d76a2607
Unverified
Commit
d76a2607
authored
Jul 12, 2025
by
PanZezhong1725
Committed by
GitHub
Jul 12, 2025
Browse files
Merge pull request #324 from InfiniTensor/issue/312-metax
issue/312 沐曦add clip sub mul 支持bf16
parents
55f60c1c
a6aa9816
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
357 additions
and
27 deletions
+357
-27
src/infiniop/ops/add/cuda/kernel.cuh
src/infiniop/ops/add/cuda/kernel.cuh
+1
-5
src/infiniop/ops/add/metax/add_metax.h
src/infiniop/ops/add/metax/add_metax.h
+8
-0
src/infiniop/ops/add/metax/add_metax.maca
src/infiniop/ops/add/metax/add_metax.maca
+62
-0
src/infiniop/ops/add/nvidia/add_nvidia.cu
src/infiniop/ops/add/nvidia/add_nvidia.cu
+3
-1
src/infiniop/ops/add/operator.cc
src/infiniop/ops/add/operator.cc
+15
-0
src/infiniop/ops/clip/cuda/kernel.cuh
src/infiniop/ops/clip/cuda/kernel.cuh
+1
-5
src/infiniop/ops/clip/metax/clip_metax.h
src/infiniop/ops/clip/metax/clip_metax.h
+8
-0
src/infiniop/ops/clip/metax/clip_metax.maca
src/infiniop/ops/clip/metax/clip_metax.maca
+63
-0
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
+3
-1
src/infiniop/ops/clip/operator.cc
src/infiniop/ops/clip/operator.cc
+15
-0
src/infiniop/ops/mul/cuda/kernel.cuh
src/infiniop/ops/mul/cuda/kernel.cuh
+2
-6
src/infiniop/ops/mul/metax/mul_metax.h
src/infiniop/ops/mul/metax/mul_metax.h
+8
-0
src/infiniop/ops/mul/metax/mul_metax.maca
src/infiniop/ops/mul/metax/mul_metax.maca
+61
-0
src/infiniop/ops/mul/nvidia/mul_nvidia.cu
src/infiniop/ops/mul/nvidia/mul_nvidia.cu
+3
-1
src/infiniop/ops/mul/operator.cc
src/infiniop/ops/mul/operator.cc
+15
-1
src/infiniop/ops/sub/cuda/kernel.cuh
src/infiniop/ops/sub/cuda/kernel.cuh
+2
-6
src/infiniop/ops/sub/metax/sub_metax.h
src/infiniop/ops/sub/metax/sub_metax.h
+8
-0
src/infiniop/ops/sub/metax/sub_metax.maca
src/infiniop/ops/sub/metax/sub_metax.maca
+61
-0
src/infiniop/ops/sub/nvidia/sub_nvidia.cu
src/infiniop/ops/sub/nvidia/sub_nvidia.cu
+3
-1
src/infiniop/ops/sub/operator.cc
src/infiniop/ops/sub/operator.cc
+15
-0
No files found.
src/infiniop/ops/add/cuda/kernel.cuh
View file @
d76a2607
#ifndef __ADD_CUDA_H__
#ifndef __ADD_CUDA_H__
#define __ADD_CUDA_H__
#define __ADD_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
add
::
cuda
{
namespace
op
::
add
::
cuda
{
typedef
struct
AddOp
{
typedef
struct
AddOp
{
public:
public:
...
@@ -13,7 +9,7 @@ public:
...
@@ -13,7 +9,7 @@ public:
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hadd2
(
a
,
b
);
return
__hadd2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
__nv
_bfloat16
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
cuda
_bfloat16
>
)
{
return
__hadd
(
a
,
b
);
return
__hadd
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fadd_rd
(
a
,
b
);
return
__fadd_rd
(
a
,
b
);
...
...
src/infiniop/ops/add/metax/add_metax.h
0 → 100644
View file @
d76a2607
#ifndef __ADD_METAX_API_H__
#define __ADD_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
add
,
metax
,
metax
)
#endif // __ADD_METAX_API_H__
src/infiniop/ops/add/metax/add_metax.maca
0 → 100644
View file @
d76a2607
#include "add_metax.h"
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
namespace op::add::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::AddOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::add::metax
src/infiniop/ops/add/nvidia/add_nvidia.cu
View file @
d76a2607
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh"
#include "../cuda/kernel.cuh"
#include "add_nvidia.cuh"
#include "add_nvidia.cuh"
...
@@ -45,7 +47,7 @@ infiniStatus_t Descriptor::calculate(
...
@@ -45,7 +47,7 @@ infiniStatus_t Descriptor::calculate(
case
INFINI_DTYPE_F16
:
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
__nv
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
cuda
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
AddOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_F64
:
...
...
src/infiniop/ops/add/operator.cc
View file @
d76a2607
...
@@ -8,6 +8,9 @@
...
@@ -8,6 +8,9 @@
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "nvidia/add_nvidia.cuh"
#include "nvidia/add_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#include "metax/add_metax.h"
#endif
__C
infiniStatus_t
infiniopCreateAddDescriptor
(
__C
infiniStatus_t
infiniopCreateAddDescriptor
(
infiniopHandle_t
handle
,
infiniopHandle_t
handle
,
...
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
...
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
...
@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopAdd(
...
@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopAdd(
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -109,6 +121,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
...
@@ -109,6 +121,9 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/clip/cuda/kernel.cuh
View file @
d76a2607
#ifndef __CLIP_CUDA_H__
#ifndef __CLIP_CUDA_H__
#define __CLIP_CUDA_H__
#define __CLIP_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
clip
::
cuda
{
namespace
op
::
clip
::
cuda
{
typedef
struct
ClipOp
{
typedef
struct
ClipOp
{
...
@@ -13,7 +9,7 @@ public:
...
@@ -13,7 +9,7 @@ public:
template
<
typename
T
>
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
x
,
const
T
&
min_val
,
const
T
&
max_val
)
const
{
__device__
__forceinline__
T
operator
()(
const
T
&
x
,
const
T
&
min_val
,
const
T
&
max_val
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
nv
_bfloat162
>
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
cuda
_bfloat162
>
)
{
#ifndef ENABLE_ILUVATAR_CUDA_API
#ifndef ENABLE_ILUVATAR_CUDA_API
return
__hmax2
(
__hmin2
(
x
,
max_val
),
min_val
);
return
__hmax2
(
__hmin2
(
x
,
max_val
),
min_val
);
#else
#else
...
...
src/infiniop/ops/clip/metax/clip_metax.h
0 → 100644
View file @
d76a2607
#ifndef __CLIP_METAX_API_H__
#define __CLIP_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
clip
,
metax
,
metax
)
#endif // __CLIP_METAX_API_H__
src/infiniop/ops/clip/metax/clip_metax.maca
0 → 100644
View file @
d76a2607
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "clip_metax.h"
namespace op::clip::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &in_desc = input_desc_vec.at(0);
const auto &min_desc = input_desc_vec.at(1);
const auto &max_desc = input_desc_vec.at(2);
const auto &out_shape = out_desc->shape();
const auto &in_shape = in_desc->shape();
const auto &min_shape = min_desc->shape();
const auto &max_shape = max_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(out_shape, in_shape);
CHECK_SAME_SHAPE(out_shape, min_shape);
CHECK_SAME_SHAPE(out_shape, max_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::ClipOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::ClipOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::ClipOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::ClipOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::clip::metax
src/infiniop/ops/clip/nvidia/clip_nvidia.cu
View file @
d76a2607
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh"
#include "../cuda/kernel.cuh"
#include "clip_nvidia.cuh"
#include "clip_nvidia.cuh"
...
@@ -51,7 +53,7 @@ infiniStatus_t Descriptor::calculate(
...
@@ -51,7 +53,7 @@ infiniStatus_t Descriptor::calculate(
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
__nv
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
ClipOp
,
cuda
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
...
...
src/infiniop/ops/clip/operator.cc
View file @
d76a2607
...
@@ -8,6 +8,9 @@
...
@@ -8,6 +8,9 @@
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "nvidia/clip_nvidia.cuh"
#include "nvidia/clip_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#include "metax/clip_metax.h"
#endif
__C
infiniStatus_t
infiniopCreateClipDescriptor
(
__C
infiniStatus_t
infiniopCreateClipDescriptor
(
infiniopHandle_t
handle
,
infiniopHandle_t
handle
,
...
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
...
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateClipDescriptor(
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
...
@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, s
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
)
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
)
#endif
#endif
}
}
...
@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopClip(
...
@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopClip(
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -109,6 +121,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
...
@@ -109,6 +121,9 @@ infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/mul/cuda/kernel.cuh
View file @
d76a2607
#ifndef __MUL_CUDA_H__
#ifndef __MUL_CUDA_H__
#define __MUL_CUDA_H__
#define __MUL_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
mul
::
cuda
{
namespace
op
::
mul
::
cuda
{
typedef
struct
MulOp
{
typedef
struct
MulOp
{
static
constexpr
size_t
num_inputs
=
2
;
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
return
__hmul2
(
a
,
b
);
return
__hmul2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
__nv
_bfloat16
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
cuda
_bfloat16
>
)
{
return
__hmul
(
a
,
b
);
return
__hmul
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
a
,
b
);
return
__fmul_rn
(
a
,
b
);
...
...
src/infiniop/ops/mul/metax/mul_metax.h
0 → 100644
View file @
d76a2607
#ifndef __MUL_METAX_API_H__
#define __MUL_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
mul
,
metax
,
metax
)
#endif // __MUL_METAX_API_H__
src/infiniop/ops/mul/metax/mul_metax.maca
0 → 100644
View file @
d76a2607
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "mul_metax.h"
namespace op::mul::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::MulOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::MulOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::MulOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::MulOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::mul::metax
src/infiniop/ops/mul/nvidia/mul_nvidia.cu
View file @
d76a2607
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh"
#include "../cuda/kernel.cuh"
#include "mul_nvidia.cuh"
#include "mul_nvidia.cuh"
...
@@ -49,7 +51,7 @@ infiniStatus_t Descriptor::calculate(
...
@@ -49,7 +51,7 @@ infiniStatus_t Descriptor::calculate(
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
cuda
::
MulOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
MulOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
MulOp
,
__nv
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
MulOp
,
cuda
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
...
...
src/infiniop/ops/mul/operator.cc
View file @
d76a2607
...
@@ -5,10 +5,12 @@
...
@@ -5,10 +5,12 @@
#ifdef ENABLE_CPU_API
#ifdef ENABLE_CPU_API
#include "cpu/mul_cpu.h"
#include "cpu/mul_cpu.h"
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "nvidia/mul_nvidia.cuh"
#include "nvidia/mul_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#include "metax/mul_metax.h"
#endif
__C
infiniStatus_t
infiniopCreateMulDescriptor
(
__C
infiniStatus_t
infiniopCreateMulDescriptor
(
infiniopHandle_t
handle
,
infiniopHandle_t
handle
,
...
@@ -34,6 +36,9 @@ __C infiniStatus_t infiniopCreateMulDescriptor(
...
@@ -34,6 +36,9 @@ __C infiniStatus_t infiniopCreateMulDescriptor(
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -55,6 +60,9 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz
...
@@ -55,6 +60,9 @@ __C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, siz
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -86,6 +94,9 @@ __C infiniStatus_t infiniopMul(
...
@@ -86,6 +94,9 @@ __C infiniStatus_t infiniopMul(
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -110,6 +121,9 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
...
@@ -110,6 +121,9 @@ infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/sub/cuda/kernel.cuh
View file @
d76a2607
#ifndef __SUB_CUDA_H__
#ifndef __SUB_CUDA_H__
#define __SUB_CUDA_H__
#define __SUB_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_bf16.h>
#include <cuda_fp16.h>
namespace
op
::
sub
::
cuda
{
namespace
op
::
sub
::
cuda
{
typedef
struct
SubOp
{
typedef
struct
SubOp
{
public:
public:
static
constexpr
size_t
num_inputs
=
2
;
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
||
std
::
is_same_v
<
T
,
cuda_bfloat162
>
)
{
return
__hsub2
(
a
,
b
);
return
__hsub2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
__nv
_bfloat16
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
||
std
::
is_same_v
<
T
,
cuda
_bfloat16
>
)
{
return
__hsub
(
a
,
b
);
return
__hsub
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fsub_rd
(
a
,
b
);
return
__fsub_rd
(
a
,
b
);
...
...
src/infiniop/ops/sub/metax/sub_metax.h
0 → 100644
View file @
d76a2607
#ifndef __SUB_METAX_API_H__
#define __SUB_METAX_API_H__
#include "../../../elementwise/metax/elementwise_metax_api.h"
ELEMENTWISE_DESCRIPTOR
(
sub
,
metax
,
metax
)
#endif // __SUB_METAX_API_H__
src/infiniop/ops/sub/metax/sub_metax.maca
0 → 100644
View file @
d76a2607
#include "../../../elementwise/metax/elementwise_metax.h"
#include "../cuda/kernel.cuh"
#include "sub_metax.h"
namespace op::sub::metax {
Descriptor::~Descriptor() = default;
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
auto dtype = out_desc->dtype();
const auto &a_desc = input_desc_vec.at(0);
const auto &b_desc = input_desc_vec.at(1);
const auto &c_shape = out_desc->shape();
const auto &a_shape = a_desc->shape();
const auto &b_shape = b_desc->shape();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *output,
std::vector<const void *> inputs,
void *stream) const {
if (workspace_size < _workspace_size) {
return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
}
switch (_dtype) {
case INFINI_DTYPE_F16:
return _device_info->calculate<256, cuda::SubOp, half>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F32:
return _device_info->calculate<256, cuda::SubOp, float>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_F64:
return _device_info->calculate<256, cuda::SubOp, double>(_info, workspace, output, inputs, stream);
case INFINI_DTYPE_BF16:
return _device_info->calculate<256, cuda::SubOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
default:
return INFINI_STATUS_BAD_TENSOR_DTYPE;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::sub::metax
src/infiniop/ops/sub/nvidia/sub_nvidia.cu
View file @
d76a2607
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include "../cuda/kernel.cuh"
#include "../cuda/kernel.cuh"
#include "sub_nvidia.cuh"
#include "sub_nvidia.cuh"
...
@@ -49,7 +51,7 @@ infiniStatus_t Descriptor::calculate(
...
@@ -49,7 +51,7 @@ infiniStatus_t Descriptor::calculate(
case
INFINI_DTYPE_F64
:
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
cuda
::
SubOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SubOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_BF16
:
case
INFINI_DTYPE_BF16
:
return
_device_info
->
calculate
<
256
,
cuda
::
SubOp
,
__nv
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
return
_device_info
->
calculate
<
256
,
cuda
::
SubOp
,
cuda
_bfloat16
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
...
...
src/infiniop/ops/sub/operator.cc
View file @
d76a2607
...
@@ -8,6 +8,9 @@
...
@@ -8,6 +8,9 @@
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
#include "nvidia/sub_nvidia.cuh"
#include "nvidia/sub_nvidia.cuh"
#endif
#endif
#ifdef ENABLE_METAX_API
#include "metax/sub_metax.h"
#endif
__C
infiniStatus_t
infiniopCreateSubDescriptor
(
__C
infiniStatus_t
infiniopCreateSubDescriptor
(
infiniopHandle_t
handle
,
infiniopHandle_t
handle
,
...
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
...
@@ -33,6 +36,9 @@ __C infiniStatus_t infiniopCreateSubDescriptor(
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
...
@@ -54,6 +60,9 @@ __C infiniStatus_t infiniopGetSubWorkspaceSize(infiniopSubDescriptor_t desc, siz
#endif
#endif
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopSub(
...
@@ -85,6 +94,9 @@ __C infiniStatus_t infiniopSub(
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -109,6 +121,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
...
@@ -109,6 +121,9 @@ infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) {
#ifdef ENABLE_NVIDIA_API
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment