Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
4e4d3415
Unverified
Commit
4e4d3415
authored
Apr 29, 2025
by
Catheriany
Committed by
GitHub
Apr 29, 2025
Browse files
Merge branch 'main' into issue/150
parents
d1c46889
1a4cfb99
Changes
51
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1393 additions
and
53 deletions
+1393
-53
src/infiniop/ops/add/cuda/add_cuda.cu
src/infiniop/ops/add/cuda/add_cuda.cu
+57
-0
src/infiniop/ops/add/cuda/add_cuda.cuh
src/infiniop/ops/add/cuda/add_cuda.cuh
+8
-0
src/infiniop/ops/add/cuda/add_cuda_internal.cuh
src/infiniop/ops/add/cuda/add_cuda_internal.cuh
+26
-0
src/infiniop/ops/add/operator.cc
src/infiniop/ops/add/operator.cc
+118
-0
src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
...nfiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
+24
-17
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
+15
-13
src/infiniop/ops/mul/cpu/mul_cpu.cc
src/infiniop/ops/mul/cpu/mul_cpu.cc
+51
-0
src/infiniop/ops/mul/cpu/mul_cpu.h
src/infiniop/ops/mul/cpu/mul_cpu.h
+19
-0
src/infiniop/ops/mul/cuda/mul_cuda.cu
src/infiniop/ops/mul/cuda/mul_cuda.cu
+57
-0
src/infiniop/ops/mul/cuda/mul_cuda.cuh
src/infiniop/ops/mul/cuda/mul_cuda.cuh
+8
-0
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
+26
-0
src/infiniop/ops/mul/operator.cc
src/infiniop/ops/mul/operator.cc
+119
-0
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
+8
-10
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
+485
-0
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
+8
-0
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
+330
-0
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+16
-0
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+13
-9
src/infiniop/ops/rope/cuda/rope_cuda.cu
src/infiniop/ops/rope/cuda/rope_cuda.cu
+3
-2
src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh
src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh
+2
-2
No files found.
src/infiniop/ops/add/cuda/add_cuda.cu
0 → 100644
View file @
4e4d3415
#include "add_cuda.cuh"
#include "add_cuda_internal.cuh"
namespace
op
::
add
::
cuda
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
c_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
AddOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
AddOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
AddOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::add::cuda
src/infiniop/ops/add/cuda/add_cuda.cuh
0 → 100644
View file @
4e4d3415
#ifndef __ADD_CUDA_API_H__
#define __ADD_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
add
,
cuda
)
#endif // __ADD_CUDA_API_H__
src/infiniop/ops/add/cuda/add_cuda_internal.cuh
0 → 100644
View file @
4e4d3415
#ifndef __ADD_CUDA_H__
#define __ADD_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace
op
::
add
::
cuda
{
typedef
struct
AddOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hadd2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hadd
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fadd_rd
(
a
,
b
);
}
else
{
return
a
+
b
;
}
}
}
AddOp
;
}
// namespace op::add::cuda
#endif // __ADD_CUDA_H__
src/infiniop/ops/add/operator.cc
0 → 100644
View file @
4e4d3415
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/add.h"
#ifdef ENABLE_CPU_API
#include "cpu/add_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/add_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateAddDescriptor
(
infiniopHandle_t
handle
,
infiniopAddDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::add::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::add::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, \
b_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetAddWorkspaceSize
(
infiniopAddDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopAdd
(
infiniopAddDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyAddDescriptor
(
infiniopAddDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/causal_softmax/ascend/causal_softmax_aclnn.cc
View file @
4e4d3415
...
...
@@ -6,22 +6,21 @@
namespace
op
::
causal_softmax
::
ascend
{
struct
Descriptor
::
Opaque
{
mutable
aclOpExecutor
*
executor
;
mutable
aclOpExecutor
*
mask_executor
;
aclnnTensorDescriptor_t
x
;
aclnnTensorDescriptor_t
mask
;
aclnnTensorDescriptor_t
y
;
aclnnTensorDescriptor_t
value
;
void
*
mask_addr
;
size_t
workspacesize_softmax
;
size_t
workspacesize_mask
;
void
*
value_addr
;
~
Opaque
()
{
delete
x
;
delete
mask
;
delete
y
;
delete
value
;
acl
DestroyAclOpExecutor
(
executo
r
);
acl
DestroyAclOpExecutor
(
mask_executo
r
);
acl
rtFree
(
mask_add
r
);
acl
rtFree
(
value_add
r
);
}
};
...
...
@@ -64,13 +63,13 @@ infiniStatus_t Descriptor::create(
auto
size
=
aclDataTypeSize
(
aclDataType
::
ACL_FLOAT16
);
CHECK_ACL
(
aclrtMalloc
(
&
value_addr
,
size
,
ACL_MEM_MALLOC_HUGE_FIRST
));
CHECK_ACL
(
aclrtMemcpy
(
value_addr
,
size
,
&
mask_value
,
size
,
ACL_MEMCPY_HOST_TO_DEVICE
));
value
=
new
aclnnTensorDescriptor
(
aclDataType
::
ACL_FLOAT16
,
{},
{}
,
value_addr
);
value
=
new
aclnnTensorDescriptor
(
aclDataType
::
ACL_FLOAT16
,
{},
{});
}
else
{
uint32_t
mask_value
=
0xff800000
;
auto
size
=
aclDataTypeSize
(
aclDataType
::
ACL_FLOAT
);
CHECK_ACL
(
aclrtMalloc
(
&
value_addr
,
size
,
ACL_MEM_MALLOC_HUGE_FIRST
));
CHECK_ACL
(
aclrtMemcpy
(
value_addr
,
size
,
&
mask_value
,
size
,
ACL_MEMCPY_HOST_TO_DEVICE
));
value
=
new
aclnnTensorDescriptor
(
aclDataType
::
ACL_FLOAT
,
{},
{}
,
value_addr
);
value
=
new
aclnnTensorDescriptor
(
aclDataType
::
ACL_FLOAT
,
{},
{});
}
// Fill Mask Tensor
...
...
@@ -93,17 +92,19 @@ infiniStatus_t Descriptor::create(
aclTensor
*
tvalue
=
value
->
tensor
;
CHECK_ACL
(
aclnnInplaceMaskedFillTensorGetWorkspaceSize
(
tx
,
tmask
,
tvalue
,
&
workspacesize_mask
,
&
mask_executor
));
aclSetAclOpExecutorRepeatable
(
mask_executor
);
int64_t
dim
=
2
;
CHECK_ACL
(
aclnnSoftmaxGetWorkspaceSize
(
tx
,
dim
,
ty
,
&
workspacesize_softmax
,
&
executor
));
aclSetAclOpExecutorRepeatable
(
executor
);
// Create the descriptor
size_t
all_workspacesize
=
workspacesize_softmax
+
workspacesize_mask
;
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
executor
,
mask_executor
,
x
,
mask
,
y
,
mask_addr
,
workspacesize_softmax
,
workspacesize_mask
},
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
x
,
mask
,
y
,
value
,
mask_addr
,
value_addr
},
std
::
move
(
info
),
all_workspacesize
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
// Delete useless executor
aclDestroyAclOpExecutor
(
executor
);
aclDestroyAclOpExecutor
(
mask_executor
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -114,18 +115,24 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
auto
tx
=
_opaque
->
x
->
tensor
;
auto
ty
=
_opaque
->
y
->
tensor
;
auto
tmask
=
_opaque
->
mask
->
tensor
;
auto
executor
=
_opaque
->
executor
;
auto
mask_executor
=
_opaque
->
mask_executor
;
auto
mask_addr
=
_opaque
->
mask_addr
;
auto
tvalue
=
_opaque
->
value
->
tensor
;
aclOpExecutor
*
executor
=
nullptr
;
aclOpExecutor
*
mask_executor
=
nullptr
;
size_t
workspacesize_softmax
=
0
;
size_t
workspacesize_mask
=
0
;
int64_t
dim
=
2
;
AclSetTensorAddr
(
mask_executor
,
0
,
tx
,
(
void
*
)
x
);
AclSetTensorAddr
(
mask_executor
,
1
,
tmask
,
mask_addr
);
CHECK_ACL
(
aclnnInplaceMaskedFillTensor
(
workspace
,
_opaque
->
workspacesize_mask
,
mask_executor
,
stream
));
AclSetTensorAddr
(
mask_executor
,
1
,
tmask
,
_opaque
->
mask_addr
);
AclSetTensorAddr
(
mask_executor
,
2
,
tvalue
,
_opaque
->
value_addr
);
CHECK_ACL
(
aclnnInplaceMaskedFillTensorGetWorkspaceSize
(
tx
,
tmask
,
tvalue
,
&
workspacesize_mask
,
&
mask_executor
));
CHECK_ACL
(
aclnnInplaceMaskedFillTensor
(
workspace
,
workspacesize_mask
,
mask_executor
,
stream
));
CHECK_ACL
(
aclrtSynchronizeStream
(
stream
));
AclSetTensorAddr
(
executor
,
0
,
tx
,
(
void
*
)
x
);
AclSetTensorAddr
(
executor
,
1
,
ty
,
y
);
CHECK_ACL
(
aclnnSoftmax
(
workspace
,
_opaque
->
workspacesize_softmax
,
executor
,
stream
));
CHECK_ACL
(
aclnnSoftmaxGetWorkspaceSize
(
tx
,
dim
,
ty
,
&
workspacesize_softmax
,
&
executor
));
CHECK_ACL
(
aclnnSoftmax
(
workspace
,
workspacesize_softmax
,
executor
,
stream
));
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
View file @
4e4d3415
...
...
@@ -6,7 +6,6 @@
namespace
op
::
gemm
::
ascend
{
struct
Descriptor
::
Opaque
{
mutable
aclOpExecutor
*
executor
;
aclnnTensorDescriptor_t
c
,
a
,
b
;
// cubeMathType
// see doc:
...
...
@@ -17,7 +16,6 @@ struct Descriptor::Opaque {
delete
c
;
delete
a
;
delete
b
;
aclDestroyAclOpExecutor
(
executor
);
}
};
...
...
@@ -56,8 +54,8 @@ infiniStatus_t Descriptor::create(
ta
=
a
->
tensor
,
tb
=
b
->
tensor
;
aclOpExecutor
*
executor
;
size_t
workspace_size
;
aclOpExecutor
*
executor
=
nullptr
;
size_t
workspace_size
=
0
;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
...
...
@@ -69,13 +67,15 @@ infiniStatus_t Descriptor::create(
*
desc_ptr
=
new
Descriptor
(
dtype
,
info
,
workspace_size
,
new
Opaque
{
executor
,
c
,
a
,
b
,
mt
,
},
handle
->
device
,
handle
->
device_id
);
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -93,22 +93,24 @@ infiniStatus_t Descriptor::calculate(
ta
=
_opaque
->
a
->
tensor
,
tb
=
_opaque
->
b
->
tensor
;
size_t
workspace_size
;
size_t
workspace_size
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
alpha
,
beta
,
0
,
0
,
tc
,
_opaque
->
mt
,
&
workspace_size
,
&
(
_opaque
->
executor
))
)
;
&
workspace_size
,
&
executor
));
if
(
workspaceSize_
<
workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
aclSetAclOpExecutorRepeatable
(
_opaque
->
executor
);
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
)
)
;
auto
unit
=
infiniSizeOf
(
_dtype
);
for
(
size_t
i
=
0
;
i
<
_info
.
batch
;
++
i
)
{
AclSetTensorAddr
(
_opaque
->
executor
,
0
,
ta
,
((
char
*
)
a
)
+
i
*
_info
.
a_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
1
,
tb
,
((
char
*
)
b
)
+
i
*
_info
.
b_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
2
,
tc
,
((
char
*
)
c
)
+
i
*
_info
.
c_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
3
,
tc
,
((
char
*
)
c
)
+
i
*
_info
.
c_matrix
.
stride
*
unit
);
CHECK_ACL
(
aclnnGemm
(
workspace
,
workspace_size
,
_opaque
->
executor
,
stream
));
AclSetTensorAddr
(
executor
,
0
,
ta
,
((
char
*
)
a
)
+
i
*
_info
.
a_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
executor
,
1
,
tb
,
((
char
*
)
b
)
+
i
*
_info
.
b_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
executor
,
2
,
tc
,
((
char
*
)
c
)
+
i
*
_info
.
c_matrix
.
stride
*
unit
);
AclSetTensorAddr
(
executor
,
3
,
tc
,
((
char
*
)
c
)
+
i
*
_info
.
c_matrix
.
stride
*
unit
);
CHECK_ACL
(
aclnnGemm
(
workspace
,
workspace_size
,
executor
,
stream
));
}
return
INFINI_STATUS_SUCCESS
;
...
...
src/infiniop/ops/mul/cpu/mul_cpu.cc
0 → 100644
View file @
4e4d3415
#include "mul_cpu.h"
namespace
op
::
mul
::
cpu
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
out_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
out_shape
,
a_shape
,
b_shape
);
// create CPU elementwise descriptor
CREATE_ELEMENTWISE_CPU_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
MulOp
,
fp16_t
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
MulOp
,
float
>
(
_info
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
MulOp
,
double
>
(
_info
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::mul::cpu
src/infiniop/ops/mul/cpu/mul_cpu.h
0 → 100644
View file @
4e4d3415
#ifndef __MUL_CPU_H__
#define __MUL_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
mul
,
cpu
)
namespace
op
::
mul
::
cpu
{
typedef
struct
MulOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
*
b
;
}
}
MulOp
;
}
// namespace op::mul::cpu
#endif // __MUL_CPU_H__
src/infiniop/ops/mul/cuda/mul_cuda.cu
0 → 100644
View file @
4e4d3415
#include "mul_cuda.cuh"
#include "mul_cuda_internal.cuh"
namespace
op
::
mul
::
cuda
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
c_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
MulOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
MulOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
MulOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::mul::cuda
src/infiniop/ops/mul/cuda/mul_cuda.cuh
0 → 100644
View file @
4e4d3415
#ifndef __MUL_CUDA_API_H__
#define __MUL_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
mul
,
cuda
)
#endif // __MUL_CUDA_API_H__
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
0 → 100644
View file @
4e4d3415
#ifndef __MUL_CUDA_H__
#define __MUL_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace
op
::
mul
::
cuda
{
typedef
struct
MulOp
{
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmul2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
a
,
b
);
}
else
{
return
a
*
b
;
}
}
}
MulOp
;
}
// namespace op::mul::cuda
#endif // __MUL_CUDA_H__
src/infiniop/ops/mul/operator.cc
0 → 100644
View file @
4e4d3415
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/mul.h"
#ifdef ENABLE_CPU_API
#include "cpu/mul_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/mul_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateMulDescriptor
(
infiniopHandle_t
handle
,
infiniopMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::mul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::mul::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, \
b_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetMulWorkspaceSize
(
infiniopMulDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopMul
(
infiniopMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyMulDescriptor
(
infiniopMulDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
View file @
4e4d3415
...
...
@@ -15,20 +15,18 @@ infiniStatus_t Descriptor::create(
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
auto
shape
=
y_desc
->
shape
().
data
();
CHECK_API_OR
(
x_desc
->
dtype
(),
dtype
,
return
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_API_OR
(
x_desc
->
ndim
(),
ndim
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
auto
y_shape
=
y_desc
->
shape
();
auto
x_shape
=
x_desc
->
shape
();
CHECK_OR_RETURN
(
x_desc
->
dtype
()
==
dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
x_desc
->
ndim
()
==
ndim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_SAME_SHAPE
(
x_shape
,
y_shape
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
CHECK_API_OR
(
x_desc
->
shape
()[
i
],
shape
[
i
],
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
auto
dst_strides
=
y_desc
->
strides
().
data
();
auto
src_strides
=
x_desc
->
strides
().
data
();
auto
dst_strides
=
y_desc
->
strides
();
auto
src_strides
=
x_desc
->
strides
();
auto
element_size
=
infiniSizeOf
(
dtype
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
shape
,
dst_strides
,
src_strides
,
ndim
,
element_size
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
y_
shape
.
data
()
,
dst_strides
.
data
()
,
src_strides
.
data
()
,
ndim
,
element_size
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
...
...
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
0 → 100644
View file @
4e4d3415
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../tensor.h"
#include "rearrange_cuda.cuh"
#include "rearrange_kernel.cuh"
#include <algorithm>
#include <cmath>
#include <memory>
#include <stdint.h>
#include <vector>
namespace
op
::
rearrange
::
cuda
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
CHECK_OR_RETURN
(
x_desc
->
dtype
()
==
dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
x_desc
->
ndim
()
==
ndim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
// 保存临时vector对象
auto
x_shape
=
x_desc
->
shape
();
auto
y_shape
=
y_desc
->
shape
();
auto
y_strides
=
y_desc
->
strides
();
auto
x_strides
=
x_desc
->
strides
();
CHECK_SAME_SHAPE
(
x_shape
,
y_shape
);
auto
meta
=
utils
::
RearrangeMeta
::
create
(
y_shape
.
data
(),
y_strides
.
data
(),
x_strides
.
data
(),
ndim
,
infiniSizeOf
(
dtype
));
CHECK_RESULT
(
meta
);
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
*
meta
),
new
Opaque
{
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
// 维度信息结构
struct
Dim
{
size_t
len
;
ARRAY_TYPE_STRIDE
src_stride
;
ARRAY_TYPE_STRIDE
dst_stride
;
};
// 分割维度结构
struct
SplitDim
{
size_t
choose_idx
;
size_t
num_per_block
;
size_t
num_per_grid
;
int
array_struct_idx_block
;
int
array_struct_idx_grid
;
size_t
dim_len
;
};
/**
* 根据给定的元数据准备张量重排参数,该函数主要完成以下工作:
* 1. 根据原始元数据调整单元大小,获取更适合GPU处理的单元大小
* 2. 将维度分配为CUDA块(block)维度和网格(grid)维度:
* 该步骤是核心,目标是为每个block分配尽可能多的相对连续的数据进行处理,
* 对无法完整放入块的维度进行分割,并记录分割维度信息,用于防止kernel访问越界,最大化内存访问局部性和计算效率
*/
utils
::
Result
<
RearrangeParams
>
prepareRearrangeParams
(
const
utils
::
RearrangeMeta
&
original_meta
,
int
max_threads
)
{
RearrangeParams
params
;
// 获取更适合GPU处理的单元大小,这里使用2的幂次方
auto
meta_result
=
original_meta
.
distributeUnit
({
32
,
16
,
8
,
4
,
2
,
1
});
CHECK_RESULT
(
meta_result
);
const
utils
::
RearrangeMeta
&
meta
=
meta_result
.
take
();
// 获取维度信息
const
size_t
ndim
=
meta
.
ndim
();
const
size_t
unit
=
meta
.
unit
();
// 特殊情况:无维度,只需要简单复制
if
(
ndim
==
0
)
{
params
.
block_dim
=
0
;
params
.
block_len_total
=
1
;
params
.
block_len
=
{
static_cast
<
ARRAY_TYPE_SIZE
>
(
1
)};
params
.
src_block_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
dst_block_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
grid_len
=
{
static_cast
<
ARRAY_TYPE_SIZE
>
(
1
)};
params
.
src_grid_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
dst_grid_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
unit_size
=
unit
;
return
utils
::
Result
<
RearrangeParams
>
(
params
);
}
// 从元数据中提取必要的信息
const
ptrdiff_t
*
idx_strides
=
meta
.
idx_strides
();
const
ptrdiff_t
*
dst_strides
=
meta
.
dst_strides
();
const
ptrdiff_t
*
src_strides
=
meta
.
src_strides
();
// 准备维度信息
std
::
vector
<
Dim
>
dims
;
std
::
vector
<
size_t
>
shape
;
dims
.
reserve
(
ndim
);
shape
.
reserve
(
ndim
);
auto
prev_idx_stride
=
meta
.
count
();
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
size_t
len
=
prev_idx_stride
/
idx_strides
[
i
];
shape
.
push_back
(
len
);
dims
.
push_back
({
len
,
src_strides
[
i
],
dst_strides
[
i
]});
prev_idx_stride
=
idx_strides
[
i
];
}
// 计算src_strides的降序排序索引,类似于Rust版本中的src_strides_desc_idx
std
::
vector
<
size_t
>
src_strides_desc_idx
(
ndim
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
src_strides_desc_idx
[
i
]
=
i
;
}
std
::
sort
(
src_strides_desc_idx
.
begin
(),
src_strides_desc_idx
.
end
(),
[
&
dims
](
size_t
a
,
size_t
b
)
{
return
std
::
abs
(
dims
[
a
].
src_stride
)
>
std
::
abs
(
dims
[
b
].
src_stride
);
});
// 根据最大线程数选择block和grid维度
const
size_t
block_size
=
max_threads
;
std
::
vector
<
bool
>
block_dim_choose
(
ndim
,
false
);
// 初始化计数器
size_t
block_elements
=
1
;
size_t
block_src_elements
=
1
;
size_t
block_dst_elements
=
1
;
size_t
src_choose_idx
=
ndim
;
size_t
dst_choose_idx
=
ndim
;
// 用于存储分割维度信息
std
::
vector
<
SplitDim
>
split_dims
;
// 维度选择循环
while
(
src_choose_idx
>
0
&&
dst_choose_idx
>
0
)
{
// 获取当前需要处理的维度索引
size_t
src_idx
=
src_strides_desc_idx
[
src_choose_idx
-
1
];
size_t
dst_idx
=
dst_choose_idx
-
1
;
if
(
src_idx
==
dst_idx
)
{
// 源和目标维度相同,可以一起处理
size_t
idx
=
src_idx
;
size_t
len
=
shape
[
idx
];
// 检查是否可以将此维度完全添加到block中
if
(
block_elements
*
len
<=
block_size
)
{
// 选择此维度
block_dim_choose
[
idx
]
=
true
;
block_elements
*=
len
;
block_src_elements
*=
len
;
block_dst_elements
*=
len
;
src_choose_idx
--
;
dst_choose_idx
--
;
}
else
{
// 需要分割此维度
size_t
num_per_block
=
block_size
/
block_elements
;
// 确保num_per_block > 0且len >= num_per_block
if
(
num_per_block
>
0
&&
len
>=
num_per_block
&&
num_per_block
>
1
)
{
size_t
num_per_grid
=
(
len
+
num_per_block
-
1
)
/
num_per_block
;
// 向上取整
SplitDim
split_dim
=
{
idx
,
// choose_idx
num_per_block
,
// num_per_block
num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
break
;
}
}
else
{
// 源和目标维度不同,需要分别处理
// 计算块比例
double
src_div_dst
=
static_cast
<
double
>
(
block_src_elements
)
/
block_dst_elements
;
double
src_num_per_block
=
std
::
sqrt
(
block_size
/
(
double
)
block_elements
/
src_div_dst
);
double
dst_num_per_block
=
src_num_per_block
*
src_div_dst
;
size_t
src_current_dim_len
=
shape
[
src_idx
];
size_t
dst_current_dim_len
=
shape
[
dst_idx
];
if
(
static_cast
<
double
>
(
src_current_dim_len
)
<
src_num_per_block
)
{
// 源维度可以完全添加到block
block_dim_choose
[
src_idx
]
=
true
;
block_elements
*=
src_current_dim_len
;
block_src_elements
*=
src_current_dim_len
;
src_choose_idx
--
;
}
else
if
(
static_cast
<
double
>
(
dst_current_dim_len
)
<
dst_num_per_block
)
{
// 目标维度可以完全添加到block
block_dim_choose
[
dst_idx
]
=
true
;
block_elements
*=
dst_current_dim_len
;
block_dst_elements
*=
dst_current_dim_len
;
dst_choose_idx
--
;
}
else
{
// 需要分割源和目标维度
size_t
src_num_per_block_int
=
static_cast
<
size_t
>
(
std
::
floor
(
src_num_per_block
));
size_t
dst_num_per_block_int
=
static_cast
<
size_t
>
(
std
::
floor
(
dst_num_per_block
));
// 计算网格尺寸
size_t
src_num_per_grid
=
(
src_current_dim_len
+
src_num_per_block_int
-
1
)
/
src_num_per_block_int
;
// 向上取整
size_t
dst_num_per_grid
=
(
dst_current_dim_len
+
dst_num_per_block_int
-
1
)
/
dst_num_per_block_int
;
// 向上取整
// 处理源维度
if
(
src_num_per_block_int
>
1
)
{
if
(
src_num_per_grid
==
1
)
{
// 可以完全放入块
block_dim_choose
[
src_idx
]
=
true
;
block_elements
*=
src_current_dim_len
;
block_src_elements
*=
src_current_dim_len
;
src_choose_idx
--
;
}
else
{
// 需要分割
SplitDim
split_dim
=
{
src_idx
,
// choose_idx
src_num_per_block_int
,
// num_per_block
src_num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
src_current_dim_len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
}
// 处理目标维度
if
(
dst_num_per_block_int
>
1
)
{
if
(
dst_num_per_grid
==
1
)
{
// 可以完全放入块
block_dim_choose
[
dst_idx
]
=
true
;
block_elements
*=
dst_current_dim_len
;
block_dst_elements
*=
dst_current_dim_len
;
dst_choose_idx
--
;
}
else
{
// 需要分割
SplitDim
split_dim
=
{
dst_idx
,
// choose_idx
dst_num_per_block_int
,
// num_per_block
dst_num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
dst_current_dim_len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
}
break
;
}
}
}
// 准备block维度相关参数
size_t
block_dim
=
0
;
size_t
block_len_total
=
1
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
block_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_block_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_block_stride
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
grid_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_grid_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_grid_stride
;
// 处理block维度,填充block_len和block_stride
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
block_dim_choose
[
i
])
{
block_len
.
push_back
(
shape
[
i
]);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
block_dim
+=
1
;
block_len_total
*=
shape
[
i
];
}
// 处理分割维度的block部分
for
(
size_t
j
=
0
;
j
<
split_dims
.
size
();
++
j
)
{
if
(
i
==
split_dims
[
j
].
choose_idx
)
{
block_len
.
push_back
(
split_dims
[
j
].
num_per_block
);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
split_dims
[
j
].
array_struct_idx_block
=
block_dim
;
block_dim
+=
1
;
block_len_total
*=
split_dims
[
j
].
num_per_block
;
}
}
}
// 处理grid维度,填充grid_len和grid_stride
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
!
block_dim_choose
[
i
])
{
bool
is_split
=
false
;
// 检查是否是分割维度
for
(
size_t
j
=
0
;
j
<
split_dims
.
size
();
++
j
)
{
if
(
i
==
split_dims
[
j
].
choose_idx
)
{
is_split
=
true
;
grid_len
.
push_back
(
split_dims
[
j
].
num_per_grid
);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
*
split_dims
[
j
].
num_per_block
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
*
split_dims
[
j
].
num_per_block
);
split_dims
[
j
].
array_struct_idx_grid
=
grid_len
.
size
()
-
1
;
}
}
// 如果不是分割维度,则作为完整的grid维度
if
(
!
is_split
)
{
grid_len
.
push_back
(
shape
[
i
]);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
);
}
}
}
// 如果grid_len为空,添加一个默认值
if
(
grid_len
.
empty
())
{
grid_len
.
push_back
(
1
);
src_grid_stride
.
push_back
(
0
);
dst_grid_stride
.
push_back
(
0
);
}
// 处理约束条件 - 使用与Rust版本相似的逻辑
std
::
vector
<
Constraint
<
ARRAY_TYPE_SIZE
>>
constraints
;
// 限制最多处理2个约束条件
for
(
size_t
i
=
0
;
i
<
split_dims
.
size
();
++
i
)
{
if
(
split_dims
[
i
].
dim_len
%
split_dims
[
i
].
num_per_block
==
0
)
{
continue
;
}
Constraint
<
ARRAY_TYPE_SIZE
>
constraint
;
constraint
.
grid_idx
=
split_dims
[
i
].
array_struct_idx_grid
;
constraint
.
block_idx
=
split_dims
[
i
].
array_struct_idx_block
;
constraint
.
grid_div_block
=
split_dims
[
i
].
num_per_block
;
constraint
.
total_len
=
split_dims
[
i
].
dim_len
;
constraints
.
push_back
(
constraint
);
}
// 设置参数
params
.
block_dim
=
block_dim
;
params
.
block_len_total
=
block_len_total
;
params
.
block_len
=
block_len
;
params
.
src_block_stride
=
src_block_stride
;
params
.
dst_block_stride
=
dst_block_stride
;
params
.
grid_len
=
grid_len
;
params
.
src_grid_stride
=
src_grid_stride
;
params
.
dst_grid_stride
=
dst_grid_stride
;
params
.
constraints
=
constraints
;
params
.
unit_size
=
unit
;
return
utils
::
Result
<
RearrangeParams
>
(
params
);
}
// 带约束的内核启动模板函数
template
<
unsigned
int
BLOCK_SIZE
>
infiniStatus_t
launchKernel
(
void
*
y
,
const
void
*
x
,
size_t
grid_size
,
const
RearrangeParams
&
params
,
size_t
unit_size
,
cudaStream_t
stream
)
{
// 获取内核函数
RearrangeParams
params_copy
=
params
;
// 创建一个非const副本
auto
kernel_func_result
=
getRearrangeKernel
(
params_copy
);
CHECK_RESULT
(
kernel_func_result
);
auto
kernel_func
=
kernel_func_result
.
take
();
// 创建非const的临时变量
size_t
block_dim
=
params
.
block_dim
;
size_t
block_len_total
=
params
.
block_len_total
;
// 检查向量尺寸是否合理
if
(
params
.
block_len
.
size
()
<
block_dim
||
params
.
src_block_stride
.
size
()
<
block_dim
||
params
.
dst_block_stride
.
size
()
<
block_dim
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
if
(
params
.
grid_len
.
empty
()
||
params
.
src_grid_stride
.
empty
()
||
params
.
dst_grid_stride
.
empty
())
{
return
INFINI_STATUS_BAD_PARAM
;
}
const
Constraint
<
ARRAY_TYPE_SIZE
>
*
constraints_data
;
auto
empty_constraints
=
Constraint
<
ARRAY_TYPE_SIZE
>
();
if
(
params
.
constraints
.
empty
())
{
constraints_data
=
&
empty_constraints
;
}
else
{
constraints_data
=
params
.
constraints
.
data
();
}
void
*
args
[]
=
{
&
y
,
&
x
,
&
block_dim
,
&
block_len_total
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
block_len
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
src_block_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
dst_block_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
grid_len
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
src_grid_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
dst_grid_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
constraints_data
))};
CHECK_OR_RETURN
(
cudaLaunchKernel
(
kernel_func
,
grid_size
,
BLOCK_SIZE
,
args
,
0
,
stream
)
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
// 如果没有维度,直接进行内存拷贝
if
(
_meta
.
ndim
()
==
0
)
{
auto
err
=
cudaMemcpyAsync
(
y
,
x
,
_meta
.
unit
(),
cudaMemcpyDeviceToDevice
,
cuda_stream
);
if
(
err
!=
cudaSuccess
)
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
CHECK_OR_RETURN
(
cudaMemcpyAsync
(
y
,
x
,
_meta
.
unit
(),
cudaMemcpyDeviceToDevice
,
cuda_stream
)
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
return
INFINI_STATUS_SUCCESS
;
}
// 获取设备属性
int
max_threads
=
_opaque
->
internal
->
maxThreadsPerBlock
();
// 准备参数
auto
params_result
=
prepareRearrangeParams
(
_meta
,
std
::
min
(
CUDA_BLOCK_SIZE_1024
,
max_threads
));
CHECK_RESULT
(
params_result
);
auto
params
=
params_result
.
take
();
// 计算grid大小
size_t
grid_size
=
1
;
for
(
size_t
i
=
0
;
i
<
params
.
grid_len
.
size
();
++
i
)
{
grid_size
*=
params
.
grid_len
[
i
];
}
// 检查grid大小是否为0
if
(
grid_size
==
0
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
// 根据设备属性选择合适的内核
infiniStatus_t
status
=
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
size_t
block_size
=
params
.
block_len_total
;
if
(
block_size
<=
CUDA_BLOCK_SIZE_512
)
{
status
=
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
y
,
x
,
grid_size
,
params
,
_meta
.
unit
(),
cuda_stream
);
}
else
if
(
block_size
<=
CUDA_BLOCK_SIZE_1024
)
{
status
=
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
y
,
x
,
grid_size
,
params
,
_meta
.
unit
(),
cuda_stream
);
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
status
;
}
}
// namespace op::rearrange::cuda
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
0 → 100644
View file @
4e4d3415
#ifndef __REARRANGE_CUDA_H__
#define __REARRANGE_CUDA_H__
#include "../rearrange.h"
DESCRIPTOR
(
cuda
)
#endif // __REARRANGE_CUDA_H__
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
0 → 100644
View file @
4e4d3415
This diff is collapsed.
Click to expand it.
src/infiniop/ops/rearrange/operator.cc
View file @
4e4d3415
...
...
@@ -6,6 +6,10 @@
#include "cpu/rearrange_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateRearrangeDescriptor
(
infiniopHandle_t
handle
,
infiniopRearrangeDescriptor_t
*
desc_ptr
,
...
...
@@ -26,6 +30,10 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -50,6 +58,10 @@ __C infiniStatus_t infiniopRearrange(
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
@@ -71,6 +83,10 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
...
...
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
View file @
4e4d3415
...
...
@@ -5,7 +5,6 @@
namespace
op
::
rms_norm
::
ascend
{
struct
Descriptor
::
Opaque
{
mutable
aclOpExecutor
*
executor
;
aclnnTensorDescriptor_t
y
;
aclnnTensorDescriptor_t
x
;
aclnnTensorDescriptor_t
w
;
...
...
@@ -17,7 +16,6 @@ struct Descriptor::Opaque {
delete
x
;
delete
w
;
delete
rstd
;
aclDestroyAclOpExecutor
(
executor
);
}
};
...
...
@@ -64,16 +62,17 @@ infiniStatus_t Descriptor::create(
// Get WorkspaceSize and set executor
CHECK_ACL
(
aclnnRmsNormGetWorkspaceSize
(
tx
,
tw
,
static_cast
<
double
>
(
epsilon
),
ty
,
trstd
,
&
workspace_size
,
&
executor
));
aclSetAclOpExecutorRepeatable
(
executor
);
auto
handle_ascend
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
size_t
all_workspace_size
=
workspace_size
+
rstd
->
numel
()
*
aclDataTypeSize
(
rstd
->
dataType
);
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
executor
,
y
,
x
,
w
,
rstd
,
workspace_size
},
new
Opaque
{
y
,
x
,
w
,
rstd
,
workspace_size
},
std
::
move
(
info
),
all_workspace_size
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -89,16 +88,21 @@ infiniStatus_t Descriptor::calculate(
auto
tx
=
_opaque
->
x
->
tensor
;
auto
ty
=
_opaque
->
y
->
tensor
;
auto
trstd
=
_opaque
->
rstd
->
tensor
;
size_t
workspace_size_
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
CHECK_ACL
(
aclnnRmsNormGetWorkspaceSize
(
tx
,
tw
,
static_cast
<
double
>
(
_info
.
epsilon
),
ty
,
trstd
,
&
workspace_size_
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
void
*
rstdPtr
=
(
void
*
)((
uint8_t
*
)
workspace
+
_opaque
->
workspaceSize
);
auto
unit
=
infiniSizeOf
(
_info
.
atype
);
AclSetTensorAddr
(
_opaque
->
executor
,
1
,
tw
,
(
void
*
)
w
);
AclSetTensorAddr
(
_opaque
->
executor
,
3
,
trstd
,
rstdPtr
);
AclSetTensorAddr
(
executor
,
1
,
tw
,
(
void
*
)
w
);
AclSetTensorAddr
(
executor
,
3
,
trstd
,
rstdPtr
);
for
(
size_t
i
=
0
;
i
<
(
_info
.
shape
)[
0
];
++
i
)
{
AclSetTensorAddr
(
_opaque
->
executor
,
0
,
tx
,
((
char
*
)
x
)
+
i
*
(
_info
.
x_strides
)[
0
]
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
2
,
ty
,
((
char
*
)
y
)
+
i
*
(
_info
.
y_strides
)[
0
]
*
unit
);
CHECK_ACL
(
aclnnRmsNorm
(
workspace
,
_opaque
->
workspaceSize
,
_opaque
->
executor
,
stream
));
AclSetTensorAddr
(
executor
,
0
,
tx
,
((
char
*
)
x
)
+
i
*
(
_info
.
x_strides
)[
0
]
*
unit
);
AclSetTensorAddr
(
executor
,
2
,
ty
,
((
char
*
)
y
)
+
i
*
(
_info
.
y_strides
)[
0
]
*
unit
);
CHECK_ACL
(
aclnnRmsNorm
(
workspace
,
_opaque
->
workspaceSize
,
executor
,
stream
));
}
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infiniop/ops/rope/cuda/rope_cuda.cu
View file @
4e4d3415
...
...
@@ -46,8 +46,9 @@ infiniStatus_t calculateRoPE(const RoPEInfo &info,
const
Tdata
*
sin_table
,
const
Tdata
*
cos_table
,
cudaStream_t
stream
)
{
auto
dimx
=
static_cast
<
unsigned
int
>
(
info
.
seqlen
);
auto
dimy
=
static_cast
<
unsigned
int
>
(
info
.
nhead
);
auto
dimx
=
uint32_t
(
info
.
seqlen
),
dimy
=
uint32_t
(
info
.
nhead
);
int
nthreads
=
std
::
max
(
int
(
info
.
table_dim
),
block_size
);
ropeThreadPerItem
<<<
dim3
(
dimx
,
dimy
),
nthreads
,
0
,
stream
>>>
(
...
...
src/infiniop/ops/swiglu/cuda/swiglu_cuda_internal.cuh
View file @
4e4d3415
...
...
@@ -14,7 +14,7 @@ private:
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
hrcp
(
__hadd
(
half
(
1.
f
),
__float2half
(
__expf
(
__half2float
(
__hneg
(
x
))))));
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__frcp_r
d
(
__fadd_r
d
(
1
,
__expf
(
-
x
)));
return
__frcp_r
n
(
__fadd_r
n
(
1
,
__expf
(
-
x
)));
}
else
{
return
1
/
(
1
+
std
::
exp
(
-
x
));
}
...
...
@@ -29,7 +29,7 @@ public:
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
__hmul
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_r
d
(
__fmul_r
d
(
gate
,
sigmoid
(
gate
)),
up
);
return
__fmul_r
n
(
__fmul_r
n
(
gate
,
sigmoid
(
gate
)),
up
);
}
else
{
return
gate
*
sigmoid
(
gate
)
*
up
;
}
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment