Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
b2b4526b
Commit
b2b4526b
authored
May 16, 2025
by
wooway777
Browse files
issue/216 - Merge remote-tracking branch main into issue/216
parents
3c696932
0c803397
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
486 additions
and
57 deletions
+486
-57
src/infiniop/devices/ascend/CMakeLists.txt
src/infiniop/devices/ascend/CMakeLists.txt
+8
-4
src/infiniop/devices/ascend/ascend_kernel_common.h
src/infiniop/devices/ascend/ascend_kernel_common.h
+11
-0
src/infiniop/devices/ascend/common_ascend.cc
src/infiniop/devices/ascend/common_ascend.cc
+11
-5
src/infiniop/ops/causal_softmax/ascend/causal_softmax_ascend.cc
...finiop/ops/causal_softmax/ascend/causal_softmax_ascend.cc
+17
-17
src/infiniop/ops/causal_softmax/ascend/causal_softmax_ascend.h
...nfiniop/ops/causal_softmax/ascend/causal_softmax_ascend.h
+0
-0
src/infiniop/ops/causal_softmax/operator.cc
src/infiniop/ops/causal_softmax/operator.cc
+1
-1
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+94
-0
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+8
-0
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+12
-0
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+10
-13
src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
+53
-0
src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
+73
-0
src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
+170
-0
src/infiniop/ops/swiglu/operator.cc
src/infiniop/ops/swiglu/operator.cc
+10
-12
test/infiniop/causal_softmax.py
test/infiniop/causal_softmax.py
+4
-1
test/infiniop/libinfiniop/utils.py
test/infiniop/libinfiniop/utils.py
+2
-1
xmake/ascend.lua
xmake/ascend.lua
+2
-3
No files found.
src/infiniop/devices/ascend/CMakeLists.txt
View file @
b2b4526b
...
...
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
# project information
project
(
Ascend_C
)
set
(
SOC_VERSION
"Ascend910B3"
CACHE STRING
"system on chip type"
)
set
(
ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH
"ASCEND CANN package installation directory"
)
set
(
ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_
TOOLKIT_
HOME} CACHE PATH
"ASCEND CANN package installation directory"
)
set
(
RUN_MODE
"npu"
CACHE STRING
"run mode: npu"
)
set
(
CMAKE_BUILD_TYPE
"Release"
CACHE STRING
"Build type Release/Debug (default Debug)"
FORCE
)
set
(
CMAKE_INSTALL_PREFIX
"
${
CMAKE_CURRENT_LIST_DIR
}
/out"
CACHE STRING
"path for install()"
FORCE
)
...
...
@@ -19,10 +19,14 @@ else()
endif
()
include
(
${
ASCENDC_CMAKE_DIR
}
/ascendc.cmake
)
include_directories
(
${
CMAKE_SOURCE_DIR
}
/../../../../include/infiniop/
)
ascendc_library
(
ascend_kernels STATIC
../../ops/swiglu/ascend/swiglu_kernel.cpp
../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
../../ops/random_sample/ascend/random_sample_kernel.cpp
../../ops/swiglu/ascend/swiglu_
ascend_
kernel.cpp
#
../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
#
../../ops/random_sample/ascend/random_sample_kernel.cpp
)
src/infiniop/devices/ascend/ascend_kernel_common.h
0 → 100644
View file @
b2b4526b
#ifndef __INFINIOP_ASCEND_KERNEL_COMMON_H__
#define __INFINIOP_ASCEND_KERNEL_COMMON_H__
#include "../../../../include/infinicore.h"
#include "kernel_operator.h"
constexpr
int32_t
BLOCK_NUM
=
8
;
constexpr
int32_t
BUFFER_NUM
=
2
;
constexpr
int32_t
BYTE_ALIGN
=
32
;
#endif
src/infiniop/devices/ascend/common_ascend.cc
View file @
b2b4526b
#include "common_ascend.h"
std
::
vector
<
int64_t
>
inferStorageShape
(
std
::
vector
<
int64_t
>
shape
,
std
::
vector
<
int64_t
>
strides
)
{
auto
index
=
std
::
max_element
(
strides
.
begin
(),
strides
.
end
());
uint64_t
max_stride_index
=
std
::
distance
(
strides
.
begin
(),
index
);
auto
storageShape
=
std
::
vector
<
int64_t
>
({
shape
[
max_stride_index
]
*
strides
[
max_stride_index
]});
if
(
shape
.
size
()
!=
strides
.
size
())
{
throw
std
::
invalid_argument
(
"Shape and strides must have the same length."
);
}
int64_t
max_offset
=
0
;
for
(
size_t
i
=
0
;
i
<
shape
.
size
();
++
i
)
{
max_offset
+=
(
shape
[
i
]
-
1
)
*
strides
[
i
];
}
return
storageShape
;
// storage shape is 1D buffer that must cover all accessed elements
return
{
max_offset
+
1
};
}
size_t
aclnnTensorDescriptor
::
numel
()
const
{
...
...
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
this
->
strides
=
std
::
vector
<
int64_t
>
(
ndim
);
for
(
uint64_t
i
=
0
;
i
<
ndim
;
++
i
)
{
this
->
shape
[
i
]
=
static_cast
<
int64_t
>
(
desc
->
dim
(
i
));
this
->
strides
[
i
]
=
desc
->
stride
(
i
);
this
->
strides
[
i
]
=
static_cast
<
int64_t
>
(
desc
->
stride
(
i
)
)
;
}
this
->
storageShape
=
inferStorageShape
(
this
->
shape
,
this
->
strides
);
this
->
dataType
=
toAclDataType
(
desc
->
dtype
());
...
...
src/infiniop/ops/causal_softmax/ascend/causal_softmax_a
clnn
.cc
→
src/infiniop/ops/causal_softmax/ascend/causal_softmax_a
scend
.cc
View file @
b2b4526b
#include "causal_softmax_a
clnn
.h"
#include "causal_softmax_a
scend
.h"
#include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_masked_fill_tensor.h>
#include <aclnnop/aclnn_softmax.h>
...
...
@@ -12,6 +12,8 @@ struct Descriptor::Opaque {
aclnnTensorDescriptor_t
value
;
void
*
mask_addr
;
void
*
value_addr
;
uint64_t
workspacesize
;
aclOpExecutor
*
executor
;
~
Opaque
()
{
delete
x
;
...
...
@@ -21,6 +23,9 @@ struct Descriptor::Opaque {
aclrtFree
(
mask_addr
);
aclrtFree
(
value_addr
);
// Delete useless executor
aclDestroyAclOpExecutor
(
executor
);
}
};
...
...
@@ -92,18 +97,18 @@ infiniStatus_t Descriptor::create(
aclTensor
*
tvalue
=
value
->
tensor
;
CHECK_ACL
(
aclnnInplaceMaskedFillTensorGetWorkspaceSize
(
tx
,
tmask
,
tvalue
,
&
workspacesize_mask
,
&
mask_executor
));
int64_t
dim
=
2
;
int64_t
dim
=
2
;
CHECK_ACL
(
aclnnSoftmaxGetWorkspaceSize
(
tx
,
dim
,
ty
,
&
workspacesize_softmax
,
&
executor
));
// set executor reusable
aclSetAclOpExecutorRepeatable
(
executor
);
// Create the descriptor
size_t
all_workspacesize
=
workspacesize_softmax
+
workspacesize_mask
;
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
x
,
mask
,
y
,
value
,
mask_addr
,
value_addr
},
std
::
move
(
info
),
all_workspacesize
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
// Create the descripto
size_t
all_workspacesize
=
std
::
max
(
workspacesize_softmax
,
workspacesize_mask
);
// Delete useless executor
aclDestroyAclOpExecutor
(
executor
);
aclDestroyAclOpExecutor
(
mask_executor
);
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
x
,
mask
,
y
,
value
,
mask_addr
,
value_addr
,
workspacesize_softmax
,
executor
},
std
::
move
(
info
),
all_workspacesize
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -116,23 +121,18 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, voi
auto
ty
=
_opaque
->
y
->
tensor
;
auto
tmask
=
_opaque
->
mask
->
tensor
;
auto
tvalue
=
_opaque
->
value
->
tensor
;
aclOpExecutor
*
executor
=
nullptr
;
aclOpExecutor
*
mask_executor
=
nullptr
;
size_t
workspacesize_softmax
=
0
;
size_t
workspacesize_mask
=
0
;
int64_t
dim
=
2
;
AclSetTensorAddr
(
mask_executor
,
0
,
tx
,
(
void
*
)
x
);
AclSetTensorAddr
(
mask_executor
,
1
,
tmask
,
_opaque
->
mask_addr
);
AclSetTensorAddr
(
mask_executor
,
2
,
tvalue
,
_opaque
->
value_addr
);
CHECK_ACL
(
aclnnInplaceMaskedFillTensorGetWorkspaceSize
(
tx
,
tmask
,
tvalue
,
&
workspacesize_mask
,
&
mask_executor
));
CHECK_ACL
(
aclnnInplaceMaskedFillTensor
(
workspace
,
workspacesize_mask
,
mask_executor
,
stream
));
CHECK_ACL
(
aclrtSynchronizeStream
(
stream
));
AclSetTensorAddr
(
executor
,
0
,
tx
,
(
void
*
)
x
);
AclSetTensorAddr
(
executor
,
1
,
ty
,
y
);
CHECK_ACL
(
aclnnSoftmaxGetWorkspaceSize
(
tx
,
dim
,
ty
,
&
workspacesize_softmax
,
&
executor
));
CHECK_ACL
(
aclnnSoftmax
(
workspace
,
workspacesize_softmax
,
executor
,
stream
));
AclSetTensorAddr
(
_opaque
->
executor
,
0
,
tx
,
(
void
*
)
x
);
AclSetTensorAddr
(
_opaque
->
executor
,
1
,
ty
,
y
);
CHECK_ACL
(
aclnnSoftmax
(
workspace
,
_opaque
->
workspacesize
,
_opaque
->
executor
,
stream
));
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infiniop/ops/causal_softmax/ascend/causal_softmax_a
clnn
.h
→
src/infiniop/ops/causal_softmax/ascend/causal_softmax_a
scend
.h
View file @
b2b4526b
File moved
src/infiniop/ops/causal_softmax/operator.cc
View file @
b2b4526b
...
...
@@ -9,7 +9,7 @@
#include "cuda/causal_softmax_cuda.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/causal_softmax_a
clnn
.h"
#include "ascend/causal_softmax_a
scend
.h"
#endif
__C
infiniStatus_t
infiniopCreateCausalSoftmaxDescriptor
(
...
...
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
0 → 100644
View file @
b2b4526b
#include "rearrange_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_copy.h>
namespace
op
::
rearrange
::
ascend
{
struct
Descriptor
::
Opaque
{
aclnnTensorDescriptor_t
dst
;
aclnnTensorDescriptor_t
src
;
void
*
workspace
;
// aclnnInplaceCopy workspace
uint64_t
workspace_size
;
~
Opaque
()
{
delete
dst
;
delete
src
;
aclrtFree
(
workspace
);
}
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
};
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
auto
shape
=
y_desc
->
shape
();
CHECK_API_OR
(
x_desc
->
dtype
(),
dtype
,
return
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_API_OR
(
x_desc
->
ndim
(),
ndim
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
CHECK_API_OR
(
x_desc
->
shape
()[
i
],
shape
[
i
],
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
auto
dst_strides
=
y_desc
->
strides
();
auto
src_strides
=
x_desc
->
strides
();
auto
element_size
=
infiniSizeOf
(
dtype
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
shape
.
data
(),
dst_strides
.
data
(),
src_strides
.
data
(),
ndim
,
element_size
);
CHECK_RESULT
(
result
);
aclnnTensorDescriptor_t
dst
=
new
aclnnTensorDescriptor
(
y_desc
);
aclnnTensorDescriptor_t
src
=
new
aclnnTensorDescriptor
(
x_desc
);
uint64_t
workspace_size
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
void
*
workspace
=
nullptr
;
aclnnInplaceCopyGetWorkspaceSize
(
dst
->
tensor
,
src
->
tensor
,
&
workspace_size
,
&
executor
);
if
(
workspace_size
!=
0
)
{
CHECK_ACL
(
aclrtMalloc
(
&
workspace
,
workspace_size
,
ACL_MEM_MALLOC_HUGE_FIRST
));
}
*
desc_ptr
=
new
Descriptor
(
result
.
take
(),
new
Opaque
{
dst
,
src
,
workspace
,
workspace_size
},
handle
->
device
,
handle
->
device_id
);
// Delete useless executor
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
auto
tdst
=
_opaque
->
dst
->
tensor
;
auto
tsrc
=
_opaque
->
src
->
tensor
;
uint64_t
workspace_size
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
AclSetTensorAddr
(
executor
,
0
,
tdst
,
y
);
AclSetTensorAddr
(
executor
,
1
,
tsrc
,
(
void
*
)
x
);
CHECK_ACL
(
aclnnInplaceCopyGetWorkspaceSize
(
tdst
,
tsrc
,
&
workspace_size
,
&
executor
));
// Execute InplaceCopy
CHECK_ACL
(
aclnnInplaceCopy
(
_opaque
->
workspace
,
_opaque
->
workspace_size
,
executor
,
stream
));
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::rearrange::ascend
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
0 → 100644
View file @
b2b4526b
#ifndef __REARRANGE_ASCEND_H__
#define __REARRANGE_ASCNED_H__
#include "../rearrange.h"
DESCRIPTOR
(
ascend
)
#endif // __REARRANGE_ASCEND_H__
src/infiniop/ops/rearrange/operator.cc
View file @
b2b4526b
...
...
@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/rearrange_cpu.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
...
...
@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
...
...
@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
...
...
@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
...
...
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
View file @
b2b4526b
...
...
@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
aclnnTensorDescriptor_t
w
;
aclnnTensorDescriptor_t
rstd
;
size_t
workspaceSize
;
aclOpExecutor
*
executor
;
~
Opaque
()
{
delete
y
;
delete
x
;
delete
w
;
delete
rstd
;
aclDestroyAclOpExecutor
(
executor
);
}
};
...
...
@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(
// Get WorkspaceSize and set executor
CHECK_ACL
(
aclnnRmsNormGetWorkspaceSize
(
tx
,
tw
,
static_cast
<
double
>
(
epsilon
),
ty
,
trstd
,
&
workspace_size
,
&
executor
));
aclSetAclOpExecutorRepeatable
(
executor
);
auto
handle_ascend
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
size_t
all_workspace_size
=
workspace_size
+
rstd
->
numel
()
*
aclDataTypeSize
(
rstd
->
dataType
);
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
y
,
x
,
w
,
rstd
,
workspace_size
},
new
Opaque
{
y
,
x
,
w
,
rstd
,
workspace_size
,
executor
},
std
::
move
(
info
),
all_workspace_size
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
auto
tx
=
_opaque
->
x
->
tensor
;
auto
ty
=
_opaque
->
y
->
tensor
;
auto
trstd
=
_opaque
->
rstd
->
tensor
;
size_t
workspace_size_
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
CHECK_ACL
(
aclnnRmsNormGetWorkspaceSize
(
tx
,
tw
,
static_cast
<
double
>
(
_info
.
epsilon
),
ty
,
trstd
,
&
workspace_size_
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
void
*
rstdPtr
=
(
void
*
)((
uint8_t
*
)
workspace
+
_opaque
->
workspaceSize
);
auto
unit
=
infiniSizeOf
(
_info
.
atype
);
AclSetTensorAddr
(
executor
,
1
,
tw
,
(
void
*
)
w
);
AclSetTensorAddr
(
executor
,
3
,
trstd
,
rstdPtr
);
AclSetTensorAddr
(
_opaque
->
executor
,
1
,
tw
,
(
void
*
)
w
);
AclSetTensorAddr
(
_opaque
->
executor
,
3
,
trstd
,
rstdPtr
);
for
(
size_t
i
=
0
;
i
<
(
_info
.
shape
)[
0
];
++
i
)
{
AclSetTensorAddr
(
executor
,
0
,
tx
,
((
char
*
)
x
)
+
i
*
(
_info
.
x_strides
)[
0
]
*
unit
);
AclSetTensorAddr
(
executor
,
2
,
ty
,
((
char
*
)
y
)
+
i
*
(
_info
.
y_strides
)[
0
]
*
unit
);
CHECK_ACL
(
aclnnRmsNorm
(
workspace
,
_opaque
->
workspaceSize
,
executor
,
stream
));
AclSetTensorAddr
(
_opaque
->
executor
,
0
,
tx
,
((
char
*
)
x
)
+
i
*
(
_info
.
x_strides
)[
0
]
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
2
,
ty
,
((
char
*
)
y
)
+
i
*
(
_info
.
y_strides
)[
0
]
*
unit
);
CHECK_ACL
(
aclnnRmsNorm
(
workspace
,
_opaque
->
workspaceSize
,
_opaque
->
executor
,
stream
));
}
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
0 → 100644
View file @
b2b4526b
#include "swiglu_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
namespace
op
::
swiglu
::
ascend
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_descs
)
{
auto
handle_ascend
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
auto
dtype
=
c_desc
->
dtype
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
);
const
auto
&
a_desc
=
input_descs
[
0
];
const
auto
&
b_desc
=
input_descs
[
1
];
auto
result
=
SwigluInfo
::
create
(
c_desc
,
a_desc
,
b_desc
);
CHECK_RESULT
(
result
);
SwigluInfo
info
=
result
.
take
();
// https://www.hiascend.com/document/detail/zh/canncommercial/800/apiref/ascendcopapi/atlasascendc_api_07_0777.html
size_t
workspace_size
=
0
;
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
info
),
workspace_size
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
extern
"C"
infiniStatus_t
swiglu_kernel_launch
(
void
*
c
,
void
*
a
,
void
*
b
,
infiniDtype_t
dtype
,
size_t
batch
,
size_t
seq
,
size_t
hd
,
ptrdiff_t
stride_batch_c
,
ptrdiff_t
stride_batch_a
,
ptrdiff_t
stride_batch_b
,
ptrdiff_t
stride_seq_c
,
ptrdiff_t
stride_seq_a
,
ptrdiff_t
stride_seq_b
,
void
*
stream
);
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
auto
batch
=
_info
.
ndim
==
2
?
1
:
_info
.
shape
[
0
];
auto
seq_len
=
_info
.
ndim
==
2
?
_info
.
shape
[
0
]
:
_info
.
shape
[
1
];
auto
hidden_size
=
_info
.
shape
[
_info
.
ndim
-
1
];
auto
stride_batch_c
=
_info
.
ndim
==
2
?
1
:
_info
.
c_strides
[
0
];
auto
stride_batch_a
=
_info
.
ndim
==
2
?
1
:
_info
.
a_strides
[
0
];
auto
stride_batch_b
=
_info
.
ndim
==
2
?
1
:
_info
.
b_strides
[
0
];
auto
stride_seq_c
=
_info
.
ndim
==
2
?
_info
.
c_strides
[
0
]
:
_info
.
c_strides
[
1
];
auto
stride_seq_a
=
_info
.
ndim
==
2
?
_info
.
a_strides
[
0
]
:
_info
.
a_strides
[
1
];
auto
stride_seq_b
=
_info
.
ndim
==
2
?
_info
.
b_strides
[
0
]
:
_info
.
b_strides
[
1
];
auto
status
=
swiglu_kernel_launch
(
c
,
(
void
*
)
inputs
[
0
],
(
void
*
)
inputs
[
1
],
_info
.
dtype
,
batch
,
seq_len
,
hidden_size
,
stride_batch_c
,
stride_batch_a
,
stride_batch_b
,
stride_seq_c
,
stride_seq_a
,
stride_seq_b
,
stream
);
return
status
;
}
}
// namespace op::swiglu::ascend
src/infiniop/ops/swiglu/ascend/swiglu_ascend.h
0 → 100644
View file @
b2b4526b
#ifndef __ACLNN_SWIGLU_H__
#define __ACLNN_SWIGLU_H__
#include "../../../../utils.h"
#include "../../../../utils/check.h"
#include "../../../operator.h"
#include "../../../tensor.h"
namespace
op
::
swiglu
::
ascend
{
class
SwigluInfo
{
SwigluInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
std
::
vector
<
size_t
>
shape
;
int32_t
ndim
;
std
::
vector
<
ptrdiff_t
>
c_strides
;
std
::
vector
<
ptrdiff_t
>
a_strides
;
std
::
vector
<
ptrdiff_t
>
b_strides
;
static
utils
::
Result
<
SwigluInfo
>
create
(
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
CHECK_OR_RETURN
(
c_desc
&&
a_desc
&&
b_desc
,
INFINI_STATUS_BAD_PARAM
);
CHECK_OR_RETURN
(
!
c_desc
->
hasBroadcastDim
(),
INFINI_STATUS_BAD_TENSOR_STRIDES
);
CHECK_OR_RETURN
(
c_desc
->
ndim
()
==
a_desc
->
ndim
()
&&
c_desc
->
ndim
()
==
b_desc
->
ndim
()
&&
(
c_desc
->
ndim
()
==
2
||
c_desc
->
ndim
()
==
3
),
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_SAME_SHAPE
(
c_desc
->
shape
(),
a_desc
->
shape
(),
b_desc
->
shape
());
int32_t
ndim
=
c_desc
->
ndim
();
CHECK_OR_RETURN
(
c_desc
->
stride
(
ndim
-
1
)
==
1
&&
a_desc
->
stride
(
ndim
-
1
)
==
1
&&
b_desc
->
stride
(
ndim
-
1
)
==
1
,
INFINI_STATUS_BAD_TENSOR_STRIDES
);
CHECK_OR_RETURN
(
c_desc
->
dtype
()
==
a_desc
->
dtype
()
&&
c_desc
->
dtype
()
==
b_desc
->
dtype
(),
INFINI_STATUS_BAD_TENSOR_DTYPE
);
return
utils
::
Result
<
SwigluInfo
>
(
SwigluInfo
{
c_desc
->
dtype
(),
c_desc
->
shape
(),
ndim
,
c_desc
->
strides
(),
a_desc
->
strides
(),
b_desc
->
strides
(),
});
}
};
class
Descriptor
final
:
public
InfiniopDescriptor
{
SwigluInfo
_info
;
size_t
_workspace_size
;
Descriptor
(
SwigluInfo
info
,
size_t
workspace_size
,
infiniDevice_t
device_type
,
int
device_id
)
:
InfiniopDescriptor
{
device_type
,
device_id
},
_info
(
info
),
_workspace_size
(
workspace_size
)
{}
public:
~
Descriptor
();
static
infiniStatus_t
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_descs
);
size_t
workspaceSize
()
const
{
return
_workspace_size
;
}
infiniStatus_t
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
;
};
}
// namespace op::swiglu::ascend
#endif // __ACLNN_SWIGLU_H__
src/infiniop/ops/swiglu/ascend/swiglu_ascend_kernel.cpp
0 → 100644
View file @
b2b4526b
#include "../../../devices/ascend/ascend_kernel_common.h"
using
namespace
AscendC
;
template
<
typename
T
>
class
SwigluKernel
{
public:
__aicore__
inline
SwigluKernel
()
{}
__aicore__
inline
void
init
(
GM_ADDR
c
,
GM_ADDR
a
,
GM_ADDR
b
,
int64_t
batch_
,
int64_t
seq
,
int64_t
hd
,
int64_t
stride_batch_c
,
int64_t
stride_batch_a
,
int64_t
stride_batch_b
,
int64_t
stride_seq_c
,
int64_t
stride_seq_a
,
int64_t
stride_seq_b
);
__aicore__
inline
void
process
();
private:
__aicore__
inline
void
copyIn
(
int64_t
i
);
__aicore__
inline
void
compute
(
int64_t
i
);
__aicore__
inline
void
copyOut
(
int64_t
i
);
private:
GlobalTensor
<
T
>
_c_gm
,
_a_gm
,
_b_gm
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_in_queue_a
,
_in_queue_b
;
TQue
<
QuePosition
::
VECOUT
,
BUFFER_NUM
>
_out_queue_c
;
TPipe
_pipe
;
float
_beta_value
=
1.0
f
;
int64_t
_block_idx
,
_tile_len
,
_copy_len
,
_batch
,
_seq_len
,
_hidden_size
,
_stride_seq_a
,
_stride_seq_b
,
_stride_seq_c
;
int64_t
_stride_batch_a
=
1
,
_stride_batch_b
=
1
,
_stride_batch_c
=
1
;
};
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
init
(
GM_ADDR
c
,
GM_ADDR
a
,
GM_ADDR
b
,
int64_t
batch_
,
int64_t
seq
,
int64_t
hd
,
int64_t
stride_batch_c
,
int64_t
stride_batch_a
,
int64_t
stride_batch_b
,
int64_t
stride_seq_c
,
int64_t
stride_seq_a
,
int64_t
stride_seq_b
)
{
// Init Shape & StrideVariables
_batch
=
batch_
;
_seq_len
=
seq
;
_hidden_size
=
hd
;
_stride_batch_a
=
stride_batch_a
;
_stride_batch_b
=
stride_batch_b
;
_stride_batch_c
=
stride_batch_c
;
_stride_seq_a
=
stride_seq_a
;
_stride_seq_b
=
stride_seq_b
;
_stride_seq_c
=
stride_seq_c
;
_block_idx
=
GetBlockIdx
();
_tile_len
=
_block_idx
<
(
_hidden_size
%
BLOCK_NUM
)
?
(
_hidden_size
/
BLOCK_NUM
)
+
1
:
(
_hidden_size
/
BLOCK_NUM
);
_copy_len
=
(
_tile_len
*
sizeof
(
T
))
%
BYTE_ALIGN
==
0
?
_tile_len
:
(
_tile_len
*
sizeof
(
T
)
+
(
BYTE_ALIGN
-
_tile_len
*
sizeof
(
T
)
%
BYTE_ALIGN
))
/
sizeof
(
T
);
// Set global tensor
_a_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
a
);
_b_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
b
);
_c_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
c
);
// _pipe alloc memory to queue, the unit is bytes
_pipe
.
InitBuffer
(
_in_queue_a
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
_pipe
.
InitBuffer
(
_in_queue_b
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
_pipe
.
InitBuffer
(
_out_queue_c
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
}
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
copyIn
(
int64_t
i
)
{
// Alloc tensor from queue memory
LocalTensor
<
T
>
aLocal
=
_in_queue_a
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
bLocal
=
_in_queue_b
.
AllocTensor
<
T
>
();
// Get idx of current tile
auto
batch_idx
=
_batch
==
1
?
0
:
i
/
_seq_len
;
auto
seq_idx
=
_batch
==
1
?
i
:
i
%
_seq_len
;
int64_t
idxa
=
batch_idx
*
_stride_batch_a
+
seq_idx
*
_stride_seq_a
+
_block_idx
*
_tile_len
;
int64_t
idxb
=
batch_idx
*
_stride_batch_b
+
seq_idx
*
_stride_seq_b
+
_block_idx
*
_tile_len
;
// Copy process_th tile from global tensor to local tensor
DataCopy
(
aLocal
,
_a_gm
[
idxa
],
_copy_len
);
DataCopy
(
bLocal
,
_b_gm
[
idxb
],
_copy_len
);
// Enque input tensor to VECIN queue
_in_queue_a
.
EnQue
(
aLocal
);
_in_queue_b
.
EnQue
(
bLocal
);
}
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
compute
(
int64_t
i
)
{
// Deque input tensors from VECIN queue
LocalTensor
<
T
>
aLocal
=
_in_queue_a
.
DeQue
<
T
>
();
LocalTensor
<
T
>
bLocal
=
_in_queue_b
.
DeQue
<
T
>
();
LocalTensor
<
T
>
cLocal
=
_out_queue_c
.
AllocTensor
<
T
>
();
// Call SwiGLU ascend api
SwiGLU
<
T
,
false
>
(
cLocal
,
aLocal
,
bLocal
,
_beta_value
,
_copy_len
);
// Enque result and free input
_out_queue_c
.
EnQue
<
T
>
(
cLocal
);
_in_queue_a
.
FreeTensor
(
aLocal
);
_in_queue_b
.
FreeTensor
(
bLocal
);
}
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
copyOut
(
int64_t
i
)
{
// Deque output tensor from VECOUT queue
LocalTensor
<
T
>
cLocal
=
_out_queue_c
.
DeQue
<
T
>
();
auto
batch_idx
=
_batch
==
1
?
0
:
i
/
_seq_len
;
auto
seq_idx
=
_batch
==
1
?
i
:
i
%
_seq_len
;
int64_t
idxc
=
batch_idx
*
_stride_batch_c
+
seq_idx
*
_stride_seq_c
+
_block_idx
*
_tile_len
;
// Copy progress_th tile from local tensor to global tensor
if
(
_tile_len
*
sizeof
(
T
)
%
BYTE_ALIGN
!=
0
)
{
DataCopyExtParams
dcep
=
{
1
,
static_cast
<
uint32_t
>
(
_tile_len
*
sizeof
(
T
)),
0
,
0
,
0
};
DataCopyPad
(
_c_gm
[
idxc
],
cLocal
,
dcep
);
}
else
{
DataCopy
(
_c_gm
[
idxc
],
cLocal
,
_tile_len
);
}
// Free output Local tensor
_out_queue_c
.
FreeTensor
(
cLocal
);
}
template
<
typename
T
>
__aicore__
inline
void
SwigluKernel
<
T
>::
process
()
{
for
(
int64_t
i
=
0
;
i
<
_batch
*
_seq_len
;
++
i
)
{
copyIn
(
i
);
compute
(
i
);
copyOut
(
i
);
}
}
#define DEFINE_SWIGLU_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR c, GM_ADDR a, GM_ADDR b, \
int64_t batch, int64_t seq, int64_t hd, \
int64_t stride_batch_c, \
int64_t stride_batch_a, \
int64_t stride_batch_b, \
int64_t stride_seq_c, \
int64_t stride_seq_a, \
int64_t stride_seq_b) { \
SwigluKernel<TYPE> op; \
op.init(c, a, b, \
batch, seq, hd, \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
op.process(); \
}
DEFINE_SWIGLU_KERNEL
(
swiglu_kernel_half
,
half
)
DEFINE_SWIGLU_KERNEL
(
swiglu_kernel_float
,
float
)
#undef DEFINE_SWIGLU_KERNEL
extern
"C"
infiniStatus_t
swiglu_kernel_launch
(
void
*
c
,
void
*
a
,
void
*
b
,
infiniDtype_t
dtype
,
size_t
batch
,
size_t
seq
,
size_t
hd
,
ptrdiff_t
stride_batch_c
,
ptrdiff_t
stride_batch_a
,
ptrdiff_t
stride_batch_b
,
ptrdiff_t
stride_seq_c
,
ptrdiff_t
stride_seq_a
,
ptrdiff_t
stride_seq_b
,
void
*
stream
)
{
#define LAUNCH_SWIGLU_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<BLOCK_NUM, nullptr, stream>>>( \
c, a, b, \
static_cast<int64_t>(batch), \
static_cast<int64_t>(seq), \
static_cast<int64_t>(hd), \
stride_batch_c, stride_batch_a, stride_batch_b, \
stride_seq_c, stride_seq_a, stride_seq_b); \
return INFINI_STATUS_SUCCESS;
switch
(
dtype
)
{
LAUNCH_SWIGLU_KERNEL
(
INFINI_DTYPE_F16
,
swiglu_kernel_half
)
LAUNCH_SWIGLU_KERNEL
(
INFINI_DTYPE_F32
,
swiglu_kernel_float
)
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
#undef LAUNCH_SWIGLU_KERNEL
}
src/infiniop/ops/swiglu/operator.cc
View file @
b2b4526b
...
...
@@ -11,6 +11,9 @@
#ifdef ENABLE_KUNLUN_API
#include "kunlun/swiglu_kunlun.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/swiglu_ascend.h"
#endif
__C
infiniStatus_t
infiniopCreateSwiGLUDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -46,11 +49,8 @@ __C infiniStatus_t infiniopCreateSwiGLUDescriptor(
c_desc
,
a_desc
,
b_desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
return
ascendCreateSwiGLUDescriptor
(
(
AscendHandle_t
)
handle
,
(
SwiGLUAscendDescriptor_t
*
)
desc_ptr
,
c_desc
,
a_desc
,
b_desc
);
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
...
...
@@ -94,7 +94,7 @@ __C infiniStatus_t infiniopGetSwiGLUWorkspaceSize(infiniopSwiGLUDescriptor_t des
return
bangGetSwiGLUWorkspaceSize
((
SwiGLUBangDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_ASCEND_
NPU
#ifdef ENABLE_ASCEND_
API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
...
...
@@ -144,9 +144,8 @@ __C infiniStatus_t infiniopSwiGLU(
return
bangSwiGLU
((
SwiGLUBangDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
return
ascendSwiGLU
((
SwiGLUAscendDescriptor_t
)
desc
,
c
,
a
,
b
,
stream
);
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
...
...
@@ -188,9 +187,8 @@ infiniopDestroySwiGLUDescriptor(infiniopSwiGLUDescriptor_t desc) {
return
bangDestroySwiGLUDescriptor
((
SwiGLUBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
return
ascendDestroySwiGLUDescriptor
((
SwiGLUAscendDescriptor_t
)
desc
);
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
)
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
...
...
test/infiniop/causal_softmax.py
View file @
b2b4526b
...
...
@@ -37,7 +37,7 @@ _TENSOR_DTYPES = [torch.float16]
# Tolerance map for different data types
_TOLERANCE_MAP
=
{
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
1e-2
},
torch
.
float16
:
{
"atol"
:
1e-3
,
"rtol"
:
1e-2
},
}
...
...
@@ -143,6 +143,9 @@ def test(
)
lib_causal_softmax
()
if
sync
is
not
None
:
sync
()
atol
,
rtol
=
get_tolerance
(
_TOLERANCE_MAP
,
dtype
)
if
DEBUG
:
...
...
test/infiniop/libinfiniop/utils.py
View file @
b2b4526b
...
...
@@ -476,10 +476,11 @@ def get_test_devices(args):
def
get_sync_func
(
device
):
import
torch
device_str
=
infiniDeviceEnum_str_map
[
device
]
if
device
==
InfiniDeviceEnum
.
CPU
:
sync
=
None
else
:
sync
=
getattr
(
torch
,
infiniDeviceEnum_str_map
[
device
]
).
synchronize
sync
=
getattr
(
torch
,
device_str
).
synchronize
return
sync
xmake/ascend.lua
View file @
b2b4526b
...
...
@@ -50,9 +50,8 @@ target("infiniop-ascend")
add_files
(
"$(projectdir)/src/infiniop/devices/ascend/*.cc"
,
"$(projectdir)/src/infiniop/ops/*/ascend/*.cc"
)
-- Add operator
-- TODO: add it back after ascend-kernels is fixed
-- add_rules("ascend-kernels")
-- add_links(builddir.."/libascend_kernels.a")
add_rules
(
"ascend-kernels"
)
add_links
(
builddir
..
"/libascend_kernels.a"
)
target_end
()
target
(
"infinirt-ascend"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment