Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
9b32b4b1
Commit
9b32b4b1
authored
Jun 04, 2025
by
Catheriany
Browse files
Merge remote-tracking branch 'origin/main' into issue/150
parents
15bcbdfc
4799ddbf
Changes
103
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1248 additions
and
156 deletions
+1248
-156
src/infiniop/ops/clip/operator.cc
src/infiniop/ops/clip/operator.cc
+118
-0
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
+46
-15
src/infiniop/ops/gemm/kunlun/gemm_kunlun.cc
src/infiniop/ops/gemm/kunlun/gemm_kunlun.cc
+1
-1
src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
+33
-96
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
+101
-0
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cuh
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cuh
+8
-0
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
+254
-0
src/infiniop/ops/random_sample/info.h
src/infiniop/ops/random_sample/info.h
+33
-0
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+17
-1
src/infiniop/ops/random_sample/random_sample.h
src/infiniop/ops/random_sample/random_sample.h
+97
-9
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+94
-0
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+8
-0
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+12
-0
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+10
-13
src/infiniop/ops/rms_norm/kunlun/rms_norm_kernel.xpu
src/infiniop/ops/rms_norm/kunlun/rms_norm_kernel.xpu
+3
-1
src/infiniop/ops/rope/ascend/rope_ascend.cc
src/infiniop/ops/rope/ascend/rope_ascend.cc
+50
-0
src/infiniop/ops/rope/ascend/rope_ascend.h
src/infiniop/ops/rope/ascend/rope_ascend.h
+25
-0
src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
+280
-0
src/infiniop/ops/rope/operator.cc
src/infiniop/ops/rope/operator.cc
+11
-20
src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
+47
-0
No files found.
src/infiniop/ops/clip/operator.cc
0 → 100644
View file @
9b32b4b1
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/clip.h"
#ifdef ENABLE_CPU_API
#include "cpu/clip_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/clip_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateClipDescriptor
(
infiniopHandle_t
handle
,
infiniopClipDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
y
,
infiniopTensorDescriptor_t
x
,
infiniopTensorDescriptor_t
min_val
,
infiniopTensorDescriptor_t
max_val
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::clip::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::clip::NAMESPACE::Descriptor **>(desc_ptr), \
y, \
{x, min_val, max_val})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetClipWorkspaceSize
(
infiniopClipDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::clip::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopClip
(
infiniopClipDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
min_val
,
const
void
*
max_val
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, y, {x, min_val, max_val}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyClipDescriptor
(
infiniopClipDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::clip::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/gemm/ascend/gemm_ascend.cc
View file @
9b32b4b1
...
...
@@ -3,6 +3,26 @@
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/level2/aclnn_gemm.h>
#include <cstring>
#include <unordered_map>
// Custom hash function for alpha beta pair<float, float>
struct
FloatPairHash
{
size_t
operator
()(
const
std
::
pair
<
float
,
float
>
&
p
)
const
{
uint64_t
combined
;
std
::
memcpy
(
reinterpret_cast
<
char
*>
(
&
combined
),
&
p
.
first
,
sizeof
(
float
));
std
::
memcpy
(
reinterpret_cast
<
char
*>
(
&
combined
)
+
sizeof
(
float
),
&
p
.
second
,
sizeof
(
float
));
return
std
::
hash
<
uint64_t
>
()(
combined
);
}
};
struct
FloatPairEqual
{
bool
operator
()(
const
std
::
pair
<
float
,
float
>
&
a
,
const
std
::
pair
<
float
,
float
>
&
b
)
const
{
return
a
.
first
==
b
.
first
&&
a
.
second
==
b
.
second
;
}
};
namespace
op
::
gemm
::
ascend
{
struct
Descriptor
::
Opaque
{
...
...
@@ -11,11 +31,17 @@ struct Descriptor::Opaque {
// see doc:
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha002/apiref/appdevgapi/context/aclnnBatchMatMul.md
int8_t
mt
;
// alpha&beta hashmap
std
::
unordered_map
<
std
::
pair
<
float
,
float
>
,
aclOpExecutor
*
,
FloatPairHash
,
FloatPairEqual
>
lookup
;
~
Opaque
()
{
delete
c
;
delete
a
;
delete
b
;
for
(
auto
&
item
:
lookup
)
{
aclDestroyAclOpExecutor
(
item
.
second
);
}
lookup
.
clear
();
}
};
...
...
@@ -54,15 +80,16 @@ infiniStatus_t Descriptor::create(
ta
=
a
->
tensor
,
tb
=
b
->
tensor
;
std
::
unordered_map
<
std
::
pair
<
float
,
float
>
,
aclOpExecutor
*
,
FloatPairHash
,
FloatPairEqual
>
lookup
;
aclOpExecutor
*
executor
=
nullptr
;
size_t
workspace_size
=
0
;
// aclnnGemm support C = alpha * A @ B + beta * C
// see
// https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC3alpha003/apiref/aolapi/context/aclnnGemm.md
// use alpha = 0.5, beta = 0.5 temporarily
int8_t
mt
=
1
;
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
.5
,
.5
,
0
,
0
,
tc
,
mt
,
&
workspace_size
,
&
executor
));
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
1.
,
0.
,
0
,
0
,
tc
,
mt
,
&
workspace_size
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
lookup
[
std
::
make_pair
(
1.0
f
,
0.0
f
)]
=
executor
;
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
1.
,
1.
,
0
,
0
,
tc
,
mt
,
&
workspace_size
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
lookup
[
std
::
make_pair
(
1.0
f
,
1.0
f
)]
=
executor
;
*
desc_ptr
=
new
Descriptor
(
dtype
,
info
,
workspace_size
,
...
...
@@ -71,11 +98,9 @@ infiniStatus_t Descriptor::create(
a
,
b
,
mt
,
},
std
::
move
(
lookup
)
},
handle
->
device
,
handle
->
device_id
);
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -93,16 +118,22 @@ infiniStatus_t Descriptor::calculate(
ta
=
_opaque
->
a
->
tensor
,
tb
=
_opaque
->
b
->
tensor
;
size_t
workspace_size
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
size_t
workspace_size
=
_workspace_size
;
aclOpExecutor
*
executor
;
auto
key
=
std
::
make_pair
(
alpha
,
beta
);
if
(
_opaque
->
lookup
.
find
(
key
)
!=
_opaque
->
lookup
.
end
())
{
executor
=
_opaque
->
lookup
[
key
];
}
else
{
CHECK_ACL
(
aclnnGemmGetWorkspaceSize
(
ta
,
tb
,
tc
,
alpha
,
beta
,
0
,
0
,
tc
,
_opaque
->
mt
,
&
workspace_size
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
_opaque
->
lookup
[
key
]
=
executor
;
}
if
(
workspaceSize_
<
workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
auto
unit
=
infiniSizeOf
(
_dtype
);
for
(
size_t
i
=
0
;
i
<
_info
.
batch
;
++
i
)
{
...
...
src/infiniop/ops/gemm/kunlun/gemm_kunlun.cc
View file @
9b32b4b1
...
...
@@ -62,7 +62,7 @@ infiniStatus_t calculate(
(
kunlunStream_t
)
stream
,
[
&
](
xdnnHandle_t
handle
)
{
for
(
size_t
i
=
0
;
i
<
info
.
batch
;
i
++
)
{
CHECK_
XDN
N
((
xdnn
::
fc_fusion
<
Tdata
,
Tdata
,
Tdata
,
int16_t
>
(
CHECK_
KUNLU
N
((
xdnn
::
fc_fusion
<
Tdata
,
Tdata
,
Tdata
,
int16_t
>
(
handle
,
(
Tdata
*
)((
char
*
)
a
+
i
*
info
.
a_matrix
.
stride
*
unit
),
(
Tdata
*
)((
char
*
)
b
+
i
*
info
.
b_matrix
.
stride
*
unit
),
...
...
src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
View file @
9b32b4b1
#include "random_sample_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../
../../devices/cpu/cpu_handle
.h"
#include "
../../../tens
or.h"
#include "../
info
.h"
#include "
infinic
or
e
.h"
#include <algorithm>
namespace
op
::
random_sample
::
cpu
{
...
...
@@ -15,29 +15,14 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
probs_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dt_i
=
result_desc
->
dtype
();
auto
dt_p
=
probs_desc
->
dtype
();
CHECK_DTYPE
(
dt_i
,
INFINI_DTYPE_U8
,
INFINI_DTYPE_U16
,
INFINI_DTYPE_U32
,
INFINI_DTYPE_U64
,
INFINI_DTYPE_I8
,
INFINI_DTYPE_I16
,
INFINI_DTYPE_I32
,
INFINI_DTYPE_I64
);
CHECK_DTYPE
(
dt_p
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_API_OR
(
result_desc
->
ndim
(),
0
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_API_OR
(
probs_desc
->
ndim
(),
1
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_API_OR
(
probs_desc
->
stride
(
0
),
1
,
return
INFINI_STATUS_BAD_TENSOR_STRIDES
);
auto
result
=
RandomSampleInfo
::
create
(
result_desc
,
probs_desc
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
dt_i
,
dt_p
,
probs_desc
->
dim
(
0
),
result
.
take
(),
0
,
nullptr
,
handle
->
device
,
handle
->
device_id
);
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -55,36 +40,42 @@ struct ComputeType<fp16_t> {
using
type
=
float
;
};
template
<
class
Tidx
,
class
Tval
>
struct
Scheme
{
using
Tcompute
=
typename
ComputeType
<
Tval
>::
type
;
struct
Algo
{
static
Tcompute
get
(
void
const
*
ptr
,
size_t
i
)
{
return
utils
::
cast
<
Tcompute
,
Tval
>
(
reinterpret_cast
<
Tval
const
*>
(
ptr
)[
i
]);
template
<
class
Tidx
,
class
Tval
>
static
auto
get
(
void
const
*
ptr
,
size_t
i
)
{
return
utils
::
cast
<
typename
ComputeType
<
Tval
>::
type
,
Tval
>
(
reinterpret_cast
<
Tval
const
*>
(
ptr
)[
i
]);
}
static
void
argmax
(
void
*
result
,
void
const
*
probs
,
size_t
n
)
{
template
<
class
Tidx
,
class
Tval
>
infiniStatus_t
argmax
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
size_t
n
,
void
*
stream
)
{
auto
idx
=
reinterpret_cast
<
Tidx
*>
(
result
);
*
idx
=
0
;
auto
max_val
=
get
(
probs
,
0
);
auto
max_val
=
get
<
Tidx
,
Tval
>
(
probs
,
0
);
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
if
(
auto
val
=
get
(
probs
,
i
);
val
>
max_val
)
{
if
(
auto
val
=
get
<
Tidx
,
Tval
>
(
probs
,
i
);
val
>
max_val
)
{
max_val
=
val
;
*
idx
=
static_cast
<
Tidx
>
(
i
);
}
}
return
INFINI_STATUS_SUCCESS
;
}
static
void
random
(
template
<
class
Tidx
,
class
Tval
>
infiniStatus_t
random
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
size_t
n
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
)
{
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
{
struct
KVPair
{
Tidx
idx
;
Tcomput
e
val
;
typename
ComputeType
<
Tval
>::
typ
e
val
;
bool
operator
<
(
const
KVPair
&
other
)
const
{
return
val
>
other
.
val
;
...
...
@@ -95,7 +86,7 @@ struct Scheme {
// build & sort
std
::
vector
<
KVPair
>
pairs
(
n
);
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
pairs
[
i
]
=
{
static_cast
<
Tidx
>
(
i
),
get
(
probs
,
i
)};
pairs
[
i
]
=
{
static_cast
<
Tidx
>
(
i
),
get
<
Tidx
,
Tval
>
(
probs
,
i
)};
}
std
::
sort
(
pairs
.
begin
(),
pairs
.
end
());
// softmax & sum
...
...
@@ -115,68 +106,10 @@ struct Scheme {
break
;
}
}
}
};
template
<
class
Tidx
,
class
Tval
>
void
switch_f
(
size_t
n
,
void
*
result
,
const
void
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
)
{
if
(
random_val
==
0
||
topp
==
0
||
topk
==
1
||
temperature
==
0
)
{
Scheme
<
Tidx
,
Tval
>::
argmax
(
result
,
probs
,
n
);
}
else
{
Scheme
<
Tidx
,
Tval
>::
random
(
result
,
probs
,
n
,
random_val
,
topp
,
topk
,
temperature
);
}
}
template
<
class
Tidx
>
void
switch_val
(
infiniDtype_t
dt_p
,
size_t
n
,
void
*
result
,
void
const
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
)
{
switch
(
dt_p
)
{
case
INFINI_DTYPE_F16
:
switch_f
<
Tidx
,
fp16_t
>
(
n
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
);
break
;
case
INFINI_DTYPE_F32
:
switch_f
<
Tidx
,
float
>
(
n
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
);
break
;
case
INFINI_DTYPE_F64
:
switch_f
<
Tidx
,
double
>
(
n
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
);
break
;
default:
// unreachable
std
::
abort
();
}
}
void
switch_idx
(
infiniDtype_t
dt_i
,
infiniDtype_t
dt_p
,
size_t
n
,
void
*
result
,
void
const
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
)
{
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>(dt_p, n, result, probs, random_val, topp, topk, temperature); \
break
switch
(
dt_i
)
{
CASE
(
INFINI_DTYPE_I8
,
int8_t
);
CASE
(
INFINI_DTYPE_I16
,
int16_t
);
CASE
(
INFINI_DTYPE_I32
,
int32_t
);
CASE
(
INFINI_DTYPE_I64
,
int64_t
);
CASE
(
INFINI_DTYPE_U8
,
uint8_t
);
CASE
(
INFINI_DTYPE_U16
,
uint16_t
);
CASE
(
INFINI_DTYPE_U32
,
uint32_t
);
CASE
(
INFINI_DTYPE_U64
,
uint64_t
);
default:
// unreachable
std
::
abort
();
return
INFINI_STATUS_SUCCESS
;
}
#undef CASE
}
};
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
...
...
@@ -189,7 +122,11 @@ infiniStatus_t Descriptor::calculate(
float
temperature
,
void
*
stream
)
const
{
switch_idx
(
_dt_i
,
_dt_p
,
_n
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
);
Calculate
::
calculate
<
Algo
>
(
Algo
{},
_info
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
0 → 100644
View file @
9b32b4b1
#
include
"../../../devices/cuda/cuda_handle.cuh"
#include "../info.h"
#include "random_sample_cuda.cuh"
#include "random_sample_kernel.cuh"
namespace
op
::
random_sample
::
cuda
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
result_desc
,
infiniopTensorDescriptor_t
probs_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle_
);
auto
result
=
RandomSampleInfo
::
create
(
result_desc
,
probs_desc
);
CHECK_RESULT
(
result
);
auto
info
=
result
.
take
();
size_t
workspace_size
;
#define CASE_P(CASE, Tidx, Tval) \
case CASE: { \
auto workspace_result = calculateWorkspace<Tidx, Tval>(info.n); \
CHECK_RESULT(workspace_result); \
workspace_size = workspace_result.take(); \
} break
#define CASE_I(CASE, Tidx) \
case CASE: \
switch (info.dt_p) { \
CASE_P(INFINI_DTYPE_F16, Tidx, half); \
CASE_P(INFINI_DTYPE_F32, Tidx, float); \
CASE_P(INFINI_DTYPE_F64, Tidx, double); \
default: \
abort(); \
} \
break
switch
(
info
.
dt_i
)
{
CASE_I
(
INFINI_DTYPE_I8
,
int8_t
);
CASE_I
(
INFINI_DTYPE_I16
,
int16_t
);
CASE_I
(
INFINI_DTYPE_I32
,
int32_t
);
CASE_I
(
INFINI_DTYPE_I64
,
int64_t
);
CASE_I
(
INFINI_DTYPE_U8
,
uint8_t
);
CASE_I
(
INFINI_DTYPE_U16
,
uint16_t
);
CASE_I
(
INFINI_DTYPE_U32
,
uint32_t
);
CASE_I
(
INFINI_DTYPE_U64
,
uint64_t
);
default:
abort
();
}
#undef CASE_I
#undef CASE_P
*
desc_ptr
=
new
Descriptor
(
info
,
workspace_size
,
new
Opaque
{
handle
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
size_t
Descriptor
::
minWorkspaceSize
()
const
{
return
_min_workspace_size
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
const
void
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
const
{
if
(
workspace_size
<
_min_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
auto
block_size
=
_opaque
->
internal
->
blockSizeX
();
Calculate
::
calculate
<
Algo
>
(
Algo
{
block_size
},
_info
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::random_sample::cuda
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cuh
0 → 100644
View file @
9b32b4b1
#
ifndef
__RANDOM_SAMPLE_CUDA_CUH__
#define __RANDOM_SAMPLE_CUDA_CUH__
#include "../random_sample.h"
DESCRIPTOR
(
cuda
)
#endif // __RANDOM_SAMPLE_CUDA_CUH__
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
0 → 100644
View file @
9b32b4b1
#
include
"../../../devices/cuda/cuda_kernel_common.cuh"
#include "infinicore.h"
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_scan.cuh>
namespace
op
::
random_sample
::
cuda
{
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
template
<
class
T
>
static
cudaError
argMax_
(
cub
::
KeyValuePair
<
int
,
T
>
*
kv_pair
,
const
T
*
logits
,
int
n
,
void
*
workspace_ptr
,
size_t
&
workspace_len
,
cudaStream_t
stream
)
{
return
cub
::
DeviceReduce
::
ArgMax
(
workspace_ptr
,
workspace_len
,
logits
,
kv_pair
,
n
,
stream
);
}
template
<
class
Tval
,
class
Tidx
>
static
cudaError
radixSort
(
void
*
workspace_ptr
,
size_t
&
workspace_len
,
const
Tval
*
key_in
,
Tval
*
key_out
,
const
Tidx
*
val_in
,
Tidx
*
val_out
,
int
n
,
cudaStream_t
stream
)
{
return
cub
::
DeviceRadixSort
::
SortPairsDescending
(
workspace_ptr
,
workspace_len
,
key_in
,
key_out
,
val_in
,
val_out
,
n
,
0
,
sizeof
(
Tval
)
*
8
,
stream
);
}
template
<
class
T
>
static
cudaError
inclusiveSum
(
void
*
workspace_ptr
,
size_t
&
workspace_len
,
T
*
data
,
int
n
,
cudaStream_t
stream
)
{
return
cub
::
DeviceScan
::
InclusiveSum
(
workspace_ptr
,
workspace_len
,
data
,
data
,
n
,
stream
);
}
// ↑↑↑ 重新封装 cub api,减少模板参数,方便调用
// ↓↓↓ 计算 workspace
// 地址对齐到 256
static
constexpr
size_t
align256
(
size_t
size
)
{
return
(
size
+
255
)
&
(
~
255
);
}
template
<
class
Tidx
,
class
Tval
>
utils
::
Result
<
size_t
>
calculateWorkspace
(
size_t
n_
)
{
const
auto
n
=
static_cast
<
int
>
(
n_
);
size_t
argmax
;
CHECK_CUDA
(
argMax_
<
Tval
>
(
nullptr
,
nullptr
,
n
,
nullptr
,
argmax
,
nullptr
));
// 前 256 字节用于 kv pair
argmax
+=
256
;
// indices
size_t
size_random
=
align256
(
sizeof
(
Tidx
)
*
n
);
// sorted
size_random
+=
align256
(
sizeof
(
Tval
)
*
n
);
// indices_out
size_random
+=
align256
(
sizeof
(
Tidx
)
*
n
);
// cub device api
size_t
size_radix_sort
;
CHECK_CUDA
((
radixSort
<
Tval
,
Tidx
>
(
nullptr
,
size_radix_sort
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
n
,
nullptr
)));
size_t
size_inclusive_sum
;
CHECK_CUDA
(
inclusiveSum
<
Tval
>
(
nullptr
,
size_inclusive_sum
,
nullptr
,
n
,
nullptr
));
size_random
+=
cub
::
Max
()(
size_radix_sort
,
size_inclusive_sum
);
return
utils
::
Result
<
size_t
>
(
cub
::
Max
()(
argmax
,
size_random
));
}
// ↑↑↑ 计算 workspace
// ↓↓↓ 通过特化将 fp16_t 转换为 half
template
<
class
Tval
>
struct
CudaTval
{
using
Type
=
Tval
;
};
template
<
>
struct
CudaTval
<
fp16_t
>
{
using
Type
=
half
;
};
// ↑↑↑ 通过特化将 fp16_t 转换为 half
// ↓↓↓ 用于采样过程的小型 kernel
// cuda toolkit 11.x 带的 cub::DeviceReduce::ArgMax 只接受 cub::KeyValuePair<int, Tval> 输出。
// 这个 kernel 用于取出序号
template
<
class
Tidx
,
class
Tval
>
static
__global__
void
castIdx
(
Tidx
*
result
,
const
cub
::
KeyValuePair
<
int
,
Tval
>
*
kv_pair
)
{
*
result
=
kv_pair
->
key
;
}
// 填充排序要求的序号数组
template
<
class
Tidx
>
static
__global__
void
fillIndices
(
Tidx
*
indices
,
int
n
)
{
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
n
)
{
indices
[
i
]
=
i
;
}
}
// random sample 使用的 softmax 可以简化为一个基本的线性映射
// 由于已经排序,最大值就是第一个数字
// 第一个数字需要被多个 block 读取,不能写
template
<
class
T
>
static
__global__
void
partialSoftmaxKernel
(
T
*
__restrict__
data
,
int
n
,
float
temperature
)
{
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
0
<
i
&&
i
<
n
)
{
float
max
=
__ldg
(
data
);
data
[
i
]
=
(
T
)
expf
(((
float
)
data
[
i
]
-
max
)
/
temperature
);
}
}
// 将第一个数字写成 1,即 exp(0)
template
<
class
T
>
static
__global__
void
setSoftmaxMaxKernel
(
T
*
__restrict__
data
)
{
*
data
=
1
;
}
// 直接 for 循环遍历采样
// 这个 kernel 仅用于避免将数据拷贝到 cpu
template
<
class
Tval
,
class
Tidx
>
static
__global__
void
randomSampleKernel
(
Tidx
*
__restrict__
result
,
const
Tval
*
__restrict__
sorted
,
const
Tidx
*
__restrict__
indices_out
,
size_t
n
,
float
random
,
float
topp
,
size_t
topk
)
{
topk
=
cub
::
Min
()(
topk
,
n
);
auto
p
=
(
Tval
)(
random
*
cub
::
Min
()(
topp
*
(
float
)
sorted
[
n
-
1
],
(
float
)
sorted
[
topk
-
1
]));
for
(
size_t
i
=
0
;;
++
i
)
{
if
((
sorted
[
i
])
>=
p
)
{
*
result
=
indices_out
[
i
];
return
;
}
}
}
// ↑↑↑ 用于采样过程的小型 kernel
struct
Algo
{
int
block_size
;
template
<
class
Tidx
,
class
Tval_
>
infiniStatus_t
argmax
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
const
void
*
probs
,
size_t
n
,
void
*
stream_
)
const
{
using
Tval
=
typename
CudaTval
<
Tval_
>::
Type
;
auto
stream
=
(
cudaStream_t
)
stream_
;
auto
logits
=
(
Tval
*
)
probs
;
auto
kv_pair
=
(
cub
::
KeyValuePair
<
int
,
Tval
>
*
)
workspace
;
workspace
=
(
void
*
)((
char
*
)
workspace
+
256
);
workspace_size
-=
256
;
argMax_
(
kv_pair
,
logits
,
n
,
workspace
,
workspace_size
,
stream
);
castIdx
<<<
1
,
1
,
0
,
stream
>>>
((
Tidx
*
)
result
,
kv_pair
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
class
Tidx
,
class
Tval_
>
infiniStatus_t
random
(
void
*
workspace_
,
size_t
workspace_size
,
void
*
result_
,
const
void
*
probs
,
size_t
n
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream_
)
const
{
using
Tval
=
typename
CudaTval
<
Tval_
>::
Type
;
auto
stream
=
(
cudaStream_t
)
stream_
;
auto
logits
=
(
Tval
*
)
probs
;
auto
result
=
(
Tidx
*
)
result_
;
auto
workspace
=
reinterpret_cast
<
size_t
>
(
workspace_
);
auto
workspace_end
=
workspace
+
workspace_size
;
auto
indices
=
reinterpret_cast
<
Tidx
*>
(
workspace
);
workspace
+=
align256
(
sizeof
(
Tidx
)
*
n
);
auto
sorted
=
reinterpret_cast
<
Tval
*>
(
workspace
);
workspace
+=
align256
(
sizeof
(
Tval
)
*
n
);
auto
indices_out
=
reinterpret_cast
<
Tidx
*>
(
workspace
);
workspace
+=
align256
(
sizeof
(
Tidx
)
*
n
);
workspace_
=
reinterpret_cast
<
void
*>
(
workspace
);
workspace_size
=
workspace_end
-
workspace
;
auto
block
=
cub
::
Min
()((
size_t
)
block_size
,
n
);
auto
grid
=
(
n
+
block
-
1
)
/
block
;
// sort
fillIndices
<<<
grid
,
block
,
0
,
stream
>>>
(
indices
,
n
);
CHECK_CUDA
(
radixSort
(
workspace_
,
workspace_size
,
logits
,
sorted
,
indices
,
indices_out
,
n
,
stream
));
// softmax
partialSoftmaxKernel
<<<
grid
,
block
,
0
,
stream
>>>
(
sorted
,
n
,
temperature
);
setSoftmaxMaxKernel
<<<
1
,
1
,
0
,
stream
>>>
(
sorted
);
// sum
CHECK_CUDA
(
inclusiveSum
(
workspace_
,
workspace
,
sorted
,
n
,
stream
));
// sample
randomSampleKernel
<<<
1
,
1
,
0
,
stream
>>>
(
result
,
sorted
,
indices_out
,
n
,
random_val
,
topp
,
topk
);
return
INFINI_STATUS_SUCCESS
;
}
};
}
// namespace op::random_sample::cuda
src/infiniop/ops/random_sample/info.h
0 → 100644
View file @
9b32b4b1
#
ifndef
__RANDOM_SAMPLE_INFO_H__
#define __RANDOM_SAMPLE_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
namespace
op
::
random_sample
{
struct
RandomSampleInfo
{
infiniDtype_t
dt_i
,
dt_p
;
size_t
n
;
static
utils
::
Result
<
RandomSampleInfo
>
create
(
infiniopTensorDescriptor_t
result_desc
,
infiniopTensorDescriptor_t
probs_desc
)
{
auto
dt_i
=
result_desc
->
dtype
();
auto
dt_p
=
probs_desc
->
dtype
();
CHECK_DTYPE_ANY_INT
(
dt_i
);
CHECK_DTYPE
(
dt_p
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_OR_RETURN
(
result_desc
->
ndim
()
==
0
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
probs_desc
->
ndim
()
==
1
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
probs_desc
->
stride
(
0
)
==
1
,
INFINI_STATUS_BAD_TENSOR_STRIDES
);
return
utils
::
Result
<
RandomSampleInfo
>
({
dt_i
,
dt_p
,
probs_desc
->
dim
(
0
)});
}
};
}
// namespace op::random_sample
#endif // __RANDOM_SAMPLE_INFO_H__
src/infiniop/ops/random_sample/operator.cc
View file @
9b32b4b1
...
...
@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/random_sample_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/random_sample_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateRandomSampleDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -25,6 +28,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -38,9 +44,10 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE:
\
case CASE:
{
\
using Ptr = const op::random_sample::NAMESPACE::Descriptor *; \
*size = reinterpret_cast<Ptr>(desc)->minWorkspaceSize(); \
} \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
...
...
@@ -48,6 +55,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -82,6 +92,9 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -103,6 +116,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/random_sample/random_sample.h
View file @
9b32b4b1
#ifndef __RANDOM_SAMPLE_H__
#define __RANDOM_SAMPLE_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
...
...
@@ -11,22 +11,18 @@
struct Opaque; \
Opaque *_opaque; \
\
infiniDtype_t _dt_i, _dt_p;
\
size_t
_n,
_min_workspace_size; \
RandomSampleInfo _info;
\
size_t _min_workspace_size;
\
\
Descriptor( \
infiniDtype_t dt_i, \
infiniDtype_t dt_p, \
size_t n, \
RandomSampleInfo info, \
size_t min_workspace_size, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dt_i(dt_i), \
_dt_p(dt_p), \
_n(n), \
_info(info), \
_min_workspace_size(min_workspace_size) {} \
\
public: \
...
...
@@ -53,4 +49,96 @@
}; \
}
namespace
op
::
random_sample
{
struct
CalculateArgs
{
void
*
workspace
;
size_t
workspace_size
;
void
*
result
;
const
void
*
probs
;
float
random_val
,
topp
,
temperature
;
int
topk
;
void
*
stream
;
};
class
Calculate
{
template
<
class
Tidx
,
class
Tval
,
class
Algo
>
static
void
switch_f
(
Algo
algo
,
size_t
n
,
CalculateArgs
args
)
{
if
(
args
.
random_val
==
0
||
args
.
topp
==
0
||
args
.
topk
==
1
||
args
.
temperature
==
0
)
{
algo
.
template
argmax
<
Tidx
,
Tval
>(
args
.
workspace
,
args
.
workspace_size
,
args
.
result
,
args
.
probs
,
n
,
args
.
stream
);
}
else
{
algo
.
template
random
<
Tidx
,
Tval
>(
args
.
workspace
,
args
.
workspace_size
,
args
.
result
,
args
.
probs
,
n
,
args
.
random_val
,
args
.
topp
,
args
.
topk
,
args
.
temperature
,
args
.
stream
);
}
}
template
<
class
Tidx
,
class
Algo
>
static
void
switch_val
(
Algo
algo
,
infiniDtype_t
dt_p
,
size_t
n
,
CalculateArgs
args
)
{
switch
(
dt_p
)
{
case
INFINI_DTYPE_F16
:
switch_f
<
Tidx
,
fp16_t
>
(
algo
,
n
,
args
);
break
;
case
INFINI_DTYPE_F32
:
switch_f
<
Tidx
,
float
>
(
algo
,
n
,
args
);
break
;
case
INFINI_DTYPE_F64
:
switch_f
<
Tidx
,
double
>
(
algo
,
n
,
args
);
break
;
default:
// unreachable
std
::
abort
();
}
}
public:
template
<
class
Algo
>
static
infiniStatus_t
calculate
(
Algo
algo
,
RandomSampleInfo
info
,
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
const
void
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
{
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>( \
algo, info.dt_p, info.n, \
{workspace, workspace_size, \
result, probs, \
random_val, topp, temperature, topk, \
stream}); \
break
switch
(
info
.
dt_i
)
{
CASE
(
INFINI_DTYPE_I8
,
int8_t
);
CASE
(
INFINI_DTYPE_I16
,
int16_t
);
CASE
(
INFINI_DTYPE_I32
,
int32_t
);
CASE
(
INFINI_DTYPE_I64
,
int64_t
);
CASE
(
INFINI_DTYPE_U8
,
uint8_t
);
CASE
(
INFINI_DTYPE_U16
,
uint16_t
);
CASE
(
INFINI_DTYPE_U32
,
uint32_t
);
CASE
(
INFINI_DTYPE_U64
,
uint64_t
);
default:
// unreachable
std
::
abort
();
}
#undef CASE
return
INFINI_STATUS_SUCCESS
;
}
};
}
// namespace op::random_sample
#endif // __RANDOM_SAMPLE_H__
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
0 → 100644
View file @
9b32b4b1
#include "rearrange_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_copy.h>
namespace
op
::
rearrange
::
ascend
{
struct
Descriptor
::
Opaque
{
aclnnTensorDescriptor_t
dst
;
aclnnTensorDescriptor_t
src
;
void
*
workspace
;
// aclnnInplaceCopy workspace
uint64_t
workspace_size
;
~
Opaque
()
{
delete
dst
;
delete
src
;
aclrtFree
(
workspace
);
}
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
};
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
auto
shape
=
y_desc
->
shape
();
CHECK_API_OR
(
x_desc
->
dtype
(),
dtype
,
return
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_API_OR
(
x_desc
->
ndim
(),
ndim
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
CHECK_API_OR
(
x_desc
->
shape
()[
i
],
shape
[
i
],
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
auto
dst_strides
=
y_desc
->
strides
();
auto
src_strides
=
x_desc
->
strides
();
auto
element_size
=
infiniSizeOf
(
dtype
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
shape
.
data
(),
dst_strides
.
data
(),
src_strides
.
data
(),
ndim
,
element_size
);
CHECK_RESULT
(
result
);
aclnnTensorDescriptor_t
dst
=
new
aclnnTensorDescriptor
(
y_desc
);
aclnnTensorDescriptor_t
src
=
new
aclnnTensorDescriptor
(
x_desc
);
uint64_t
workspace_size
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
void
*
workspace
=
nullptr
;
aclnnInplaceCopyGetWorkspaceSize
(
dst
->
tensor
,
src
->
tensor
,
&
workspace_size
,
&
executor
);
if
(
workspace_size
!=
0
)
{
CHECK_ACL
(
aclrtMalloc
(
&
workspace
,
workspace_size
,
ACL_MEM_MALLOC_HUGE_FIRST
));
}
*
desc_ptr
=
new
Descriptor
(
result
.
take
(),
new
Opaque
{
dst
,
src
,
workspace
,
workspace_size
},
handle
->
device
,
handle
->
device_id
);
// Delete useless executor
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
auto
tdst
=
_opaque
->
dst
->
tensor
;
auto
tsrc
=
_opaque
->
src
->
tensor
;
uint64_t
workspace_size
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
AclSetTensorAddr
(
executor
,
0
,
tdst
,
y
);
AclSetTensorAddr
(
executor
,
1
,
tsrc
,
(
void
*
)
x
);
CHECK_ACL
(
aclnnInplaceCopyGetWorkspaceSize
(
tdst
,
tsrc
,
&
workspace_size
,
&
executor
));
// Execute InplaceCopy
CHECK_ACL
(
aclnnInplaceCopy
(
_opaque
->
workspace
,
_opaque
->
workspace_size
,
executor
,
stream
));
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::rearrange::ascend
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
0 → 100644
View file @
9b32b4b1
#ifndef __REARRANGE_ASCEND_H__
#define __REARRANGE_ASCNED_H__
#include "../rearrange.h"
DESCRIPTOR
(
ascend
)
#endif // __REARRANGE_ASCEND_H__
src/infiniop/ops/rearrange/operator.cc
View file @
9b32b4b1
...
...
@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/rearrange_cpu.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
...
...
@@ -29,6 +32,9 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
...
...
@@ -57,6 +63,9 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
...
...
@@ -82,6 +91,9 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
...
...
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
View file @
9b32b4b1
...
...
@@ -10,12 +10,15 @@ struct Descriptor::Opaque {
aclnnTensorDescriptor_t
w
;
aclnnTensorDescriptor_t
rstd
;
size_t
workspaceSize
;
aclOpExecutor
*
executor
;
~
Opaque
()
{
delete
y
;
delete
x
;
delete
w
;
delete
rstd
;
aclDestroyAclOpExecutor
(
executor
);
}
};
...
...
@@ -62,17 +65,16 @@ infiniStatus_t Descriptor::create(
// Get WorkspaceSize and set executor
CHECK_ACL
(
aclnnRmsNormGetWorkspaceSize
(
tx
,
tw
,
static_cast
<
double
>
(
epsilon
),
ty
,
trstd
,
&
workspace_size
,
&
executor
));
aclSetAclOpExecutorRepeatable
(
executor
);
auto
handle_ascend
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
size_t
all_workspace_size
=
workspace_size
+
rstd
->
numel
()
*
aclDataTypeSize
(
rstd
->
dataType
);
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
y
,
x
,
w
,
rstd
,
workspace_size
},
new
Opaque
{
y
,
x
,
w
,
rstd
,
workspace_size
,
executor
},
std
::
move
(
info
),
all_workspace_size
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -88,21 +90,16 @@ infiniStatus_t Descriptor::calculate(
auto
tx
=
_opaque
->
x
->
tensor
;
auto
ty
=
_opaque
->
y
->
tensor
;
auto
trstd
=
_opaque
->
rstd
->
tensor
;
size_t
workspace_size_
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
CHECK_ACL
(
aclnnRmsNormGetWorkspaceSize
(
tx
,
tw
,
static_cast
<
double
>
(
_info
.
epsilon
),
ty
,
trstd
,
&
workspace_size_
,
&
executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
executor
));
void
*
rstdPtr
=
(
void
*
)((
uint8_t
*
)
workspace
+
_opaque
->
workspaceSize
);
auto
unit
=
infiniSizeOf
(
_info
.
atype
);
AclSetTensorAddr
(
executor
,
1
,
tw
,
(
void
*
)
w
);
AclSetTensorAddr
(
executor
,
3
,
trstd
,
rstdPtr
);
AclSetTensorAddr
(
_opaque
->
executor
,
1
,
tw
,
(
void
*
)
w
);
AclSetTensorAddr
(
_opaque
->
executor
,
3
,
trstd
,
rstdPtr
);
for
(
size_t
i
=
0
;
i
<
(
_info
.
shape
)[
0
];
++
i
)
{
AclSetTensorAddr
(
executor
,
0
,
tx
,
((
char
*
)
x
)
+
i
*
(
_info
.
x_strides
)[
0
]
*
unit
);
AclSetTensorAddr
(
executor
,
2
,
ty
,
((
char
*
)
y
)
+
i
*
(
_info
.
y_strides
)[
0
]
*
unit
);
CHECK_ACL
(
aclnnRmsNorm
(
workspace
,
_opaque
->
workspaceSize
,
executor
,
stream
));
AclSetTensorAddr
(
_opaque
->
executor
,
0
,
tx
,
((
char
*
)
x
)
+
i
*
(
_info
.
x_strides
)[
0
]
*
unit
);
AclSetTensorAddr
(
_opaque
->
executor
,
2
,
ty
,
((
char
*
)
y
)
+
i
*
(
_info
.
y_strides
)[
0
]
*
unit
);
CHECK_ACL
(
aclnnRmsNorm
(
workspace
,
_opaque
->
workspaceSize
,
_opaque
->
executor
,
stream
));
}
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infiniop/ops/rms_norm/kunlun/rms_norm_kernel.xpu
View file @
9b32b4b1
#ifndef __RMS_NORM_KUNLUN_KERNEL_XPU__
#define __RMS_NORM_KUNLUN_KERNEL_XPU__
#include "../../../devices/kunlun/kunlun_common.h"
#include "../../../devices/kunlun/kunlun_
kernel_
common.h"
#include "../../../reduce/kunlun/reduce_kunlun.h"
using namespace device::kunlun::kernel;
// Element wise mul used in x * w
static inline __device__ void elementwiseMulRms(float *x, float *w, float *y, int count, float rms) {
int remain = count % 16;
...
...
src/infiniop/ops/rope/ascend/rope_ascend.cc
0 → 100644
View file @
9b32b4b1
#include "rope_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
namespace
op
::
rope
::
ascend
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
,
infiniopTensorDescriptor_t
pos_desc
,
infiniopTensorDescriptor_t
sin_desc
,
infiniopTensorDescriptor_t
cos_desc
)
{
auto
handle_ascned
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
auto
result
=
RoPEInfo
::
createRoPEInfo
(
y_desc
,
x_desc
,
pos_desc
,
sin_desc
,
cos_desc
);
CHECK_RESULT
(
result
);
size_t
workspace_size
=
0
;
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
result
.
take
()),
workspace_size
,
nullptr
,
handle_ascned
->
device
,
handle_ascned
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
y
,
const
void
*
x
,
const
void
*
pos_ids
,
const
void
*
sin_table
,
const
void
*
cos_table
,
void
*
stream
)
const
{
CHECK_DTYPE
(
_info
.
data_type
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F16
);
auto
data_type
=
_info
.
data_type
;
auto
pos_type
=
_info
.
pos_type
;
auto
seq_len
=
_info
.
seqlen
;
auto
nhead
=
_info
.
nhead
;
auto
dhead
=
_info
.
dhead
;
auto
y_stride_seqlen
=
_info
.
y_stride_seqlen
;
auto
y_stride_nhead
=
_info
.
y_stride_nhead
;
auto
x_stride_seqlen
=
_info
.
x_stride_seqlen
;
auto
x_stride_nhead
=
_info
.
x_stride_nhead
;
return
rope_kernel_launch
(
y
,
(
void
*
)
x
,
(
void
*
)
pos_ids
,
(
void
*
)
sin_table
,
(
void
*
)
cos_table
,
seq_len
,
nhead
,
dhead
,
data_type
,
pos_type
,
y_stride_seqlen
,
y_stride_nhead
,
x_stride_seqlen
,
x_stride_nhead
,
stream
);
}
}
// namespace op::rope::ascend
src/infiniop/ops/rope/ascend/rope_ascend.h
0 → 100644
View file @
9b32b4b1
#ifndef __ACLNN_ROPE_H__
#define __ACLNN_ROPE_H__
#include "../rope.h"
extern
"C"
infiniStatus_t
rope_kernel_launch
(
void
*
y
,
void
*
x
,
void
*
pos
,
void
*
sin
,
void
*
cos
,
size_t
seq_len
,
size_t
nhead
,
size_t
dhead
,
infiniDtype_t
data_type
,
infiniDtype_t
pos_type
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
,
void
*
stream
);
DESCRIPTOR
(
ascend
)
#endif // __ACLNN_ROPE_H__
src/infiniop/ops/rope/ascend/rope_ascend_kernel.cpp
0 → 100644
View file @
9b32b4b1
#include "../../../devices/ascend/ascend_kernel_common.h"
using
namespace
AscendC
;
template
<
typename
T
,
typename
U
>
class
RoPEKernel
{
public:
__aicore__
inline
RoPEKernel
()
{}
// Init op
// pos position vector
// x input tensor
// y output tensor
// tensor shape [nt, nh, dh]
// make block_num = nh, tile_len = dh
__aicore__
inline
void
init
(
GM_ADDR
y
,
GM_ADDR
x
,
GM_ADDR
pos
,
GM_ADDR
sin
,
GM_ADDR
cos
,
size_t
dh
,
ptrdiff_t
st_ynt
,
ptrdiff_t
st_ynh
,
ptrdiff_t
st_xnt
,
ptrdiff_t
st_xnh
);
__aicore__
inline
void
process
(
size_t
seq_len
);
private:
// Copy a tile into UB
__aicore__
inline
void
copyIn
(
size_t
i
);
__aicore__
inline
void
compute
(
size_t
i
);
__aicore__
inline
void
copyOut
(
size_t
i
);
private:
TPipe
pipe
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_in_que
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_sin_que
;
TQue
<
QuePosition
::
VECIN
,
BUFFER_NUM
>
_cos_que
;
TQue
<
QuePosition
::
VECOUT
,
BUFFER_NUM
>
_out_que
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf1
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_odd_buf2
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf1
;
TBuf
<
TPosition
::
VECCALC
>
_tmp_even_buf2
;
GlobalTensor
<
T
>
_x_gm
,
_y_gm
;
GlobalTensor
<
U
>
_p_gm
;
GlobalTensor
<
T
>
_sin_gm
;
GlobalTensor
<
T
>
_cos_gm
;
size_t
_block_idx
;
size_t
_tile_len
;
size_t
_copy_len
;
size_t
_half_copy_len
;
// stridey[_st_ynt, _st_ynh, 1]
ptrdiff_t
_st_ynt
;
ptrdiff_t
_st_ynh
;
// stridex[_st_xnt, _st_xnh, 1]
ptrdiff_t
_st_xnt
;
ptrdiff_t
_st_xnh
;
};
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
init
(
GM_ADDR
y
,
GM_ADDR
x
,
GM_ADDR
pos
,
GM_ADDR
sin
,
GM_ADDR
cos
,
size_t
dh
,
ptrdiff_t
st_ynt
,
ptrdiff_t
st_ynh
,
ptrdiff_t
st_xnt
,
ptrdiff_t
st_xnh
)
{
this
->
_tile_len
=
dh
;
this
->
_st_ynt
=
st_ynt
;
this
->
_st_ynh
=
st_ynh
;
this
->
_st_xnt
=
st_xnt
;
this
->
_st_xnh
=
st_xnh
;
_copy_len
=
alignTileLen
<
T
>
(
dh
,
BYTE_ALIGN
);
_half_copy_len
=
alignTileLen
<
T
>
(
dh
,
BYTE_ALIGN
);
_block_idx
=
GetBlockIdx
();
// Init global buffer
_x_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
x
);
_p_gm
.
SetGlobalBuffer
((
__gm__
U
*
)
pos
);
_sin_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
sin
);
_cos_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
cos
);
_y_gm
.
SetGlobalBuffer
((
__gm__
T
*
)
y
);
// Init Queue buffer
pipe
.
InitBuffer
(
_in_que
,
BUFFER_NUM
,
_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_out_que
,
BUFFER_NUM
,
_tile_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_sin_que
,
BUFFER_NUM
,
_half_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_cos_que
,
BUFFER_NUM
,
_half_copy_len
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf1
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_odd_buf2
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf1
,
_tile_len
/
2
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp_even_buf2
,
_tile_len
/
2
*
sizeof
(
T
));
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
copyIn
(
size_t
i
)
{
LocalTensor
<
T
>
input_ub
=
_in_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
sin_ub
=
_sin_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
cos_ub
=
_cos_que
.
AllocTensor
<
T
>
();
// Get idx of current tile in total input
auto
idx
=
i
*
_st_xnt
+
_block_idx
*
_st_xnh
;
// Copy tile current tile into UB
DataCopy
(
input_ub
,
_x_gm
[
idx
],
_copy_len
);
// Copy sin cos tile
auto
pos_idx
=
_p_gm
(
i
);
DataCopy
(
sin_ub
,
_sin_gm
[
pos_idx
*
_tile_len
/
2
],
_half_copy_len
);
DataCopy
(
cos_ub
,
_cos_gm
[
pos_idx
*
_tile_len
/
2
],
_half_copy_len
);
// Push in operands
_in_que
.
EnQue
(
input_ub
);
_sin_que
.
EnQue
(
sin_ub
);
_cos_que
.
EnQue
(
cos_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
compute
(
size_t
i
)
{
LocalTensor
<
T
>
input_ub
=
_in_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
sin_ub
=
_sin_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
cos_ub
=
_cos_que
.
DeQue
<
T
>
();
LocalTensor
<
T
>
output_ub
=
_out_que
.
AllocTensor
<
T
>
();
LocalTensor
<
T
>
tmp_odd
=
_tmp_odd_buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even
=
_tmp_even_buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_odd1
=
_tmp_odd_buf1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_odd2
=
_tmp_odd_buf2
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even1
=
_tmp_even_buf1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmp_even2
=
_tmp_even_buf2
.
Get
<
T
>
();
// separate odd and even bit elements
uint64_t
rsvdCnt
=
0
;
GatherMaskParams
gMaskParams
=
{
1
,
static_cast
<
uint16_t
>
((
_tile_len
*
sizeof
(
T
)
+
255
)
/
256
),
// no more than 256(<=255)
8
,
8
,
};
GatherMask
<
T
>
(
tmp_odd
,
input_ub
,
1
,
false
,
0
,
gMaskParams
,
rsvdCnt
);
GatherMask
<
T
>
(
tmp_even
,
input_ub
,
2
,
false
,
0
,
gMaskParams
,
rsvdCnt
);
PipeBarrier
<
PIPE_V
>
();
// compute odd bit elements
// y_odd = x_odd * cos - x_even * sin
Mul
<
T
>
(
tmp_odd1
,
tmp_odd
,
cos_ub
,
_tile_len
/
2
);
Mul
<
T
>
(
tmp_odd2
,
tmp_even
,
sin_ub
,
_tile_len
/
2
);
PipeBarrier
<
PIPE_V
>
();
Sub
<
T
>
(
tmp_odd1
,
tmp_odd1
,
tmp_odd2
,
_tile_len
/
2
);
// compute even bit elements
// y_even = x_odd * sin + x_even * cos
Mul
<
T
>
(
tmp_even1
,
tmp_odd
,
sin_ub
,
_tile_len
/
2
);
Mul
<
T
>
(
tmp_even2
,
tmp_even
,
cos_ub
,
_tile_len
/
2
);
PipeBarrier
<
PIPE_V
>
();
Add
<
T
>
(
tmp_even1
,
tmp_even1
,
tmp_even2
,
_tile_len
/
2
);
// combine odd and even bit elements
for
(
uint32_t
j
=
0
;
j
<
_tile_len
/
2
;
j
+=
1
)
{
output_ub
(
j
*
2
)
=
tmp_odd1
(
j
);
output_ub
(
j
*
2
+
1
)
=
tmp_even1
(
j
);
}
_out_que
.
EnQue
<
T
>
(
output_ub
);
_in_que
.
FreeTensor
(
input_ub
);
_sin_que
.
FreeTensor
(
sin_ub
);
_cos_que
.
FreeTensor
(
cos_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
copyOut
(
size_t
i
)
{
LocalTensor
<
T
>
output_ub
=
_out_que
.
DeQue
<
T
>
();
auto
idy
=
i
*
_st_ynt
+
_block_idx
*
_st_ynh
;
DataCopyExtParams
params
=
{
1
,
static_cast
<
uint32_t
>
(
_tile_len
*
sizeof
(
T
)),
0
,
0
,
0
};
DataCopyPad
(
_y_gm
[
idy
],
output_ub
,
params
);
_out_que
.
FreeTensor
(
output_ub
);
}
template
<
typename
T
,
typename
U
>
__aicore__
inline
void
RoPEKernel
<
T
,
U
>::
process
(
size_t
seq_len
)
{
for
(
size_t
i
=
0
;
i
<
seq_len
;
++
i
)
{
copyIn
(
i
);
compute
(
i
);
copyOut
(
i
);
}
}
#define ROPE_KERNEL_INIT_ARGS y, x, pos, sin, cos, dhead, \
y_stride_seqlen, y_stride_nhead, \
x_stride_seqlen, x_stride_nhead
#define CASE_POSTYPE(POS_TYPE_ENUM, TYPE, POS_T) \
case POS_TYPE_ENUM: { \
RoPEKernel<TYPE, POS_T> op; \
op.init(ROPE_KERNEL_INIT_ARGS); \
op.process(seq_len); \
break; \
}
#define ROPE_KERNEL(TYPE, POSTYPE) \
switch (POSTYPE) { \
CASE_POSTYPE(INFINI_DTYPE_I8, TYPE, int8_t) \
CASE_POSTYPE(INFINI_DTYPE_I16, TYPE, int16_t) \
CASE_POSTYPE(INFINI_DTYPE_I32, TYPE, int32_t) \
CASE_POSTYPE(INFINI_DTYPE_I64, TYPE, int64_t) \
CASE_POSTYPE(INFINI_DTYPE_U8, TYPE, uint8_t) \
CASE_POSTYPE(INFINI_DTYPE_U16, TYPE, uint16_t) \
CASE_POSTYPE(INFINI_DTYPE_U32, TYPE, uint32_t) \
CASE_POSTYPE(INFINI_DTYPE_U64, TYPE, uint64_t) \
default: \
break; \
}
#define DEFINE_ROPE_KERNEL(KERNEL_NAME, TYPE) \
__global__ __aicore__ void KERNEL_NAME(GM_ADDR y, \
GM_ADDR x, \
GM_ADDR pos, \
GM_ADDR sin, \
GM_ADDR cos, \
size_t seq_len, \
size_t dhead, \
ptrdiff_t y_stride_seqlen, \
ptrdiff_t y_stride_nhead, \
ptrdiff_t x_stride_seqlen, \
ptrdiff_t x_stride_nhead, \
int32_t pos_type) { \
ROPE_KERNEL(TYPE, pos_type) \
}
DEFINE_ROPE_KERNEL
(
rope_kernel_float
,
float
)
DEFINE_ROPE_KERNEL
(
rope_kernel_half
,
half
)
#undef DEFINE_ROPE_KERNEL
#undef ROPE_KERNEL
#undef CASE_POSTYPE
#undef ROPE_KERNEL_INIT_ARGS
extern
"C"
infiniStatus_t
rope_kernel_launch
(
void
*
y
,
void
*
x
,
void
*
pos
,
void
*
sin
,
void
*
cos
,
size_t
seq_len
,
size_t
nhead
,
size_t
dhead
,
infiniDtype_t
dtype
,
infiniDtype_t
pos_type
,
ptrdiff_t
y_stride_seqlen
,
ptrdiff_t
y_stride_nhead
,
ptrdiff_t
x_stride_seqlen
,
ptrdiff_t
x_stride_nhead
,
void
*
stream
)
{
#define LAUNCH_ROPE_KERNEL(DTYPE_ENUM, KERNEL_NAME) \
case DTYPE_ENUM: \
KERNEL_NAME<<<nhead, nullptr, stream>>>(y, x, pos, sin, cos, \
seq_len, \
dhead, \
y_stride_seqlen, \
y_stride_nhead, \
x_stride_seqlen, \
x_stride_nhead, \
pos_type); \
return INFINI_STATUS_SUCCESS;
switch
(
dtype
)
{
LAUNCH_ROPE_KERNEL
(
INFINI_DTYPE_F16
,
rope_kernel_half
)
LAUNCH_ROPE_KERNEL
(
INFINI_DTYPE_F32
,
rope_kernel_float
)
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
src/infiniop/ops/rope/operator.cc
View file @
9b32b4b1
...
...
@@ -8,6 +8,9 @@
#ifdef ENABLE_CUDA_API
#include "cuda/rope_cuda.cuh"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rope_ascend.h"
#endif
__C
infiniStatus_t
infiniopCreateRoPEDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -43,12 +46,8 @@ __C infiniStatus_t infiniopCreateRoPEDescriptor(
pos_ids
,
sin_table
,
cos_table
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendCreateRoPEDescriptor
((
AscendHandle_t
)
handle
,
(
RoPEAscendDescriptor_t
*
)
desc_ptr
,
t
,
pos_ids
,
sin_table
,
cos_table
);
}
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
...
...
@@ -90,10 +89,8 @@ __C infiniStatus_t infiniopGetRoPEWorkspaceSize(infiniopRoPEDescriptor_t desc,
return
bangGetRoPEWorkspaceSize
((
RoPEBangDescriptor_t
)
desc
,
size
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendGetRoPEWorkspaceSize
((
RoPEAscendDescriptor_t
)
desc
,
size
);
}
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
...
...
@@ -141,12 +138,8 @@ __C infiniStatus_t infiniopRoPE(
t
,
pos_ids
,
sin_table
,
cos_table
,
stream
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendRoPE
((
RoPEAscendDescriptor_t
)
desc
,
workspace
,
workspace_size
,
t
,
pos_ids
,
sin_table
,
cos_table
,
stream
);
}
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
...
...
@@ -187,10 +180,8 @@ infiniopDestroyRoPEDescriptor(infiniopRoPEDescriptor_t desc) {
return
bangDestroyRoPEDescriptor
((
RoPEBangDescriptor_t
)
desc
);
}
#endif
#ifdef ENABLE_ASCEND_NPU
case
DevAscendNpu
:
{
return
ascendDestroyRoPEDescriptor
((
RoPEAscendDescriptor_t
)
desc
);
}
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_METAX_GPU
case
DevMetaxGpu
:
{
...
...
src/infiniop/ops/swiglu/ascend/swiglu_ascend.cc
0 → 100644
View file @
9b32b4b1
#include "swiglu_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
namespace
op
::
swiglu
::
ascend
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_descs
)
{
auto
handle_ascend
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
auto
dtype
=
c_desc
->
dtype
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
);
const
auto
&
a_desc
=
input_descs
[
0
];
const
auto
&
b_desc
=
input_descs
[
1
];
auto
result
=
SwigluInfo
::
create
(
c_desc
,
a_desc
,
b_desc
);
CHECK_RESULT
(
result
);
SwigluInfo
info
=
result
.
take
();
// https://www.hiascend.com/document/detail/zh/canncommercial/800/apiref/ascendcopapi/atlasascendc_api_07_0777.html
size_t
workspace_size
=
0
;
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
info
),
workspace_size
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
auto
batch
=
_info
.
ndim
==
2
?
1
:
_info
.
shape
[
0
];
auto
seq_len
=
_info
.
ndim
==
2
?
_info
.
shape
[
0
]
:
_info
.
shape
[
1
];
auto
hidden_size
=
_info
.
shape
[
_info
.
ndim
-
1
];
auto
stride_batch_c
=
_info
.
ndim
==
2
?
1
:
_info
.
c_strides
[
0
];
auto
stride_batch_a
=
_info
.
ndim
==
2
?
1
:
_info
.
a_strides
[
0
];
auto
stride_batch_b
=
_info
.
ndim
==
2
?
1
:
_info
.
b_strides
[
0
];
auto
stride_seq_c
=
_info
.
ndim
==
2
?
_info
.
c_strides
[
0
]
:
_info
.
c_strides
[
1
];
auto
stride_seq_a
=
_info
.
ndim
==
2
?
_info
.
a_strides
[
0
]
:
_info
.
a_strides
[
1
];
auto
stride_seq_b
=
_info
.
ndim
==
2
?
_info
.
b_strides
[
0
]
:
_info
.
b_strides
[
1
];
auto
status
=
swiglu_kernel_launch
(
c
,
(
void
*
)
inputs
[
0
],
(
void
*
)
inputs
[
1
],
_info
.
dtype
,
batch
,
seq_len
,
hidden_size
,
stride_batch_c
,
stride_batch_a
,
stride_batch_b
,
stride_seq_c
,
stride_seq_a
,
stride_seq_b
,
stream
);
return
status
;
}
}
// namespace op::swiglu::ascend
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment