Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
c1fa267c
Commit
c1fa267c
authored
May 20, 2025
by
zhangyunze
Browse files
feat:重构random sample ascend算子
parent
b5c6c7b8
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
381 additions
and
423 deletions
+381
-423
src/infiniop/devices/ascend/CMakeLists.txt
src/infiniop/devices/ascend/CMakeLists.txt
+1
-1
src/infiniop/devices/ascend/ascend_kernel_common.h
src/infiniop/devices/ascend/ascend_kernel_common.h
+1
-0
src/infiniop/ops/random_sample/ascend/random_sample_aclnn.h
src/infiniop/ops/random_sample/ascend/random_sample_aclnn.h
+8
-0
src/infiniop/ops/random_sample/ascend/random_sample_ascend.cc
...infiniop/ops/random_sample/ascend/random_sample_ascend.cc
+0
-148
src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
+0
-25
src/infiniop/ops/random_sample/ascend/random_sample_ascend_api.h
...iniop/ops/random_sample/ascend/random_sample_ascend_api.h
+0
-31
src/infiniop/ops/random_sample/ascend/random_sample_kernel.cpp
...nfiniop/ops/random_sample/ascend/random_sample_kernel.cpp
+236
-202
src/infiniop/ops/random_sample/ascend/randomsample_aclnn.cc
src/infiniop/ops/random_sample/ascend/randomsample_aclnn.cc
+110
-0
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+17
-2
test/infiniop/random_sample.py
test/infiniop/random_sample.py
+8
-14
No files found.
src/infiniop/devices/ascend/CMakeLists.txt
View file @
c1fa267c
...
@@ -5,7 +5,7 @@ project(Ascend_C)
...
@@ -5,7 +5,7 @@ project(Ascend_C)
set
(
SOC_VERSION
"Ascend910B3"
CACHE STRING
"system on chip type"
)
set
(
SOC_VERSION
"Ascend910B3"
CACHE STRING
"system on chip type"
)
set
(
ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH
"ASCEND CANN package installation directory"
)
set
(
ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH
"ASCEND CANN package installation directory"
)
set
(
RUN_MODE
"npu"
CACHE STRING
"run mode: npu"
)
set
(
RUN_MODE
"npu"
CACHE STRING
"run mode: npu"
)
set
(
CMAKE_BUILD_TYPE
"
Debug
"
CACHE STRING
"Build type Release/Debug (default Debug)"
FORCE
)
set
(
CMAKE_BUILD_TYPE
"
Release
"
CACHE STRING
"Build type Release/Debug (default Debug)"
FORCE
)
set
(
CMAKE_INSTALL_PREFIX
"
${
CMAKE_CURRENT_LIST_DIR
}
/out"
CACHE STRING
"path for install()"
FORCE
)
set
(
CMAKE_INSTALL_PREFIX
"
${
CMAKE_CURRENT_LIST_DIR
}
/out"
CACHE STRING
"path for install()"
FORCE
)
if
(
EXISTS
${
ASCEND_CANN_PACKAGE_PATH
}
/tools/tikcpp/ascendc_kernel_cmake
)
if
(
EXISTS
${
ASCEND_CANN_PACKAGE_PATH
}
/tools/tikcpp/ascendc_kernel_cmake
)
...
...
src/infiniop/devices/ascend/ascend_kernel_common.h
View file @
c1fa267c
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
constexpr
size_t
BLOCK_NUM
=
8
;
constexpr
size_t
BLOCK_NUM
=
8
;
constexpr
size_t
BUFFER_NUM
=
2
;
constexpr
size_t
BUFFER_NUM
=
2
;
constexpr
size_t
BYTE_ALIGN
=
32
;
constexpr
size_t
BYTE_ALIGN
=
32
;
constexpr
size_t
BLOCK_LEN
=
256
;
template
<
typename
T
>
template
<
typename
T
>
__aicore__
inline
size_t
alignTileLen
(
size_t
tile_len
,
size_t
byte_align
)
{
__aicore__
inline
size_t
alignTileLen
(
size_t
tile_len
,
size_t
byte_align
)
{
...
...
src/infiniop/ops/random_sample/ascend/random_sample_aclnn.h
0 → 100644
View file @
c1fa267c
#ifndef __ACLNN_RANDOM_SAMPLE_H__
#define __ACLNN_RANDOM_SAMPLE_H__
#include "../random_sample.h"
DESCRIPTOR
(
ascend
)
#endif // __ACLNN_RANDOM_SAMPLE_H__
src/infiniop/ops/random_sample/ascend/random_sample_ascend.cc
deleted
100644 → 0
View file @
b5c6c7b8
#include "random_sample_ascend.h"
InfiniopRandomSampleAscendDescriptor
::
InfiniopRandomSampleAscendDescriptor
(
infiniDevice_t
device_
)
{
device
=
device_
;
device_id
=
0
;
pDesc
=
new
aclnnTensorDescriptor
();
topkIdxDesc
=
new
aclnnTensorDescriptor
();
topkValDesc
=
new
aclnnTensorDescriptor
();
resDesc
=
new
aclnnTensorDescriptor
();
}
infiniopStatus_t
ascendCreateRandomSampleDescriptor
(
infiniopAscendHandle_t
handle
,
infiniopRandomSampleAscendDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
result
,
infiniopTensorDescriptor_t
probs
)
{
if
(
probs
->
ndim
!=
1
)
{
return
INFINIOP_STATUS_BAD_TENSOR_SHAPE
;
}
if
(
result
->
ndim
!=
1
&&
result
->
shape
[
0
]
!=
1
)
{
return
INFINIOP_STATUS_BAD_TENSOR_SHAPE
;
}
(
*
desc_ptr
)
=
new
InfiniopRandomSampleAscendDescriptor
(
handle
->
device
);
(
*
desc_ptr
)
->
device_id
=
handle
->
device_id
;
CHECK_STATUS
((
*
desc_ptr
)
->
pDesc
->
fromInfiniOpTensorDescriptor
(
probs
),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
((
*
desc_ptr
)
->
resDesc
->
fromInfiniOpTensorDescriptor
(
result
),
INFINIOP_STATUS_SUCCESS
);
// Ascend aclnnTopk doesn't support U64 type
(
*
desc_ptr
)
->
resDesc
->
dataType
=
aclDataType
::
ACL_INT64
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
ascendGetRandomSampleWorkspaceSize
(
infiniopRandomSampleAscendDescriptor_t
desc
,
uint64_t
*
size
)
{
auto
&
pDesc
=
desc
->
pDesc
;
*
size
=
numElements
(
pDesc
->
shape
.
data
(),
pDesc
->
ndim
)
*
aclDataTypeSize
(
pDesc
->
dataType
)
+
numElements
(
pDesc
->
shape
.
data
(),
pDesc
->
ndim
)
*
infiniSizeof
(
infiniDtype_t
::
INFINI_DTYPE_I64
);
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
ascendRandomSample
(
infiniopRandomSampleAscendDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
{
if
(
topk
<=
0
||
topp
<
0
||
topp
>
1.0
)
{
return
INFINIOP_STATUS_BAD_PARAM
;
}
if
(
random_val
<
0
||
random_val
>
1.0
)
{
return
INFINIOP_STATUS_BAD_PARAM
;
}
auto
&
pDesc
=
desc
->
pDesc
;
auto
&
topkIdxDesc
=
desc
->
topkIdxDesc
;
auto
&
topkValDesc
=
desc
->
topkValDesc
;
auto
ndim
=
static_cast
<
int64_t
>
(
pDesc
->
ndim
);
auto
voc
=
pDesc
->
shape
[
0
];
auto
topk_
=
topk
<=
voc
?
topk
:
voc
;
bool
doSample
=
topk_
>
1
&&
temperature
!=
0
&&
topp
!=
0
;
auto
topkShape
=
std
::
vector
<
int64_t
>
(
pDesc
->
shape
);
topkShape
[
ndim
-
1
]
=
doSample
?
topk_
:
1
;
auto
topkStrides
=
std
::
vector
<
int64_t
>
(
pDesc
->
strides
);
// Infer contiguous strides
topkStrides
[
ndim
-
1
]
=
1
;
for
(
int64_t
i
=
ndim
-
2
;
i
>=
0
;
--
i
)
{
topkStrides
[
i
]
=
topkStrides
[
i
+
1
]
*
topkShape
[
i
+
1
];
}
CHECK_STATUS
(
topkValDesc
->
setDescriptor
(
pDesc
->
dataType
,
topkShape
,
topkStrides
),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
topkIdxDesc
->
setDescriptor
(
aclDataType
::
ACL_INT64
,
topkShape
,
topkStrides
),
INFINIOP_STATUS_SUCCESS
);
// Infer data ptr
auto
workspaceTmp
=
workspace
;
auto
topkValAddr
=
workspaceTmp
;
workspaceTmp
=
(
void
*
)((
uint8_t
*
)
workspace
+
numElements
(
topkValDesc
->
shape
.
data
(),
topkValDesc
->
ndim
)
*
aclDataTypeSize
(
topkValDesc
->
dataType
));
auto
topkIdxAddr
=
workspaceTmp
;
auto
pAddr
=
(
void
*
)
probs
;
// Create aclTensor
CHECK_STATUS
(
pDesc
->
createTensor
(
pAddr
),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
topkValDesc
->
createTensor
(
topkValAddr
),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
topkIdxDesc
->
createTensor
(
topkIdxAddr
),
INFINIOP_STATUS_SUCCESS
);
if
(
!
doSample
)
{
CHECK_STATUS
(
desc
->
resDesc
->
createTensor
(
result
),
INFINIOP_STATUS_SUCCESS
);
}
// Do Topk calculate
uint64_t
topkWorkspaceSize
=
0
;
aclOpExecutor
*
topkExecutor
=
nullptr
;
auto
ret
=
aclnnTopkGetWorkspaceSize
(
pDesc
->
t
,
topkShape
[
ndim
-
1
],
ndim
-
1
,
true
,
true
,
topkValDesc
->
t
,
doSample
?
topkIdxDesc
->
t
:
desc
->
resDesc
->
t
,
&
topkWorkspaceSize
,
&
topkExecutor
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnTopkGetWorkspaceSize failed ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
void
*
topkWorkspace
;
CHECK_STATUS
(
mallocWorkspace
(
&
topkWorkspace
,
topkWorkspaceSize
),
INFINIOP_STATUS_SUCCESS
);
ret
=
aclnnTopk
(
topkWorkspace
,
topkWorkspaceSize
,
topkExecutor
,
stream
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnTopk failed ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
CHECK_STATUS
(
freeWorkspace
(
topkWorkspace
),
INFINIOP_STATUS_SUCCESS
);
if
(
doSample
)
{
// Do softmax and topp random sample
random_sample_do
(
pAddr
,
result
,
topkValAddr
,
topkIdxAddr
,
topk
,
static_cast
<
int
>
(
pDesc
->
shape
[
0
]),
topp
,
temperature
,
random_val
,
pDesc
->
dataType
,
stream
);
}
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
ascendDestroyRandomSampleDescriptor
(
infiniopRandomSampleAscendDescriptor_t
desc
)
{
delete
desc
->
pDesc
;
delete
desc
->
topkIdxDesc
;
delete
desc
->
topkValDesc
;
delete
desc
;
return
INFINIOP_STATUS_SUCCESS
;
}
src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
deleted
100644 → 0
View file @
b5c6c7b8
#ifndef __RANDOM_SAMPLE_ASCEND_H__
#define __RANDOM_SAMPLE_ASCEND_H__
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include "random_sample_ascend_api.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <acl/acl_rt.h>
#include <aclnnop/aclnn_topk.h>
struct
InfiniopRandomSampleAscendDescriptor
{
infiniDevice_t
device
;
int
device_id
;
aclnnTensorDescriptor_t
pDesc
,
topkValDesc
,
topkIdxDesc
,
resDesc
;
InfiniopRandomSampleAscendDescriptor
(
infiniDevice_t
device_
);
};
extern
"C"
void
random_sample_do
(
void
*
p
,
void
*
res
,
void
*
topkAddr
,
void
*
topkIdxAddr
,
int32_t
topk
,
int32_t
voc
,
float
topp
,
float
temper
,
float
random
,
int
dtype
,
void
*
stream
);
#endif
src/infiniop/ops/random_sample/ascend/random_sample_ascend_api.h
deleted
100644 → 0
View file @
b5c6c7b8
#ifndef __RANDOM_SAMPLE_ASCEND_API_H__
#define __RANDOM_SAMPLE_ASCEND_API_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "infiniop/operator.h"
struct
InfiniopRandomSampleAscendDescriptor
;
typedef
struct
InfiniopRandomSampleAscendDescriptor
*
infiniopRandomSampleAscendDescriptor_t
;
infiniopStatus_t
ascendCreateRandomSampleDescriptor
(
infiniopAscendHandle_t
handle
,
infiniopRandomSampleAscendDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
results
,
infiniopTensorDescriptor_t
probs
);
infiniopStatus_t
ascendGetRandomSampleWorkspaceSize
(
infiniopRandomSampleAscendDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
ascendRandomSample
(
infiniopRandomSampleAscendDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
);
infiniopStatus_t
ascendDestroyRandomSampleDescriptor
(
infiniopRandomSampleAscendDescriptor_t
desc
);
#endif
src/infiniop/ops/random_sample/ascend/random_sample_kernel.cpp
View file @
c1fa267c
#include "
kernel_operator
.h"
#include "
../../../devices/ascend/ascend_kernel_common
.h"
using
namespace
AscendC
;
using
namespace
AscendC
;
const
int32_t
BLOCK_LEN
=
256
;
template
<
typename
T
>
template
<
typename
T
>
class
Kernel
RandomSample
{
class
RandomSample
Kernel
{
public:
public:
__aicore__
inline
KernelRandomSample
()
{}
__aicore__
inline
RandomSampleKernel
()
{}
__aicore__
inline
void
Init
(
GM_ADDR
p
,
GM_ADDR
res
,
GM_ADDR
topkAddr
,
__aicore__
inline
void
init
(
GM_ADDR
probs
,
GM_ADDR
result
,
GM_ADDR
topk_val_addr
,
GM_ADDR
topk_idx_addr
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
int32_t
n
);
GM_ADDR
topkIdxAddr
,
int32_t
topk_
,
int32_t
voc_
,
__aicore__
inline
void
process
();
float
topp_
,
float
temper_
,
float
random_
)
{
private:
topk
=
topk_
;
__aicore__
inline
void
copyIn
();
voc
=
voc_
;
__aicore__
inline
void
copyOut
();
topp
=
topp_
;
__aicore__
inline
void
compute
();
invTemperature
=
1.0
f
/
temper_
;
__aicore__
inline
void
SoftMax
(
LocalTensor
<
T
>
&
topkValIn
,
random
=
random_
;
LocalTensor
<
T
>
&
softMaxOut
);
negMax
=
0.
f
;
__aicore__
inline
void
InclusiveSum
(
LocalTensor
<
T
>
&
topkValIn
,
sum
=
0.
f
;
LocalTensor
<
T
>
&
topkValOut
);
__aicore__
inline
void
RandomSample
(
LocalTensor
<
T
>
&
valIn
,
LocalTensor
<
int64_t
>
&
Index
,
LocalTensor
<
int64_t
>
&
result
);
GlobalTensor
<
T
>
_pGM
,
_topk_valGM
;
GlobalTensor
<
int64_t
>
_topk_idxGM
,
_resGM
;
TPipe
pipe
;
TQue
<
QuePosition
::
VECIN
,
1
>
_topk_valQue
;
TQue
<
QuePosition
::
VECIN
,
1
>
_topk_idxQue
;
TQue
<
QuePosition
::
VECOUT
,
1
>
_resQue
;
TBuf
<
TPosition
::
VECCALC
>
_inBuf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp1Buf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp2Buf
;
TBuf
<
TPosition
::
VECCALC
>
_tmp3Buf
;
TBuf
<
TPosition
::
VECCALC
>
_softmax_OutBuf
;
TBuf
<
TPosition
::
VECCALC
>
_inclusive_sum_OutBuf
;
int32_t
_topk
;
int32_t
_voc
;
float
_random_val
;
float
_topp
;
float
_invTemp
;
float
_negMax
=
0.
f
;
float
_sum
=
0.
f
;
int32_t
_topkAligned
;
int32_t
_topkIdxAligned
;
int32_t
_vocAligned
;
int32_t
_bufferLen
;
};
template
<
typename
T
>
__aicore__
inline
void
RandomSampleKernel
<
T
>::
init
(
GM_ADDR
probs
,
GM_ADDR
result
,
GM_ADDR
topk_val_addr
,
GM_ADDR
topk_idx_addr
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
int32_t
n
)
{
_topk
=
topk
;
_voc
=
n
;
_random_val
=
random_val
;
_topp
=
topp
;
_invTemp
=
1.0
f
/
temperature
;
// CumSumInfo
// CumSumInfo
topkAligned
=
topk
*
sizeof
(
T
)
%
32
==
0
_topkAligned
=
alignTileLen
<
T
>
(
_topk
,
BYTE_ALIGN
);
?
topk
_vocAligned
=
alignTileLen
<
T
>
(
_voc
,
BYTE_ALIGN
);
:
(
topk
*
sizeof
(
T
)
+
31
)
/
32
*
32
/
sizeof
(
T
);
_topkIdxAligned
=
(
_topk
+
3
)
/
4
*
4
;
vocAligned
=
voc
*
sizeof
(
T
)
%
32
==
0
_bufferLen
=
_topkAligned
>
BLOCK_LEN
?
_topkAligned
:
BLOCK_LEN
;
?
voc
:
(
voc
*
sizeof
(
T
)
+
31
)
/
32
*
32
/
sizeof
(
T
);
// Set GlobalTensor
topkIdxAligned
=
(
topk
+
3
)
/
4
*
4
;
_pGM
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
T
*>
(
probs
),
_voc
);
_topk_valGM
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
T
*>
(
topk_val_addr
),
_topk
);
bufferLen
=
topkAligned
>
BLOCK_LEN
?
topkAligned
:
BLOCK_LEN
;
_topk_idxGM
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
int64_t
*>
(
topk_idx_addr
),
_topk
);
_resGM
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
int64_t
*>
(
result
),
1
);
// Set Gm
pGm
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
T
*>
(
p
),
voc
);
topkGm
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
T
*>
(
topkAddr
),
topk
);
topkIdxGm
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
int64_t
*>
(
topkIdxAddr
),
topk
);
resGm
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
int64_t
*>
(
res
),
1
);
// Global input and output
// Global input and output
pipe
.
InitBuffer
(
topkQue
,
1
,
topkAligned
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_topk_valQue
,
1
,
_topkAligned
*
sizeof
(
T
));
pipe
.
InitBuffer
(
topkIdxQue
,
1
,
topkIdxAligned
*
sizeof
(
int64_t
));
pipe
.
InitBuffer
(
_topk_idxQue
,
1
,
_topkIdxAligned
*
sizeof
(
int64_t
));
pipe
.
InitBuffer
(
resQue
,
1
,
32
);
// 32 bytes for aligned
pipe
.
InitBuffer
(
_resQue
,
1
,
BYTE_ALIGN
);
// 32 bytes for aligned
pipe
.
InitBuffer
(
inBuf
,
BLOCK_LEN
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_inBuf
,
BLOCK_LEN
*
sizeof
(
T
));
pipe
.
InitBuffer
(
tmpBuf1
,
bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp1Buf
,
_bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
tmpBuf2
,
bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp2Buf
,
_bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
tmpBuf3
,
bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_tmp3Buf
,
_bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
softMaxOutBuf
,
topkAligned
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_softmax_OutBuf
,
_topkAligned
*
sizeof
(
T
));
pipe
.
InitBuffer
(
_inclusive_sum_OutBuf
,
_topkAligned
*
sizeof
(
T
));
pipe
.
InitBuffer
(
inclusiveSumOutBuf
,
topkAligned
*
sizeof
(
T
));
}
}
__aicore__
inline
void
Process
()
{
CopyIn
();
Compute
();
CopyOut
();
}
private:
template
<
typename
T
>
// Softmax
__aicore__
inline
void
RandomSampleKernel
<
T
>::
process
()
{
__aicore__
inline
void
SoftMax
(
LocalTensor
<
T
>
&
topkValIn
,
copyIn
();
compute
();
copyOut
();
}
template
<
typename
T
>
__aicore__
inline
void
RandomSampleKernel
<
T
>::
SoftMax
(
LocalTensor
<
T
>
&
topkValIn
,
LocalTensor
<
T
>
&
softMaxOut
)
{
LocalTensor
<
T
>
&
softMaxOut
)
{
float
invSum
=
1.0
f
/
sum
;
float
invSum
=
1.0
f
/
_
sum
;
LocalTensor
<
T
>
tmpBuffer
=
tmpBuf
1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer
=
_
tmp
1
Buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer2
=
tmpBuf
2
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer2
=
_
tmp
2
Buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer3
=
tmpBuf
3
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer3
=
_
tmp
3
Buf
.
Get
<
T
>
();
Adds
(
tmpBuffer
,
topkValIn
,
static_cast
<
T
>
(
negMax
),
topk
);
Adds
(
tmpBuffer
,
topkValIn
,
static_cast
<
T
>
(
_
negMax
),
_
topk
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
invTemp
erature
),
topk
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
_
invTemp
),
_
topk
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
topk
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
_
topk
);
Muls
(
softMaxOut
,
tmpBuffer3
,
static_cast
<
T
>
(
invSum
),
topk
);
Muls
(
softMaxOut
,
tmpBuffer3
,
static_cast
<
T
>
(
invSum
),
_
topk
);
}
}
// Cumsum
template
<
typename
T
>
__aicore__
inline
void
InclusiveSum
(
LocalTensor
<
T
>
&
topkValIn
,
__aicore__
inline
void
RandomSampleKernel
<
T
>::
InclusiveSum
(
LocalTensor
<
T
>
&
topkValIn
,
LocalTensor
<
T
>
&
topkValOut
)
{
LocalTensor
<
T
>
&
topkValOut
)
{
static
constexpr
CumSumConfig
cumSumConfig
{
true
,
false
,
false
};
static
constexpr
CumSumConfig
cumSumConfig
{
true
,
false
,
false
};
LocalTensor
<
T
>
lastRowLocal
;
LocalTensor
<
T
>
lastRowLocal
;
CumSum
<
T
,
cumSumConfig
>
(
topkValOut
,
lastRowLocal
,
topkValIn
,
CumSum
<
T
,
cumSumConfig
>
(
topkValOut
,
lastRowLocal
,
topkValIn
,
{
1
,
static_cast
<
uint32_t
>
(
topkAligned
)});
{
1
,
static_cast
<
uint32_t
>
(
_
topkAligned
)});
}
}
// Random sample
template
<
typename
T
>
__aicore__
inline
void
RandomSample
(
LocalTensor
<
T
>
&
valIn
,
__aicore__
inline
void
RandomSampleKernel
<
T
>::
RandomSample
(
LocalTensor
<
T
>
&
valIn
,
LocalTensor
<
int64_t
>
&
Index
,
LocalTensor
<
int64_t
>
&
Index
,
LocalTensor
<
int64_t
>
&
result
)
{
LocalTensor
<
int64_t
>
&
result
)
{
int
end
=
0
;
int
end
=
0
;
for
(
end
=
0
;
end
<
topk
;
end
++
)
{
for
(
end
=
0
;
end
<
_
topk
;
end
++
)
{
if
(
static_cast
<
float
>
(
valIn
(
end
))
>=
topp
)
{
if
(
static_cast
<
float
>
(
valIn
(
end
))
>=
_
topp
)
{
break
;
break
;
}
}
}
}
if
(
end
<
topk
-
1
)
{
if
(
end
<
_
topk
-
1
)
{
end
+=
1
;
end
+=
1
;
}
else
{
}
else
{
end
=
topk
;
end
=
_
topk
;
}
}
auto
random
V
al
=
random
*
static_cast
<
float
>
(
valIn
(
end
-
1
));
auto
random
_v
al
=
_
random
_val
*
static_cast
<
float
>
(
valIn
(
end
-
1
));
for
(
int
i
=
0
;
i
<
end
;
i
++
)
{
for
(
int
i
=
0
;
i
<
end
;
i
++
)
{
if
(
random
V
al
<
static_cast
<
float
>
(
valIn
(
i
)))
{
if
(
random
_v
al
<
static_cast
<
float
>
(
valIn
(
i
)))
{
result
(
0
)
=
Index
(
i
);
result
(
0
)
=
Index
(
i
);
return
;
return
;
}
}
}
}
result
(
0
)
=
Index
(
end
-
1
);
result
(
0
)
=
Index
(
end
-
1
);
}
}
__aicore__
inline
void
CopyIn
()
{
template
<
typename
T
>
LocalTensor
<
T
>
topkValLocal
=
topkQue
.
AllocTensor
<
T
>
();
__aicore__
inline
void
RandomSampleKernel
<
T
>::
copyIn
()
{
LocalTensor
<
int64_t
>
topkIdxLocal
=
topkIdxQue
.
AllocTensor
<
int64_t
>
();
LocalTensor
<
T
>
topkValLocal
=
_topk_valQue
.
AllocTensor
<
T
>
();
DataCopy
(
topkValLocal
,
topkGm
,
topkAligned
);
LocalTensor
<
int64_t
>
topkIdxLocal
=
_topk_idxQue
.
AllocTensor
<
int64_t
>
();
DataCopy
(
topkIdxLocal
,
topkIdxGm
,
topkIdxAligned
);
DataCopy
(
topkValLocal
,
_topk_valGM
,
_topkAligned
);
DataCopy
(
topkIdxLocal
,
_topk_idxGM
,
_topkIdxAligned
);
// Get Max val of input
// Get Max val of input
negMax
=
-
static_cast
<
float
>
(
topkValLocal
(
0
));
_
negMax
=
-
static_cast
<
float
>
(
topkValLocal
(
0
));
// Copy in p and compute sum
// Copy in p and compute
_
sum
int32_t
repeatTimes
=
voc
/
BLOCK_LEN
;
int32_t
repeatTimes
=
_
voc
/
BLOCK_LEN
;
int32_t
remainder
=
voc
%
BLOCK_LEN
;
int32_t
remainder
=
_
voc
%
BLOCK_LEN
;
float
sum_s
=
0.
f
;
float
sum_s
=
0.
f
;
LocalTensor
<
T
>
inBuffer
=
inBuf
.
Get
<
T
>
();
LocalTensor
<
T
>
_
inBuffer
=
_
inBuf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer
=
tmpBuf
1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer
=
_
tmp
1
Buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer2
=
tmpBuf
2
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer2
=
_
tmp
2
Buf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer3
=
tmpBuf
3
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer3
=
_
tmp
3
Buf
.
Get
<
T
>
();
for
(
int32_t
i
=
0
;
i
<
repeatTimes
;
i
++
)
{
for
(
int32_t
i
=
0
;
i
<
repeatTimes
;
i
++
)
{
DataCopy
(
inBuffer
,
pG
m
[
i
*
BLOCK_LEN
],
BLOCK_LEN
);
DataCopy
(
_
inBuffer
,
_
pG
M
[
i
*
BLOCK_LEN
],
BLOCK_LEN
);
Adds
(
tmpBuffer
,
inBuffer
,
static_cast
<
T
>
(
negMax
),
BLOCK_LEN
);
Adds
(
tmpBuffer
,
_
inBuffer
,
static_cast
<
T
>
(
_
negMax
),
BLOCK_LEN
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
invTemp
erature
),
BLOCK_LEN
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
_
invTemp
),
BLOCK_LEN
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
BLOCK_LEN
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
BLOCK_LEN
);
sum_s
=
0.
f
;
sum_s
=
0.
f
;
for
(
int
j
=
0
;
j
<
BLOCK_LEN
;
++
j
)
{
for
(
int
j
=
0
;
j
<
BLOCK_LEN
;
++
j
)
{
sum_s
+=
static_cast
<
float
>
(
tmpBuffer3
(
j
));
sum_s
+=
static_cast
<
float
>
(
tmpBuffer3
(
j
));
}
}
sum
+=
sum_s
;
_
sum
+=
sum_s
;
}
}
if
(
remainder
!=
0
)
{
if
(
remainder
!=
0
)
{
int32_t
remainderAligned
=
remainder
*
sizeof
(
T
)
%
32
==
0
int32_t
remainderAligned
=
alignTileLen
<
T
>
(
remainder
,
BYTE_ALIGN
);
?
remainder
DataCopy
(
_inBuffer
,
_pGM
[
repeatTimes
*
BLOCK_LEN
],
remainderAligned
);
:
(
remainder
*
sizeof
(
T
)
+
31
)
/
32
*
32
/
sizeof
(
T
);
Adds
(
tmpBuffer
,
_inBuffer
,
static_cast
<
T
>
(
_negMax
),
remainder
);
DataCopy
(
inBuffer
,
pGm
[
repeatTimes
*
BLOCK_LEN
],
remainderAligned
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
_invTemp
),
remainder
);
Adds
(
tmpBuffer
,
inBuffer
,
static_cast
<
T
>
(
negMax
),
remainder
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
invTemperature
),
remainder
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
remainder
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
remainder
);
sum_s
=
0.
f
;
sum_s
=
0.
f
;
for
(
int
i
=
0
;
i
<
remainder
;
++
i
)
{
for
(
int
i
=
0
;
i
<
remainder
;
++
i
)
{
sum_s
+=
static_cast
<
float
>
(
tmpBuffer3
(
i
));
sum_s
+=
static_cast
<
float
>
(
tmpBuffer3
(
i
));
}
}
sum
+=
sum_s
;
_
sum
+=
sum_s
;
}
}
topk
Que
.
EnQue
(
topkValLocal
);
_topk_val
Que
.
EnQue
(
topkValLocal
);
topk
I
dxQue
.
EnQue
(
topkIdxLocal
);
_
topk
_i
dxQue
.
EnQue
(
topkIdxLocal
);
}
}
__aicore__
inline
void
Compute
()
{
template
<
typename
T
>
__aicore__
inline
void
RandomSampleKernel
<
T
>::
compute
()
{
// Get input data
// Get input data
LocalTensor
<
T
>
topkValLocal
=
topkQue
.
DeQue
<
T
>
();
LocalTensor
<
T
>
topkValLocal
=
_
topk
_val
Que
.
DeQue
<
T
>
();
// SoftMax
// SoftMax
LocalTensor
<
T
>
softMaxOutLocal
=
soft
M
axOutBuf
.
Get
<
T
>
();
LocalTensor
<
T
>
softMaxOutLocal
=
_
soft
m
ax
_
OutBuf
.
Get
<
T
>
();
SoftMax
(
topkValLocal
,
softMaxOutLocal
);
SoftMax
(
topkValLocal
,
softMaxOutLocal
);
// InclusiveSum
// InclusiveSum
LocalTensor
<
T
>
inclusiveOutLocal
=
inclusive
S
umOutBuf
.
Get
<
T
>
();
LocalTensor
<
T
>
inclusiveOutLocal
=
_
inclusive
_s
um
_
OutBuf
.
Get
<
T
>
();
InclusiveSum
(
softMaxOutLocal
,
inclusiveOutLocal
);
InclusiveSum
(
softMaxOutLocal
,
inclusiveOutLocal
);
// randomSample
// randomSample
LocalTensor
<
int64_t
>
topkIdxLocal
=
topk
I
dxQue
.
DeQue
<
int64_t
>
();
LocalTensor
<
int64_t
>
topkIdxLocal
=
_
topk
_i
dxQue
.
DeQue
<
int64_t
>
();
LocalTensor
<
int64_t
>
resultLocal
=
resQue
.
AllocTensor
<
int64_t
>
();
LocalTensor
<
int64_t
>
resultLocal
=
_
resQue
.
AllocTensor
<
int64_t
>
();
RandomSample
(
inclusiveOutLocal
,
topkIdxLocal
,
resultLocal
);
RandomSample
(
inclusiveOutLocal
,
topkIdxLocal
,
resultLocal
);
topkQue
.
FreeTensor
(
topkValLocal
);
_topk_valQue
.
FreeTensor
(
topkValLocal
);
topkIdxQue
.
FreeTensor
(
topkIdxLocal
);
_topk_idxQue
.
FreeTensor
(
topkIdxLocal
);
resQue
.
EnQue
(
resultLocal
);
_resQue
.
EnQue
(
resultLocal
);
}
}
__aicore__
inline
void
CopyOut
()
{
LocalTensor
<
int64_t
>
resLocal
=
resQue
.
DeQue
<
int64_t
>
();
DataCopy
(
resGm
,
resLocal
,
32
/
sizeof
(
int64_t
));
resQue
.
FreeTensor
(
resLocal
);
}
private:
GlobalTensor
<
T
>
pGm
;
GlobalTensor
<
T
>
topkGm
;
GlobalTensor
<
int64_t
>
topkIdxGm
;
GlobalTensor
<
int64_t
>
resGm
;
TPipe
pipe
;
TQue
<
QuePosition
::
VECIN
,
1
>
topkQue
;
template
<
typename
T
>
TQue
<
QuePosition
::
VECIN
,
1
>
topkIdxQue
;
__aicore__
inline
void
RandomSampleKernel
<
T
>::
copyOut
()
{
TQue
<
QuePosition
::
VECOUT
,
1
>
resQue
;
LocalTensor
<
int64_t
>
resLocal
=
_resQue
.
DeQue
<
int64_t
>
();
DataCopy
(
_resGM
,
resLocal
,
BYTE_ALIGN
/
sizeof
(
int64_t
));
TBuf
<
TPosition
::
VECCALC
>
inBuf
;
_resQue
.
FreeTensor
(
resLocal
);
TBuf
<
TPosition
::
VECCALC
>
tmpBuf1
;
}
TBuf
<
TPosition
::
VECCALC
>
tmpBuf2
;
TBuf
<
TPosition
::
VECCALC
>
tmpBuf3
;
TBuf
<
TPosition
::
VECCALC
>
softMaxOutBuf
;
TBuf
<
TPosition
::
VECCALC
>
inclusiveSumOutBuf
;
// Kernel params
int32_t
topk
;
int32_t
voc
;
float
topp
;
float
invTemperature
;
float
random
;
float
negMax
;
float
sum
;
int32_t
topkAligned
;
int32_t
topkIdxAligned
;
int32_t
vocAligned
;
int32_t
bufferLen
;
};
extern
"C"
__global__
__aicore__
void
extern
"C"
__global__
__aicore__
void
random_sample_kernel_fp16
(
random_sample_kernel_f16
(
GM_ADDR
p
,
GM_ADDR
res
,
GM_ADDR
topkAddr
,
GM_ADDR
probs
,
GM_ADDR
topkIdxAddr
,
int32_t
topk_
,
int32_t
voc_
,
GM_ADDR
result
,
float
topp_
,
float
temper_
,
float
random_
)
{
GM_ADDR
topk_val_addr
,
KernelRandomSample
<
half
>
op
;
GM_ADDR
topk_idx_addr
,
op
.
Init
(
p
,
res
,
topkAddr
,
topkIdxAddr
,
topk_
,
voc_
,
topp_
,
temper_
,
random_
);
float
random_val
,
op
.
Process
();
float
topp
,
int
topk
,
float
temperature
,
int32_t
n
)
{
RandomSampleKernel
<
half
>
op
;
op
.
init
(
probs
,
result
,
topk_val_addr
,
topk_idx_addr
,
random_val
,
topp
,
topk
,
temperature
,
n
);
op
.
process
();
}
}
extern
"C"
void
extern
"C"
__global__
__aicore__
void
random_sample_kernel_fp32
(
random_sample_do
(
void
*
p
,
void
*
res
,
void
*
topkAddr
,
void
*
topkIdxAddr
,
GM_ADDR
probs
,
int32_t
topk
,
int32_t
voc
,
float
topp
,
float
temper
,
GM_ADDR
result
,
float
random
,
int
dtype
,
void
*
stream
)
{
GM_ADDR
topk_val_addr
,
GM_ADDR
topk_idx_addr
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
int32_t
n
)
{
RandomSampleKernel
<
float
>
op
;
op
.
init
(
probs
,
result
,
topk_val_addr
,
topk_idx_addr
,
random_val
,
topp
,
topk
,
temperature
,
n
);
op
.
process
();
}
switch
(
dtype
)
{
extern
"C"
infiniStatus_t
random_sample_kernel_launch
(
case
1
:
void
*
probs
,
random_sample_kernel_f16
<<<
1
,
nullptr
,
stream
>>>
(
void
*
result
,
p
,
res
,
topkAddr
,
topkIdxAddr
,
topk
,
voc
,
topp
,
temper
,
random
);
void
*
topk_val_addr
,
void
*
topk_idx_addr
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
uint64_t
n
,
infiniDtype_t
dt_p
,
void
*
stream
)
{
switch
(
dt_p
)
{
case
INFINI_DTYPE_F16
:
random_sample_kernel_fp16
<<<
1
,
nullptr
,
stream
>>>
(
probs
,
result
,
topk_val_addr
,
topk_idx_addr
,
random_val
,
topp
,
topk
,
temperature
,
n
);
break
;
case
INFINI_DTYPE_F32
:
random_sample_kernel_fp32
<<<
1
,
nullptr
,
stream
>>>
(
probs
,
result
,
topk_val_addr
,
topk_idx_addr
,
random_val
,
topp
,
topk
,
temperature
,
n
);
break
;
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
}
return
INFINI_STATUS_SUCCESS
;
}
}
src/infiniop/ops/random_sample/ascend/randomsample_aclnn.cc
0 → 100644
View file @
c1fa267c
#include "../../../devices/ascend/common_ascend.h"
#include "random_sample_aclnn.h"
#include <aclnnop/aclnn_topk.h>
namespace
op
::
random_sample
::
ascend
{
struct
Descriptor
::
Opaque
{
aclnnTensorDescriptor_t
probs
;
aclnnTensorDescriptor_t
result
;
~
Opaque
()
{
delete
probs
;
delete
result
;
}
};
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
result_desc
,
infiniopTensorDescriptor_t
probs_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle_
);
auto
result
=
RandomSampleInfo
::
create
(
result_desc
,
probs_desc
);
CHECK_RESULT
(
result
);
CHECK_DTYPE
(
result
->
dt_i
,
INFINI_DTYPE_I64
);
auto
workspace_size
=
probs_desc
->
numel
()
*
infiniSizeOf
(
probs_desc
->
dtype
())
+
probs_desc
->
numel
()
*
infiniSizeOf
(
infiniDtype_t
::
INFINI_DTYPE_I64
);
auto
tresult
=
new
aclnnTensorDescriptor
(
result_desc
);
auto
tprobs
=
new
aclnnTensorDescriptor
(
probs_desc
);
*
desc_ptr
=
new
Descriptor
(
result
.
take
(),
workspace_size
,
new
Opaque
{
tprobs
,
tresult
},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
size_t
Descriptor
::
minWorkspaceSize
()
const
{
return
_min_workspace_size
;
}
extern
"C"
infiniStatus_t
random_sample_kernel_launch
(
void
*
probs
,
void
*
result
,
void
*
topk_val_addr
,
void
*
topk_idx_addr
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
uint64_t
n
,
infiniDtype_t
dt_p
,
void
*
stream
);
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
const
void
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
const
{
if
(
workspace_size
<
_min_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
auto
topk_
=
topk
<=
(
int
)
_info
.
n
?
topk
:
(
int
)
_info
.
n
;
bool
dosample
=
topk_
>
1
&&
temperature
!=
0.0
f
&&
topp
!=
0.0
f
&&
random_val
!=
0.0
f
;
auto
topk_shape
=
std
::
vector
<
int64_t
>
{
dosample
?
topk_
:
1
};
auto
topk_stride
=
std
::
vector
<
int64_t
>
{
1
};
auto
topk_idx
=
new
aclnnTensorDescriptor
(
toAclDataType
(
_info
.
dt_i
),
topk_shape
,
topk_stride
);
auto
topk_val
=
new
aclnnTensorDescriptor
(
toAclDataType
(
_info
.
dt_p
),
topk_shape
,
topk_stride
);
auto
topk_val_addr
=
workspace
;
auto
topk_idx_addr
=
(
void
*
)((
uint8_t
*
)
workspace
+
topk_
*
infiniSizeOf
(
_info
.
dt_p
));
uint64_t
topk_workspace_size
=
0
;
aclOpExecutor
*
topk_executor
=
nullptr
;
CHECK_ACL
(
aclnnTopkGetWorkspaceSize
(
_opaque
->
probs
->
tensor
,
topk_shape
[
0
],
0
,
true
,
true
,
topk_val
->
tensor
,
dosample
?
topk_idx
->
tensor
:
_opaque
->
result
->
tensor
,
&
topk_workspace_size
,
&
topk_executor
));
CHECK_ACL
(
aclSetAclOpExecutorRepeatable
(
topk_executor
));
void
*
topk_workspace
;
CHECK_ACL
(
aclrtMalloc
(
&
topk_workspace
,
topk_workspace_size
,
ACL_MEM_MALLOC_HUGE_FIRST
));
AclSetTensorAddr
(
topk_executor
,
0
,
_opaque
->
probs
->
tensor
,
(
void
*
)
probs
);
AclSetTensorAddr
(
topk_executor
,
1
,
topk_val
->
tensor
,
topk_val_addr
);
if
(
!
dosample
)
{
AclSetTensorAddr
(
topk_executor
,
2
,
_opaque
->
result
->
tensor
,
result
);
}
else
{
AclSetTensorAddr
(
topk_executor
,
2
,
topk_idx
->
tensor
,
topk_idx_addr
);
}
CHECK_ACL
(
aclnnTopk
(
topk_workspace
,
topk_workspace_size
,
topk_executor
,
stream
));
CHECK_ACL
(
aclrtFree
(
topk_workspace
));
if
(
dosample
)
{
auto
status
=
random_sample_kernel_launch
((
void
*
)
probs
,
result
,
topk_val_addr
,
topk_idx_addr
,
random_val
,
topp
,
topk_
,
temperature
,
_info
.
n
,
_info
.
dt_p
,
stream
);
CHECK_STATUS
(
status
);
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::random_sample::ascend
src/infiniop/ops/random_sample/operator.cc
View file @
c1fa267c
...
@@ -11,8 +11,12 @@
...
@@ -11,8 +11,12 @@
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
#include "maca/random_sample_maca.h"
#include "maca/random_sample_maca.h"
#endif
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/random_sample_aclnn.h"
#endif
__C
infiniStatus_t
infiniopCreateRandomSampleDescriptor
(
__C
infiniStatus_t
infiniopCreateRandomSampleDescriptor
(
infiniopHandle_t
handle
,
infiniopHandle_t
handle
,
infiniopRandomSampleDescriptor_t
*
desc_ptr
,
infiniopRandomSampleDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
result
,
infiniopTensorDescriptor_t
result
,
...
@@ -37,6 +41,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
...
@@ -37,6 +41,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
maca
);
CREATE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -67,6 +74,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
...
@@ -67,6 +74,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
maca
);
GET
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#endif
#ifdef ENABLE_ASCEND_API
GET
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -107,6 +117,9 @@ __C infiniStatus_t infiniopRandomSample(
...
@@ -107,6 +117,9 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
);
CALCULATE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -134,6 +147,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
...
@@ -134,6 +147,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_METAX_API
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
maca
);
DELETE
(
INFINI_DEVICE_METAX
,
maca
);
#endif
#endif
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -141,4 +157,3 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
...
@@ -141,4 +157,3 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#undef DELETE
#undef DELETE
}
}
test/infiniop/random_sample.py
View file @
c1fa267c
...
@@ -43,6 +43,7 @@ _TOLERANCE_MAP = {
...
@@ -43,6 +43,7 @@ _TOLERANCE_MAP = {
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
0
},
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
0
},
}
}
DEBUG
=
False
DEBUG
=
False
PROFILE
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_PRERUN
=
10
...
@@ -73,15 +74,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
...
@@ -73,15 +74,6 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
return
torch
.
argmax
(
data
)
return
torch
.
argmax
(
data
)
def
random_sample
(
data
,
random_val
,
topp
,
topk
,
voc
,
temperature
):
if
topp
>
0
and
topk
>
1
:
ans
=
random_sample_1
(
data
.
to
(
"cpu"
),
random_val
,
topp
,
topk
,
voc
,
temperature
)
else
:
ans
=
random_sample_0
(
data
)
return
ans
def
test
(
def
test
(
lib
,
lib
,
handle
,
handle
,
...
@@ -92,7 +84,7 @@ def test(
...
@@ -92,7 +84,7 @@ def test(
topk
,
topk
,
temperature
,
temperature
,
dtype
=
torch
.
float16
,
dtype
=
torch
.
float16
,
sync
=
None
sync
=
None
,
):
):
print
(
print
(
f
"Testing RandomSample on
{
torch_device
}
with voc:
{
voc
}
random_val:
{
random_val
}
topp:
{
topp
}
topk:
{
topk
}
temperature:
{
temperature
}
dtype:
{
dtype
}
"
f
"Testing RandomSample on
{
torch_device
}
with voc:
{
voc
}
random_val:
{
random_val
}
topp:
{
topp
}
topk:
{
topk
}
temperature:
{
temperature
}
dtype:
{
dtype
}
"
...
@@ -136,6 +128,7 @@ def test(
...
@@ -136,6 +128,7 @@ def test(
)
)
)
)
workspace
=
create_workspace
(
workspace_size
.
value
,
torch_device
)
workspace
=
create_workspace
(
workspace_size
.
value
,
torch_device
)
def
lib_random_sample
():
def
lib_random_sample
():
check_error
(
check_error
(
lib
.
infiniopRandomSample
(
lib
.
infiniopRandomSample
(
...
@@ -223,4 +216,5 @@ if __name__ == "__main__":
...
@@ -223,4 +216,5 @@ if __name__ == "__main__":
# Execute tests
# Execute tests
for
device
in
get_test_devices
(
args
):
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment