Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
b5c6c7b8
Commit
b5c6c7b8
authored
Feb 18, 2025
by
zhangyue
Committed by
zhangyunze
Jun 17, 2025
Browse files
issue/11: add random sample ascend
parent
384cb5bf
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
451 additions
and
5 deletions
+451
-5
src/infiniop/devices/ascend/CMakeLists.txt
src/infiniop/devices/ascend/CMakeLists.txt
+3
-2
src/infiniop/ops/random_sample/ascend/random_sample_ascend.cc
...infiniop/ops/random_sample/ascend/random_sample_ascend.cc
+148
-0
src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
+25
-0
src/infiniop/ops/random_sample/ascend/random_sample_ascend_api.h
...iniop/ops/random_sample/ascend/random_sample_ascend_api.h
+31
-0
src/infiniop/ops/random_sample/ascend/random_sample_kernel.cpp
...nfiniop/ops/random_sample/ascend/random_sample_kernel.cpp
+234
-0
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+1
-0
test/infiniop/random_sample.py
test/infiniop/random_sample.py
+9
-3
No files found.
src/infiniop/devices/ascend/CMakeLists.txt
View file @
b5c6c7b8
...
@@ -5,7 +5,7 @@ project(Ascend_C)
...
@@ -5,7 +5,7 @@ project(Ascend_C)
set
(
SOC_VERSION
"Ascend910B3"
CACHE STRING
"system on chip type"
)
set
(
SOC_VERSION
"Ascend910B3"
CACHE STRING
"system on chip type"
)
set
(
ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH
"ASCEND CANN package installation directory"
)
set
(
ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH
"ASCEND CANN package installation directory"
)
set
(
RUN_MODE
"npu"
CACHE STRING
"run mode: npu"
)
set
(
RUN_MODE
"npu"
CACHE STRING
"run mode: npu"
)
set
(
CMAKE_BUILD_TYPE
"
Release
"
CACHE STRING
"Build type Release/Debug (default Debug)"
FORCE
)
set
(
CMAKE_BUILD_TYPE
"
Debug
"
CACHE STRING
"Build type Release/Debug (default Debug)"
FORCE
)
set
(
CMAKE_INSTALL_PREFIX
"
${
CMAKE_CURRENT_LIST_DIR
}
/out"
CACHE STRING
"path for install()"
FORCE
)
set
(
CMAKE_INSTALL_PREFIX
"
${
CMAKE_CURRENT_LIST_DIR
}
/out"
CACHE STRING
"path for install()"
FORCE
)
if
(
EXISTS
${
ASCEND_CANN_PACKAGE_PATH
}
/tools/tikcpp/ascendc_kernel_cmake
)
if
(
EXISTS
${
ASCEND_CANN_PACKAGE_PATH
}
/tools/tikcpp/ascendc_kernel_cmake
)
...
@@ -26,6 +26,7 @@ include_directories(
...
@@ -26,6 +26,7 @@ include_directories(
ascendc_library
(
ascend_kernels STATIC
ascendc_library
(
ascend_kernels STATIC
../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
../../ops/rope/ascend/rope_ascend_kernel.cpp
../../ops/rope/ascend/rope_ascend_kernel.cpp
#
../../ops/random_sample/ascend/random_sample_kernel.cpp
../../ops/random_sample/ascend/random_sample_kernel.cpp
)
)
target_include_directories
(
ascend_kernels PRIVATE ../../../../include
)
src/infiniop/ops/random_sample/ascend/random_sample_ascend.cc
0 → 100644
View file @
b5c6c7b8
#include "random_sample_ascend.h"
InfiniopRandomSampleAscendDescriptor
::
InfiniopRandomSampleAscendDescriptor
(
infiniDevice_t
device_
)
{
device
=
device_
;
device_id
=
0
;
pDesc
=
new
aclnnTensorDescriptor
();
topkIdxDesc
=
new
aclnnTensorDescriptor
();
topkValDesc
=
new
aclnnTensorDescriptor
();
resDesc
=
new
aclnnTensorDescriptor
();
}
infiniopStatus_t
ascendCreateRandomSampleDescriptor
(
infiniopAscendHandle_t
handle
,
infiniopRandomSampleAscendDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
result
,
infiniopTensorDescriptor_t
probs
)
{
if
(
probs
->
ndim
!=
1
)
{
return
INFINIOP_STATUS_BAD_TENSOR_SHAPE
;
}
if
(
result
->
ndim
!=
1
&&
result
->
shape
[
0
]
!=
1
)
{
return
INFINIOP_STATUS_BAD_TENSOR_SHAPE
;
}
(
*
desc_ptr
)
=
new
InfiniopRandomSampleAscendDescriptor
(
handle
->
device
);
(
*
desc_ptr
)
->
device_id
=
handle
->
device_id
;
CHECK_STATUS
((
*
desc_ptr
)
->
pDesc
->
fromInfiniOpTensorDescriptor
(
probs
),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
((
*
desc_ptr
)
->
resDesc
->
fromInfiniOpTensorDescriptor
(
result
),
INFINIOP_STATUS_SUCCESS
);
// Ascend aclnnTopk doesn't support U64 type
(
*
desc_ptr
)
->
resDesc
->
dataType
=
aclDataType
::
ACL_INT64
;
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
ascendGetRandomSampleWorkspaceSize
(
infiniopRandomSampleAscendDescriptor_t
desc
,
uint64_t
*
size
)
{
auto
&
pDesc
=
desc
->
pDesc
;
*
size
=
numElements
(
pDesc
->
shape
.
data
(),
pDesc
->
ndim
)
*
aclDataTypeSize
(
pDesc
->
dataType
)
+
numElements
(
pDesc
->
shape
.
data
(),
pDesc
->
ndim
)
*
infiniSizeof
(
infiniDtype_t
::
INFINI_DTYPE_I64
);
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
ascendRandomSample
(
infiniopRandomSampleAscendDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
{
if
(
topk
<=
0
||
topp
<
0
||
topp
>
1.0
)
{
return
INFINIOP_STATUS_BAD_PARAM
;
}
if
(
random_val
<
0
||
random_val
>
1.0
)
{
return
INFINIOP_STATUS_BAD_PARAM
;
}
auto
&
pDesc
=
desc
->
pDesc
;
auto
&
topkIdxDesc
=
desc
->
topkIdxDesc
;
auto
&
topkValDesc
=
desc
->
topkValDesc
;
auto
ndim
=
static_cast
<
int64_t
>
(
pDesc
->
ndim
);
auto
voc
=
pDesc
->
shape
[
0
];
auto
topk_
=
topk
<=
voc
?
topk
:
voc
;
bool
doSample
=
topk_
>
1
&&
temperature
!=
0
&&
topp
!=
0
;
auto
topkShape
=
std
::
vector
<
int64_t
>
(
pDesc
->
shape
);
topkShape
[
ndim
-
1
]
=
doSample
?
topk_
:
1
;
auto
topkStrides
=
std
::
vector
<
int64_t
>
(
pDesc
->
strides
);
// Infer contiguous strides
topkStrides
[
ndim
-
1
]
=
1
;
for
(
int64_t
i
=
ndim
-
2
;
i
>=
0
;
--
i
)
{
topkStrides
[
i
]
=
topkStrides
[
i
+
1
]
*
topkShape
[
i
+
1
];
}
CHECK_STATUS
(
topkValDesc
->
setDescriptor
(
pDesc
->
dataType
,
topkShape
,
topkStrides
),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
topkIdxDesc
->
setDescriptor
(
aclDataType
::
ACL_INT64
,
topkShape
,
topkStrides
),
INFINIOP_STATUS_SUCCESS
);
// Infer data ptr
auto
workspaceTmp
=
workspace
;
auto
topkValAddr
=
workspaceTmp
;
workspaceTmp
=
(
void
*
)((
uint8_t
*
)
workspace
+
numElements
(
topkValDesc
->
shape
.
data
(),
topkValDesc
->
ndim
)
*
aclDataTypeSize
(
topkValDesc
->
dataType
));
auto
topkIdxAddr
=
workspaceTmp
;
auto
pAddr
=
(
void
*
)
probs
;
// Create aclTensor
CHECK_STATUS
(
pDesc
->
createTensor
(
pAddr
),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
topkValDesc
->
createTensor
(
topkValAddr
),
INFINIOP_STATUS_SUCCESS
);
CHECK_STATUS
(
topkIdxDesc
->
createTensor
(
topkIdxAddr
),
INFINIOP_STATUS_SUCCESS
);
if
(
!
doSample
)
{
CHECK_STATUS
(
desc
->
resDesc
->
createTensor
(
result
),
INFINIOP_STATUS_SUCCESS
);
}
// Do Topk calculate
uint64_t
topkWorkspaceSize
=
0
;
aclOpExecutor
*
topkExecutor
=
nullptr
;
auto
ret
=
aclnnTopkGetWorkspaceSize
(
pDesc
->
t
,
topkShape
[
ndim
-
1
],
ndim
-
1
,
true
,
true
,
topkValDesc
->
t
,
doSample
?
topkIdxDesc
->
t
:
desc
->
resDesc
->
t
,
&
topkWorkspaceSize
,
&
topkExecutor
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnTopkGetWorkspaceSize failed ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
void
*
topkWorkspace
;
CHECK_STATUS
(
mallocWorkspace
(
&
topkWorkspace
,
topkWorkspaceSize
),
INFINIOP_STATUS_SUCCESS
);
ret
=
aclnnTopk
(
topkWorkspace
,
topkWorkspaceSize
,
topkExecutor
,
stream
);
CHECK_RET
(
ret
==
ACL_SUCCESS
,
LOG_PRINT
(
"aclnnTopk failed ERROR: %d
\n
"
,
ret
);
return
INFINIOP_STATUS_INTERNAL_ERROR
);
CHECK_STATUS
(
freeWorkspace
(
topkWorkspace
),
INFINIOP_STATUS_SUCCESS
);
if
(
doSample
)
{
// Do softmax and topp random sample
random_sample_do
(
pAddr
,
result
,
topkValAddr
,
topkIdxAddr
,
topk
,
static_cast
<
int
>
(
pDesc
->
shape
[
0
]),
topp
,
temperature
,
random_val
,
pDesc
->
dataType
,
stream
);
}
return
INFINIOP_STATUS_SUCCESS
;
}
infiniopStatus_t
ascendDestroyRandomSampleDescriptor
(
infiniopRandomSampleAscendDescriptor_t
desc
)
{
delete
desc
->
pDesc
;
delete
desc
->
topkIdxDesc
;
delete
desc
->
topkValDesc
;
delete
desc
;
return
INFINIOP_STATUS_SUCCESS
;
}
src/infiniop/ops/random_sample/ascend/random_sample_ascend.h
0 → 100644
View file @
b5c6c7b8
#ifndef __RANDOM_SAMPLE_ASCEND_H__
#define __RANDOM_SAMPLE_ASCEND_H__
#include "../../../devices/ascend/tensor_aclnn.h"
#include "../../utils.h"
#include "random_sample_ascend_api.h"
#include <acl/acl.h>
#include <acl/acl_base.h>
#include <acl/acl_rt.h>
#include <aclnnop/aclnn_topk.h>
struct
InfiniopRandomSampleAscendDescriptor
{
infiniDevice_t
device
;
int
device_id
;
aclnnTensorDescriptor_t
pDesc
,
topkValDesc
,
topkIdxDesc
,
resDesc
;
InfiniopRandomSampleAscendDescriptor
(
infiniDevice_t
device_
);
};
extern
"C"
void
random_sample_do
(
void
*
p
,
void
*
res
,
void
*
topkAddr
,
void
*
topkIdxAddr
,
int32_t
topk
,
int32_t
voc
,
float
topp
,
float
temper
,
float
random
,
int
dtype
,
void
*
stream
);
#endif
src/infiniop/ops/random_sample/ascend/random_sample_ascend_api.h
0 → 100644
View file @
b5c6c7b8
#ifndef __RANDOM_SAMPLE_ASCEND_API_H__
#define __RANDOM_SAMPLE_ASCEND_API_H__
#include "../../../devices/ascend/ascend_handle.h"
#include "infiniop/operator.h"
struct
InfiniopRandomSampleAscendDescriptor
;
typedef
struct
InfiniopRandomSampleAscendDescriptor
*
infiniopRandomSampleAscendDescriptor_t
;
infiniopStatus_t
ascendCreateRandomSampleDescriptor
(
infiniopAscendHandle_t
handle
,
infiniopRandomSampleAscendDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
results
,
infiniopTensorDescriptor_t
probs
);
infiniopStatus_t
ascendGetRandomSampleWorkspaceSize
(
infiniopRandomSampleAscendDescriptor_t
desc
,
uint64_t
*
size
);
infiniopStatus_t
ascendRandomSample
(
infiniopRandomSampleAscendDescriptor_t
desc
,
void
*
workspace
,
uint64_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
);
infiniopStatus_t
ascendDestroyRandomSampleDescriptor
(
infiniopRandomSampleAscendDescriptor_t
desc
);
#endif
src/infiniop/ops/random_sample/ascend/random_sample_kernel.cpp
0 → 100644
View file @
b5c6c7b8
#include "kernel_operator.h"
using
namespace
AscendC
;
const
int32_t
BLOCK_LEN
=
256
;
template
<
typename
T
>
class
KernelRandomSample
{
public:
__aicore__
inline
KernelRandomSample
()
{}
__aicore__
inline
void
Init
(
GM_ADDR
p
,
GM_ADDR
res
,
GM_ADDR
topkAddr
,
GM_ADDR
topkIdxAddr
,
int32_t
topk_
,
int32_t
voc_
,
float
topp_
,
float
temper_
,
float
random_
)
{
topk
=
topk_
;
voc
=
voc_
;
topp
=
topp_
;
invTemperature
=
1.0
f
/
temper_
;
random
=
random_
;
negMax
=
0.
f
;
sum
=
0.
f
;
// CumSumInfo
topkAligned
=
topk
*
sizeof
(
T
)
%
32
==
0
?
topk
:
(
topk
*
sizeof
(
T
)
+
31
)
/
32
*
32
/
sizeof
(
T
);
vocAligned
=
voc
*
sizeof
(
T
)
%
32
==
0
?
voc
:
(
voc
*
sizeof
(
T
)
+
31
)
/
32
*
32
/
sizeof
(
T
);
topkIdxAligned
=
(
topk
+
3
)
/
4
*
4
;
bufferLen
=
topkAligned
>
BLOCK_LEN
?
topkAligned
:
BLOCK_LEN
;
// Set Gm
pGm
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
T
*>
(
p
),
voc
);
topkGm
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
T
*>
(
topkAddr
),
topk
);
topkIdxGm
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
int64_t
*>
(
topkIdxAddr
),
topk
);
resGm
.
SetGlobalBuffer
(
reinterpret_cast
<
__gm__
int64_t
*>
(
res
),
1
);
// Global input and output
pipe
.
InitBuffer
(
topkQue
,
1
,
topkAligned
*
sizeof
(
T
));
pipe
.
InitBuffer
(
topkIdxQue
,
1
,
topkIdxAligned
*
sizeof
(
int64_t
));
pipe
.
InitBuffer
(
resQue
,
1
,
32
);
// 32 bytes for aligned
pipe
.
InitBuffer
(
inBuf
,
BLOCK_LEN
*
sizeof
(
T
));
pipe
.
InitBuffer
(
tmpBuf1
,
bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
tmpBuf2
,
bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
tmpBuf3
,
bufferLen
*
sizeof
(
T
));
pipe
.
InitBuffer
(
softMaxOutBuf
,
topkAligned
*
sizeof
(
T
));
pipe
.
InitBuffer
(
inclusiveSumOutBuf
,
topkAligned
*
sizeof
(
T
));
}
__aicore__
inline
void
Process
()
{
CopyIn
();
Compute
();
CopyOut
();
}
private:
// Softmax
__aicore__
inline
void
SoftMax
(
LocalTensor
<
T
>
&
topkValIn
,
LocalTensor
<
T
>
&
softMaxOut
)
{
float
invSum
=
1.0
f
/
sum
;
LocalTensor
<
T
>
tmpBuffer
=
tmpBuf1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer2
=
tmpBuf2
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer3
=
tmpBuf3
.
Get
<
T
>
();
Adds
(
tmpBuffer
,
topkValIn
,
static_cast
<
T
>
(
negMax
),
topk
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
invTemperature
),
topk
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
topk
);
Muls
(
softMaxOut
,
tmpBuffer3
,
static_cast
<
T
>
(
invSum
),
topk
);
}
// Cumsum
__aicore__
inline
void
InclusiveSum
(
LocalTensor
<
T
>
&
topkValIn
,
LocalTensor
<
T
>
&
topkValOut
)
{
static
constexpr
CumSumConfig
cumSumConfig
{
true
,
false
,
false
};
LocalTensor
<
T
>
lastRowLocal
;
CumSum
<
T
,
cumSumConfig
>
(
topkValOut
,
lastRowLocal
,
topkValIn
,
{
1
,
static_cast
<
uint32_t
>
(
topkAligned
)});
}
// Random sample
__aicore__
inline
void
RandomSample
(
LocalTensor
<
T
>
&
valIn
,
LocalTensor
<
int64_t
>
&
Index
,
LocalTensor
<
int64_t
>
&
result
)
{
int
end
=
0
;
for
(
end
=
0
;
end
<
topk
;
end
++
)
{
if
(
static_cast
<
float
>
(
valIn
(
end
))
>=
topp
)
{
break
;
}
}
if
(
end
<
topk
-
1
)
{
end
+=
1
;
}
else
{
end
=
topk
;
}
auto
randomVal
=
random
*
static_cast
<
float
>
(
valIn
(
end
-
1
));
for
(
int
i
=
0
;
i
<
end
;
i
++
)
{
if
(
randomVal
<
static_cast
<
float
>
(
valIn
(
i
)))
{
result
(
0
)
=
Index
(
i
);
return
;
}
}
result
(
0
)
=
Index
(
end
-
1
);
}
__aicore__
inline
void
CopyIn
()
{
LocalTensor
<
T
>
topkValLocal
=
topkQue
.
AllocTensor
<
T
>
();
LocalTensor
<
int64_t
>
topkIdxLocal
=
topkIdxQue
.
AllocTensor
<
int64_t
>
();
DataCopy
(
topkValLocal
,
topkGm
,
topkAligned
);
DataCopy
(
topkIdxLocal
,
topkIdxGm
,
topkIdxAligned
);
// Get Max val of input
negMax
=
-
static_cast
<
float
>
(
topkValLocal
(
0
));
// Copy in p and compute sum
int32_t
repeatTimes
=
voc
/
BLOCK_LEN
;
int32_t
remainder
=
voc
%
BLOCK_LEN
;
float
sum_s
=
0.
f
;
LocalTensor
<
T
>
inBuffer
=
inBuf
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer
=
tmpBuf1
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer2
=
tmpBuf2
.
Get
<
T
>
();
LocalTensor
<
T
>
tmpBuffer3
=
tmpBuf3
.
Get
<
T
>
();
for
(
int32_t
i
=
0
;
i
<
repeatTimes
;
i
++
)
{
DataCopy
(
inBuffer
,
pGm
[
i
*
BLOCK_LEN
],
BLOCK_LEN
);
Adds
(
tmpBuffer
,
inBuffer
,
static_cast
<
T
>
(
negMax
),
BLOCK_LEN
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
invTemperature
),
BLOCK_LEN
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
BLOCK_LEN
);
sum_s
=
0.
f
;
for
(
int
j
=
0
;
j
<
BLOCK_LEN
;
++
j
)
{
sum_s
+=
static_cast
<
float
>
(
tmpBuffer3
(
j
));
}
sum
+=
sum_s
;
}
if
(
remainder
!=
0
)
{
int32_t
remainderAligned
=
remainder
*
sizeof
(
T
)
%
32
==
0
?
remainder
:
(
remainder
*
sizeof
(
T
)
+
31
)
/
32
*
32
/
sizeof
(
T
);
DataCopy
(
inBuffer
,
pGm
[
repeatTimes
*
BLOCK_LEN
],
remainderAligned
);
Adds
(
tmpBuffer
,
inBuffer
,
static_cast
<
T
>
(
negMax
),
remainder
);
Muls
(
tmpBuffer2
,
tmpBuffer
,
static_cast
<
T
>
(
invTemperature
),
remainder
);
Exp
(
tmpBuffer3
,
tmpBuffer2
,
remainder
);
sum_s
=
0.
f
;
for
(
int
i
=
0
;
i
<
remainder
;
++
i
)
{
sum_s
+=
static_cast
<
float
>
(
tmpBuffer3
(
i
));
}
sum
+=
sum_s
;
}
topkQue
.
EnQue
(
topkValLocal
);
topkIdxQue
.
EnQue
(
topkIdxLocal
);
}
__aicore__
inline
void
Compute
()
{
// Get input data
LocalTensor
<
T
>
topkValLocal
=
topkQue
.
DeQue
<
T
>
();
// SoftMax
LocalTensor
<
T
>
softMaxOutLocal
=
softMaxOutBuf
.
Get
<
T
>
();
SoftMax
(
topkValLocal
,
softMaxOutLocal
);
// InclusiveSum
LocalTensor
<
T
>
inclusiveOutLocal
=
inclusiveSumOutBuf
.
Get
<
T
>
();
InclusiveSum
(
softMaxOutLocal
,
inclusiveOutLocal
);
// randomSample
LocalTensor
<
int64_t
>
topkIdxLocal
=
topkIdxQue
.
DeQue
<
int64_t
>
();
LocalTensor
<
int64_t
>
resultLocal
=
resQue
.
AllocTensor
<
int64_t
>
();
RandomSample
(
inclusiveOutLocal
,
topkIdxLocal
,
resultLocal
);
topkQue
.
FreeTensor
(
topkValLocal
);
topkIdxQue
.
FreeTensor
(
topkIdxLocal
);
resQue
.
EnQue
(
resultLocal
);
}
__aicore__
inline
void
CopyOut
()
{
LocalTensor
<
int64_t
>
resLocal
=
resQue
.
DeQue
<
int64_t
>
();
DataCopy
(
resGm
,
resLocal
,
32
/
sizeof
(
int64_t
));
resQue
.
FreeTensor
(
resLocal
);
}
private:
GlobalTensor
<
T
>
pGm
;
GlobalTensor
<
T
>
topkGm
;
GlobalTensor
<
int64_t
>
topkIdxGm
;
GlobalTensor
<
int64_t
>
resGm
;
TPipe
pipe
;
TQue
<
QuePosition
::
VECIN
,
1
>
topkQue
;
TQue
<
QuePosition
::
VECIN
,
1
>
topkIdxQue
;
TQue
<
QuePosition
::
VECOUT
,
1
>
resQue
;
TBuf
<
TPosition
::
VECCALC
>
inBuf
;
TBuf
<
TPosition
::
VECCALC
>
tmpBuf1
;
TBuf
<
TPosition
::
VECCALC
>
tmpBuf2
;
TBuf
<
TPosition
::
VECCALC
>
tmpBuf3
;
TBuf
<
TPosition
::
VECCALC
>
softMaxOutBuf
;
TBuf
<
TPosition
::
VECCALC
>
inclusiveSumOutBuf
;
// Kernel params
int32_t
topk
;
int32_t
voc
;
float
topp
;
float
invTemperature
;
float
random
;
float
negMax
;
float
sum
;
int32_t
topkAligned
;
int32_t
topkIdxAligned
;
int32_t
vocAligned
;
int32_t
bufferLen
;
};
extern
"C"
__global__
__aicore__
void
random_sample_kernel_f16
(
GM_ADDR
p
,
GM_ADDR
res
,
GM_ADDR
topkAddr
,
GM_ADDR
topkIdxAddr
,
int32_t
topk_
,
int32_t
voc_
,
float
topp_
,
float
temper_
,
float
random_
)
{
KernelRandomSample
<
half
>
op
;
op
.
Init
(
p
,
res
,
topkAddr
,
topkIdxAddr
,
topk_
,
voc_
,
topp_
,
temper_
,
random_
);
op
.
Process
();
}
extern
"C"
void
random_sample_do
(
void
*
p
,
void
*
res
,
void
*
topkAddr
,
void
*
topkIdxAddr
,
int32_t
topk
,
int32_t
voc
,
float
topp
,
float
temper
,
float
random
,
int
dtype
,
void
*
stream
)
{
switch
(
dtype
)
{
case
1
:
random_sample_kernel_f16
<<<
1
,
nullptr
,
stream
>>>
(
p
,
res
,
topkAddr
,
topkIdxAddr
,
topk
,
voc
,
topp
,
temper
,
random
);
}
}
src/infiniop/ops/random_sample/operator.cc
View file @
b5c6c7b8
...
@@ -141,3 +141,4 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
...
@@ -141,3 +141,4 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#undef DELETE
#undef DELETE
}
}
test/infiniop/random_sample.py
View file @
b5c6c7b8
...
@@ -43,7 +43,6 @@ _TOLERANCE_MAP = {
...
@@ -43,7 +43,6 @@ _TOLERANCE_MAP = {
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
0
},
torch
.
float16
:
{
"atol"
:
0
,
"rtol"
:
0
},
}
}
DEBUG
=
False
DEBUG
=
False
PROFILE
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_PRERUN
=
10
...
@@ -74,6 +73,15 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
...
@@ -74,6 +73,15 @@ def random_sample(data, random_val, topp, topk, voc, temperature):
return
torch
.
argmax
(
data
)
return
torch
.
argmax
(
data
)
def
random_sample
(
data
,
random_val
,
topp
,
topk
,
voc
,
temperature
):
if
topp
>
0
and
topk
>
1
:
ans
=
random_sample_1
(
data
.
to
(
"cpu"
),
random_val
,
topp
,
topk
,
voc
,
temperature
)
else
:
ans
=
random_sample_0
(
data
)
return
ans
def
test
(
def
test
(
lib
,
lib
,
handle
,
handle
,
...
@@ -128,7 +136,6 @@ def test(
...
@@ -128,7 +136,6 @@ def test(
)
)
)
)
workspace
=
create_workspace
(
workspace_size
.
value
,
torch_device
)
workspace
=
create_workspace
(
workspace_size
.
value
,
torch_device
)
def
lib_random_sample
():
def
lib_random_sample
():
check_error
(
check_error
(
lib
.
infiniopRandomSample
(
lib
.
infiniopRandomSample
(
...
@@ -216,5 +223,4 @@ if __name__ == "__main__":
...
@@ -216,5 +223,4 @@ if __name__ == "__main__":
# Execute tests
# Execute tests
for
device
in
get_test_devices
(
args
):
for
device
in
get_test_devices
(
args
):
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
test_operator
(
lib
,
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment