Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
c2e87202
Commit
c2e87202
authored
Jun 04, 2025
by
Catheriany
Browse files
Merge remote-tracking branch 'origin/main' into issue/142
parents
41818f84
c203635b
Changes
175
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1736 additions
and
118 deletions
+1736
-118
src/infiniop/ops/mul/cpu/mul_cpu.h
src/infiniop/ops/mul/cpu/mul_cpu.h
+19
-0
src/infiniop/ops/mul/cuda/mul_cuda.cu
src/infiniop/ops/mul/cuda/mul_cuda.cu
+57
-0
src/infiniop/ops/mul/cuda/mul_cuda.cuh
src/infiniop/ops/mul/cuda/mul_cuda.cuh
+8
-0
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
+26
-0
src/infiniop/ops/mul/operator.cc
src/infiniop/ops/mul/operator.cc
+119
-0
src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
+33
-96
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
+101
-0
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cuh
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cuh
+8
-0
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
+254
-0
src/infiniop/ops/random_sample/info.h
src/infiniop/ops/random_sample/info.h
+33
-0
src/infiniop/ops/random_sample/operator.cc
src/infiniop/ops/random_sample/operator.cc
+17
-1
src/infiniop/ops/random_sample/random_sample.h
src/infiniop/ops/random_sample/random_sample.h
+97
-9
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
+94
-0
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
+8
-0
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
+8
-10
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
+485
-0
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
+8
-0
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
+330
-0
src/infiniop/ops/rearrange/operator.cc
src/infiniop/ops/rearrange/operator.cc
+28
-0
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
+3
-2
No files found.
src/infiniop/ops/mul/cpu/mul_cpu.h
0 → 100644
View file @
c2e87202
#ifndef __MUL_CPU_H__
#define __MUL_CPU_H__
#include "../../../elementwise/cpu/elementwise_cpu.h"
ELEMENTWISE_DESCRIPTOR
(
mul
,
cpu
)
namespace
op
::
mul
::
cpu
{
typedef
struct
MulOp
{
public:
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
return
a
*
b
;
}
}
MulOp
;
}
// namespace op::mul::cpu
#endif // __MUL_CPU_H__
src/infiniop/ops/mul/cuda/mul_cuda.cu
0 → 100644
View file @
c2e87202
#include "mul_cuda.cuh"
#include "mul_cuda_internal.cuh"
namespace
op
::
mul
::
cuda
{
Descriptor
::~
Descriptor
()
=
default
;
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
std
::
vector
<
infiniopTensorDescriptor_t
>
input_desc_vec
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle_
);
auto
dtype
=
out_desc
->
dtype
();
const
auto
&
a_desc
=
input_desc_vec
.
at
(
0
);
const
auto
&
b_desc
=
input_desc_vec
.
at
(
1
);
const
auto
&
c_shape
=
out_desc
->
shape
();
const
auto
&
a_shape
=
a_desc
->
shape
();
const
auto
&
b_shape
=
b_desc
->
shape
();
CHECK_DTYPE
(
dtype
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_SAME_SHAPE
(
c_shape
,
a_shape
,
b_shape
);
// create CUDA elementwise descriptor
CREATE_ELEMENTWISE_CUDA_DESCRIPTOR
(
handle
,
dtype
,
out_desc
,
input_desc_vec
)
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
output
,
std
::
vector
<
const
void
*>
inputs
,
void
*
stream
)
const
{
if
(
workspace_size
<
_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
switch
(
_dtype
)
{
case
INFINI_DTYPE_F16
:
return
_device_info
->
calculate
<
256
,
MulOp
,
half
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F32
:
return
_device_info
->
calculate
<
256
,
MulOp
,
float
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
case
INFINI_DTYPE_F64
:
return
_device_info
->
calculate
<
256
,
MulOp
,
double
>
(
_info
,
workspace
,
output
,
inputs
,
stream
);
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::mul::cuda
src/infiniop/ops/mul/cuda/mul_cuda.cuh
0 → 100644
View file @
c2e87202
#ifndef __MUL_CUDA_API_H__
#define __MUL_CUDA_API_H__
#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
ELEMENTWISE_DESCRIPTOR
(
mul
,
cuda
)
#endif // __MUL_CUDA_API_H__
src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
0 → 100644
View file @
c2e87202
#ifndef __MUL_CUDA_H__
#define __MUL_CUDA_H__
#include "../../../elementwise/cuda/elementwise_cuda.cuh"
#include <cuda_fp16.h>
namespace
op
::
mul
::
cuda
{
typedef
struct
MulOp
{
static
constexpr
size_t
num_inputs
=
2
;
template
<
typename
T
>
__device__
__forceinline__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
if
constexpr
(
std
::
is_same_v
<
T
,
half2
>
)
{
return
__hmul2
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
half
>
)
{
return
__hmul
(
a
,
b
);
}
else
if
constexpr
(
std
::
is_same_v
<
T
,
float
>
)
{
return
__fmul_rn
(
a
,
b
);
}
else
{
return
a
*
b
;
}
}
}
MulOp
;
}
// namespace op::mul::cuda
#endif // __MUL_CUDA_H__
src/infiniop/ops/mul/operator.cc
0 → 100644
View file @
c2e87202
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/mul.h"
#ifdef ENABLE_CPU_API
#include "cpu/mul_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/mul_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateMulDescriptor
(
infiniopHandle_t
handle
,
infiniopMulDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
c_desc
,
infiniopTensorDescriptor_t
a_desc
,
infiniopTensorDescriptor_t
b_desc
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::mul::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::mul::NAMESPACE::Descriptor **>(desc_ptr), \
c_desc, \
{a_desc, \
b_desc})
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__C
infiniStatus_t
infiniopGetMulWorkspaceSize
(
infiniopMulDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
)
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
)
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__C
infiniStatus_t
infiniopMul
(
infiniopMulDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
c
,
const
void
*
a
,
const
void
*
b
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, c, {a, b}, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__C
infiniStatus_t
infiniopDestroyMulDescriptor
(
infiniopMulDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/random_sample/cpu/random_sample_cpu.cc
View file @
c2e87202
#include "random_sample_cpu.h"
#include "../../../devices/cpu/common_cpu.h"
#include "../
../../devices/cpu/cpu_handle
.h"
#include "
../../../tens
or.h"
#include "../
info
.h"
#include "
infinic
or
e
.h"
#include <algorithm>
namespace
op
::
random_sample
::
cpu
{
...
...
@@ -15,29 +15,14 @@ infiniStatus_t Descriptor::create(
infiniopTensorDescriptor_t
probs_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dt_i
=
result_desc
->
dtype
();
auto
dt_p
=
probs_desc
->
dtype
();
CHECK_DTYPE
(
dt_i
,
INFINI_DTYPE_U8
,
INFINI_DTYPE_U16
,
INFINI_DTYPE_U32
,
INFINI_DTYPE_U64
,
INFINI_DTYPE_I8
,
INFINI_DTYPE_I16
,
INFINI_DTYPE_I32
,
INFINI_DTYPE_I64
);
CHECK_DTYPE
(
dt_p
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_API_OR
(
result_desc
->
ndim
(),
0
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_API_OR
(
probs_desc
->
ndim
(),
1
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_API_OR
(
probs_desc
->
stride
(
0
),
1
,
return
INFINI_STATUS_BAD_TENSOR_STRIDES
);
auto
result
=
RandomSampleInfo
::
create
(
result_desc
,
probs_desc
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
dt_i
,
dt_p
,
probs_desc
->
dim
(
0
),
result
.
take
(),
0
,
nullptr
,
handle
->
device
,
handle
->
device_id
);
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
@@ -55,36 +40,42 @@ struct ComputeType<fp16_t> {
using
type
=
float
;
};
template
<
class
Tidx
,
class
Tval
>
struct
Scheme
{
using
Tcompute
=
typename
ComputeType
<
Tval
>::
type
;
struct
Algo
{
static
Tcompute
get
(
void
const
*
ptr
,
size_t
i
)
{
return
utils
::
cast
<
Tcompute
,
Tval
>
(
reinterpret_cast
<
Tval
const
*>
(
ptr
)[
i
]);
template
<
class
Tidx
,
class
Tval
>
static
auto
get
(
void
const
*
ptr
,
size_t
i
)
{
return
utils
::
cast
<
typename
ComputeType
<
Tval
>::
type
,
Tval
>
(
reinterpret_cast
<
Tval
const
*>
(
ptr
)[
i
]);
}
static
void
argmax
(
void
*
result
,
void
const
*
probs
,
size_t
n
)
{
template
<
class
Tidx
,
class
Tval
>
infiniStatus_t
argmax
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
size_t
n
,
void
*
stream
)
{
auto
idx
=
reinterpret_cast
<
Tidx
*>
(
result
);
*
idx
=
0
;
auto
max_val
=
get
(
probs
,
0
);
auto
max_val
=
get
<
Tidx
,
Tval
>
(
probs
,
0
);
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
if
(
auto
val
=
get
(
probs
,
i
);
val
>
max_val
)
{
if
(
auto
val
=
get
<
Tidx
,
Tval
>
(
probs
,
i
);
val
>
max_val
)
{
max_val
=
val
;
*
idx
=
static_cast
<
Tidx
>
(
i
);
}
}
return
INFINI_STATUS_SUCCESS
;
}
static
void
random
(
template
<
class
Tidx
,
class
Tval
>
infiniStatus_t
random
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
void
const
*
probs
,
size_t
n
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
)
{
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
{
struct
KVPair
{
Tidx
idx
;
Tcomput
e
val
;
typename
ComputeType
<
Tval
>::
typ
e
val
;
bool
operator
<
(
const
KVPair
&
other
)
const
{
return
val
>
other
.
val
;
...
...
@@ -95,7 +86,7 @@ struct Scheme {
// build & sort
std
::
vector
<
KVPair
>
pairs
(
n
);
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
pairs
[
i
]
=
{
static_cast
<
Tidx
>
(
i
),
get
(
probs
,
i
)};
pairs
[
i
]
=
{
static_cast
<
Tidx
>
(
i
),
get
<
Tidx
,
Tval
>
(
probs
,
i
)};
}
std
::
sort
(
pairs
.
begin
(),
pairs
.
end
());
// softmax & sum
...
...
@@ -115,68 +106,10 @@ struct Scheme {
break
;
}
}
}
};
template
<
class
Tidx
,
class
Tval
>
void
switch_f
(
size_t
n
,
void
*
result
,
const
void
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
)
{
if
(
random_val
==
0
||
topp
==
0
||
topk
==
1
||
temperature
==
0
)
{
Scheme
<
Tidx
,
Tval
>::
argmax
(
result
,
probs
,
n
);
}
else
{
Scheme
<
Tidx
,
Tval
>::
random
(
result
,
probs
,
n
,
random_val
,
topp
,
topk
,
temperature
);
}
}
template
<
class
Tidx
>
void
switch_val
(
infiniDtype_t
dt_p
,
size_t
n
,
void
*
result
,
void
const
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
)
{
switch
(
dt_p
)
{
case
INFINI_DTYPE_F16
:
switch_f
<
Tidx
,
fp16_t
>
(
n
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
);
break
;
case
INFINI_DTYPE_F32
:
switch_f
<
Tidx
,
float
>
(
n
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
);
break
;
case
INFINI_DTYPE_F64
:
switch_f
<
Tidx
,
double
>
(
n
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
);
break
;
default:
// unreachable
std
::
abort
();
}
}
void
switch_idx
(
infiniDtype_t
dt_i
,
infiniDtype_t
dt_p
,
size_t
n
,
void
*
result
,
void
const
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
)
{
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>(dt_p, n, result, probs, random_val, topp, topk, temperature); \
break
switch
(
dt_i
)
{
CASE
(
INFINI_DTYPE_I8
,
int8_t
);
CASE
(
INFINI_DTYPE_I16
,
int16_t
);
CASE
(
INFINI_DTYPE_I32
,
int32_t
);
CASE
(
INFINI_DTYPE_I64
,
int64_t
);
CASE
(
INFINI_DTYPE_U8
,
uint8_t
);
CASE
(
INFINI_DTYPE_U16
,
uint16_t
);
CASE
(
INFINI_DTYPE_U32
,
uint32_t
);
CASE
(
INFINI_DTYPE_U64
,
uint64_t
);
default:
// unreachable
std
::
abort
();
return
INFINI_STATUS_SUCCESS
;
}
#undef CASE
}
};
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
...
...
@@ -189,7 +122,11 @@ infiniStatus_t Descriptor::calculate(
float
temperature
,
void
*
stream
)
const
{
switch_idx
(
_dt_i
,
_dt_p
,
_n
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
);
Calculate
::
calculate
<
Algo
>
(
Algo
{},
_info
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
INFINI_STATUS_SUCCESS
;
}
...
...
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cu
0 → 100644
View file @
c2e87202
#
include
"../../../devices/cuda/cuda_handle.cuh"
#include "../info.h"
#include "random_sample_cuda.cuh"
#include "random_sample_kernel.cuh"
namespace
op
::
random_sample
::
cuda
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
result_desc
,
infiniopTensorDescriptor_t
probs_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle_
);
auto
result
=
RandomSampleInfo
::
create
(
result_desc
,
probs_desc
);
CHECK_RESULT
(
result
);
auto
info
=
result
.
take
();
size_t
workspace_size
;
#define CASE_P(CASE, Tidx, Tval) \
case CASE: { \
auto workspace_result = calculateWorkspace<Tidx, Tval>(info.n); \
CHECK_RESULT(workspace_result); \
workspace_size = workspace_result.take(); \
} break
#define CASE_I(CASE, Tidx) \
case CASE: \
switch (info.dt_p) { \
CASE_P(INFINI_DTYPE_F16, Tidx, half); \
CASE_P(INFINI_DTYPE_F32, Tidx, float); \
CASE_P(INFINI_DTYPE_F64, Tidx, double); \
default: \
abort(); \
} \
break
switch
(
info
.
dt_i
)
{
CASE_I
(
INFINI_DTYPE_I8
,
int8_t
);
CASE_I
(
INFINI_DTYPE_I16
,
int16_t
);
CASE_I
(
INFINI_DTYPE_I32
,
int32_t
);
CASE_I
(
INFINI_DTYPE_I64
,
int64_t
);
CASE_I
(
INFINI_DTYPE_U8
,
uint8_t
);
CASE_I
(
INFINI_DTYPE_U16
,
uint16_t
);
CASE_I
(
INFINI_DTYPE_U32
,
uint32_t
);
CASE_I
(
INFINI_DTYPE_U64
,
uint64_t
);
default:
abort
();
}
#undef CASE_I
#undef CASE_P
*
desc_ptr
=
new
Descriptor
(
info
,
workspace_size
,
new
Opaque
{
handle
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
size_t
Descriptor
::
minWorkspaceSize
()
const
{
return
_min_workspace_size
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
const
void
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
const
{
if
(
workspace_size
<
_min_workspace_size
)
{
return
INFINI_STATUS_INSUFFICIENT_WORKSPACE
;
}
auto
block_size
=
_opaque
->
internal
->
blockSizeX
();
Calculate
::
calculate
<
Algo
>
(
Algo
{
block_size
},
_info
,
workspace
,
workspace_size
,
result
,
probs
,
random_val
,
topp
,
topk
,
temperature
,
stream
);
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::random_sample::cuda
src/infiniop/ops/random_sample/cuda/random_sample_cuda.cuh
0 → 100644
View file @
c2e87202
#
ifndef
__RANDOM_SAMPLE_CUDA_CUH__
#define __RANDOM_SAMPLE_CUDA_CUH__
#include "../random_sample.h"
DESCRIPTOR
(
cuda
)
#endif // __RANDOM_SAMPLE_CUDA_CUH__
src/infiniop/ops/random_sample/cuda/random_sample_kernel.cuh
0 → 100644
View file @
c2e87202
#
include
"../../../devices/cuda/cuda_kernel_common.cuh"
#include "infinicore.h"
#include <cub/device/device_radix_sort.cuh>
#include <cub/device/device_reduce.cuh>
#include <cub/device/device_scan.cuh>
namespace
op
::
random_sample
::
cuda
{
// ↓↓↓ 重新封装 cub api,减少模板参数,方便调用
template
<
class
T
>
static
cudaError
argMax_
(
cub
::
KeyValuePair
<
int
,
T
>
*
kv_pair
,
const
T
*
logits
,
int
n
,
void
*
workspace_ptr
,
size_t
&
workspace_len
,
cudaStream_t
stream
)
{
return
cub
::
DeviceReduce
::
ArgMax
(
workspace_ptr
,
workspace_len
,
logits
,
kv_pair
,
n
,
stream
);
}
template
<
class
Tval
,
class
Tidx
>
static
cudaError
radixSort
(
void
*
workspace_ptr
,
size_t
&
workspace_len
,
const
Tval
*
key_in
,
Tval
*
key_out
,
const
Tidx
*
val_in
,
Tidx
*
val_out
,
int
n
,
cudaStream_t
stream
)
{
return
cub
::
DeviceRadixSort
::
SortPairsDescending
(
workspace_ptr
,
workspace_len
,
key_in
,
key_out
,
val_in
,
val_out
,
n
,
0
,
sizeof
(
Tval
)
*
8
,
stream
);
}
template
<
class
T
>
static
cudaError
inclusiveSum
(
void
*
workspace_ptr
,
size_t
&
workspace_len
,
T
*
data
,
int
n
,
cudaStream_t
stream
)
{
return
cub
::
DeviceScan
::
InclusiveSum
(
workspace_ptr
,
workspace_len
,
data
,
data
,
n
,
stream
);
}
// ↑↑↑ 重新封装 cub api,减少模板参数,方便调用
// ↓↓↓ 计算 workspace
// 地址对齐到 256
static
constexpr
size_t
align256
(
size_t
size
)
{
return
(
size
+
255
)
&
(
~
255
);
}
template
<
class
Tidx
,
class
Tval
>
utils
::
Result
<
size_t
>
calculateWorkspace
(
size_t
n_
)
{
const
auto
n
=
static_cast
<
int
>
(
n_
);
size_t
argmax
;
CHECK_CUDA
(
argMax_
<
Tval
>
(
nullptr
,
nullptr
,
n
,
nullptr
,
argmax
,
nullptr
));
// 前 256 字节用于 kv pair
argmax
+=
256
;
// indices
size_t
size_random
=
align256
(
sizeof
(
Tidx
)
*
n
);
// sorted
size_random
+=
align256
(
sizeof
(
Tval
)
*
n
);
// indices_out
size_random
+=
align256
(
sizeof
(
Tidx
)
*
n
);
// cub device api
size_t
size_radix_sort
;
CHECK_CUDA
((
radixSort
<
Tval
,
Tidx
>
(
nullptr
,
size_radix_sort
,
nullptr
,
nullptr
,
nullptr
,
nullptr
,
n
,
nullptr
)));
size_t
size_inclusive_sum
;
CHECK_CUDA
(
inclusiveSum
<
Tval
>
(
nullptr
,
size_inclusive_sum
,
nullptr
,
n
,
nullptr
));
size_random
+=
cub
::
Max
()(
size_radix_sort
,
size_inclusive_sum
);
return
utils
::
Result
<
size_t
>
(
cub
::
Max
()(
argmax
,
size_random
));
}
// ↑↑↑ 计算 workspace
// ↓↓↓ 通过特化将 fp16_t 转换为 half
template
<
class
Tval
>
struct
CudaTval
{
using
Type
=
Tval
;
};
template
<
>
struct
CudaTval
<
fp16_t
>
{
using
Type
=
half
;
};
// ↑↑↑ 通过特化将 fp16_t 转换为 half
// ↓↓↓ 用于采样过程的小型 kernel
// cuda toolkit 11.x 带的 cub::DeviceReduce::ArgMax 只接受 cub::KeyValuePair<int, Tval> 输出。
// 这个 kernel 用于取出序号
template
<
class
Tidx
,
class
Tval
>
static
__global__
void
castIdx
(
Tidx
*
result
,
const
cub
::
KeyValuePair
<
int
,
Tval
>
*
kv_pair
)
{
*
result
=
kv_pair
->
key
;
}
// 填充排序要求的序号数组
template
<
class
Tidx
>
static
__global__
void
fillIndices
(
Tidx
*
indices
,
int
n
)
{
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
n
)
{
indices
[
i
]
=
i
;
}
}
// random sample 使用的 softmax 可以简化为一个基本的线性映射
// 由于已经排序,最大值就是第一个数字
// 第一个数字需要被多个 block 读取,不能写
template
<
class
T
>
static
__global__
void
partialSoftmaxKernel
(
T
*
__restrict__
data
,
int
n
,
float
temperature
)
{
int
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
0
<
i
&&
i
<
n
)
{
float
max
=
__ldg
(
data
);
data
[
i
]
=
(
T
)
expf
(((
float
)
data
[
i
]
-
max
)
/
temperature
);
}
}
// 将第一个数字写成 1,即 exp(0)
template
<
class
T
>
static
__global__
void
setSoftmaxMaxKernel
(
T
*
__restrict__
data
)
{
*
data
=
1
;
}
// 直接 for 循环遍历采样
// 这个 kernel 仅用于避免将数据拷贝到 cpu
template
<
class
Tval
,
class
Tidx
>
static
__global__
void
randomSampleKernel
(
Tidx
*
__restrict__
result
,
const
Tval
*
__restrict__
sorted
,
const
Tidx
*
__restrict__
indices_out
,
size_t
n
,
float
random
,
float
topp
,
size_t
topk
)
{
topk
=
cub
::
Min
()(
topk
,
n
);
auto
p
=
(
Tval
)(
random
*
cub
::
Min
()(
topp
*
(
float
)
sorted
[
n
-
1
],
(
float
)
sorted
[
topk
-
1
]));
for
(
size_t
i
=
0
;;
++
i
)
{
if
((
sorted
[
i
])
>=
p
)
{
*
result
=
indices_out
[
i
];
return
;
}
}
}
// ↑↑↑ 用于采样过程的小型 kernel
struct
Algo
{
int
block_size
;
template
<
class
Tidx
,
class
Tval_
>
infiniStatus_t
argmax
(
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
const
void
*
probs
,
size_t
n
,
void
*
stream_
)
const
{
using
Tval
=
typename
CudaTval
<
Tval_
>::
Type
;
auto
stream
=
(
cudaStream_t
)
stream_
;
auto
logits
=
(
Tval
*
)
probs
;
auto
kv_pair
=
(
cub
::
KeyValuePair
<
int
,
Tval
>
*
)
workspace
;
workspace
=
(
void
*
)((
char
*
)
workspace
+
256
);
workspace_size
-=
256
;
argMax_
(
kv_pair
,
logits
,
n
,
workspace
,
workspace_size
,
stream
);
castIdx
<<<
1
,
1
,
0
,
stream
>>>
((
Tidx
*
)
result
,
kv_pair
);
return
INFINI_STATUS_SUCCESS
;
}
template
<
class
Tidx
,
class
Tval_
>
infiniStatus_t
random
(
void
*
workspace_
,
size_t
workspace_size
,
void
*
result_
,
const
void
*
probs
,
size_t
n
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream_
)
const
{
using
Tval
=
typename
CudaTval
<
Tval_
>::
Type
;
auto
stream
=
(
cudaStream_t
)
stream_
;
auto
logits
=
(
Tval
*
)
probs
;
auto
result
=
(
Tidx
*
)
result_
;
auto
workspace
=
reinterpret_cast
<
size_t
>
(
workspace_
);
auto
workspace_end
=
workspace
+
workspace_size
;
auto
indices
=
reinterpret_cast
<
Tidx
*>
(
workspace
);
workspace
+=
align256
(
sizeof
(
Tidx
)
*
n
);
auto
sorted
=
reinterpret_cast
<
Tval
*>
(
workspace
);
workspace
+=
align256
(
sizeof
(
Tval
)
*
n
);
auto
indices_out
=
reinterpret_cast
<
Tidx
*>
(
workspace
);
workspace
+=
align256
(
sizeof
(
Tidx
)
*
n
);
workspace_
=
reinterpret_cast
<
void
*>
(
workspace
);
workspace_size
=
workspace_end
-
workspace
;
auto
block
=
cub
::
Min
()((
size_t
)
block_size
,
n
);
auto
grid
=
(
n
+
block
-
1
)
/
block
;
// sort
fillIndices
<<<
grid
,
block
,
0
,
stream
>>>
(
indices
,
n
);
CHECK_CUDA
(
radixSort
(
workspace_
,
workspace_size
,
logits
,
sorted
,
indices
,
indices_out
,
n
,
stream
));
// softmax
partialSoftmaxKernel
<<<
grid
,
block
,
0
,
stream
>>>
(
sorted
,
n
,
temperature
);
setSoftmaxMaxKernel
<<<
1
,
1
,
0
,
stream
>>>
(
sorted
);
// sum
CHECK_CUDA
(
inclusiveSum
(
workspace_
,
workspace
,
sorted
,
n
,
stream
));
// sample
randomSampleKernel
<<<
1
,
1
,
0
,
stream
>>>
(
result
,
sorted
,
indices_out
,
n
,
random_val
,
topp
,
topk
);
return
INFINI_STATUS_SUCCESS
;
}
};
}
// namespace op::random_sample::cuda
src/infiniop/ops/random_sample/info.h
0 → 100644
View file @
c2e87202
#
ifndef
__RANDOM_SAMPLE_INFO_H__
#define __RANDOM_SAMPLE_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
namespace
op
::
random_sample
{
struct
RandomSampleInfo
{
infiniDtype_t
dt_i
,
dt_p
;
size_t
n
;
static
utils
::
Result
<
RandomSampleInfo
>
create
(
infiniopTensorDescriptor_t
result_desc
,
infiniopTensorDescriptor_t
probs_desc
)
{
auto
dt_i
=
result_desc
->
dtype
();
auto
dt_p
=
probs_desc
->
dtype
();
CHECK_DTYPE_ANY_INT
(
dt_i
);
CHECK_DTYPE
(
dt_p
,
INFINI_DTYPE_F16
,
INFINI_DTYPE_F32
,
INFINI_DTYPE_F64
);
CHECK_OR_RETURN
(
result_desc
->
ndim
()
==
0
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
probs_desc
->
ndim
()
==
1
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_OR_RETURN
(
probs_desc
->
stride
(
0
)
==
1
,
INFINI_STATUS_BAD_TENSOR_STRIDES
);
return
utils
::
Result
<
RandomSampleInfo
>
({
dt_i
,
dt_p
,
probs_desc
->
dim
(
0
)});
}
};
}
// namespace op::random_sample
#endif // __RANDOM_SAMPLE_INFO_H__
src/infiniop/ops/random_sample/operator.cc
View file @
c2e87202
...
...
@@ -5,6 +5,9 @@
#ifdef ENABLE_CPU_API
#include "cpu/random_sample_cpu.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/random_sample_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateRandomSampleDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -25,6 +28,9 @@ __C infiniStatus_t infiniopCreateRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -38,9 +44,10 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE:
\
case CASE:
{
\
using Ptr = const op::random_sample::NAMESPACE::Descriptor *; \
*size = reinterpret_cast<Ptr>(desc)->minWorkspaceSize(); \
} \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
...
...
@@ -48,6 +55,9 @@ __C infiniStatus_t infiniopGetRandomSampleWorkspaceSize(
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
GET
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -82,6 +92,9 @@ __C infiniStatus_t infiniopRandomSample(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -103,6 +116,9 @@ __C infiniStatus_t infiniopDestroyRandomSampleDescriptor(
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/random_sample/random_sample.h
View file @
c2e87202
#ifndef __RANDOM_SAMPLE_H__
#define __RANDOM_SAMPLE_H__
#include "../../../utils.h"
#include "../../operator.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
...
...
@@ -11,22 +11,18 @@
struct Opaque; \
Opaque *_opaque; \
\
infiniDtype_t _dt_i, _dt_p;
\
size_t
_n,
_min_workspace_size; \
RandomSampleInfo _info;
\
size_t _min_workspace_size;
\
\
Descriptor( \
infiniDtype_t dt_i, \
infiniDtype_t dt_p, \
size_t n, \
RandomSampleInfo info, \
size_t min_workspace_size, \
Opaque *opaque, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_dt_i(dt_i), \
_dt_p(dt_p), \
_n(n), \
_info(info), \
_min_workspace_size(min_workspace_size) {} \
\
public: \
...
...
@@ -53,4 +49,96 @@
}; \
}
namespace
op
::
random_sample
{
struct
CalculateArgs
{
void
*
workspace
;
size_t
workspace_size
;
void
*
result
;
const
void
*
probs
;
float
random_val
,
topp
,
temperature
;
int
topk
;
void
*
stream
;
};
class
Calculate
{
template
<
class
Tidx
,
class
Tval
,
class
Algo
>
static
void
switch_f
(
Algo
algo
,
size_t
n
,
CalculateArgs
args
)
{
if
(
args
.
random_val
==
0
||
args
.
topp
==
0
||
args
.
topk
==
1
||
args
.
temperature
==
0
)
{
algo
.
template
argmax
<
Tidx
,
Tval
>(
args
.
workspace
,
args
.
workspace_size
,
args
.
result
,
args
.
probs
,
n
,
args
.
stream
);
}
else
{
algo
.
template
random
<
Tidx
,
Tval
>(
args
.
workspace
,
args
.
workspace_size
,
args
.
result
,
args
.
probs
,
n
,
args
.
random_val
,
args
.
topp
,
args
.
topk
,
args
.
temperature
,
args
.
stream
);
}
}
template
<
class
Tidx
,
class
Algo
>
static
void
switch_val
(
Algo
algo
,
infiniDtype_t
dt_p
,
size_t
n
,
CalculateArgs
args
)
{
switch
(
dt_p
)
{
case
INFINI_DTYPE_F16
:
switch_f
<
Tidx
,
fp16_t
>
(
algo
,
n
,
args
);
break
;
case
INFINI_DTYPE_F32
:
switch_f
<
Tidx
,
float
>
(
algo
,
n
,
args
);
break
;
case
INFINI_DTYPE_F64
:
switch_f
<
Tidx
,
double
>
(
algo
,
n
,
args
);
break
;
default:
// unreachable
std
::
abort
();
}
}
public:
template
<
class
Algo
>
static
infiniStatus_t
calculate
(
Algo
algo
,
RandomSampleInfo
info
,
void
*
workspace
,
size_t
workspace_size
,
void
*
result
,
const
void
*
probs
,
float
random_val
,
float
topp
,
int
topk
,
float
temperature
,
void
*
stream
)
{
#define CASE(DT_VAL, DT_TYP) \
case DT_VAL: \
switch_val<DT_TYP>( \
algo, info.dt_p, info.n, \
{workspace, workspace_size, \
result, probs, \
random_val, topp, temperature, topk, \
stream}); \
break
switch
(
info
.
dt_i
)
{
CASE
(
INFINI_DTYPE_I8
,
int8_t
);
CASE
(
INFINI_DTYPE_I16
,
int16_t
);
CASE
(
INFINI_DTYPE_I32
,
int32_t
);
CASE
(
INFINI_DTYPE_I64
,
int64_t
);
CASE
(
INFINI_DTYPE_U8
,
uint8_t
);
CASE
(
INFINI_DTYPE_U16
,
uint16_t
);
CASE
(
INFINI_DTYPE_U32
,
uint32_t
);
CASE
(
INFINI_DTYPE_U64
,
uint64_t
);
default:
// unreachable
std
::
abort
();
}
#undef CASE
return
INFINI_STATUS_SUCCESS
;
}
};
}
// namespace op::random_sample
#endif // __RANDOM_SAMPLE_H__
src/infiniop/ops/rearrange/ascend/rearrange_ascend.cc
0 → 100644
View file @
c2e87202
#include "rearrange_ascend.h"
#include "../../../devices/ascend/common_ascend.h"
#include <aclnnop/aclnn_copy.h>
namespace
op
::
rearrange
::
ascend
{
struct
Descriptor
::
Opaque
{
aclnnTensorDescriptor_t
dst
;
aclnnTensorDescriptor_t
src
;
void
*
workspace
;
// aclnnInplaceCopy workspace
uint64_t
workspace_size
;
~
Opaque
()
{
delete
dst
;
delete
src
;
aclrtFree
(
workspace
);
}
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
};
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle_
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
handle
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
auto
shape
=
y_desc
->
shape
();
CHECK_API_OR
(
x_desc
->
dtype
(),
dtype
,
return
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_API_OR
(
x_desc
->
ndim
(),
ndim
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
CHECK_API_OR
(
x_desc
->
shape
()[
i
],
shape
[
i
],
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
auto
dst_strides
=
y_desc
->
strides
();
auto
src_strides
=
x_desc
->
strides
();
auto
element_size
=
infiniSizeOf
(
dtype
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
shape
.
data
(),
dst_strides
.
data
(),
src_strides
.
data
(),
ndim
,
element_size
);
CHECK_RESULT
(
result
);
aclnnTensorDescriptor_t
dst
=
new
aclnnTensorDescriptor
(
y_desc
);
aclnnTensorDescriptor_t
src
=
new
aclnnTensorDescriptor
(
x_desc
);
uint64_t
workspace_size
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
void
*
workspace
=
nullptr
;
aclnnInplaceCopyGetWorkspaceSize
(
dst
->
tensor
,
src
->
tensor
,
&
workspace_size
,
&
executor
);
if
(
workspace_size
!=
0
)
{
CHECK_ACL
(
aclrtMalloc
(
&
workspace
,
workspace_size
,
ACL_MEM_MALLOC_HUGE_FIRST
));
}
*
desc_ptr
=
new
Descriptor
(
result
.
take
(),
new
Opaque
{
dst
,
src
,
workspace
,
workspace_size
},
handle
->
device
,
handle
->
device_id
);
// Delete useless executor
aclDestroyAclOpExecutor
(
executor
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
auto
tdst
=
_opaque
->
dst
->
tensor
;
auto
tsrc
=
_opaque
->
src
->
tensor
;
uint64_t
workspace_size
=
0
;
aclOpExecutor
*
executor
=
nullptr
;
AclSetTensorAddr
(
executor
,
0
,
tdst
,
y
);
AclSetTensorAddr
(
executor
,
1
,
tsrc
,
(
void
*
)
x
);
CHECK_ACL
(
aclnnInplaceCopyGetWorkspaceSize
(
tdst
,
tsrc
,
&
workspace_size
,
&
executor
));
// Execute InplaceCopy
CHECK_ACL
(
aclnnInplaceCopy
(
_opaque
->
workspace
,
_opaque
->
workspace_size
,
executor
,
stream
));
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::rearrange::ascend
src/infiniop/ops/rearrange/ascend/rearrange_ascend.h
0 → 100644
View file @
c2e87202
#ifndef __REARRANGE_ASCEND_H__
#define __REARRANGE_ASCNED_H__
#include "../rearrange.h"
DESCRIPTOR
(
ascend
)
#endif // __REARRANGE_ASCEND_H__
src/infiniop/ops/rearrange/cpu/rearrange_cpu.cc
View file @
c2e87202
...
...
@@ -15,20 +15,18 @@ infiniStatus_t Descriptor::create(
auto
handle
=
reinterpret_cast
<
device
::
cpu
::
Handle
*>
(
handle_
);
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
auto
shape
=
y_desc
->
shape
().
data
();
CHECK_API_OR
(
x_desc
->
dtype
(),
dtype
,
return
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_API_OR
(
x_desc
->
ndim
(),
ndim
,
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
auto
y_shape
=
y_desc
->
shape
();
auto
x_shape
=
x_desc
->
shape
();
CHECK_OR_RETURN
(
x_desc
->
dtype
()
==
dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
x_desc
->
ndim
()
==
ndim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
CHECK_SAME_SHAPE
(
x_shape
,
y_shape
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
CHECK_API_OR
(
x_desc
->
shape
()[
i
],
shape
[
i
],
return
INFINI_STATUS_BAD_TENSOR_SHAPE
);
}
auto
dst_strides
=
y_desc
->
strides
().
data
();
auto
src_strides
=
x_desc
->
strides
().
data
();
auto
dst_strides
=
y_desc
->
strides
();
auto
src_strides
=
x_desc
->
strides
();
auto
element_size
=
infiniSizeOf
(
dtype
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
shape
,
dst_strides
,
src_strides
,
ndim
,
element_size
);
auto
result
=
utils
::
RearrangeMeta
::
create
(
y_
shape
.
data
()
,
dst_strides
.
data
()
,
src_strides
.
data
()
,
ndim
,
element_size
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
...
...
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cu
0 → 100644
View file @
c2e87202
#include "../../../devices/cuda/cuda_common.cuh"
#include "../../../devices/cuda/cuda_kernel_common.cuh"
#include "../../../tensor.h"
#include "rearrange_cuda.cuh"
#include "rearrange_kernel.cuh"
#include <algorithm>
#include <cmath>
#include <memory>
#include <stdint.h>
#include <vector>
namespace
op
::
rearrange
::
cuda
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
cuda
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
y_desc
,
infiniopTensorDescriptor_t
x_desc
)
{
auto
dtype
=
y_desc
->
dtype
();
auto
ndim
=
y_desc
->
ndim
();
CHECK_OR_RETURN
(
x_desc
->
dtype
()
==
dtype
,
INFINI_STATUS_BAD_TENSOR_DTYPE
);
CHECK_OR_RETURN
(
x_desc
->
ndim
()
==
ndim
,
INFINI_STATUS_BAD_TENSOR_SHAPE
);
// 保存临时vector对象
auto
x_shape
=
x_desc
->
shape
();
auto
y_shape
=
y_desc
->
shape
();
auto
y_strides
=
y_desc
->
strides
();
auto
x_strides
=
x_desc
->
strides
();
CHECK_SAME_SHAPE
(
x_shape
,
y_shape
);
auto
meta
=
utils
::
RearrangeMeta
::
create
(
y_shape
.
data
(),
y_strides
.
data
(),
x_strides
.
data
(),
ndim
,
infiniSizeOf
(
dtype
));
CHECK_RESULT
(
meta
);
*
desc_ptr
=
new
Descriptor
(
std
::
move
(
*
meta
),
new
Opaque
{
reinterpret_cast
<
device
::
cuda
::
Handle
*>
(
handle
)
->
internal
()},
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
// 维度信息结构
struct
Dim
{
size_t
len
;
ARRAY_TYPE_STRIDE
src_stride
;
ARRAY_TYPE_STRIDE
dst_stride
;
};
// 分割维度结构
struct
SplitDim
{
size_t
choose_idx
;
size_t
num_per_block
;
size_t
num_per_grid
;
int
array_struct_idx_block
;
int
array_struct_idx_grid
;
size_t
dim_len
;
};
/**
* 根据给定的元数据准备张量重排参数,该函数主要完成以下工作:
* 1. 根据原始元数据调整单元大小,获取更适合GPU处理的单元大小
* 2. 将维度分配为CUDA块(block)维度和网格(grid)维度:
* 该步骤是核心,目标是为每个block分配尽可能多的相对连续的数据进行处理,
* 对无法完整放入块的维度进行分割,并记录分割维度信息,用于防止kernel访问越界,最大化内存访问局部性和计算效率
*/
utils
::
Result
<
RearrangeParams
>
prepareRearrangeParams
(
const
utils
::
RearrangeMeta
&
original_meta
,
int
max_threads
)
{
RearrangeParams
params
;
// 获取更适合GPU处理的单元大小,这里使用2的幂次方
auto
meta_result
=
original_meta
.
distributeUnit
({
32
,
16
,
8
,
4
,
2
,
1
});
CHECK_RESULT
(
meta_result
);
const
utils
::
RearrangeMeta
&
meta
=
meta_result
.
take
();
// 获取维度信息
const
size_t
ndim
=
meta
.
ndim
();
const
size_t
unit
=
meta
.
unit
();
// 特殊情况:无维度,只需要简单复制
if
(
ndim
==
0
)
{
params
.
block_dim
=
0
;
params
.
block_len_total
=
1
;
params
.
block_len
=
{
static_cast
<
ARRAY_TYPE_SIZE
>
(
1
)};
params
.
src_block_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
dst_block_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
grid_len
=
{
static_cast
<
ARRAY_TYPE_SIZE
>
(
1
)};
params
.
src_grid_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
dst_grid_stride
=
{
static_cast
<
ARRAY_TYPE_STRIDE
>
(
0
)};
params
.
unit_size
=
unit
;
return
utils
::
Result
<
RearrangeParams
>
(
params
);
}
// 从元数据中提取必要的信息
const
ptrdiff_t
*
idx_strides
=
meta
.
idx_strides
();
const
ptrdiff_t
*
dst_strides
=
meta
.
dst_strides
();
const
ptrdiff_t
*
src_strides
=
meta
.
src_strides
();
// 准备维度信息
std
::
vector
<
Dim
>
dims
;
std
::
vector
<
size_t
>
shape
;
dims
.
reserve
(
ndim
);
shape
.
reserve
(
ndim
);
auto
prev_idx_stride
=
meta
.
count
();
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
size_t
len
=
prev_idx_stride
/
idx_strides
[
i
];
shape
.
push_back
(
len
);
dims
.
push_back
({
len
,
src_strides
[
i
],
dst_strides
[
i
]});
prev_idx_stride
=
idx_strides
[
i
];
}
// 计算src_strides的降序排序索引,类似于Rust版本中的src_strides_desc_idx
std
::
vector
<
size_t
>
src_strides_desc_idx
(
ndim
);
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
src_strides_desc_idx
[
i
]
=
i
;
}
std
::
sort
(
src_strides_desc_idx
.
begin
(),
src_strides_desc_idx
.
end
(),
[
&
dims
](
size_t
a
,
size_t
b
)
{
return
std
::
abs
(
dims
[
a
].
src_stride
)
>
std
::
abs
(
dims
[
b
].
src_stride
);
});
// 根据最大线程数选择block和grid维度
const
size_t
block_size
=
max_threads
;
std
::
vector
<
bool
>
block_dim_choose
(
ndim
,
false
);
// 初始化计数器
size_t
block_elements
=
1
;
size_t
block_src_elements
=
1
;
size_t
block_dst_elements
=
1
;
size_t
src_choose_idx
=
ndim
;
size_t
dst_choose_idx
=
ndim
;
// 用于存储分割维度信息
std
::
vector
<
SplitDim
>
split_dims
;
// 维度选择循环
while
(
src_choose_idx
>
0
&&
dst_choose_idx
>
0
)
{
// 获取当前需要处理的维度索引
size_t
src_idx
=
src_strides_desc_idx
[
src_choose_idx
-
1
];
size_t
dst_idx
=
dst_choose_idx
-
1
;
if
(
src_idx
==
dst_idx
)
{
// 源和目标维度相同,可以一起处理
size_t
idx
=
src_idx
;
size_t
len
=
shape
[
idx
];
// 检查是否可以将此维度完全添加到block中
if
(
block_elements
*
len
<=
block_size
)
{
// 选择此维度
block_dim_choose
[
idx
]
=
true
;
block_elements
*=
len
;
block_src_elements
*=
len
;
block_dst_elements
*=
len
;
src_choose_idx
--
;
dst_choose_idx
--
;
}
else
{
// 需要分割此维度
size_t
num_per_block
=
block_size
/
block_elements
;
// 确保num_per_block > 0且len >= num_per_block
if
(
num_per_block
>
0
&&
len
>=
num_per_block
&&
num_per_block
>
1
)
{
size_t
num_per_grid
=
(
len
+
num_per_block
-
1
)
/
num_per_block
;
// 向上取整
SplitDim
split_dim
=
{
idx
,
// choose_idx
num_per_block
,
// num_per_block
num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
break
;
}
}
else
{
// 源和目标维度不同,需要分别处理
// 计算块比例
double
src_div_dst
=
static_cast
<
double
>
(
block_src_elements
)
/
block_dst_elements
;
double
src_num_per_block
=
std
::
sqrt
(
block_size
/
(
double
)
block_elements
/
src_div_dst
);
double
dst_num_per_block
=
src_num_per_block
*
src_div_dst
;
size_t
src_current_dim_len
=
shape
[
src_idx
];
size_t
dst_current_dim_len
=
shape
[
dst_idx
];
if
(
static_cast
<
double
>
(
src_current_dim_len
)
<
src_num_per_block
)
{
// 源维度可以完全添加到block
block_dim_choose
[
src_idx
]
=
true
;
block_elements
*=
src_current_dim_len
;
block_src_elements
*=
src_current_dim_len
;
src_choose_idx
--
;
}
else
if
(
static_cast
<
double
>
(
dst_current_dim_len
)
<
dst_num_per_block
)
{
// 目标维度可以完全添加到block
block_dim_choose
[
dst_idx
]
=
true
;
block_elements
*=
dst_current_dim_len
;
block_dst_elements
*=
dst_current_dim_len
;
dst_choose_idx
--
;
}
else
{
// 需要分割源和目标维度
size_t
src_num_per_block_int
=
static_cast
<
size_t
>
(
std
::
floor
(
src_num_per_block
));
size_t
dst_num_per_block_int
=
static_cast
<
size_t
>
(
std
::
floor
(
dst_num_per_block
));
// 计算网格尺寸
size_t
src_num_per_grid
=
(
src_current_dim_len
+
src_num_per_block_int
-
1
)
/
src_num_per_block_int
;
// 向上取整
size_t
dst_num_per_grid
=
(
dst_current_dim_len
+
dst_num_per_block_int
-
1
)
/
dst_num_per_block_int
;
// 向上取整
// 处理源维度
if
(
src_num_per_block_int
>
1
)
{
if
(
src_num_per_grid
==
1
)
{
// 可以完全放入块
block_dim_choose
[
src_idx
]
=
true
;
block_elements
*=
src_current_dim_len
;
block_src_elements
*=
src_current_dim_len
;
src_choose_idx
--
;
}
else
{
// 需要分割
SplitDim
split_dim
=
{
src_idx
,
// choose_idx
src_num_per_block_int
,
// num_per_block
src_num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
src_current_dim_len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
}
// 处理目标维度
if
(
dst_num_per_block_int
>
1
)
{
if
(
dst_num_per_grid
==
1
)
{
// 可以完全放入块
block_dim_choose
[
dst_idx
]
=
true
;
block_elements
*=
dst_current_dim_len
;
block_dst_elements
*=
dst_current_dim_len
;
dst_choose_idx
--
;
}
else
{
// 需要分割
SplitDim
split_dim
=
{
dst_idx
,
// choose_idx
dst_num_per_block_int
,
// num_per_block
dst_num_per_grid
,
// num_per_grid
0
,
// array_struct_idx_block (待更新)
0
,
// array_struct_idx_grid (待更新)
dst_current_dim_len
// 原始维度长度
};
split_dims
.
push_back
(
split_dim
);
}
}
break
;
}
}
}
// 准备block维度相关参数
size_t
block_dim
=
0
;
size_t
block_len_total
=
1
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
block_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_block_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_block_stride
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
grid_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_grid_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_grid_stride
;
// 处理block维度,填充block_len和block_stride
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
block_dim_choose
[
i
])
{
block_len
.
push_back
(
shape
[
i
]);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
block_dim
+=
1
;
block_len_total
*=
shape
[
i
];
}
// 处理分割维度的block部分
for
(
size_t
j
=
0
;
j
<
split_dims
.
size
();
++
j
)
{
if
(
i
==
split_dims
[
j
].
choose_idx
)
{
block_len
.
push_back
(
split_dims
[
j
].
num_per_block
);
src_block_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_block_stride
.
push_back
(
dims
[
i
].
dst_stride
);
split_dims
[
j
].
array_struct_idx_block
=
block_dim
;
block_dim
+=
1
;
block_len_total
*=
split_dims
[
j
].
num_per_block
;
}
}
}
// 处理grid维度,填充grid_len和grid_stride
for
(
size_t
i
=
0
;
i
<
ndim
;
++
i
)
{
if
(
!
block_dim_choose
[
i
])
{
bool
is_split
=
false
;
// 检查是否是分割维度
for
(
size_t
j
=
0
;
j
<
split_dims
.
size
();
++
j
)
{
if
(
i
==
split_dims
[
j
].
choose_idx
)
{
is_split
=
true
;
grid_len
.
push_back
(
split_dims
[
j
].
num_per_grid
);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
*
split_dims
[
j
].
num_per_block
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
*
split_dims
[
j
].
num_per_block
);
split_dims
[
j
].
array_struct_idx_grid
=
grid_len
.
size
()
-
1
;
}
}
// 如果不是分割维度,则作为完整的grid维度
if
(
!
is_split
)
{
grid_len
.
push_back
(
shape
[
i
]);
src_grid_stride
.
push_back
(
dims
[
i
].
src_stride
);
dst_grid_stride
.
push_back
(
dims
[
i
].
dst_stride
);
}
}
}
// 如果grid_len为空,添加一个默认值
if
(
grid_len
.
empty
())
{
grid_len
.
push_back
(
1
);
src_grid_stride
.
push_back
(
0
);
dst_grid_stride
.
push_back
(
0
);
}
// 处理约束条件 - 使用与Rust版本相似的逻辑
std
::
vector
<
Constraint
<
ARRAY_TYPE_SIZE
>>
constraints
;
// 限制最多处理2个约束条件
for
(
size_t
i
=
0
;
i
<
split_dims
.
size
();
++
i
)
{
if
(
split_dims
[
i
].
dim_len
%
split_dims
[
i
].
num_per_block
==
0
)
{
continue
;
}
Constraint
<
ARRAY_TYPE_SIZE
>
constraint
;
constraint
.
grid_idx
=
split_dims
[
i
].
array_struct_idx_grid
;
constraint
.
block_idx
=
split_dims
[
i
].
array_struct_idx_block
;
constraint
.
grid_div_block
=
split_dims
[
i
].
num_per_block
;
constraint
.
total_len
=
split_dims
[
i
].
dim_len
;
constraints
.
push_back
(
constraint
);
}
// 设置参数
params
.
block_dim
=
block_dim
;
params
.
block_len_total
=
block_len_total
;
params
.
block_len
=
block_len
;
params
.
src_block_stride
=
src_block_stride
;
params
.
dst_block_stride
=
dst_block_stride
;
params
.
grid_len
=
grid_len
;
params
.
src_grid_stride
=
src_grid_stride
;
params
.
dst_grid_stride
=
dst_grid_stride
;
params
.
constraints
=
constraints
;
params
.
unit_size
=
unit
;
return
utils
::
Result
<
RearrangeParams
>
(
params
);
}
// 带约束的内核启动模板函数
template
<
unsigned
int
BLOCK_SIZE
>
infiniStatus_t
launchKernel
(
void
*
y
,
const
void
*
x
,
size_t
grid_size
,
const
RearrangeParams
&
params
,
size_t
unit_size
,
cudaStream_t
stream
)
{
// 获取内核函数
RearrangeParams
params_copy
=
params
;
// 创建一个非const副本
auto
kernel_func_result
=
getRearrangeKernel
(
params_copy
);
CHECK_RESULT
(
kernel_func_result
);
auto
kernel_func
=
kernel_func_result
.
take
();
// 创建非const的临时变量
size_t
block_dim
=
params
.
block_dim
;
size_t
block_len_total
=
params
.
block_len_total
;
// 检查向量尺寸是否合理
if
(
params
.
block_len
.
size
()
<
block_dim
||
params
.
src_block_stride
.
size
()
<
block_dim
||
params
.
dst_block_stride
.
size
()
<
block_dim
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
if
(
params
.
grid_len
.
empty
()
||
params
.
src_grid_stride
.
empty
()
||
params
.
dst_grid_stride
.
empty
())
{
return
INFINI_STATUS_BAD_PARAM
;
}
const
Constraint
<
ARRAY_TYPE_SIZE
>
*
constraints_data
;
auto
empty_constraints
=
Constraint
<
ARRAY_TYPE_SIZE
>
();
if
(
params
.
constraints
.
empty
())
{
constraints_data
=
&
empty_constraints
;
}
else
{
constraints_data
=
params
.
constraints
.
data
();
}
void
*
args
[]
=
{
&
y
,
&
x
,
&
block_dim
,
&
block_len_total
,
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
block_len
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
src_block_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
dst_block_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
grid_len
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
src_grid_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
params
.
dst_grid_stride
.
data
())),
const_cast
<
void
*>
(
static_cast
<
const
void
*>
(
constraints_data
))};
CHECK_OR_RETURN
(
cudaLaunchKernel
(
kernel_func
,
grid_size
,
BLOCK_SIZE
,
args
,
0
,
stream
)
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
return
INFINI_STATUS_SUCCESS
;
}
infiniStatus_t
Descriptor
::
calculate
(
void
*
y
,
const
void
*
x
,
void
*
stream
)
const
{
auto
cuda_stream
=
reinterpret_cast
<
cudaStream_t
>
(
stream
);
// 如果没有维度,直接进行内存拷贝
if
(
_meta
.
ndim
()
==
0
)
{
auto
err
=
cudaMemcpyAsync
(
y
,
x
,
_meta
.
unit
(),
cudaMemcpyDeviceToDevice
,
cuda_stream
);
if
(
err
!=
cudaSuccess
)
{
return
INFINI_STATUS_INTERNAL_ERROR
;
}
CHECK_OR_RETURN
(
cudaMemcpyAsync
(
y
,
x
,
_meta
.
unit
(),
cudaMemcpyDeviceToDevice
,
cuda_stream
)
==
cudaSuccess
,
INFINI_STATUS_INTERNAL_ERROR
);
return
INFINI_STATUS_SUCCESS
;
}
// 获取设备属性
int
max_threads
=
_opaque
->
internal
->
maxThreadsPerBlock
();
// 准备参数
auto
params_result
=
prepareRearrangeParams
(
_meta
,
std
::
min
(
CUDA_BLOCK_SIZE_1024
,
max_threads
));
CHECK_RESULT
(
params_result
);
auto
params
=
params_result
.
take
();
// 计算grid大小
size_t
grid_size
=
1
;
for
(
size_t
i
=
0
;
i
<
params
.
grid_len
.
size
();
++
i
)
{
grid_size
*=
params
.
grid_len
[
i
];
}
// 检查grid大小是否为0
if
(
grid_size
==
0
)
{
return
INFINI_STATUS_BAD_PARAM
;
}
// 根据设备属性选择合适的内核
infiniStatus_t
status
=
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
size_t
block_size
=
params
.
block_len_total
;
if
(
block_size
<=
CUDA_BLOCK_SIZE_512
)
{
status
=
launchKernel
<
CUDA_BLOCK_SIZE_512
>
(
y
,
x
,
grid_size
,
params
,
_meta
.
unit
(),
cuda_stream
);
}
else
if
(
block_size
<=
CUDA_BLOCK_SIZE_1024
)
{
status
=
launchKernel
<
CUDA_BLOCK_SIZE_1024
>
(
y
,
x
,
grid_size
,
params
,
_meta
.
unit
(),
cuda_stream
);
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
status
;
}
}
// namespace op::rearrange::cuda
src/infiniop/ops/rearrange/cuda/rearrange_cuda.cuh
0 → 100644
View file @
c2e87202
#ifndef __REARRANGE_CUDA_H__
#define __REARRANGE_CUDA_H__
#include "../rearrange.h"
DESCRIPTOR
(
cuda
)
#endif // __REARRANGE_CUDA_H__
src/infiniop/ops/rearrange/cuda/rearrange_kernel.cuh
0 → 100644
View file @
c2e87202
#ifndef __REARRANGE_CUDA_KERNEL_H__
#define __REARRANGE_CUDA_KERNEL_H__
#include "../../../devices/cuda/cuda_common.cuh"
#define ARRAY_TYPE_STRIDE ptrdiff_t
#define ARRAY_TYPE_SIZE size_t
// 与 DEFINE_KERNELS_BY_CONSTRAINT 耦合,需要同时修改
#define MAX_BLOCK_ARRAY_SIZE 5
#define MAX_GRID_ARRAY_SIZE 5
template
<
int
ArrSize
,
typename
ArrayType
>
struct
ArrayStruct
{
ArrayType
a
[
ArrSize
];
};
// 各个元素分别代表:[grid_idx, block_idx, grid的stride相对于block的倍数,总的len限制]
template
<
typename
ElementType
>
struct
Constraint
{
ElementType
grid_idx
;
ElementType
block_idx
;
ElementType
grid_div_block
;
ElementType
total_len
;
};
#define IF_CONSTRAINT_0 , const ArrayStruct<1, Constraint<ARRAY_TYPE_SIZE>> constraints
#define IF_CONSTRAINT_1 , const ArrayStruct<1, Constraint<ARRAY_TYPE_SIZE>> constraints
#define IF_CONSTRAINT_2 , const ArrayStruct<2, Constraint<ARRAY_TYPE_SIZE>> constraints
// 定义宏生成内核函数
#define DEFINE_REARRANGE_KERNEL(Tmem_type, constraint_num, block_array_size, grid_array_size) \
extern "C" __global__ void rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num( \
void *__restrict__ dst, \
const void *__restrict__ src, \
const size_t block_dim, \
const size_t block_len_total, \
const ArrayStruct<block_array_size, ARRAY_TYPE_SIZE> block_len, \
const ArrayStruct<block_array_size, ARRAY_TYPE_STRIDE> src_block_stride,
/* 字节单位的步长 */
\
const ArrayStruct<block_array_size, ARRAY_TYPE_STRIDE> dst_block_stride,
/* 字节单位的步长 */
\
const ArrayStruct<grid_array_size, ARRAY_TYPE_SIZE> grid_len, \
const ArrayStruct<grid_array_size, ARRAY_TYPE_STRIDE> src_grid_stride,
/* 字节单位的步长 */
\
const ArrayStruct<grid_array_size, ARRAY_TYPE_STRIDE> dst_grid_stride
/* 字节单位的步长 */
\
IF_CONSTRAINT_##constraint_num) { \
size_t remaining = threadIdx.x; \
if (remaining >= block_len_total) { \
return; \
} \
\
/* 声明共享内存 */
\
__shared__ ptrdiff_t shared_src_offset; \
__shared__ ptrdiff_t shared_dst_offset; \
\
if (constraint_num > 0) { \
__shared__ ARRAY_TYPE_SIZE shared_constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
\
if (threadIdx.x == 0) {
/* 只让0号线程计算 */
\
/* 计算当前block处理的数据在src和dst中的基础偏移(bytes) */
\
ptrdiff_t src_offset = 0; \
ptrdiff_t dst_offset = 0; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
\
size_t remaining \
= blockIdx.x; \
\
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].grid_idx) { \
constraints_grid_idx_multiple[j] = idx * constraints.a[j].grid_div_block; \
} \
} \
} \
} \
\
/* 将结果存入共享内存 */
\
shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \
for (ssize_t j = 0; j < constraint_num; j++) { \
shared_constraints_grid_idx_multiple[j] = constraints_grid_idx_multiple[j]; \
} \
} \
\
/* 确保所有线程都能看到共享内存中的值 */
\
__syncthreads(); \
\
/* 所有线程直接使用计算好的偏移值 */
\
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
ARRAY_TYPE_SIZE constraints_grid_idx_multiple[constraint_num > 0 ? constraint_num : 1]; \
for (ssize_t j = 0; j < constraint_num; j++) { \
constraints_grid_idx_multiple[j] = shared_constraints_grid_idx_multiple[j]; \
} \
\
for (ssize_t i = block_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */
\
src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
if (constraint_num > 0) { \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (i == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + idx >= constraints.a[j].total_len) { \
return; \
} \
} \
} \
} \
} \
\
src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
for (ssize_t j = 0; j < constraint_num; j++) { \
if (0 == constraints.a[j].block_idx) { \
if (constraints_grid_idx_multiple[j] + remaining >= constraints.a[j].total_len) { \
return; \
} \
} \
} \
\
/* 执行数据拷贝,注意offset已经是字节偏移 */
\
*reinterpret_cast<Tmem_type *>(reinterpret_cast<char *>(dst) + dst_offset) = *reinterpret_cast<const Tmem_type *>(reinterpret_cast<const char *>(src) + src_offset); \
\
} else { \
if (threadIdx.x == 0) {
/* 只让0号线程计算 */
\
/* 计算当前block处理的数据在src和dst中的基础偏移(bytes) */
\
ptrdiff_t src_offset = 0; \
ptrdiff_t dst_offset = 0; \
size_t remaining = blockIdx.x; \
\
for (ssize_t i = grid_array_size - 1; i >= 0; i--) { \
size_t idx = remaining % grid_len.a[i]; \
remaining /= grid_len.a[i]; \
src_offset += idx * src_grid_stride.a[i]; \
dst_offset += idx * dst_grid_stride.a[i]; \
} \
\
/* 将结果存入共享内存 */
\
shared_src_offset = src_offset; \
shared_dst_offset = dst_offset; \
} \
\
/* 确保所有线程都能看到共享内存中的值 */
\
__syncthreads(); \
\
/* 所有线程直接使用计算好的偏移值 */
\
ptrdiff_t src_offset = shared_src_offset; \
ptrdiff_t dst_offset = shared_dst_offset; \
\
for (ssize_t i = block_array_size - 1; i > 0; i--) { \
size_t idx = remaining % block_len.a[i]; \
remaining /= block_len.a[i]; \
/* 计算偏移量 */
\
src_offset += idx * src_block_stride.a[i]; \
dst_offset += idx * dst_block_stride.a[i]; \
} \
\
src_offset += remaining * src_block_stride.a[0]; \
dst_offset += remaining * dst_block_stride.a[0]; \
\
/* 执行数据拷贝,注意offset已经是字节偏移 */
\
*reinterpret_cast<Tmem_type *>(reinterpret_cast<char *>(dst) + dst_offset) = *reinterpret_cast<const Tmem_type *>(reinterpret_cast<const char *>(src) + src_offset); \
} \
}
// 定义支持的约束条件数量组合
#define DEFINE_KERNELS_BY_CONSTRAINT(block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(0, block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(1, block_array_size, grid_array_size) \
DEFINE_KERNELS_BY_TYPE(2, block_array_size, grid_array_size)
// 定义支持的类型
#define DEFINE_KERNELS_BY_TYPE(constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(uchar1, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(uchar2, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float1, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float2, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(float4, constraint_num, block_array_size, grid_array_size) \
DEFINE_REARRANGE_KERNEL(double4, constraint_num, block_array_size, grid_array_size)
// 与 MAX_BLOCK_ARRAY_SIZE 和 MAX_GRID_ARRAY_SIZE 耦合,需要同时修改
// 为1-5和1-5的所有组合生成内核
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
1
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
2
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
3
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
4
,
5
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
1
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
2
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
3
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
4
)
DEFINE_KERNELS_BY_CONSTRAINT
(
5
,
5
)
// 准备参数结构体
struct
RearrangeParams
{
std
::
vector
<
ARRAY_TYPE_SIZE
>
block_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_block_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_block_stride
;
std
::
vector
<
ARRAY_TYPE_SIZE
>
grid_len
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
src_grid_stride
;
std
::
vector
<
ARRAY_TYPE_STRIDE
>
dst_grid_stride
;
size_t
block_dim
;
size_t
block_len_total
;
std
::
vector
<
Constraint
<
ARRAY_TYPE_SIZE
>>
constraints
;
size_t
unit_size
;
};
utils
::
Result
<
void
*>
getRearrangeKernel
(
const
RearrangeParams
&
params
)
{
auto
grid_num
=
params
.
grid_len
.
size
();
auto
block_num
=
params
.
block_len
.
size
();
auto
constraint_num
=
params
.
constraints
.
size
();
auto
unit_size
=
params
.
unit_size
;
CHECK_OR_RETURN
(
grid_num
<=
MAX_GRID_ARRAY_SIZE
&&
grid_num
!=
0
,
INFINI_STATUS_BAD_PARAM
);
CHECK_OR_RETURN
(
block_num
<=
MAX_BLOCK_ARRAY_SIZE
&&
block_num
!=
0
,
INFINI_STATUS_BAD_PARAM
);
CHECK_OR_RETURN
(
constraint_num
<=
2
,
INFINI_STATUS_BAD_PARAM
);
auto
block_len
=
params
.
block_len
.
data
();
auto
src_block_stride
=
params
.
src_block_stride
.
data
();
auto
dst_block_stride
=
params
.
dst_block_stride
.
data
();
auto
grid_len
=
params
.
grid_len
.
data
();
auto
src_grid_stride
=
params
.
src_grid_stride
.
data
();
auto
dst_grid_stride
=
params
.
dst_grid_stride
.
data
();
auto
constrain
=
params
.
constraints
.
data
();
void
*
kernel_func
=
nullptr
;
#define GET_REARRANGE_KERNEL(Tmem_type, block_array_size, grid_array_size, constraint_num) \
kernel_func = (void *)rearrange_unit_##Tmem_type##_block_##block_array_size##_grid_##grid_array_size##_constrain_##constraint_num;
#define GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, constraint_num) \
switch (unit_size) { \
case 1: \
GET_REARRANGE_KERNEL(uchar1, block_array_size, grid_array_size, constraint_num); \
break; \
case 2: \
GET_REARRANGE_KERNEL(uchar2, block_array_size, grid_array_size, constraint_num); \
break; \
case 4: \
GET_REARRANGE_KERNEL(float1, block_array_size, grid_array_size, constraint_num); \
break; \
case 8: \
GET_REARRANGE_KERNEL(float2, block_array_size, grid_array_size, constraint_num); \
break; \
case 16: \
GET_REARRANGE_KERNEL(float4, block_array_size, grid_array_size, constraint_num); \
break; \
case 32: \
GET_REARRANGE_KERNEL(double4, block_array_size, grid_array_size, constraint_num); \
break; \
default: \
return INFINI_STATUS_BAD_PARAM; \
}
#define GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, grid_array_size) \
switch (constraint_num) { \
case 0: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 0); \
break; \
case 1: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_TYPE(block_array_size, grid_array_size, 2); \
break; \
}
#define GET_REARRANGE_KERNEL_BY_GRID_NUM(block_array_size) \
switch (grid_num) { \
case 1: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 2); \
break; \
case 3: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 3); \
break; \
case 4: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 4); \
break; \
case 5: \
GET_REARRANGE_KERNEL_BY_CONSTRAINT(block_array_size, 5); \
break; \
}
#define GET_REARRANGE_KERNEL_BY_BLOCK_NUM \
switch (block_num) { \
case 1: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(1); \
break; \
case 2: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(2); \
break; \
case 3: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(3); \
break; \
case 4: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(4); \
break; \
case 5: \
GET_REARRANGE_KERNEL_BY_GRID_NUM(5); \
break; \
}
GET_REARRANGE_KERNEL_BY_BLOCK_NUM
return
utils
::
Result
<
void
*>
(
kernel_func
);
}
#endif // __REARRANGE_CUDA_KERNEL_H__
src/infiniop/ops/rearrange/operator.cc
View file @
c2e87202
...
...
@@ -5,6 +5,13 @@
#ifdef ENABLE_CPU_API
#include "cpu/rearrange_cpu.h"
#endif
#ifdef ENABLE_ASCEND_API
#include "ascend/rearrange_ascend.h"
#endif
#ifdef ENABLE_CUDA_API
#include "cuda/rearrange_cuda.cuh"
#endif
__C
infiniStatus_t
infiniopCreateRearrangeDescriptor
(
infiniopHandle_t
handle
,
...
...
@@ -25,6 +32,13 @@ __C infiniStatus_t infiniopCreateRearrangeDescriptor(
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
CREATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -49,6 +63,13 @@ __C infiniStatus_t infiniopRearrange(
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
CALCULATE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
@@ -70,6 +91,13 @@ __C infiniStatus_t infiniopDestroyRearrangeDescriptor(
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_ASCEND_API
DELETE
(
INFINI_DEVICE_ASCEND
,
ascend
);
#endif
#ifdef ENABLE_CUDA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
cuda
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/rms_norm/ascend/rms_norm_aclnn.cc
View file @
c2e87202
...
...
@@ -5,18 +5,19 @@
namespace
op
::
rms_norm
::
ascend
{
struct
Descriptor
::
Opaque
{
mutable
aclOpExecutor
*
executor
;
aclnnTensorDescriptor_t
y
;
aclnnTensorDescriptor_t
x
;
aclnnTensorDescriptor_t
w
;
aclnnTensorDescriptor_t
rstd
;
size_t
workspaceSize
;
aclOpExecutor
*
executor
;
~
Opaque
()
{
delete
y
;
delete
x
;
delete
w
;
delete
rstd
;
aclDestroyAclOpExecutor
(
executor
);
}
};
...
...
@@ -69,7 +70,7 @@ infiniStatus_t Descriptor::create(
auto
handle_ascend
=
reinterpret_cast
<
device
::
ascend
::
Handle
*>
(
handle
);
size_t
all_workspace_size
=
workspace_size
+
rstd
->
numel
()
*
aclDataTypeSize
(
rstd
->
dataType
);
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
executor
,
y
,
x
,
w
,
rstd
,
workspace_size
},
new
Opaque
{
y
,
x
,
w
,
rstd
,
workspace_size
,
executor
},
std
::
move
(
info
),
all_workspace_size
,
handle_ascend
->
device
,
handle_ascend
->
device_id
);
...
...
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment