Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
93191613
Unverified
Commit
93191613
authored
Mar 13, 2026
by
thatPepe
Committed by
GitHub
Mar 13, 2026
Browse files
Merge pull request #1075 from InfiniTensor/RevertT_1-1-4
Revert T1-1-4
parents
6ab911c3
def22a08
Changes
203
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
1944 deletions
+0
-1944
src/infiniop/ops/topk/topk_desc.h
src/infiniop/ops/topk/topk_desc.h
+0
-57
src/infiniop/ops/var/cpu/var_cpu.cc
src/infiniop/ops/var/cpu/var_cpu.cc
+0
-94
src/infiniop/ops/var/cpu/var_cpu.h
src/infiniop/ops/var/cpu/var_cpu.h
+0
-8
src/infiniop/ops/var/cuda/kernel.cuh
src/infiniop/ops/var/cuda/kernel.cuh
+0
-370
src/infiniop/ops/var/info.h
src/infiniop/ops/var/info.h
+0
-67
src/infiniop/ops/var/metax/var_metax.h
src/infiniop/ops/var/metax/var_metax.h
+0
-8
src/infiniop/ops/var/metax/var_metax.maca
src/infiniop/ops/var/metax/var_metax.maca
+0
-124
src/infiniop/ops/var/moore/var_moore.h
src/infiniop/ops/var/moore/var_moore.h
+0
-8
src/infiniop/ops/var/moore/var_moore.mu
src/infiniop/ops/var/moore/var_moore.mu
+0
-124
src/infiniop/ops/var/nvidia/var_nvidia.cu
src/infiniop/ops/var/nvidia/var_nvidia.cu
+0
-124
src/infiniop/ops/var/nvidia/var_nvidia.cuh
src/infiniop/ops/var/nvidia/var_nvidia.cuh
+0
-8
src/infiniop/ops/var/operator.cc
src/infiniop/ops/var/operator.cc
+0
-197
src/infiniop/ops/var/var_desc.h
src/infiniop/ops/var/var_desc.h
+0
-53
src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc
src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc
+0
-107
src/infiniop/ops/var_mean/cpu/var_mean_cpu.h
src/infiniop/ops/var_mean/cpu/var_mean_cpu.h
+0
-8
src/infiniop/ops/var_mean/cuda/kernel.cuh
src/infiniop/ops/var_mean/cuda/kernel.cuh
+0
-378
src/infiniop/ops/var_mean/info.h
src/infiniop/ops/var_mean/info.h
+0
-67
src/infiniop/ops/var_mean/metax/var_mean_metax.h
src/infiniop/ops/var_mean/metax/var_mean_metax.h
+0
-8
src/infiniop/ops/var_mean/metax/var_mean_metax.maca
src/infiniop/ops/var_mean/metax/var_mean_metax.maca
+0
-126
src/infiniop/ops/var_mean/moore/var_mean_moore.h
src/infiniop/ops/var_mean/moore/var_mean_moore.h
+0
-8
No files found.
src/infiniop/ops/topk/topk_desc.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef INFINIOP_TOPK_DESCRIPTOR_H_
#define INFINIOP_TOPK_DESCRIPTOR_H_
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::topk::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
TopKInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
TopKInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t values_output_desc, \
infiniopTensorDescriptor_t indices_output_desc, \
infiniopTensorDescriptor_t input_desc, \
size_t k, \
size_t dim, \
bool largest, \
bool sorted); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *values_output, \
void *indices_output, \
const void *input, \
size_t k, \
size_t dim, \
bool largest, \
bool sorted, \
void *stream) const; \
}; \
}
#endif
src/infiniop/ops/var/cpu/var_cpu.cc
deleted
100644 → 0
View file @
6ab911c3
#include "var_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
namespace
op
::
var
::
cpu
{
Descriptor
::~
Descriptor
()
{}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
var_output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
unbiased
,
bool
keepdim
)
{
auto
result
=
VarInfo
::
create
(
var_output_desc
,
input_desc
,
dim
,
dim_size
,
unbiased
,
keepdim
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
nullptr
,
result
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
// welford
namespace
{
bool
IsNanOut
(
const
VarInfo
&
info
)
{
return
(
info
.
reduce_num
==
0
)
||
(
info
.
reduce_num
==
1
&&
info
.
unbiased_var
==
true
);
}
// 直接用float计算
template
<
typename
Tdata
>
void
computeVarUsingWelfordCpu
(
const
Tdata
*
input_ptr
,
float
&
var_output
,
size_t
start
,
size_t
end
,
const
VarInfo
&
info
)
{
if
(
start
>=
end
)
{
return
;
}
float
old_mean
=
0.0
f
;
// previous mean
float
mean
=
0.0
f
;
// new mean
float
M2
=
0.0
f
;
// variance sum
size_t
count
=
0
;
// element count of new sum
for
(
size_t
idx
=
start
;
idx
<
end
;
++
idx
)
{
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
idx
,
info
.
permuted_input_shape
.
size
(),
info
.
permuted_input_shape
.
data
(),
info
.
permuted_input_strides
.
data
());
;
float
value
=
utils
::
cast
<
float
>
(
input_ptr
[
input_offset
]);
count
++
;
old_mean
=
mean
;
mean
+=
(
value
-
mean
)
/
count
;
M2
+=
(
value
-
old_mean
)
*
(
value
-
mean
);
}
var_output
=
M2
/
(
info
.
unbiased_var
?
(
count
-
1
)
:
count
);
}
template
<
typename
Tdata
>
infiniStatus_t
calculateVar
(
const
VarInfo
&
info
,
Tdata
*
var_output
,
const
Tdata
*
input
)
{
Tdata
nan_value
=
utils
::
cast
<
Tdata
>
(
NAN
);
bool
is_scalar
=
(
info
.
reduce_dim_size
==
info
.
permuted_input_shape
.
size
());
for
(
size_t
i
=
0
;
i
<
info
.
output_size
;
++
i
)
{
size_t
output_offset
=
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
output_shape
.
size
(),
info
.
output_shape
.
data
(),
info
.
output_strides
.
data
());
if
(
IsNanOut
(
info
))
{
var_output
[
output_offset
]
=
nan_value
;
}
else
{
size_t
start
=
is_scalar
?
0
:
i
*
info
.
reduce_num
;
size_t
end
=
is_scalar
?
info
.
input_size
:
(
i
+
1
)
*
info
.
reduce_num
;
float
var
=
0.0
f
;
computeVarUsingWelfordCpu
(
input
,
var
,
start
,
end
,
info
);
var_output
[
output_offset
]
=
utils
::
cast
<
Tdata
>
(
var
);
}
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
var_output
,
const
void
*
input
,
bool
unbiased
,
bool
keepdim
,
void
*
stream
)
const
{
switch
(
_info
.
dtype
)
{
case
INFINI_DTYPE_F16
:
return
calculateVar
<
fp16_t
>
(
_info
,
(
fp16_t
*
)
var_output
,
reinterpret_cast
<
const
fp16_t
*>
(
input
));
case
INFINI_DTYPE_F32
:
return
calculateVar
<
float
>
(
_info
,
(
float
*
)
var_output
,
reinterpret_cast
<
const
float
*>
(
input
));
case
INFINI_DTYPE_BF16
:
return
calculateVar
<
bf16_t
>
(
_info
,
(
bf16_t
*
)
var_output
,
reinterpret_cast
<
const
bf16_t
*>
(
input
));
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::var::cpu
src/infiniop/ops/var/cpu/var_cpu.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __INFINIOP_VAR_CPU_H__
#define __INFINIOP_VAR_CPU_H__
#include "../var_desc.h"
DESCRIPTOR
(
cpu
);
#endif // __INFINIOP_VAR_CPU_H__
src/infiniop/ops/var/cuda/kernel.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_CUDA_H__
#define __VAR_CUDA_H__
#include <cmath> // NAN
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
namespace
device
{
namespace
cuda
{
template
<
typename
Tdata
>
__inline__
__device__
Tdata
Nan
();
template
<
>
__inline__
__device__
float
Nan
<
float
>
()
{
return
NAN
;
}
template
<
>
__inline__
__device__
double
Nan
<
double
>
()
{
return
NAN
;
}
template
<
>
__inline__
__device__
half
Nan
<
half
>
()
{
return
__float2half
(
NAN
);
}
#if defined(ENABLE_MOORE_API)
using
bf16_t
=
__mt_bfloat16
;
#elif defined(ENABLE_METAX_API)
using
bf16_t
=
__hpcc_bfloat16
;
#else
using
bf16_t
=
__nv_bfloat16
;
#endif
/* bf16 */
template
<
>
__inline__
__device__
bf16_t
Nan
<
bf16_t
>
()
{
return
__float2bfloat16_rn
(
NAN
);
}
template
<
typename
Tdata
>
__inline__
__device__
Tdata
Div
(
Tdata
a
,
Tdata
b
);
template
<
>
__inline__
__device__
float
Div
<
float
>
(
float
a
,
float
b
)
{
#ifdef OF_LAYER_NORM_USE_FAST_MATH
return
__fdividef
(
a
,
b
);
#else
return
a
/
b
;
#endif
}
template
<
>
__inline__
__device__
double
Div
<
double
>
(
double
a
,
double
b
)
{
return
a
/
b
;
}
template
<
>
__inline__
__device__
half
Div
<
half
>
(
half
a
,
half
b
)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
return
__hdiv
(
a
,
b
);
#else
return
__float2half
(
__half2float
(
a
)
/
__half2float
(
b
));
#endif
}
template
<
>
__inline__
__device__
bf16_t
Div
<
bf16_t
>
(
bf16_t
a
,
bf16_t
b
)
{
#if defined(ENABLE_NVIDIA_API) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
return
__hdiv
(
a
,
b
);
#else
return
__float2bfloat16_rn
(
__bfloat162float
(
a
)
/
__bfloat162float
(
b
));
#endif
}
template
<
typename
Tdata
,
typename
ComputeType
>
inline
__device__
void
WelfordReduce
(
const
Tdata
*
input_ptr
,
ComputeType
&
mean
,
ComputeType
&
m2
,
ComputeType
&
count
,
const
size_t
start
,
const
size_t
end
,
const
size_t
step
,
const
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
ComputeType
old_mean
=
0.0
;
for
(
size_t
i
=
start
;
i
<
end
;
i
+=
step
)
{
++
count
;
old_mean
=
mean
;
size_t
input_offset
=
indexToOffset
(
i
,
ndim
,
shape
,
strides
);
ComputeType
input_value
=
static_cast
<
ComputeType
>
(
input_ptr
[
input_offset
]);
mean
+=
(
input_value
-
mean
)
/
count
;
m2
+=
(
input_value
-
mean
)
*
(
input_value
-
old_mean
);
}
}
template
<
typename
Tdata
>
inline
__device__
void
WelfordCombine
(
Tdata
val
,
Tdata
&
mean
,
Tdata
&
m2
,
Tdata
&
count
)
{
count
+=
1
;
Tdata
delta1
=
val
-
mean
;
mean
+=
Div
(
delta1
,
count
);
Tdata
delta2
=
val
-
mean
;
m2
+=
delta1
*
delta2
;
}
template
<
typename
Tdata
>
inline
__device__
void
WelfordCombine
(
Tdata
b_mean
,
Tdata
b_m2
,
Tdata
b_count
,
Tdata
&
mean
,
Tdata
&
m2
,
Tdata
&
count
)
{
if
(
b_count
==
0
)
{
return
;
}
Tdata
new_count
=
count
+
b_count
;
// n1 + n2
Tdata
nb_over_n
=
Div
(
b_count
,
new_count
);
// n2 / (n1 + n2)
Tdata
delta
=
b_mean
-
mean
;
// mean2 - mean1
mean
+=
delta
*
nb_over_n
;
// mean1 + n2 * (mean2 - mean1) / (n1 + n2)
m2
+=
b_m2
+
delta
*
delta
*
count
*
nb_over_n
;
// m21 + m22 + n2 * (mean2 - mean1) ^ 2 / (n1 + n2)
count
=
new_count
;
}
template
<
typename
Tdata
>
inline
__device__
void
WelfordCombineLoop
(
const
Tdata
*
b_mean
,
const
Tdata
*
b_m2
,
const
Tdata
*
b_count
,
Tdata
&
mean
,
Tdata
&
m2
,
Tdata
&
count
,
const
size_t
start
,
const
size_t
end
,
const
size_t
step
)
{
for
(
size_t
i
=
start
;
i
<
end
;
i
+=
step
)
{
WelfordCombine
(
b_mean
[
i
],
b_m2
[
i
],
b_count
[
i
],
mean
,
m2
,
count
);
}
}
template
<
typename
Tdata
,
int
thread_group_width
=
32
>
__inline__
__device__
void
WelfordWarpReduce
(
Tdata
thread_mean
,
Tdata
thread_m2
,
Tdata
thread_count
,
Tdata
&
mean
,
Tdata
&
m2
,
Tdata
&
count
)
{
mean
=
thread_mean
;
m2
=
thread_m2
;
count
=
thread_count
;
for
(
int
lane_mask
=
thread_group_width
/
2
;
lane_mask
>
0
;
lane_mask
/=
2
)
{
Tdata
b_mean
=
__shfl_down_sync
(
0xffffffff
,
mean
,
lane_mask
,
thread_group_width
);
Tdata
b_m2
=
__shfl_down_sync
(
0xffffffff
,
m2
,
lane_mask
,
thread_group_width
);
Tdata
b_count
=
__shfl_down_sync
(
0xffffffff
,
count
,
lane_mask
,
thread_group_width
);
WelfordCombine
(
b_mean
,
b_m2
,
b_count
,
mean
,
m2
,
count
);
}
}
template
<
typename
Tdata
,
size_t
kWarpSize
=
32
>
__inline__
__device__
void
WelfordBlockAllReduce
(
Tdata
thread_mean
,
Tdata
thread_m2
,
Tdata
thread_count
,
Tdata
&
result_mean
,
Tdata
&
result_m2
,
Tdata
&
result_count
)
{
__shared__
Tdata
mean_shared
[
kWarpSize
];
__shared__
Tdata
m2_shared
[
kWarpSize
];
__shared__
Tdata
count_shared
[
kWarpSize
];
__shared__
Tdata
mean_result_broadcast
;
__shared__
Tdata
m2_result_broadcast
;
__shared__
Tdata
count_result_broadcast
;
const
int
lid
=
threadIdx
.
x
%
kWarpSize
;
const
int
wid
=
threadIdx
.
x
/
kWarpSize
;
// warp内规约
Tdata
warp_mean
=
0.0
;
Tdata
warp_m2
=
0.0
;
Tdata
warp_count
=
0
;
WelfordWarpReduce
(
thread_mean
,
thread_m2
,
thread_count
,
warp_mean
,
warp_m2
,
warp_count
);
__syncthreads
();
if
(
lid
==
0
)
{
// 每个warp内的的thread0 保存warp结果
mean_shared
[
wid
]
=
warp_mean
;
m2_shared
[
wid
]
=
warp_m2
;
count_shared
[
wid
]
=
warp_count
;
}
__syncthreads
();
// warp间规约
if
(
wid
==
0
)
{
if
(
threadIdx
.
x
<
blockDim
.
x
/
kWarpSize
)
{
warp_mean
=
mean_shared
[
lid
];
warp_m2
=
m2_shared
[
lid
];
warp_count
=
count_shared
[
lid
];
}
else
{
warp_mean
=
static_cast
<
Tdata
>
(
0
);
warp_m2
=
static_cast
<
Tdata
>
(
0
);
warp_count
=
static_cast
<
Tdata
>
(
0
);
}
__syncwarp
();
Tdata
block_mean
=
0
;
Tdata
block_m2
=
0
;
Tdata
block_count
=
0
;
WelfordWarpReduce
(
warp_mean
,
warp_m2
,
warp_count
,
block_mean
,
block_m2
,
block_count
);
if
(
lid
==
0
)
{
mean_result_broadcast
=
block_mean
;
m2_result_broadcast
=
block_m2
;
count_result_broadcast
=
block_count
;
}
}
__syncthreads
();
result_mean
=
mean_result_broadcast
;
result_m2
=
m2_result_broadcast
;
result_count
=
count_result_broadcast
;
}
}
// namespace cuda
}
// namespace device
__device__
int32_t
done_block_counts
=
0
;
template
<
typename
Tdata
,
typename
ComputeType
>
__global__
void
ComputeVarScalarOut
(
const
Tdata
*
input_ptr
,
Tdata
*
var_output_ptr
,
ComputeType
*
tmp_buffer_ptr
,
// Tdata *mean_output_ptr,
size_t
input_size
,
size_t
input_ndim
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
,
bool
unbiased
,
bool
is_nan
)
{
// 处理 NaN 情况
if
(
is_nan
)
{
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
{
*
var_output_ptr
=
device
::
cuda
::
Nan
<
Tdata
>
();
}
// mean_output_ptr[0] = (input_size == 0) ? device::cuda::Nan<Tdata>() : input_ptr[0];}
return
;
}
// 计算每个 block 和 thread 的工作量
const
size_t
elems_per_block
=
input_size
/
gridDim
.
x
;
const
size_t
elems_per_thread
=
elems_per_block
/
blockDim
.
x
;
// 线程级 Welford 累积
ComputeType
thread_mean
=
0.0
,
thread_m2
=
0.0
,
thread_count
=
0
;
// 每个线程处理常规元素(stride 访问)
if
(
elems_per_thread
>
0
)
{
const
size_t
block_start
=
blockIdx
.
x
*
elems_per_block
;
const
size_t
regular_elems
=
elems_per_block
-
(
elems_per_block
%
blockDim
.
x
);
device
::
cuda
::
WelfordReduce
<
Tdata
,
ComputeType
>
(
input_ptr
,
thread_mean
,
thread_m2
,
thread_count
,
/*start=*/
block_start
+
threadIdx
.
x
,
/*end=*/
block_start
+
regular_elems
,
/*step=*/
blockDim
.
x
,
/*ndim=*/
input_ndim
,
/*shape=*/
permuted_input_shape
,
/*strides=*/
permuted_input_strides
);
}
// thread 0 处理本 block 的尾部元素以及跨 block 的尾部元素(单个线程处理)
if
(
threadIdx
.
x
==
0
)
{
size_t
tail_count
=
elems_per_block
%
blockDim
.
x
;
// 最后一个 block 还需要处理总元素数的尾部
if
(
blockIdx
.
x
==
gridDim
.
x
-
1
)
{
tail_count
+=
input_size
%
gridDim
.
x
;
}
if
(
tail_count
>
0
)
{
const
size_t
tail_start
=
blockIdx
.
x
*
elems_per_block
+
blockDim
.
x
*
elems_per_thread
;
device
::
cuda
::
WelfordReduce
<
Tdata
,
ComputeType
>
(
input_ptr
,
thread_mean
,
thread_m2
,
thread_count
,
/*start=*/
tail_start
,
/*end=*/
tail_start
+
tail_count
,
/*step=*/
1
,
/*ndim=*/
input_ndim
,
/*shape=*/
permuted_input_shape
,
/*strides=*/
permuted_input_strides
);
}
}
// Block 级规约
ComputeType
block_mean
=
0.0
,
block_m2
=
0.0
,
block_count
=
0
;
device
::
cuda
::
WelfordBlockAllReduce
<
ComputeType
>
(
thread_mean
,
thread_m2
,
thread_count
,
block_mean
,
block_m2
,
block_count
);
// 单 block 情况:直接输出结果
if
(
gridDim
.
x
==
1
)
{
if
(
threadIdx
.
x
==
0
)
{
ComputeType
divisor
=
unbiased
?
block_count
-
1
:
block_count
;
var_output_ptr
[
0
]
=
device
::
cuda
::
Div
(
block_m2
,
divisor
);
}
return
;
}
// 多 block 情况:使用临时缓冲区
ComputeType
*
tmp_mean_ptr
=
tmp_buffer_ptr
;
ComputeType
*
tmp_m2_ptr
=
tmp_mean_ptr
+
gridDim
.
x
;
ComputeType
*
tmp_count_ptr
=
tmp_m2_ptr
+
gridDim
.
x
;
// 保存本 block 的结果
if
(
threadIdx
.
x
==
0
)
{
tmp_mean_ptr
[
blockIdx
.
x
]
=
block_mean
;
tmp_m2_ptr
[
blockIdx
.
x
]
=
block_m2
;
tmp_count_ptr
[
blockIdx
.
x
]
=
block_count
;
}
// 最后一个 block 负责最终规约
__shared__
bool
is_last_block
;
if
(
threadIdx
.
x
==
0
)
{
is_last_block
=
(
atomicAdd
(
&
done_block_counts
,
1
)
==
gridDim
.
x
-
1
);
}
__syncthreads
();
if
(
is_last_block
)
{
// 每个线程合并一部分 block 的结果
ComputeType
final_thread_mean
=
0.0
,
final_thread_m2
=
0.0
,
final_thread_count
=
0
;
const
size_t
blocks_per_thread
=
gridDim
.
x
/
blockDim
.
x
;
const
size_t
regular_blocks
=
blocks_per_thread
*
blockDim
.
x
;
if
(
blocks_per_thread
>
0
)
{
device
::
cuda
::
WelfordCombineLoop
(
tmp_mean_ptr
,
tmp_m2_ptr
,
tmp_count_ptr
,
final_thread_mean
,
final_thread_m2
,
final_thread_count
,
/*start=*/
threadIdx
.
x
,
/*end=*/
regular_blocks
,
/*step=*/
blockDim
.
x
);
}
// thread 0 处理尾部 block
if
(
threadIdx
.
x
==
0
&&
regular_blocks
<
gridDim
.
x
)
{
device
::
cuda
::
WelfordCombineLoop
(
&
tmp_mean_ptr
[
regular_blocks
],
&
tmp_m2_ptr
[
regular_blocks
],
&
tmp_count_ptr
[
regular_blocks
],
final_thread_mean
,
final_thread_m2
,
final_thread_count
,
/*start=*/
0
,
/*end=*/
gridDim
.
x
-
regular_blocks
,
/*step=*/
1
);
}
// 最终 block 级规约并输出
ComputeType
final_mean
=
0
,
final_m2
=
0
,
final_count
=
0
;
device
::
cuda
::
WelfordBlockAllReduce
<
ComputeType
>
(
final_thread_mean
,
final_thread_m2
,
final_thread_count
,
final_mean
,
final_m2
,
final_count
);
if
(
threadIdx
.
x
==
0
)
{
ComputeType
divisor
=
unbiased
?
final_count
-
1
:
final_count
;
var_output_ptr
[
0
]
=
device
::
cuda
::
Div
(
final_m2
,
divisor
);
done_block_counts
=
0
;
// 重置计数器
}
}
}
// CUDA: grid stride looping
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \
i += step)
template
<
typename
Tdata
,
typename
ComputeType
>
__forceinline__
__device__
__host__
void
ComputeVarUsingWelford
(
const
Tdata
*
input_ptr
,
size_t
offset
,
Tdata
&
var_output
,
size_t
reduce_num
,
size_t
input_ndim
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
,
bool
unbiased
)
{
size_t
count
=
0
;
ComputeType
mean
=
0.0
;
ComputeType
old_mean
=
0.0
;
ComputeType
m2
=
0.0
;
for
(
size_t
i
=
0
;
i
<
reduce_num
;
++
i
)
{
size_t
input_offset
=
indexToOffset
(
offset
+
i
,
input_ndim
,
permuted_input_shape
,
permuted_input_strides
);
count
++
;
old_mean
=
mean
;
mean
=
old_mean
+
(
static_cast
<
ComputeType
>
(
input_ptr
[
input_offset
])
-
old_mean
)
/
count
;
m2
+=
(
static_cast
<
ComputeType
>
(
input_ptr
[
input_offset
])
-
old_mean
)
*
(
static_cast
<
ComputeType
>
(
input_ptr
[
input_offset
])
-
mean
);
}
var_output
=
static_cast
<
Tdata
>
(
m2
/
(
unbiased
?
count
-
1
:
count
));
}
template
<
typename
Tdata
,
typename
ComputeType
>
__global__
void
ComputeVarUsingWelfordWrapper
(
const
Tdata
*
input_ptr
,
Tdata
*
var_output_ptr
,
size_t
input_ndim
,
size_t
output_size
,
size_t
reduce_num
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
,
bool
unbiased
,
bool
is_nan
)
{
if
(
is_nan
)
{
if
(
reduce_num
==
0
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
output_size
)
{
var_output_ptr
[
i
]
=
device
::
cuda
::
Nan
<
Tdata
>
();
}
}
else
{
CUDA_1D_KERNEL_LOOP
(
i
,
output_size
)
{
// const size_t input_offset = indexToOffset(i * reduce_num, input_ndim, permuted_input_shape, permuted_input_strides);
var_output_ptr
[
i
]
=
device
::
cuda
::
Nan
<
Tdata
>
();
}
}
}
else
{
CUDA_1D_KERNEL_LOOP
(
i
,
output_size
)
{
ComputeVarUsingWelford
<
Tdata
,
ComputeType
>
(
input_ptr
,
i
*
reduce_num
,
var_output_ptr
[
i
],
reduce_num
,
input_ndim
,
permuted_input_shape
,
permuted_input_strides
,
unbiased
);
}
}
}
#endif // __VAR_CUDA_H__
src/infiniop/ops/var/info.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_INFO_H__
#define __VAR_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <algorithm>
#include <cstddef>
#include <vector>
namespace
op
::
var
{
class
VarInfo
{
VarInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
std
::
vector
<
size_t
>
permuted_input_shape
;
// need to permute
std
::
vector
<
size_t
>
output_shape
;
std
::
vector
<
ptrdiff_t
>
permuted_input_strides
;
// need to permute
std
::
vector
<
ptrdiff_t
>
output_strides
;
size_t
reduce_dim_size
;
// reduce dim size
size_t
reduce_num
;
// number of elements to reduce for each output element
size_t
input_size
;
// total number of input elements
size_t
output_size
;
// total number of output elements
bool
unbiased_var
;
static
utils
::
Result
<
VarInfo
>
create
(
infiniopTensorDescriptor_t
var_output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
unbiased
,
bool
keepdim
)
{
auto
input_shape
=
input_desc
->
shape
();
auto
input_strides
=
input_desc
->
strides
();
size_t
input_ndim
=
input_desc
->
ndim
();
size_t
reduce_num
=
1
;
for
(
size_t
i
=
0
;
i
<
dim_size
;
i
++
)
{
reduce_num
*=
input_shape
[
dim
[
i
]];
}
std
::
vector
<
size_t
>
permute_order
;
for
(
size_t
i
=
0
;
i
<
input_ndim
;
i
++
)
{
if
(
std
::
find
(
dim
,
dim
+
dim_size
,
i
)
==
dim
+
dim_size
)
{
permute_order
.
push_back
(
i
);
}
}
for
(
size_t
i
=
0
;
i
<
dim_size
;
i
++
)
{
permute_order
.
push_back
(
dim
[
i
]);
}
std
::
vector
<
size_t
>
permuted_input_shape
;
std
::
vector
<
ptrdiff_t
>
permuted_input_strides
;
for
(
size_t
i
=
0
;
i
<
permute_order
.
size
();
i
++
)
{
permuted_input_shape
.
push_back
(
input_shape
[
permute_order
[
i
]]);
permuted_input_strides
.
push_back
(
input_strides
[
permute_order
[
i
]]);
}
return
utils
::
Result
<
VarInfo
>
(
VarInfo
{
input_desc
->
dtype
(),
permuted_input_shape
,
var_output_desc
->
shape
(),
permuted_input_strides
,
var_output_desc
->
strides
(),
dim_size
,
reduce_num
,
input_desc
->
numel
(),
var_output_desc
->
numel
(),
unbiased
});
}
};
}
// namespace op::var
#endif
src/infiniop/ops/var/metax/var_metax.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_METAX_H__
#define __VAR_METAX_H__
#include "../var_desc.h"
DESCRIPTOR
(
metax
);
#endif // __VAR_METAX_H__
src/infiniop/ops/var/metax/var_metax.maca
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "var_metax.h"
namespace op::var::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t var_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim) {
auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
bool IsNanOut(const VarInfo &info) {
return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
}
template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
infiniStatus_t launchKernel(
const VarInfo &info,
Tdata *var_output, const Tdata *input,
bool unbiased, bool keepdim,
hcStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(size_t);
ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(ptrdiff_t);
CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
bool is_nan = IsNanOut(info);
if (info.reduce_num == input_size) { // scalar output
ComputeType *tmp_buffer;
constexpr size_t MAX_GRID_SIZE = 128;
size_t grid_size = std::min(MAX_GRID_SIZE,
(input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
CHECK_METAX(hcMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, tmp_buffer, input_size, input_ndim,
permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan);
CHECK_METAX(hcFree(tmp_buffer));
} else {
size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
ComputeVarUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, input_ndim, output_size, reduce_num,
permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *var_output,
const void *input,
bool unbiased,
bool keepdim,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \
launchKernel<BLOCK_SIZE, Tdata, ComputeType>( \
_info, \
(Tdata *)var_output, (const Tdata *)input, \
unbiased, keepdim, \
stream, workspace, workspace_size)
#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_VAR(BLOCK_SIZE, __hpcc_bfloat16, double); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_VAR(BLOCK_SIZE, half, double); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_VAR(BLOCK_SIZE, float, double); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_VAR_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::var::metax
src/infiniop/ops/var/moore/var_moore.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_MOORE_H__
#define __VAR_MOORE_H__
#include "../var_desc.h"
DESCRIPTOR
(
moore
);
#endif // __VAR_MOORE_H__
src/infiniop/ops/var/moore/var_moore.mu
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "var_moore.h"
namespace op::var::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t var_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim) {
auto result = VarInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
bool IsNanOut(const VarInfo &info) {
return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
}
template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
infiniStatus_t launchKernel(
const VarInfo &info,
Tdata *var_output, const Tdata *input,
bool unbiased, bool keepdim,
musaStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_musa = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(size_t);
ptrdiff_t *permuted_input_strides_musa = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(ptrdiff_t);
CHECK_MOORE(musaMemcpyAsync(permuted_input_shape_musa, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), musaMemcpyHostToDevice, stream));
CHECK_MOORE(musaMemcpyAsync(permuted_input_strides_musa, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), musaMemcpyHostToDevice, stream));
bool is_nan = IsNanOut(info);
if (info.reduce_num == input_size) { // scalar output
ComputeType *tmp_buffer;
constexpr size_t MAX_GRID_SIZE = 128;
size_t grid_size = std::min(MAX_GRID_SIZE,
(input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
CHECK_MOORE(musaMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, tmp_buffer, input_size, input_ndim,
permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
CHECK_MOORE(musaFree(tmp_buffer));
} else {
size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
ComputeVarUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, input_ndim, output_size, reduce_num,
permuted_input_shape_musa, permuted_input_strides_musa, unbiased, is_nan);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *var_output,
const void *input,
bool unbiased,
bool keepdim,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \
launchKernel<BLOCK_SIZE, Tdata, ComputeType>( \
_info, \
(Tdata *)var_output, (const Tdata *)input, \
unbiased, keepdim, \
stream, workspace, workspace_size)
#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_VAR(BLOCK_SIZE, __mt_bfloat16, double); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_VAR(BLOCK_SIZE, half, double); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_VAR(BLOCK_SIZE, float, double); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_VAR_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::var::moore
src/infiniop/ops/var/nvidia/var_nvidia.cu
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/nvidia/nvidia_common.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#include "../cuda/kernel.cuh"
#include "var_nvidia.cuh"
namespace
op
::
var
::
nvidia
{
struct
Descriptor
::
Opaque
{
std
::
shared_ptr
<
device
::
nvidia
::
Handle
::
Internal
>
internal
;
};
Descriptor
::~
Descriptor
()
{
delete
_opaque
;
}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
var_output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
unbiased
,
bool
keepdim
)
{
auto
result
=
VarInfo
::
create
(
var_output_desc
,
input_desc
,
dim
,
dim_size
,
unbiased
,
keepdim
);
CHECK_RESULT
(
result
);
auto
info
=
result
.
take
();
size_t
workspace_size
=
0
;
workspace_size
+=
input_desc
->
ndim
()
*
(
sizeof
(
size_t
)
+
sizeof
(
ptrdiff_t
));
// permuted_input_shape + permuted_input_strides
*
desc_ptr
=
new
Descriptor
(
new
Opaque
{
reinterpret_cast
<
device
::
nvidia
::
Handle
*>
(
handle
)
->
internal
()},
info
,
workspace_size
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
namespace
{
bool
IsNanOut
(
const
VarInfo
&
info
)
{
return
(
info
.
reduce_num
==
0
)
||
(
info
.
reduce_num
==
1
&&
info
.
unbiased_var
==
true
);
}
template
<
size_t
BLOCK_SIZE
,
typename
Tdata
,
typename
ComputeType
>
infiniStatus_t
launchKernel
(
const
VarInfo
&
info
,
Tdata
*
var_output
,
const
Tdata
*
input
,
bool
unbiased
,
bool
keepdim
,
cudaStream_t
stream
,
void
*
workspace
,
size_t
workspace_size
)
{
size_t
input_ndim
=
info
.
permuted_input_shape
.
size
();
// size_t output_ndim = info.output_shape.size();
size_t
input_size
=
info
.
input_size
;
size_t
output_size
=
info
.
output_size
;
size_t
reduce_num
=
info
.
reduce_num
;
unsigned
char
*
workspace_ptr
=
reinterpret_cast
<
unsigned
char
*>
(
workspace
);
size_t
workspace_offset
=
0
;
size_t
*
permuted_input_shape_cuda
=
reinterpret_cast
<
size_t
*>
(
workspace_ptr
+
workspace_offset
);
workspace_offset
+=
input_ndim
*
sizeof
(
size_t
);
ptrdiff_t
*
permuted_input_strides_cuda
=
reinterpret_cast
<
ptrdiff_t
*>
(
workspace_ptr
+
workspace_offset
);
workspace_offset
+=
input_ndim
*
sizeof
(
ptrdiff_t
);
CHECK_CUDA
(
cudaMemcpyAsync
(
permuted_input_shape_cuda
,
info
.
permuted_input_shape
.
data
(),
input_ndim
*
sizeof
(
size_t
),
cudaMemcpyHostToDevice
,
stream
));
CHECK_CUDA
(
cudaMemcpyAsync
(
permuted_input_strides_cuda
,
info
.
permuted_input_strides
.
data
(),
input_ndim
*
sizeof
(
ptrdiff_t
),
cudaMemcpyHostToDevice
,
stream
));
bool
is_nan
=
IsNanOut
(
info
);
if
(
info
.
reduce_num
==
input_size
)
{
// scalar output
ComputeType
*
tmp_buffer
;
constexpr
size_t
MAX_GRID_SIZE
=
128
;
size_t
grid_size
=
std
::
min
(
MAX_GRID_SIZE
,
(
input_size
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
);
grid_size
=
std
::
max
(
1UL
,
grid_size
);
CHECK_CUDA
(
cudaMalloc
(
&
tmp_buffer
,
grid_size
*
3
*
sizeof
(
ComputeType
)));
ComputeVarScalarOut
<
Tdata
,
ComputeType
><<<
grid_size
,
BLOCK_SIZE
,
0
,
stream
>>>
(
input
,
var_output
,
tmp_buffer
,
input_size
,
input_ndim
,
permuted_input_shape_cuda
,
permuted_input_strides_cuda
,
unbiased
,
is_nan
);
CHECK_CUDA
(
cudaFree
(
tmp_buffer
));
}
else
{
size_t
grid_size
=
std
::
min
(
256UL
,
(
info
.
output_size
+
BLOCK_SIZE
-
1
)
/
BLOCK_SIZE
);
grid_size
=
std
::
max
(
1UL
,
grid_size
);
ComputeVarUsingWelfordWrapper
<
Tdata
,
ComputeType
><<<
grid_size
,
BLOCK_SIZE
,
0
,
stream
>>>
(
input
,
var_output
,
input_ndim
,
output_size
,
reduce_num
,
permuted_input_shape_cuda
,
permuted_input_strides_cuda
,
unbiased
,
is_nan
);
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
var_output
,
const
void
*
input
,
bool
unbiased
,
bool
keepdim
,
void
*
stream_
)
const
{
cudaStream_t
stream
=
(
cudaStream_t
)
stream_
;
#define CALCULATE_VAR(BLOCK_SIZE, Tdata, ComputeType) \
launchKernel<BLOCK_SIZE, Tdata, ComputeType>( \
_info, \
(Tdata *)var_output, (const Tdata *)input, \
unbiased, keepdim, \
stream, workspace, workspace_size)
#define CALCULATE_VAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_VAR(BLOCK_SIZE, __nv_bfloat16, double); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_VAR(BLOCK_SIZE, half, double); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_VAR(BLOCK_SIZE, float, double); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if
(
_opaque
->
internal
->
maxThreadsPerBlock
()
>=
256
)
{
CALCULATE_VAR_WITH_BLOCK_SIZE
(
256
)
}
else
{
return
INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::var::nvidia
src/infiniop/ops/var/nvidia/var_nvidia.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_NVIDIA_H__
#define __VAR_NVIDIA_H__
#include "../var_desc.h"
DESCRIPTOR
(
nvidia
);
#endif // __VAR_NVIDIA_H__
src/infiniop/ops/var/operator.cc
deleted
100644 → 0
View file @
6ab911c3
#include "../../operator.h"
#include "../../handle.h"
#include "infiniop/ops/var.h"
#include <vector>
#ifdef ENABLE_CPU_API
#include "cpu/var_cpu.h"
#endif
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
#include "nvidia/var_nvidia.cuh"
#endif
#ifdef ENABLE_METAX_API
#include "metax/var_metax.h"
#endif
#ifdef ENABLE_KUNLUN_API
#include "kunlun/var_kunlun.h"
#endif
#ifdef ENABLE_MOORE_API
#include "moore/var_moore.h"
#endif
__INFINI_C
infiniStatus_t
infiniopCreateVarDescriptor
(
infiniopHandle_t
handle
,
infiniopVarDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
var_output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
unbiased
,
bool
keepdim
)
{
#define CREATE(CASE, NAMESPACE) \
case CASE: \
return op::var::NAMESPACE::Descriptor::create( \
handle, \
reinterpret_cast<op::var::NAMESPACE::Descriptor **>(desc_ptr), \
var_output_desc, \
input_desc, \
dim, \
dim_size, \
unbiased, \
keepdim)
switch
(
handle
->
device
)
{
#ifdef ENABLE_CPU_API
CREATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CREATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CREATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CREATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CREATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CREATE
}
__INFINI_C
infiniStatus_t
infiniopGetVarWorkspaceSize
(
infiniopVarDescriptor_t
desc
,
size_t
*
size
)
{
#define GET(CASE, NAMESPACE) \
case CASE: \
*size = reinterpret_cast<op::var::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
return INFINI_STATUS_SUCCESS
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
GET
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
GET
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
GET
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
GET
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
GET
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef GET
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
__INFINI_C
infiniStatus_t
infiniopVar
(
infiniopVarDescriptor_t
desc
,
void
*
workspace
,
size_t
workspace_size
,
void
*
var_output
,
const
void
*
input
,
size_t
*
dim
,
size_t
dim_size
,
bool
unbiased
,
bool
keepdim
,
void
*
stream
)
{
#define CALCULATE(CASE, NAMESPACE) \
case CASE: \
return reinterpret_cast<const op::var::NAMESPACE::Descriptor *>(desc) \
->calculate(workspace, workspace_size, var_output, input, unbiased, keepdim, stream)
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
CALCULATE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
CALCULATE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
CALCULATE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
CALCULATE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
CALCULATE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
CALCULATE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_MOORE_API
CALCULATE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef CALCULATE
}
__INFINI_C
infiniStatus_t
infiniopDestroyVarDescriptor
(
infiniopVarDescriptor_t
desc
)
{
#define DELETE(CASE, NAMESPACE) \
case CASE: \
delete reinterpret_cast<const op::var::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS;
switch
(
desc
->
device_type
)
{
#ifdef ENABLE_CPU_API
DELETE
(
INFINI_DEVICE_CPU
,
cpu
);
#endif
#ifdef ENABLE_NVIDIA_API
DELETE
(
INFINI_DEVICE_NVIDIA
,
nvidia
);
#endif
#ifdef ENABLE_ILUVATAR_API
DELETE
(
INFINI_DEVICE_ILUVATAR
,
nvidia
);
#endif
#ifdef ENABLE_QY_API
DELETE
(
INFINI_DEVICE_QY
,
nvidia
);
#endif
#ifdef ENABLE_METAX_API
DELETE
(
INFINI_DEVICE_METAX
,
metax
);
#endif
#ifdef ENABLE_KUNLUN_API
DELETE
(
INFINI_DEVICE_KUNLUN
,
kunlun
);
#endif
#ifdef ENABLE_MOORE_API
DELETE
(
INFINI_DEVICE_MOORE
,
moore
);
#endif
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
}
#undef DELETE
}
src/infiniop/ops/var/var_desc.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef INFINIOP_VAR_DESCRIPTOR_H_
#define INFINIOP_VAR_DESCRIPTOR_H_
#include "../../../utils.h"
#include "../../operator.h"
#include "../../tensor.h"
#include "info.h"
#define DESCRIPTOR(NAMESPACE) \
\
namespace op::var::NAMESPACE { \
class Descriptor final : public InfiniopDescriptor { \
struct Opaque; \
Opaque *_opaque; \
VarInfo _info; \
size_t _workspace_size; \
\
Descriptor( \
Opaque *opaque, \
VarInfo info, \
size_t workspace_size, \
infiniDevice_t device_type, \
int device_id) \
: InfiniopDescriptor{device_type, device_id}, \
_opaque(opaque), \
_info(info), \
_workspace_size(workspace_size) {} \
\
public: \
~Descriptor(); \
size_t workspaceSize() const { return _workspace_size; } \
\
static infiniStatus_t create( \
infiniopHandle_t handle, \
Descriptor **desc_ptr, \
infiniopTensorDescriptor_t var_output_desc, \
infiniopTensorDescriptor_t input_desc, \
size_t *dim, \
size_t dim_size, \
bool unbiased, \
bool keepdim); \
\
infiniStatus_t calculate( \
void *workspace, size_t workspace_size, \
void *var_output, \
const void *input, \
bool unbiased, \
bool keepdim, \
void *stream) const; \
}; \
}
#endif
src/infiniop/ops/var_mean/cpu/var_mean_cpu.cc
deleted
100644 → 0
View file @
6ab911c3
#include "var_mean_cpu.h"
#include "../../../../utils.h"
#include "../../../devices/cpu/common_cpu.h"
namespace
op
::
var_mean
::
cpu
{
Descriptor
::~
Descriptor
()
{}
infiniStatus_t
Descriptor
::
create
(
infiniopHandle_t
handle
,
Descriptor
**
desc_ptr
,
infiniopTensorDescriptor_t
var_output_desc
,
infiniopTensorDescriptor_t
mean_output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
unbiased
,
bool
keepdim
)
{
auto
result
=
VarMeanInfo
::
create
(
var_output_desc
,
input_desc
,
dim
,
dim_size
,
unbiased
,
keepdim
);
CHECK_RESULT
(
result
);
*
desc_ptr
=
new
Descriptor
(
nullptr
,
result
.
take
(),
0
,
handle
->
device
,
handle
->
device_id
);
return
INFINI_STATUS_SUCCESS
;
}
// welford
namespace
{
bool
IsNanOut
(
const
VarMeanInfo
&
info
)
{
return
(
info
.
reduce_num
==
0
)
||
(
info
.
reduce_num
==
1
&&
info
.
unbiased_var
==
true
);
}
// 直接用float计算
template
<
typename
Tdata
>
void
computeVarMeanUsingWelfordCpu
(
const
Tdata
*
input_ptr
,
float
&
var_output
,
float
&
mean_output
,
size_t
start
,
size_t
end
,
const
VarMeanInfo
&
info
)
{
if
(
start
>=
end
)
{
return
;
}
float
old_mean
=
0.0
f
;
// previous mean
float
mean
=
0.0
f
;
// new mean
float
M2
=
0.0
f
;
// variance sum
size_t
count
=
0
;
// element count of new sum
for
(
size_t
idx
=
start
;
idx
<
end
;
++
idx
)
{
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
idx
,
info
.
permuted_input_shape
.
size
(),
info
.
permuted_input_shape
.
data
(),
info
.
permuted_input_strides
.
data
());
;
float
value
=
utils
::
cast
<
float
>
(
input_ptr
[
input_offset
]);
count
++
;
old_mean
=
mean
;
mean
+=
(
value
-
mean
)
/
count
;
M2
+=
(
value
-
old_mean
)
*
(
value
-
mean
);
}
mean_output
=
mean
;
var_output
=
M2
/
(
info
.
unbiased_var
?
(
count
-
1
)
:
count
);
}
template
<
typename
Tdata
>
infiniStatus_t
calculateVarMean
(
const
VarMeanInfo
&
info
,
Tdata
*
var_output
,
Tdata
*
mean_output
,
const
Tdata
*
input
)
{
Tdata
nan_value
=
utils
::
cast
<
Tdata
>
(
NAN
);
bool
is_scalar
=
(
info
.
reduce_dim_size
==
info
.
permuted_input_shape
.
size
());
// #pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
info
.
output_size
;
++
i
)
{
size_t
output_offset
=
op
::
common_cpu
::
indexToOffset
(
i
,
info
.
output_shape
.
size
(),
info
.
output_shape
.
data
(),
info
.
output_strides
.
data
());
if
(
IsNanOut
(
info
))
{
var_output
[
output_offset
]
=
nan_value
;
if
(
info
.
reduce_num
==
0
)
{
mean_output
[
output_offset
]
=
nan_value
;
}
else
{
size_t
input_idx
=
is_scalar
?
0
:
i
*
info
.
reduce_num
;
size_t
input_offset
=
op
::
common_cpu
::
indexToOffset
(
input_idx
,
info
.
permuted_input_shape
.
size
(),
info
.
permuted_input_shape
.
data
(),
info
.
permuted_input_strides
.
data
());
mean_output
[
output_offset
]
=
input
[
input_offset
];
}
}
else
{
size_t
start
=
is_scalar
?
0
:
i
*
info
.
reduce_num
;
size_t
end
=
is_scalar
?
info
.
input_size
:
(
i
+
1
)
*
info
.
reduce_num
;
float
var
=
0.0
f
,
mean
=
0.0
f
;
computeVarMeanUsingWelfordCpu
(
input
,
var
,
mean
,
start
,
end
,
info
);
var_output
[
output_offset
]
=
utils
::
cast
<
Tdata
>
(
var
);
mean_output
[
output_offset
]
=
utils
::
cast
<
Tdata
>
(
mean
);
}
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace
infiniStatus_t
Descriptor
::
calculate
(
void
*
workspace
,
size_t
workspace_size
,
void
*
var_output
,
void
*
mean_output
,
const
void
*
input
,
bool
unbiased
,
bool
keepdim
,
void
*
stream
)
const
{
switch
(
_info
.
dtype
)
{
case
INFINI_DTYPE_F16
:
return
calculateVarMean
<
fp16_t
>
(
_info
,
(
fp16_t
*
)
var_output
,
(
fp16_t
*
)
mean_output
,
reinterpret_cast
<
const
fp16_t
*>
(
input
));
case
INFINI_DTYPE_F32
:
return
calculateVarMean
<
float
>
(
_info
,
(
float
*
)
var_output
,
(
float
*
)
mean_output
,
reinterpret_cast
<
const
float
*>
(
input
));
case
INFINI_DTYPE_BF16
:
return
calculateVarMean
<
bf16_t
>
(
_info
,
(
bf16_t
*
)
var_output
,
(
bf16_t
*
)
mean_output
,
reinterpret_cast
<
const
bf16_t
*>
(
input
));
default:
return
INFINI_STATUS_BAD_TENSOR_DTYPE
;
}
return
INFINI_STATUS_SUCCESS
;
}
}
// namespace op::var_mean::cpu
src/infiniop/ops/var_mean/cpu/var_mean_cpu.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __INFINIOP_VAR_MEAN_CPU_H__
#define __INFINIOP_VAR_MEAN_CPU_H__
#include "../var_mean_desc.h"
DESCRIPTOR
(
cpu
);
#endif // __INFINIOP_VAR_MEAN_CPU_H__
src/infiniop/ops/var_mean/cuda/kernel.cuh
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_MEAN_CUDA_H__
#define __VAR_MEAN_CUDA_H__
#include <cmath> // NAN
__forceinline__
__device__
__host__
size_t
indexToOffset
(
size_t
flat_index
,
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
size_t
res
=
0
;
for
(
size_t
i
=
ndim
;
i
--
>
0
;)
{
res
+=
(
flat_index
%
shape
[
i
])
*
strides
[
i
];
flat_index
/=
shape
[
i
];
}
return
res
;
}
namespace
device
{
namespace
cuda
{
template
<
typename
Tdata
>
__inline__
__device__
Tdata
Nan
();
template
<
>
__inline__
__device__
float
Nan
<
float
>
()
{
return
NAN
;
}
template
<
>
__inline__
__device__
double
Nan
<
double
>
()
{
return
NAN
;
}
template
<
>
__inline__
__device__
half
Nan
<
half
>
()
{
return
__float2half
(
NAN
);
}
#if defined(ENABLE_MOORE_API)
using
bf16_t
=
__mt_bfloat16
;
#elif defined(ENABLE_METAX_API)
using
bf16_t
=
__hpcc_bfloat16
;
#else
using
bf16_t
=
__nv_bfloat16
;
#endif
/* bf16 */
template
<
>
__inline__
__device__
bf16_t
Nan
<
bf16_t
>
()
{
return
__float2bfloat16_rn
(
NAN
);
}
template
<
typename
Tdata
>
__inline__
__device__
Tdata
Div
(
Tdata
a
,
Tdata
b
);
template
<
>
__inline__
__device__
float
Div
<
float
>
(
float
a
,
float
b
)
{
#ifdef OF_LAYER_NORM_USE_FAST_MATH
return
__fdividef
(
a
,
b
);
#else
return
a
/
b
;
#endif
}
template
<
>
__inline__
__device__
double
Div
<
double
>
(
double
a
,
double
b
)
{
return
a
/
b
;
}
template
<
>
__inline__
__device__
half
Div
<
half
>
(
half
a
,
half
b
)
{
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
return
__hdiv
(
a
,
b
);
#else
return
__float2half
(
__half2float
(
a
)
/
__half2float
(
b
));
#endif
}
template
<
>
__inline__
__device__
bf16_t
Div
<
bf16_t
>
(
bf16_t
a
,
bf16_t
b
)
{
#if defined(ENABLE_NVIDIA_API) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
return
__hdiv
(
a
,
b
);
#else
return
__float2bfloat16_rn
(
__bfloat162float
(
a
)
/
__bfloat162float
(
b
));
#endif
}
template
<
typename
Tdata
,
typename
ComputeType
>
inline
__device__
void
WelfordReduce
(
const
Tdata
*
input_ptr
,
ComputeType
&
mean
,
ComputeType
&
m2
,
ComputeType
&
count
,
const
size_t
start
,
const
size_t
end
,
const
size_t
step
,
const
size_t
ndim
,
const
size_t
*
shape
,
const
ptrdiff_t
*
strides
)
{
ComputeType
old_mean
=
0.0
;
for
(
size_t
i
=
start
;
i
<
end
;
i
+=
step
)
{
++
count
;
old_mean
=
mean
;
size_t
input_offset
=
indexToOffset
(
i
,
ndim
,
shape
,
strides
);
ComputeType
input_value
=
static_cast
<
ComputeType
>
(
input_ptr
[
input_offset
]);
mean
+=
(
input_value
-
mean
)
/
count
;
m2
+=
(
input_value
-
mean
)
*
(
input_value
-
old_mean
);
}
}
template
<
typename
Tdata
>
inline
__device__
void
WelfordCombine
(
Tdata
val
,
Tdata
&
mean
,
Tdata
&
m2
,
Tdata
&
count
)
{
count
+=
1
;
Tdata
delta1
=
val
-
mean
;
mean
+=
Div
(
delta1
,
count
);
Tdata
delta2
=
val
-
mean
;
m2
+=
delta1
*
delta2
;
}
template
<
typename
Tdata
>
inline
__device__
void
WelfordCombine
(
Tdata
b_mean
,
Tdata
b_m2
,
Tdata
b_count
,
Tdata
&
mean
,
Tdata
&
m2
,
Tdata
&
count
)
{
if
(
b_count
==
0
)
{
return
;
}
Tdata
new_count
=
count
+
b_count
;
// n1 + n2
Tdata
nb_over_n
=
Div
(
b_count
,
new_count
);
// n2 / (n1 + n2)
Tdata
delta
=
b_mean
-
mean
;
// mean2 - mean1
mean
+=
delta
*
nb_over_n
;
// mean1 + n2 * (mean2 - mean1) / (n1 + n2)
m2
+=
b_m2
+
delta
*
delta
*
count
*
nb_over_n
;
// m21 + m22 + n2 * (mean2 - mean1) ^ 2 / (n1 + n2)
count
=
new_count
;
}
template
<
typename
Tdata
>
inline
__device__
void
WelfordCombineLoop
(
const
Tdata
*
b_mean
,
const
Tdata
*
b_m2
,
const
Tdata
*
b_count
,
Tdata
&
mean
,
Tdata
&
m2
,
Tdata
&
count
,
const
size_t
start
,
const
size_t
end
,
const
size_t
step
)
{
for
(
size_t
i
=
start
;
i
<
end
;
i
+=
step
)
{
WelfordCombine
(
b_mean
[
i
],
b_m2
[
i
],
b_count
[
i
],
mean
,
m2
,
count
);
}
}
template
<
typename
Tdata
,
int
thread_group_width
=
32
>
__inline__
__device__
void
WelfordWarpReduce
(
Tdata
thread_mean
,
Tdata
thread_m2
,
Tdata
thread_count
,
Tdata
&
mean
,
Tdata
&
m2
,
Tdata
&
count
)
{
mean
=
thread_mean
;
m2
=
thread_m2
;
count
=
thread_count
;
for
(
int
lane_mask
=
thread_group_width
/
2
;
lane_mask
>
0
;
lane_mask
/=
2
)
{
Tdata
b_mean
=
__shfl_down_sync
(
0xffffffff
,
mean
,
lane_mask
,
thread_group_width
);
Tdata
b_m2
=
__shfl_down_sync
(
0xffffffff
,
m2
,
lane_mask
,
thread_group_width
);
Tdata
b_count
=
__shfl_down_sync
(
0xffffffff
,
count
,
lane_mask
,
thread_group_width
);
WelfordCombine
(
b_mean
,
b_m2
,
b_count
,
mean
,
m2
,
count
);
}
}
template
<
typename
Tdata
,
size_t
kWarpSize
=
32
>
__inline__
__device__
void
WelfordBlockAllReduce
(
Tdata
thread_mean
,
Tdata
thread_m2
,
Tdata
thread_count
,
Tdata
&
result_mean
,
Tdata
&
result_m2
,
Tdata
&
result_count
)
{
__shared__
Tdata
mean_shared
[
kWarpSize
];
__shared__
Tdata
m2_shared
[
kWarpSize
];
__shared__
Tdata
count_shared
[
kWarpSize
];
__shared__
Tdata
mean_result_broadcast
;
__shared__
Tdata
m2_result_broadcast
;
__shared__
Tdata
count_result_broadcast
;
const
int
lid
=
threadIdx
.
x
%
kWarpSize
;
const
int
wid
=
threadIdx
.
x
/
kWarpSize
;
// warp内规约
Tdata
warp_mean
=
0.0
;
Tdata
warp_m2
=
0.0
;
Tdata
warp_count
=
0
;
WelfordWarpReduce
(
thread_mean
,
thread_m2
,
thread_count
,
warp_mean
,
warp_m2
,
warp_count
);
__syncthreads
();
if
(
lid
==
0
)
{
// 每个warp内的的thread0 保存warp结果
mean_shared
[
wid
]
=
warp_mean
;
m2_shared
[
wid
]
=
warp_m2
;
count_shared
[
wid
]
=
warp_count
;
}
__syncthreads
();
// warp间规约
if
(
wid
==
0
)
{
if
(
threadIdx
.
x
<
blockDim
.
x
/
kWarpSize
)
{
warp_mean
=
mean_shared
[
lid
];
warp_m2
=
m2_shared
[
lid
];
warp_count
=
count_shared
[
lid
];
}
else
{
warp_mean
=
static_cast
<
Tdata
>
(
0
);
warp_m2
=
static_cast
<
Tdata
>
(
0
);
warp_count
=
static_cast
<
Tdata
>
(
0
);
}
__syncwarp
();
Tdata
block_mean
=
0
;
Tdata
block_m2
=
0
;
Tdata
block_count
=
0
;
WelfordWarpReduce
(
warp_mean
,
warp_m2
,
warp_count
,
block_mean
,
block_m2
,
block_count
);
if
(
lid
==
0
)
{
mean_result_broadcast
=
block_mean
;
m2_result_broadcast
=
block_m2
;
count_result_broadcast
=
block_count
;
}
}
__syncthreads
();
result_mean
=
mean_result_broadcast
;
result_m2
=
m2_result_broadcast
;
result_count
=
count_result_broadcast
;
}
}
// namespace cuda
}
// namespace device
__device__
int32_t
done_block_count
=
0
;
template
<
typename
Tdata
,
typename
ComputeType
>
__global__
void
ComputeVarScalarOut
(
const
Tdata
*
input_ptr
,
Tdata
*
var_output_ptr
,
Tdata
*
mean_output_ptr
,
ComputeType
*
tmp_buffer_ptr
,
size_t
input_size
,
size_t
input_ndim
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
,
bool
unbiased
,
bool
is_nan
)
{
// 处理 NaN 情况
if
(
is_nan
)
{
if
(
blockIdx
.
x
==
0
&&
threadIdx
.
x
==
0
)
{
*
var_output_ptr
=
device
::
cuda
::
Nan
<
Tdata
>
();
mean_output_ptr
[
0
]
=
(
input_size
==
0
)
?
device
::
cuda
::
Nan
<
Tdata
>
()
:
input_ptr
[
0
];
}
return
;
}
// 计算每个 block 和 thread 的工作量
const
size_t
elems_per_block
=
input_size
/
gridDim
.
x
;
const
size_t
elems_per_thread
=
elems_per_block
/
blockDim
.
x
;
// 线程级 Welford 累积
ComputeType
thread_mean
=
0.0
,
thread_m2
=
0.0
,
thread_count
=
0
;
// 每个线程处理常规元素(stride 访问)
if
(
elems_per_thread
>
0
)
{
const
size_t
block_start
=
blockIdx
.
x
*
elems_per_block
;
const
size_t
regular_elems
=
elems_per_block
-
(
elems_per_block
%
blockDim
.
x
);
device
::
cuda
::
WelfordReduce
<
Tdata
,
ComputeType
>
(
input_ptr
,
thread_mean
,
thread_m2
,
thread_count
,
/*start=*/
block_start
+
threadIdx
.
x
,
/*end=*/
block_start
+
regular_elems
,
/*step=*/
blockDim
.
x
,
/*ndim=*/
input_ndim
,
/*shape=*/
permuted_input_shape
,
/*strides=*/
permuted_input_strides
);
}
// thread 0 处理本 block 的尾部元素以及跨 block 的尾部元素(单个线程处理)
if
(
threadIdx
.
x
==
0
)
{
size_t
tail_count
=
elems_per_block
%
blockDim
.
x
;
// 最后一个 block 还需要处理总元素数的尾部
if
(
blockIdx
.
x
==
gridDim
.
x
-
1
)
{
tail_count
+=
input_size
%
gridDim
.
x
;
}
if
(
tail_count
>
0
)
{
const
size_t
tail_start
=
blockIdx
.
x
*
elems_per_block
+
blockDim
.
x
*
elems_per_thread
;
device
::
cuda
::
WelfordReduce
<
Tdata
,
ComputeType
>
(
input_ptr
,
thread_mean
,
thread_m2
,
thread_count
,
/*start=*/
tail_start
,
/*end=*/
tail_start
+
tail_count
,
/*step=*/
1
,
/*ndim=*/
input_ndim
,
/*shape=*/
permuted_input_shape
,
/*strides=*/
permuted_input_strides
);
}
}
// Block 级规约
ComputeType
block_mean
=
0.0
,
block_m2
=
0.0
,
block_count
=
0
;
device
::
cuda
::
WelfordBlockAllReduce
<
ComputeType
>
(
thread_mean
,
thread_m2
,
thread_count
,
block_mean
,
block_m2
,
block_count
);
// 单 block 情况:直接输出结果
if
(
gridDim
.
x
==
1
)
{
if
(
threadIdx
.
x
==
0
)
{
ComputeType
divisor
=
unbiased
?
block_count
-
1
:
block_count
;
var_output_ptr
[
0
]
=
device
::
cuda
::
Div
(
block_m2
,
divisor
);
mean_output_ptr
[
0
]
=
static_cast
<
Tdata
>
(
block_mean
);
}
return
;
}
// 多 block 情况:使用临时缓冲区
ComputeType
*
tmp_mean_ptr
=
tmp_buffer_ptr
;
ComputeType
*
tmp_m2_ptr
=
tmp_mean_ptr
+
gridDim
.
x
;
ComputeType
*
tmp_count_ptr
=
tmp_m2_ptr
+
gridDim
.
x
;
// 保存本 block 的结果
if
(
threadIdx
.
x
==
0
)
{
tmp_mean_ptr
[
blockIdx
.
x
]
=
block_mean
;
tmp_m2_ptr
[
blockIdx
.
x
]
=
block_m2
;
tmp_count_ptr
[
blockIdx
.
x
]
=
block_count
;
}
// 最后一个 block 负责最终规约
__shared__
bool
is_last_block
;
if
(
threadIdx
.
x
==
0
)
{
is_last_block
=
(
atomicAdd
(
&
done_block_count
,
1
)
==
gridDim
.
x
-
1
);
}
__syncthreads
();
if
(
is_last_block
)
{
// 每个线程合并一部分 block 的结果
ComputeType
final_thread_mean
=
0.0
,
final_thread_m2
=
0.0
,
final_thread_count
=
0
;
const
size_t
blocks_per_thread
=
gridDim
.
x
/
blockDim
.
x
;
const
size_t
regular_blocks
=
blocks_per_thread
*
blockDim
.
x
;
if
(
blocks_per_thread
>
0
)
{
device
::
cuda
::
WelfordCombineLoop
(
tmp_mean_ptr
,
tmp_m2_ptr
,
tmp_count_ptr
,
final_thread_mean
,
final_thread_m2
,
final_thread_count
,
/*start=*/
threadIdx
.
x
,
/*end=*/
regular_blocks
,
/*step=*/
blockDim
.
x
);
}
// thread 0 处理尾部 block
if
(
threadIdx
.
x
==
0
&&
regular_blocks
<
gridDim
.
x
)
{
device
::
cuda
::
WelfordCombineLoop
(
&
tmp_mean_ptr
[
regular_blocks
],
&
tmp_m2_ptr
[
regular_blocks
],
&
tmp_count_ptr
[
regular_blocks
],
final_thread_mean
,
final_thread_m2
,
final_thread_count
,
/*start=*/
0
,
/*end=*/
gridDim
.
x
-
regular_blocks
,
/*step=*/
1
);
}
// 最终 block 级规约并输出
ComputeType
final_mean
=
0
,
final_m2
=
0
,
final_count
=
0
;
device
::
cuda
::
WelfordBlockAllReduce
<
ComputeType
>
(
final_thread_mean
,
final_thread_m2
,
final_thread_count
,
final_mean
,
final_m2
,
final_count
);
if
(
threadIdx
.
x
==
0
)
{
ComputeType
divisor
=
unbiased
?
final_count
-
1
:
final_count
;
var_output_ptr
[
0
]
=
device
::
cuda
::
Div
(
final_m2
,
divisor
);
mean_output_ptr
[
0
]
=
static_cast
<
Tdata
>
(
final_mean
);
done_block_count
=
0
;
// 重置计数器
}
}
}
// CUDA: grid stride looping
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x, step = blockDim.x * gridDim.x; i < (n); \
i += step)
template
<
typename
Tdata
,
typename
ComputeType
>
__forceinline__
__device__
__host__
void
ComputeVarMeanUsingWelford
(
const
Tdata
*
input_ptr
,
size_t
offset
,
Tdata
&
var_output
,
Tdata
&
mean_output
,
size_t
reduce_num
,
size_t
input_ndim
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
,
bool
unbiased
)
{
size_t
count
=
0
;
ComputeType
mean
=
0.0
;
ComputeType
old_mean
=
0.0
;
ComputeType
m2
=
0.0
;
for
(
size_t
i
=
0
;
i
<
reduce_num
;
++
i
)
{
size_t
input_offset
=
indexToOffset
(
offset
+
i
,
input_ndim
,
permuted_input_shape
,
permuted_input_strides
);
count
++
;
old_mean
=
mean
;
mean
=
old_mean
+
(
static_cast
<
ComputeType
>
(
input_ptr
[
input_offset
])
-
old_mean
)
/
count
;
m2
+=
(
static_cast
<
ComputeType
>
(
input_ptr
[
input_offset
])
-
old_mean
)
*
(
static_cast
<
ComputeType
>
(
input_ptr
[
input_offset
])
-
mean
);
}
var_output
=
static_cast
<
Tdata
>
(
m2
/
(
unbiased
?
count
-
1
:
count
));
mean_output
=
static_cast
<
Tdata
>
(
mean
);
}
template
<
typename
Tdata
,
typename
ComputeType
>
__global__
void
ComputeVarMeanUsingWelfordWrapper
(
const
Tdata
*
input_ptr
,
Tdata
*
var_output_ptr
,
Tdata
*
mean_output_ptr
,
size_t
input_ndim
,
size_t
output_size
,
size_t
reduce_num
,
size_t
*
permuted_input_shape
,
ptrdiff_t
*
permuted_input_strides
,
bool
unbiased
,
bool
is_nan
)
{
if
(
is_nan
)
{
if
(
reduce_num
==
0
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
output_size
)
{
var_output_ptr
[
i
]
=
device
::
cuda
::
Nan
<
Tdata
>
();
mean_output_ptr
[
i
]
=
device
::
cuda
::
Nan
<
Tdata
>
();
}
}
else
{
CUDA_1D_KERNEL_LOOP
(
i
,
output_size
)
{
const
size_t
input_offset
=
indexToOffset
(
i
*
reduce_num
,
input_ndim
,
permuted_input_shape
,
permuted_input_strides
);
var_output_ptr
[
i
]
=
device
::
cuda
::
Nan
<
Tdata
>
();
mean_output_ptr
[
i
]
=
input_ptr
[
input_offset
];
}
}
}
else
{
CUDA_1D_KERNEL_LOOP
(
i
,
output_size
)
{
ComputeVarMeanUsingWelford
<
Tdata
,
ComputeType
>
(
input_ptr
,
i
*
reduce_num
,
var_output_ptr
[
i
],
mean_output_ptr
[
i
],
reduce_num
,
input_ndim
,
permuted_input_shape
,
permuted_input_strides
,
unbiased
);
}
}
}
#endif // __VAR_MEAN_CUDA_H__
src/infiniop/ops/var_mean/info.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_MEAN_INFO_H__
#define __VAR_MEAN_INFO_H__
#include "../../../utils.h"
#include "../../tensor.h"
#include <algorithm>
#include <cstddef>
#include <vector>
namespace
op
::
var_mean
{
class
VarMeanInfo
{
VarMeanInfo
()
=
default
;
public:
infiniDtype_t
dtype
;
std
::
vector
<
size_t
>
permuted_input_shape
;
// need to permute
std
::
vector
<
size_t
>
output_shape
;
std
::
vector
<
ptrdiff_t
>
permuted_input_strides
;
// need to permute
std
::
vector
<
ptrdiff_t
>
output_strides
;
size_t
reduce_dim_size
;
// reduce dim size
size_t
reduce_num
;
// number of elements to reduce for each output element
size_t
input_size
;
// total number of input elements
size_t
output_size
;
// total number of output elements
bool
unbiased_var
;
static
utils
::
Result
<
VarMeanInfo
>
create
(
infiniopTensorDescriptor_t
var_output_desc
,
infiniopTensorDescriptor_t
input_desc
,
size_t
*
dim
,
size_t
dim_size
,
bool
unbiased
,
bool
keepdim
)
{
auto
input_shape
=
input_desc
->
shape
();
auto
input_strides
=
input_desc
->
strides
();
size_t
input_ndim
=
input_desc
->
ndim
();
size_t
reduce_num
=
1
;
for
(
size_t
i
=
0
;
i
<
dim_size
;
i
++
)
{
reduce_num
*=
input_shape
[
dim
[
i
]];
}
std
::
vector
<
size_t
>
permute_order
;
for
(
size_t
i
=
0
;
i
<
input_ndim
;
i
++
)
{
if
(
std
::
find
(
dim
,
dim
+
dim_size
,
i
)
==
dim
+
dim_size
)
{
permute_order
.
push_back
(
i
);
}
}
for
(
size_t
i
=
0
;
i
<
dim_size
;
i
++
)
{
permute_order
.
push_back
(
dim
[
i
]);
}
std
::
vector
<
size_t
>
permuted_input_shape
;
std
::
vector
<
ptrdiff_t
>
permuted_input_strides
;
for
(
size_t
i
=
0
;
i
<
permute_order
.
size
();
i
++
)
{
permuted_input_shape
.
push_back
(
input_shape
[
permute_order
[
i
]]);
permuted_input_strides
.
push_back
(
input_strides
[
permute_order
[
i
]]);
}
return
utils
::
Result
<
VarMeanInfo
>
(
VarMeanInfo
{
input_desc
->
dtype
(),
permuted_input_shape
,
var_output_desc
->
shape
(),
permuted_input_strides
,
var_output_desc
->
strides
(),
dim_size
,
reduce_num
,
input_desc
->
numel
(),
var_output_desc
->
numel
(),
unbiased
});
}
};
}
// namespace op::var_mean
#endif
src/infiniop/ops/var_mean/metax/var_mean_metax.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_MEAN_METAX_H__
#define __VAR_MEAN_METAX_H__
#include "../var_mean_desc.h"
DESCRIPTOR
(
metax
);
#endif // __VAR_MEAN_METAX_H__
src/infiniop/ops/var_mean/metax/var_mean_metax.maca
deleted
100644 → 0
View file @
6ab911c3
#include "../../../devices/metax/metax_common.h"
#include "../../../devices/metax/metax_kernel_common.h"
#include "../cuda/kernel.cuh"
#include "var_mean_metax.h"
namespace op::var_mean::metax {
struct Descriptor::Opaque {
std::shared_ptr<device::metax::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t var_output_desc,
infiniopTensorDescriptor_t mean_output_desc,
infiniopTensorDescriptor_t input_desc,
size_t *dim,
size_t dim_size,
bool unbiased,
bool keepdim) {
auto result = VarMeanInfo::create(var_output_desc, input_desc, dim, dim_size, unbiased, keepdim);
CHECK_RESULT(result);
auto info = result.take();
size_t workspace_size = 0;
workspace_size += input_desc->ndim() * (sizeof(size_t) + sizeof(ptrdiff_t)); // permuted_input_shape + permuted_input_strides
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
info, workspace_size, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
namespace {
bool IsNanOut(const VarMeanInfo &info) {
return (info.reduce_num == 0) || (info.reduce_num == 1 && info.unbiased_var == true);
}
template <size_t BLOCK_SIZE, typename Tdata, typename ComputeType>
infiniStatus_t launchKernel(
const VarMeanInfo &info,
Tdata *var_output, Tdata *mean_output, const Tdata *input,
bool unbiased, bool keepdim,
hcStream_t stream, void *workspace, size_t workspace_size) {
size_t input_ndim = info.permuted_input_shape.size();
size_t output_ndim = info.output_shape.size();
size_t input_size = info.input_size;
size_t output_size = info.output_size;
size_t reduce_num = info.reduce_num;
unsigned char *workspace_ptr = reinterpret_cast<unsigned char *>(workspace);
size_t workspace_offset = 0;
size_t *permuted_input_shape_hc = reinterpret_cast<size_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(size_t);
ptrdiff_t *permuted_input_strides_hc = reinterpret_cast<ptrdiff_t *>(workspace_ptr + workspace_offset);
workspace_offset += input_ndim * sizeof(ptrdiff_t);
CHECK_METAX(hcMemcpyAsync(permuted_input_shape_hc, info.permuted_input_shape.data(), input_ndim * sizeof(size_t), hcMemcpyHostToDevice, stream));
CHECK_METAX(hcMemcpyAsync(permuted_input_strides_hc, info.permuted_input_strides.data(), input_ndim * sizeof(ptrdiff_t), hcMemcpyHostToDevice, stream));
bool is_nan = IsNanOut(info);
if (info.reduce_num == input_size) { // scalar output
ComputeType *tmp_buffer;
constexpr size_t MAX_GRID_SIZE = 128;
size_t grid_size = std::min(MAX_GRID_SIZE,
(input_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
CHECK_METAX(hcMalloc(&tmp_buffer, grid_size * 3 * sizeof(ComputeType)));
ComputeVarScalarOut<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, mean_output, tmp_buffer, input_size, input_ndim,
permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan);
CHECK_METAX(hcFree(tmp_buffer));
} else {
size_t grid_size = std::min(256UL, (info.output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
grid_size = std::max(1UL, grid_size);
ComputeVarMeanUsingWelfordWrapper<Tdata, ComputeType><<<grid_size, BLOCK_SIZE, 0, stream>>>(
input, var_output, mean_output, input_ndim, output_size, reduce_num,
permuted_input_shape_hc, permuted_input_strides_hc, unbiased, is_nan);
}
return INFINI_STATUS_SUCCESS;
}
} // namespace
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *var_output,
void *mean_output,
const void *input,
bool unbiased,
bool keepdim,
void *stream_) const {
hcStream_t stream = (hcStream_t)stream_;
#define CALCULATE_VAR_MEAN(BLOCK_SIZE, Tdata, ComputeType) \
launchKernel<BLOCK_SIZE, Tdata, ComputeType>( \
_info, \
(Tdata *)var_output, (Tdata *)mean_output, (const Tdata *)input, \
unbiased, keepdim, \
stream, workspace, workspace_size)
#define CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_BF16) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, __hpcc_bfloat16, double); \
else if (_info.dtype == INFINI_DTYPE_F16) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, half, double); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return CALCULATE_VAR_MEAN(BLOCK_SIZE, float, double); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() >= 256) {
CALCULATE_VAR_MEAN_WITH_BLOCK_SIZE(256)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::var_mean::metax
src/infiniop/ops/var_mean/moore/var_mean_moore.h
deleted
100644 → 0
View file @
6ab911c3
#ifndef __VAR_MEAN_MOORE_H__
#define __VAR_MEAN_MOORE_H__
#include "../var_mean_desc.h"
DESCRIPTOR
(
moore
);
#endif // __VAR_MEAN_MOORE_H__
Prev
1
…
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment