Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
24af0144
Unverified
Commit
24af0144
authored
Nov 12, 2022
by
Po Yen Chen
Committed by
GitHub
Nov 12, 2022
Browse files
Merge branch 'develop' into gemm_layernorm_welford
parents
961f5e9e
b79bbbc2
Changes
813
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1110 additions
and
923 deletions
+1110
-923
example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
...mm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
+159
-0
example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
...gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
+29
-313
example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
...tmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
+262
-0
example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
...tmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
+319
-0
example/33_multiple_reduce/dual_reduce_common.hpp
example/33_multiple_reduce/dual_reduce_common.hpp
+7
-6
example/33_multiple_reduce/dual_reduce_multiblock.cpp
example/33_multiple_reduce/dual_reduce_multiblock.cpp
+1
-1
example/33_multiple_reduce/dual_reduce_threadwise.cpp
example/33_multiple_reduce/dual_reduce_threadwise.cpp
+1
-1
example/34_batchnorm/batchnorm_common.hpp
example/34_batchnorm/batchnorm_common.hpp
+7
-120
example/34_batchnorm/batchnorm_forward_impl.hpp
example/34_batchnorm/batchnorm_forward_impl.hpp
+0
-295
example/34_batchnorm/batchnorm_forward_nhwc.cpp
example/34_batchnorm/batchnorm_forward_nhwc.cpp
+233
-112
example/34_batchnorm/batchnorm_infer_impl.hpp
example/34_batchnorm/batchnorm_infer_impl.hpp
+28
-16
example/34_batchnorm/batchnorm_infer_nhwc.cpp
example/34_batchnorm/batchnorm_infer_nhwc.cpp
+41
-30
example/35_splitK_gemm/run_splitK_gemm_example.inc
example/35_splitK_gemm/run_splitK_gemm_example.inc
+7
-10
example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
+1
-1
example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
+1
-1
example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
+1
-1
example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
+1
-1
example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
+1
-1
example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
..._sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+4
-7
example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
..._gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
+7
-7
No files found.
example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
/*
Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
|-----------------|
Gemm0
|-------------------------------------|
Gemm1
*/
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
F16
;
using
B0DataType
=
F16
;
using
B1DataType
=
F16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
CDataType
=
F16
;
using
Acc0BiasDataType
=
ck
::
Tuple
<>
;
using
Acc1BiasDataType
=
ck
::
Tuple
<>
;
static
constexpr
ck
::
index_t
NumDimG
=
2
;
static
constexpr
ck
::
index_t
NumDimM
=
1
;
static
constexpr
ck
::
index_t
NumDimN
=
1
;
static
constexpr
ck
::
index_t
NumDimK
=
1
;
static
constexpr
ck
::
index_t
NumDimO
=
1
;
using
AElementOp
=
PassThrough
;
using
B0ElementOp
=
PassThrough
;
using
Acc0ElementOp
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
B1ElementOp
=
PassThrough
;
using
CElementOp
=
PassThrough
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKOPadding
;
static
constexpr
auto
MaskingSpec
=
ck
::
tensor_operation
::
device
::
MaskingSpecialization
::
MaskOutUpperTriangle
;
static
constexpr
auto
TensorSpecA
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
static
constexpr
auto
TensorSpecB0
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
static
constexpr
auto
TensorSpecB1
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
static
constexpr
auto
TensorSpecC
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
ADataType
,
B0DataType
,
B1DataType
,
CDataType
,
Acc0BiasDataType
,
Acc1BiasDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
B0ElementOp
,
Acc0ElementOp
,
B1ElementOp
,
CElementOp
,
GemmSpec
,
TensorSpecA
,
TensorSpecB0
,
TensorSpecB1
,
TensorSpecC
,
1
,
256
,
128
,
// MPerBlock
128
,
// NPerBlock
32
,
// KPerBlock
64
,
// Gemm1NPerBlock
32
,
// Gemm1KPerBlock
8
,
// AK1
8
,
// BK1
2
,
// B1K1
32
,
// MPerXDL
32
,
// NPerXDL
1
,
// MXdlPerWave
4
,
// NXdlPerWave
2
,
// Gemm1NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransfer
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
// BBlockTransfer
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
16
,
16
,
1
>
,
// B1BlockTransfer
S
<
0
,
2
,
1
>
,
S
<
0
,
2
,
1
>
,
1
,
4
,
2
,
false
,
1
,
// CShuffleMXdlPerWavePerShuffle
2
,
// CShuffleNXdlPerWavePerShuffle
S
<
1
,
32
,
1
,
8
>
,
// CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8
,
// CShuffleBlockTransferScalarPerVector_NPerBlock
MaskingSpec
>
;
// MaskingSpecialization
// Ref Gemm0: fp16 in, fp32 out
using
ReferenceGemm0Instance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
B0DataType
,
AccDataType
,
AccDataType
,
AElementOp
,
B0ElementOp
,
Acc0ElementOp
>
;
// Ref Softmax: fp32 in, fp16 out
using
ReferenceSoftmaxInstance
=
ck
::
tensor_operation
::
host
::
ReferenceSoftmax
<
AccDataType
,
ADataType
,
AccDataType
>
;
// Ref Gemm1: fp16 in, fp16 out
using
ReferenceGemm1Instance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
B1DataType
,
CDataType
,
AccDataType
,
AElementOp
,
B1ElementOp
,
CElementOp
>
;
#include "run_grouped_gemm_scale_softmax_gemm_permute.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
run
(
argc
,
argv
);
}
example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
View file @
24af0144
...
...
@@ -24,6 +24,7 @@ Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
...
...
@@ -33,9 +34,6 @@ using S = ck::Sequence<Is...>;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
F16
;
...
...
@@ -44,13 +42,14 @@ using B1DataType = F16;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
CDataType
=
F16
;
using
Acc0BiasDataType
=
ck
::
Tuple
<>
;
using
Acc1BiasDataType
=
ck
::
Tuple
<>
;
using
ALayout
=
Row
;
using
B0Layout
=
Col
;
using
B1Layout
=
Row
;
using
CPermuteNumDims_G_M_O
=
S
<
1
,
1
,
1
>
;
// "using CLayout = Row" has been replaced by CPermuteNumDims_M_O
static
constexpr
ck
::
index_t
NumDimG
=
2
;
static
constexpr
ck
::
index_t
NumDimM
=
1
;
static
constexpr
ck
::
index_t
NumDimN
=
1
;
static
constexpr
ck
::
index_t
NumDimK
=
1
;
static
constexpr
ck
::
index_t
NumDimO
=
1
;
using
AElementOp
=
PassThrough
;
using
B0ElementOp
=
PassThrough
;
...
...
@@ -59,17 +58,27 @@ using B1ElementOp = PassThrough;
using
CElementOp
=
PassThrough
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKOPadding
;
static
constexpr
auto
MaskingSpec
=
ck
::
tensor_operation
::
device
::
MaskingSpecialization
::
MaskDisabled
;
static
constexpr
auto
TensorSpecA
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
static
constexpr
auto
TensorSpecB0
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
static
constexpr
auto
TensorSpecB1
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
static
constexpr
auto
TensorSpecC
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
<
ALayout
,
B0Layout
,
B1Layout
,
CPermuteNumDims_G_M_O
,
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
NumDimO
,
ADataType
,
B0DataType
,
B1DataType
,
CDataType
,
Acc0BiasDataType
,
Acc1BiasDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
...
...
@@ -78,6 +87,10 @@ using DeviceGemmInstance =
B1ElementOp
,
CElementOp
,
GemmSpec
,
TensorSpecA
,
TensorSpecB0
,
TensorSpecB1
,
TensorSpecC
,
1
,
256
,
128
,
// MPerBlock
...
...
@@ -118,7 +131,7 @@ using DeviceGemmInstance =
2
,
// CShuffleNXdlPerWavePerShuffle
S
<
1
,
32
,
1
,
8
>
,
// CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8
,
// CShuffleBlockTransferScalarPerVector_NPerBlock
false
>
;
MaskingSpec
>
;
// MaskingSpecialization
// Ref Gemm0: fp16 in, fp32 out
using
ReferenceGemm0Instance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
...
...
@@ -142,303 +155,6 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<
B1ElementOp
,
CElementOp
>
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
exit
(
0
);
}
float
alpha
=
1
;
// scaling after 1st gemm
std
::
size_t
group_count
=
13
;
// Problem descs
std
::
vector
<
DeviceGemmInstance
::
ProblemDesc
>
problem_descs
;
std
::
vector
<
const
void
*>
p_a
;
std
::
vector
<
const
void
*>
p_b0
;
std
::
vector
<
const
void
*>
p_b1
;
std
::
vector
<
void
*>
p_c
;
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
int
M
=
128
*
(
rand
()
%
8
+
1
);
int
N
=
128
*
(
rand
()
%
8
+
1
);
int
K
=
40
;
int
O
=
40
*
(
rand
()
%
2
+
1
);
int
Batch
=
rand
()
%
8
+
1
;
const
int
StrideA
=
ck
::
is_same_v
<
ALayout
,
Row
>
?
K
:
M
;
const
int
StrideB0
=
ck
::
is_same_v
<
B0Layout
,
Row
>
?
N
:
K
;
const
int
StrideB1
=
ck
::
is_same_v
<
B1Layout
,
Row
>
?
O
:
N
;
const
int
BatchStrideA
=
(
ck
::
is_same_v
<
ALayout
,
Col
>
?
K
:
M
)
*
StrideA
;
const
int
BatchStrideB0
=
(
ck
::
is_same_v
<
B0Layout
,
Col
>
?
N
:
K
)
*
StrideB0
;
const
int
BatchStrideB1
=
(
ck
::
is_same_v
<
B1Layout
,
Col
>
?
O
:
N
)
*
StrideB1
;
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
Batch
,
M
,
O
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
{
O
,
Batch
*
O
,
1
};
problem_descs
.
push_back
({
M
,
N
,
K
,
O
,
Batch
,
StrideA
,
StrideB0
,
StrideB1
,
BatchStrideA
,
BatchStrideB0
,
BatchStrideB1
,
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
});
}
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count
,
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
}
};
std
::
vector
<
Tensor
<
ADataType
>>
a_tensors
;
std
::
vector
<
Tensor
<
B0DataType
>>
b0_tensors
;
std
::
vector
<
Tensor
<
B1DataType
>>
b1_tensors
;
std
::
vector
<
Tensor
<
CDataType
>>
c_tensors
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
std
::
vector
<
DeviceMemPtr
>
a_tensors_device
;
std
::
vector
<
DeviceMemPtr
>
b0_tensors_device
;
std
::
vector
<
DeviceMemPtr
>
b1_tensors_device
;
std
::
vector
<
DeviceMemPtr
>
c_tensors_device
;
std
::
size_t
flop
=
0
,
num_byte
=
0
;
std
::
cout
<<
"group count "
<<
group_count
<<
". printing first 4 groups
\n
"
;
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
const
auto
&
M
=
problem_descs
[
i
].
M
;
const
auto
&
N
=
problem_descs
[
i
].
N
;
const
auto
&
K
=
problem_descs
[
i
].
K
;
const
auto
&
O
=
problem_descs
[
i
].
O
;
const
auto
&
Batch
=
problem_descs
[
i
].
Batch
;
const
auto
&
StrideA
=
problem_descs
[
i
].
StrideA
;
const
auto
&
StrideB0
=
problem_descs
[
i
].
StrideB0
;
const
auto
&
StrideB1
=
problem_descs
[
i
].
StrideB1
;
const
auto
&
BatchStrideA
=
problem_descs
[
i
].
BatchStrideA
;
const
auto
&
BatchStrideB0
=
problem_descs
[
i
].
BatchStrideB0
;
const
auto
&
BatchStrideB1
=
problem_descs
[
i
].
BatchStrideB1
;
const
auto
&
c_gs_ms_os_lengths
=
problem_descs
[
i
].
c_gs_ms_os_lengths
;
const
auto
&
c_gs_ms_os_strides
=
problem_descs
[
i
].
c_gs_ms_os_strides
;
// C_m_o = A_m_k * B0_k_n * B1_n_o
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
Batch
,
M
,
K
,
StrideA
,
BatchStrideA
,
ALayout
{}));
Tensor
<
B0DataType
>
b0_g_k_n
(
f_host_tensor_descriptor
(
Batch
,
K
,
N
,
StrideB0
,
BatchStrideB0
,
B0Layout
{}));
Tensor
<
B1DataType
>
b1_g_n_o
(
f_host_tensor_descriptor
(
Batch
,
N
,
O
,
StrideB1
,
BatchStrideB1
,
B1Layout
{}));
Tensor
<
CDataType
>
c_gs_ms_os_device_result
(
std
::
vector
<
std
::
size_t
>
(
c_gs_ms_os_lengths
.
begin
(),
c_gs_ms_os_lengths
.
end
()),
std
::
vector
<
std
::
size_t
>
(
c_gs_ms_os_strides
.
begin
(),
c_gs_ms_os_strides
.
end
()));
flop
+=
(
size_t
(
M
)
*
N
*
K
*
2
+
size_t
(
M
)
*
N
*
O
*
2
)
*
Batch
;
num_byte
+=
(
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
B0DataType
)
*
K
*
N
+
sizeof
(
B1DataType
)
*
N
*
O
+
sizeof
(
CDataType
)
*
M
*
O
)
*
Batch
;
if
(
i
<
4
)
{
std
::
cout
<<
"a_g_m_k["
<<
i
<<
"]: "
<<
a_g_m_k
.
mDesc
<<
", "
<<
"b0_g_k_n["
<<
i
<<
"]: "
<<
b0_g_k_n
.
mDesc
<<
", "
<<
"b1_g_n_o["
<<
i
<<
"]: "
<<
b1_g_n_o
.
mDesc
<<
", "
<<
"c_gs_ms_os["
<<
i
<<
"]: "
<<
c_gs_ms_os_device_result
.
mDesc
<<
std
::
endl
;
}
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
B0DataType
>
{
-
2
,
2
});
b1_g_n_o
.
GenerateTensorValue
(
GeneratorTensor_2
<
B1DataType
>
{
-
2
,
2
});
break
;
case
2
:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b0_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
B0DataType
>
{
0.0
,
1.0
});
b1_g_n_o
.
GenerateTensorValue
(
GeneratorTensor_3
<
B1DataType
>
{
-
0.5
,
0.5
});
break
;
case
3
:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B0DataType
>
{});
b1_g_n_o
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
break
;
default:
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{
1
});
b0_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
b1_g_n_o
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
}
a_tensors
.
push_back
(
a_g_m_k
);
b0_tensors
.
push_back
(
b0_g_k_n
);
b1_tensors
.
push_back
(
b1_g_n_o
);
c_tensors
.
push_back
(
c_gs_ms_os_device_result
);
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_g_m_k
.
mDesc
.
GetElementSpaceSize
()));
b0_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
B0DataType
)
*
b0_g_k_n
.
mDesc
.
GetElementSpaceSize
()));
b1_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
B1DataType
)
*
b1_g_n_o
.
mDesc
.
GetElementSpaceSize
()));
c_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_gs_ms_os_device_result
.
mDesc
.
GetElementSpaceSize
()));
a_tensors_device
[
i
]
->
ToDevice
(
a_g_m_k
.
mData
.
data
());
b0_tensors_device
[
i
]
->
ToDevice
(
b0_g_k_n
.
mData
.
data
());
b1_tensors_device
[
i
]
->
ToDevice
(
b1_g_n_o
.
mData
.
data
());
p_a
.
push_back
(
a_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_b0
.
push_back
(
b0_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_b1
.
push_back
(
b1_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_c
.
push_back
(
c_tensors_device
[
i
]
->
GetDeviceBuffer
());
}
auto
a_element_op
=
AElementOp
{};
auto
b0_element_op
=
B0ElementOp
{};
auto
acc0_element_op
=
Acc0ElementOp
{
alpha
};
auto
b1_element_op
=
B1ElementOp
{};
auto
c_element_op
=
CElementOp
{};
// do GEMM
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
p_a
,
p_b0
,
p_b1
,
p_c
,
problem_descs
,
a_element_op
,
b0_element_op
,
acc0_element_op
,
b1_element_op
,
c_element_op
);
// specify workspace for problem_desc
DeviceMem
problem_desc_workspace
(
gemm
.
GetWorkSpaceSize
(
&
argument
));
gemm
.
SetWorkSpacePointer
(
&
argument
,
problem_desc_workspace
.
GetDeviceBuffer
());
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
std
::
cout
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
0
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
const
auto
&
M
=
problem_descs
[
i
].
M
;
const
auto
&
N
=
problem_descs
[
i
].
N
;
const
auto
&
O
=
problem_descs
[
i
].
O
;
const
auto
&
Batch
=
problem_descs
[
i
].
Batch
;
const
auto
&
c_gs_ms_os_lengths
=
problem_descs
[
i
].
c_gs_ms_os_lengths
;
const
auto
&
c_gs_ms_os_strides
=
problem_descs
[
i
].
c_gs_ms_os_strides
;
const
auto
&
a_g_m_k
=
a_tensors
[
i
];
const
auto
&
b0_g_k_n
=
b0_tensors
[
i
];
const
auto
&
b1_g_n_o
=
b1_tensors
[
i
];
auto
&
c_gs_ms_os_device_result
=
c_tensors
[
i
];
auto
&
c_gs_ms_os_device_buf
=
*
c_tensors_device
[
i
];
Tensor
<
CDataType
>
c_gs_ms_os_host_result
(
std
::
vector
<
std
::
size_t
>
(
c_gs_ms_os_lengths
.
begin
(),
c_gs_ms_os_lengths
.
end
()),
std
::
vector
<
std
::
size_t
>
(
c_gs_ms_os_strides
.
begin
(),
c_gs_ms_os_strides
.
end
()));
c_gs_ms_os_device_buf
.
FromDevice
(
c_gs_ms_os_device_result
.
mData
.
data
());
// Output of Gemm0 is input A of Gemm1
Tensor
<
AccDataType
>
acc0_m_n
(
f_host_tensor_descriptor
(
Batch
,
M
,
N
,
N
,
M
*
N
,
Row
{}));
Tensor
<
ADataType
>
a1_g_m_n
(
f_host_tensor_descriptor
(
Batch
,
M
,
N
,
N
,
M
*
N
,
Row
{}));
Tensor
<
CDataType
>
c_g_m_o_host_result
(
std
::
vector
<
int
>
{
Batch
,
M
,
O
},
std
::
vector
<
int
>
{
M
*
O
,
O
,
1
});
auto
ref_gemm0
=
ReferenceGemm0Instance
{};
auto
ref_gemm0_invoker
=
ref_gemm0
.
MakeInvoker
();
auto
ref_gemm0_argument
=
ref_gemm0
.
MakeArgument
(
a_g_m_k
,
b0_g_k_n
,
acc0_m_n
,
a_element_op
,
b0_element_op
,
acc0_element_op
);
ref_gemm0_invoker
.
Run
(
ref_gemm0_argument
);
auto
ref_softmax
=
ReferenceSoftmaxInstance
{};
auto
ref_softmax_invoker
=
ref_softmax
.
MakeInvoker
();
auto
ref_softmax_argument
=
ref_softmax
.
MakeArgument
(
acc0_m_n
,
a1_g_m_n
,
1
,
0
,
{
2
});
ref_softmax_invoker
.
Run
(
ref_softmax_argument
);
auto
ref_gemm1
=
ReferenceGemm1Instance
{};
auto
ref_gemm1_invoker
=
ref_gemm1
.
MakeInvoker
();
auto
ref_gemm1_argument
=
ref_gemm1
.
MakeArgument
(
a1_g_m_n
,
b1_g_n_o
,
c_g_m_o_host_result
,
PassThrough
{},
b1_element_op
,
c_element_op
);
ref_gemm1_invoker
.
Run
(
ref_gemm1_argument
);
// Note: in this example, we merely permute the dimensions by changing underlying
// strides so we simply access data as-is
c_gs_ms_os_host_result
.
ForEach
(
[
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
)
=
c_g_m_o_host_result
(
idx
);
});
bool
pass_
=
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
.
mData
,
c_gs_ms_os_host_result
.
mData
);
pass
&=
pass_
;
}
}
#include "run_grouped_gemm_scale_softmax_gemm_permute.inc"
return
pass
?
0
:
1
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
return
run
(
argc
,
argv
);
}
example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
int
run
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
// GEMM shape for A/B0/B1/C
// C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o
ck
::
index_t
M
=
120
;
ck
::
index_t
N
=
1000
;
ck
::
index_t
K
=
64
;
ck
::
index_t
O
=
128
;
// Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape
// C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o])
// C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3])
ck
::
index_t
G0
=
7
;
ck
::
index_t
G1
=
13
;
float
alpha
=
1
;
bool
input_permute
=
false
;
bool
output_permute
=
true
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
13
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
M
=
std
::
stoi
(
argv
[
4
]);
N
=
std
::
stoi
(
argv
[
5
]);
K
=
std
::
stoi
(
argv
[
6
]);
O
=
std
::
stoi
(
argv
[
7
]);
G0
=
std
::
stoi
(
argv
[
8
]);
G1
=
std
::
stoi
(
argv
[
9
]);
alpha
=
std
::
stof
(
argv
[
10
]);
input_permute
=
std
::
stoi
(
argv
[
11
]);
output_permute
=
std
::
stoi
(
argv
[
12
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4 to 11: M, N, K, O, G0, G1
\n
"
);
printf
(
"arg10: scale (alpha)
\n
"
);
printf
(
"arg11 to 12: input / output permute
\n
"
);
exit
(
0
);
}
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_lengths
{
G0
,
G1
,
M
,
K
};
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_strides
=
input_permute
?
std
::
vector
<
ck
::
index_t
>
{
M
*
G1
*
K
,
K
,
G1
*
K
,
1
}
// A layout [G0, M, G1, K]
:
std
::
vector
<
ck
::
index_t
>
{
G1
*
M
*
K
,
M
*
K
,
K
,
1
};
// A layout [G0, G1, M, K]
std
::
vector
<
ck
::
index_t
>
b0_gs_ns_ks_lengths
{
G0
,
G1
,
N
,
K
};
std
::
vector
<
ck
::
index_t
>
b0_gs_ns_ks_strides
=
input_permute
?
std
::
vector
<
ck
::
index_t
>
{
N
*
G1
*
K
,
K
,
G1
*
K
,
1
}
// B0 layout [G0, N, G1, K]
:
std
::
vector
<
ck
::
index_t
>
{
G1
*
N
*
K
,
N
*
K
,
K
,
1
};
// B0 layout [G0, G1, N, K]
std
::
vector
<
ck
::
index_t
>
b1_gs_os_ns_lengths
{
G0
,
G1
,
O
,
N
};
std
::
vector
<
ck
::
index_t
>
b1_gs_os_ns_strides
=
input_permute
?
std
::
vector
<
ck
::
index_t
>
{
N
*
G1
*
O
,
O
,
1
,
G1
*
O
}
// B1 layout [G0, N, G1, O]
:
std
::
vector
<
ck
::
index_t
>
{
G1
*
N
*
O
,
N
*
O
,
1
,
O
};
// B1 layout [G0, G1, N, O]
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
G0
,
G1
,
M
,
O
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
=
output_permute
?
std
::
vector
<
ck
::
index_t
>
{
M
*
G1
*
O
,
O
,
G1
*
O
,
1
}
// C layout [G0, M, G1, O]
:
std
::
vector
<
ck
::
index_t
>
{
G1
*
M
*
O
,
M
*
O
,
O
,
1
};
// C layout [G0, G1, M, O]
Tensor
<
ADataType
>
a_gs_ms_ks
(
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
);
Tensor
<
B0DataType
>
b0_gs_ns_ks
(
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_strides
);
Tensor
<
B1DataType
>
b1_gs_os_ns
(
b1_gs_os_ns_lengths
,
b1_gs_os_ns_strides
);
Tensor
<
CDataType
>
c_gs_ms_os_host_result
(
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
);
Tensor
<
CDataType
>
c_gs_ms_os_device_result
(
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
);
std
::
cout
<<
"a_gs_ms_ks: "
<<
a_gs_ms_ks
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b0_gs_ns_ks: "
<<
b0_gs_ns_ks
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b1_gs_os_ns: "
<<
b1_gs_os_ns
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_gs_ms_os: "
<<
c_gs_ms_os_host_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
B0DataType
>
{
-
2
,
2
});
b1_gs_os_ns
.
GenerateTensorValue
(
GeneratorTensor_2
<
B1DataType
>
{
-
2
,
2
});
break
;
case
2
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b0_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
B0DataType
>
{
0.0
,
1.0
});
b1_gs_os_ns
.
GenerateTensorValue
(
GeneratorTensor_3
<
B1DataType
>
{
-
0.5
,
0.5
});
break
;
case
3
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B0DataType
>
{});
b1_gs_os_ns
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
break
;
default
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
2
>
{});
b0_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B0DataType
>
{});
b1_gs_os_ns
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_gs_ms_ks
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b0_device_buf
(
sizeof
(
B0DataType
)
*
b0_gs_ns_ks
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b1_device_buf
(
sizeof
(
B1DataType
)
*
b1_gs_os_ns
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_gs_ms_os_device_result
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a_gs_ms_ks
.
mData
.
data
());
b0_device_buf
.
ToDevice
(
b0_gs_ns_ks
.
mData
.
data
());
b1_device_buf
.
ToDevice
(
b1_gs_os_ns
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
b0_element_op
=
B0ElementOp
{};
auto
acc0_element_op
=
Acc0ElementOp
{
alpha
};
auto
b1_element_op
=
B1ElementOp
{};
auto
c_element_op
=
CElementOp
{};
// do GEMM
// TODO ANT: replace array with vector?
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
B0DataType
*>
(
b0_device_buf
.
GetDeviceBuffer
()),
static_cast
<
B1DataType
*>
(
b1_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
{},
// std::array<void*, 1> p_acc0_biases;
{},
// std::array<void*, 1> p_acc1_biases;
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
,
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_strides
,
b1_gs_os_ns_lengths
,
b1_gs_os_ns_strides
,
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
,
{},
// std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
{},
// std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
{},
// std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
{},
// std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
a_element_op
,
b0_element_op
,
acc0_element_op
,
b1_element_op
,
c_element_op
);
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
std
::
cout
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
0
;
}
ck
::
index_t
BatchCount
=
G0
*
G1
;
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
(
size_t
(
M
)
*
N
*
K
*
2
+
size_t
(
M
)
*
N
*
O
*
2
)
*
BatchCount
;
std
::
size_t
num_btype
=
(
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
B0DataType
)
*
K
*
N
+
sizeof
(
B1DataType
)
*
N
*
O
+
sizeof
(
CDataType
)
*
M
*
O
)
*
BatchCount
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
{
c_device_buf
.
FromDevice
(
c_gs_ms_os_device_result
.
mData
.
data
());
Tensor
<
ADataType
>
a_g_m_k
({
BatchCount
,
M
,
K
});
Tensor
<
B0DataType
>
b0_g_k_n
({
BatchCount
,
K
,
N
});
Tensor
<
B1DataType
>
b1_g_n_o
({
BatchCount
,
N
,
O
});
Tensor
<
AccDataType
>
acc0_g_m_n
({
BatchCount
,
M
,
N
});
// scratch object after gemm0
Tensor
<
ADataType
>
a1_g_m_n
({
BatchCount
,
M
,
N
});
// scratch object after softmax
Tensor
<
CDataType
>
c_g_m_o_host_result
({
BatchCount
,
M
,
O
});
// scratch object after gemm1
// permute
a_gs_ms_ks
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
a_g_m_k
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
2
],
idx
[
3
])
=
self
(
idx
);
});
b0_gs_ns_ks
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
b0_g_k_n
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
3
],
idx
[
2
])
=
self
(
idx
);
});
b1_gs_os_ns
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
b1_g_n_o
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
3
],
idx
[
2
])
=
self
(
idx
);
});
// gemm 0
auto
ref_gemm0
=
ReferenceGemm0Instance
{};
auto
ref_gemm0_invoker
=
ref_gemm0
.
MakeInvoker
();
auto
ref_gemm0_argument
=
ref_gemm0
.
MakeArgument
(
a_g_m_k
,
b0_g_k_n
,
acc0_g_m_n
,
a_element_op
,
b0_element_op
,
acc0_element_op
);
ref_gemm0_invoker
.
Run
(
ref_gemm0_argument
);
// masking
const
auto
mask
=
DeviceGemmInstance
::
C0MatrixMask
(
N
);
acc0_g_m_n
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
if
(
mask
.
IsMaskedElement
(
idx
[
1
],
idx
[
2
]))
self
(
idx
)
=
-
ck
::
NumericLimits
<
float
>::
Infinity
();
});
// softmax
auto
ref_softmax
=
ReferenceSoftmaxInstance
{};
auto
ref_softmax_invoker
=
ref_softmax
.
MakeInvoker
();
auto
ref_softmax_argument
=
ref_softmax
.
MakeArgument
(
acc0_g_m_n
,
a1_g_m_n
,
1
,
0
,
{
2
});
ref_softmax_invoker
.
Run
(
ref_softmax_argument
);
// gemm1
auto
ref_gemm1
=
ReferenceGemm1Instance
{};
auto
ref_gemm1_invoker
=
ref_gemm1
.
MakeInvoker
();
auto
ref_gemm1_argument
=
ref_gemm1
.
MakeArgument
(
a1_g_m_n
,
b1_g_n_o
,
c_g_m_o_host_result
,
PassThrough
{},
b1_element_op
,
c_element_op
);
ref_gemm1_invoker
.
Run
(
ref_gemm1_argument
);
// permute
c_gs_ms_os_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
const
size_t
&
g0
=
idx
[
0
];
const
size_t
&
g1
=
idx
[
1
];
const
size_t
g
=
g0
*
G1
+
g1
;
self
(
idx
)
=
c_g_m_o_host_result
(
g
,
idx
[
2
],
idx
[
3
]);
});
return
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
.
mData
,
c_gs_ms_os_host_result
.
mData
)
?
0
:
1
;
}
return
0
;
}
example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
0 → 100644
View file @
24af0144
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
int
run
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
false
;
bool
input_permute
=
false
;
bool
output_permute
=
true
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
6
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
input_permute
=
std
::
stoi
(
argv
[
4
]);
output_permute
=
std
::
stoi
(
argv
[
5
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4 to 5: input / output permute
\n
"
);
exit
(
0
);
}
float
alpha
=
1
;
// scaling after 1st gemm
std
::
size_t
group_count
=
7
;
// Problem descs
std
::
vector
<
DeviceGemmInstance
::
ProblemDesc
>
problem_descs
;
std
::
vector
<
const
void
*>
p_a
;
std
::
vector
<
const
void
*>
p_b0
;
std
::
vector
<
const
void
*>
p_b1
;
std
::
vector
<
void
*>
p_c
;
std
::
vector
<
std
::
vector
<
int
>>
g0_g1_m_n_k_o
;
std
::
vector
<
Tensor
<
ADataType
>>
a_tensors
;
std
::
vector
<
Tensor
<
B0DataType
>>
b0_tensors
;
std
::
vector
<
Tensor
<
B1DataType
>>
b1_tensors
;
std
::
vector
<
Tensor
<
CDataType
>>
c_tensors
;
using
DeviceMemPtr
=
std
::
unique_ptr
<
DeviceMem
>
;
std
::
vector
<
DeviceMemPtr
>
a_tensors_device
;
std
::
vector
<
DeviceMemPtr
>
b0_tensors_device
;
std
::
vector
<
DeviceMemPtr
>
b1_tensors_device
;
std
::
vector
<
DeviceMemPtr
>
c_tensors_device
;
std
::
size_t
flop
=
0
,
num_byte
=
0
;
std
::
cout
<<
"group count "
<<
group_count
<<
". printing first 4 groups
\n
"
;
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
int
M
=
128
*
(
rand
()
%
8
+
1
);
int
N
=
128
*
(
rand
()
%
8
+
1
);
int
K
=
40
;
int
O
=
40
*
(
rand
()
%
2
+
1
);
int
G0
=
rand
()
%
3
+
1
;
int
G1
=
rand
()
%
5
+
1
;
g0_g1_m_n_k_o
.
push_back
({
G0
,
G1
,
M
,
N
,
K
,
O
});
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_lengths
{
G0
,
G1
,
M
,
K
};
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_strides
=
input_permute
?
std
::
vector
<
ck
::
index_t
>
{
M
*
G1
*
K
,
K
,
G1
*
K
,
1
}
// A layout [G0, M, G1, K]
:
std
::
vector
<
ck
::
index_t
>
{
G1
*
M
*
K
,
M
*
K
,
K
,
1
};
// A layout [G0, G1, M, K]
std
::
vector
<
ck
::
index_t
>
b0_gs_ns_ks_lengths
{
G0
,
G1
,
N
,
K
};
std
::
vector
<
ck
::
index_t
>
b0_gs_ns_ks_strides
=
input_permute
?
std
::
vector
<
ck
::
index_t
>
{
N
*
G1
*
K
,
K
,
G1
*
K
,
1
}
// B0 layout [G0, N, G1, K]
:
std
::
vector
<
ck
::
index_t
>
{
G1
*
N
*
K
,
N
*
K
,
K
,
1
};
// B0 layout [G0, G1, N, K]
std
::
vector
<
ck
::
index_t
>
b1_gs_os_ns_lengths
{
G0
,
G1
,
O
,
N
};
std
::
vector
<
ck
::
index_t
>
b1_gs_os_ns_strides
=
input_permute
?
std
::
vector
<
ck
::
index_t
>
{
N
*
G1
*
O
,
O
,
1
,
G1
*
O
}
// B1 layout [G0, N, G1, O]
:
std
::
vector
<
ck
::
index_t
>
{
G1
*
N
*
O
,
N
*
O
,
1
,
O
};
// B1 layout [G0, G1, N, O]
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_lengths
{
G0
,
G1
,
M
,
O
};
std
::
vector
<
ck
::
index_t
>
c_gs_ms_os_strides
=
output_permute
?
std
::
vector
<
ck
::
index_t
>
{
M
*
G1
*
O
,
O
,
G1
*
O
,
1
}
// C layout [G0, M, G1, O]
:
std
::
vector
<
ck
::
index_t
>
{
G1
*
M
*
O
,
M
*
O
,
O
,
1
};
// C layout [G0, G1, M, O]
problem_descs
.
push_back
({
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
,
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_strides
,
b1_gs_os_ns_lengths
,
b1_gs_os_ns_strides
,
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
,
{},
// acc0_biases_gs_ms_ns_lengths
{},
// acc0_biases_gs_ms_ns_strides
{},
// acc1_biases_gs_ms_os_lengths
{}});
// acc1_biases_gs_ms_os_strides
// C_m_o = A_m_k * B0_k_n * B1_n_o
Tensor
<
ADataType
>
a_gs_ms_ks
(
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
);
Tensor
<
B0DataType
>
b0_gs_ns_ks
(
b0_gs_ns_ks_lengths
,
b0_gs_ns_ks_strides
);
Tensor
<
B1DataType
>
b1_gs_os_ns
(
b1_gs_os_ns_lengths
,
b1_gs_os_ns_strides
);
Tensor
<
CDataType
>
c_gs_ms_os_device_result
(
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
);
int
Batch
=
G0
*
G1
;
flop
+=
(
size_t
(
M
)
*
N
*
K
*
2
+
size_t
(
M
)
*
N
*
O
*
2
)
*
Batch
;
num_byte
+=
(
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
B0DataType
)
*
K
*
N
+
sizeof
(
B1DataType
)
*
N
*
O
+
sizeof
(
CDataType
)
*
M
*
O
)
*
Batch
;
if
(
i
<
4
)
{
std
::
cout
<<
"a_gs_ms_ks["
<<
i
<<
"]: "
<<
a_gs_ms_ks
.
mDesc
<<
", "
<<
"b0_gs_ns_ks["
<<
i
<<
"]: "
<<
b0_gs_ns_ks
.
mDesc
<<
", "
<<
"b1_gs_os_ns["
<<
i
<<
"]: "
<<
b1_gs_os_ns
.
mDesc
<<
", "
<<
"c_gs_ms_os["
<<
i
<<
"]: "
<<
c_gs_ms_os_device_result
.
mDesc
<<
std
::
endl
;
}
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
B0DataType
>
{
-
2
,
2
});
b1_gs_os_ns
.
GenerateTensorValue
(
GeneratorTensor_2
<
B1DataType
>
{
-
2
,
2
});
break
;
case
2
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b0_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
B0DataType
>
{
0.0
,
1.0
});
b1_gs_os_ns
.
GenerateTensorValue
(
GeneratorTensor_3
<
B1DataType
>
{
-
0.5
,
0.5
});
break
;
case
3
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
2
,
2
});
b0_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B0DataType
>
{});
b1_gs_os_ns
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
break
;
default
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{
1
});
b0_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_Sequential
<
1
>
{});
b1_gs_os_ns
.
GenerateTensorValue
(
GeneratorTensor_Diagonal
<
B1DataType
>
{});
}
a_tensors
.
push_back
(
a_gs_ms_ks
);
b0_tensors
.
push_back
(
b0_gs_ns_ks
);
b1_tensors
.
push_back
(
b1_gs_os_ns
);
c_tensors
.
push_back
(
c_gs_ms_os_device_result
);
a_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
ADataType
)
*
a_gs_ms_ks
.
mDesc
.
GetElementSpaceSize
()));
b0_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
B0DataType
)
*
b0_gs_ns_ks
.
mDesc
.
GetElementSpaceSize
()));
b1_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
B1DataType
)
*
b1_gs_os_ns
.
mDesc
.
GetElementSpaceSize
()));
c_tensors_device
.
emplace_back
(
std
::
make_unique
<
DeviceMem
>
(
sizeof
(
CDataType
)
*
c_gs_ms_os_device_result
.
mDesc
.
GetElementSpaceSize
()));
a_tensors_device
[
i
]
->
ToDevice
(
a_gs_ms_ks
.
mData
.
data
());
b0_tensors_device
[
i
]
->
ToDevice
(
b0_gs_ns_ks
.
mData
.
data
());
b1_tensors_device
[
i
]
->
ToDevice
(
b1_gs_os_ns
.
mData
.
data
());
p_a
.
push_back
(
a_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_b0
.
push_back
(
b0_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_b1
.
push_back
(
b1_tensors_device
[
i
]
->
GetDeviceBuffer
());
p_c
.
push_back
(
c_tensors_device
[
i
]
->
GetDeviceBuffer
());
}
auto
a_element_op
=
AElementOp
{};
auto
b0_element_op
=
B0ElementOp
{};
auto
acc0_element_op
=
Acc0ElementOp
{
alpha
};
auto
b1_element_op
=
B1ElementOp
{};
auto
c_element_op
=
CElementOp
{};
// do GEMM
auto
gemm
=
DeviceGemmInstance
{};
auto
invoker
=
gemm
.
MakeInvoker
();
auto
argument
=
gemm
.
MakeArgument
(
p_a
,
p_b0
,
p_b1
,
p_c
,
{},
// p_acc0_biases
{},
// p_acc1_biases
problem_descs
,
a_element_op
,
b0_element_op
,
acc0_element_op
,
b1_element_op
,
c_element_op
);
// specify workspace for problem_desc
DeviceMem
problem_desc_workspace
(
gemm
.
GetWorkSpaceSize
(
&
argument
));
gemm
.
SetWorkSpacePointer
(
&
argument
,
problem_desc_workspace
.
GetDeviceBuffer
());
if
(
!
gemm
.
IsSupportedArgument
(
argument
))
{
std
::
cout
<<
gemm
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
0
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
for
(
std
::
size_t
i
=
0
;
i
<
group_count
;
i
++
)
{
const
int
&
G0
=
g0_g1_m_n_k_o
[
i
][
0
];
const
int
&
G1
=
g0_g1_m_n_k_o
[
i
][
1
];
const
int
&
M
=
g0_g1_m_n_k_o
[
i
][
2
];
const
int
&
N
=
g0_g1_m_n_k_o
[
i
][
3
];
const
int
&
K
=
g0_g1_m_n_k_o
[
i
][
4
];
const
int
&
O
=
g0_g1_m_n_k_o
[
i
][
5
];
const
auto
&
c_gs_ms_os_lengths
=
problem_descs
[
i
]
.
c_gs_ms_os_lengths
;
const
auto
&
c_gs_ms_os_strides
=
problem_descs
[
i
]
.
c_gs_ms_os_strides
;
const
auto
&
a_gs_ms_ks
=
a_tensors
[
i
];
const
auto
&
b0_gs_ns_ks
=
b0_tensors
[
i
];
const
auto
&
b1_gs_os_ns
=
b1_tensors
[
i
];
auto
&
c_gs_ms_os_device_result
=
c_tensors
[
i
];
auto
&
c_gs_ms_os_device_buf
=
*
c_tensors_device
[
i
];
c_gs_ms_os_device_buf
.
FromDevice
(
c_gs_ms_os_device_result
.
mData
.
data
());
Tensor
<
ADataType
>
a_g_m_k
({
G0
*
G1
,
M
,
K
});
Tensor
<
B0DataType
>
b0_g_k_n
({
G0
*
G1
,
K
,
N
});
Tensor
<
B1DataType
>
b1_g_n_o
({
G0
*
G1
,
N
,
O
});
Tensor
<
AccDataType
>
acc0_g_m_n
({
G0
*
G1
,
M
,
N
});
// scratch object after gemm0
Tensor
<
ADataType
>
a1_g_m_n
({
G0
*
G1
,
M
,
N
});
// scratch object after softmax
Tensor
<
CDataType
>
c_g_m_o_host_result
({
G0
*
G1
,
M
,
O
});
// scratch object after gemm1
Tensor
<
CDataType
>
c_gs_ms_os_host_result
(
c_gs_ms_os_lengths
,
c_gs_ms_os_strides
);
// permute
a_gs_ms_ks
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
a_g_m_k
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
2
],
idx
[
3
])
=
self
(
idx
);
});
b0_gs_ns_ks
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
b0_g_k_n
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
3
],
idx
[
2
])
=
self
(
idx
);
});
b1_gs_os_ns
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
b1_g_n_o
(
idx
[
0
]
*
G1
+
idx
[
1
],
idx
[
3
],
idx
[
2
])
=
self
(
idx
);
});
// gemm 0
auto
ref_gemm0
=
ReferenceGemm0Instance
{};
auto
ref_gemm0_invoker
=
ref_gemm0
.
MakeInvoker
();
auto
ref_gemm0_argument
=
ref_gemm0
.
MakeArgument
(
a_g_m_k
,
b0_g_k_n
,
acc0_g_m_n
,
a_element_op
,
b0_element_op
,
acc0_element_op
);
ref_gemm0_invoker
.
Run
(
ref_gemm0_argument
);
// masking
const
auto
mask
=
DeviceGemmInstance
::
C0MatrixMask
(
N
);
acc0_g_m_n
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
if
(
mask
.
IsMaskedElement
(
idx
[
1
],
idx
[
2
]))
self
(
idx
)
=
-
ck
::
NumericLimits
<
float
>::
Infinity
();
});
// softmax
auto
ref_softmax
=
ReferenceSoftmaxInstance
{};
auto
ref_softmax_invoker
=
ref_softmax
.
MakeInvoker
();
auto
ref_softmax_argument
=
ref_softmax
.
MakeArgument
(
acc0_g_m_n
,
a1_g_m_n
,
1
,
0
,
{
2
});
ref_softmax_invoker
.
Run
(
ref_softmax_argument
);
// gemm 1
auto
ref_gemm1
=
ReferenceGemm1Instance
{};
auto
ref_gemm1_invoker
=
ref_gemm1
.
MakeInvoker
();
auto
ref_gemm1_argument
=
ref_gemm1
.
MakeArgument
(
a1_g_m_n
,
b1_g_n_o
,
c_g_m_o_host_result
,
PassThrough
{},
b1_element_op
,
c_element_op
);
ref_gemm1_invoker
.
Run
(
ref_gemm1_argument
);
// permute
c_gs_ms_os_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
const
size_t
&
g0
=
idx
[
0
];
const
size_t
&
g1
=
idx
[
1
];
const
size_t
g
=
g0
*
G1
+
g1
;
self
(
idx
)
=
c_g_m_o_host_result
(
g
,
idx
[
2
],
idx
[
3
]);
});
bool
pass_
=
ck
::
utils
::
check_err
(
c_gs_ms_os_device_result
.
mData
,
c_gs_ms_os_host_result
.
mData
);
pass
&=
pass_
;
}
}
return
pass
?
0
:
1
;
}
example/33_multiple_reduce/dual_reduce_common.hpp
View file @
24af0144
...
...
@@ -12,6 +12,7 @@
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
...
...
@@ -253,10 +254,10 @@ int mean_meansquare_dual_reduce_test(size_t n,
std
::
array
<
ck
::
index_t
,
NumOutputDim
>
i_outLengths
;
std
::
array
<
ck
::
index_t
,
NumOutputDim
>
i_outStrides
;
std
::
copy
(
inLengths
.
begin
(),
inLengths
.
end
()
,
i_inLengths
.
begin
());
std
::
copy
(
inStrides
.
begin
(),
inStrides
.
end
()
,
i_inStrides
.
begin
());
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
i_outLengths
.
begin
());
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
i_outStrides
.
begin
());
ck
::
ranges
::
copy
(
inLengths
,
i_inLengths
.
begin
());
ck
::
ranges
::
copy
(
inStrides
,
i_inStrides
.
begin
());
ck
::
ranges
::
copy
(
outLengths
,
i_outLengths
.
begin
());
ck
::
ranges
::
copy
(
outStrides
,
i_outStrides
.
begin
());
auto
dual_reduce_op
=
DeviceDualReduce
{};
...
...
@@ -305,8 +306,8 @@ int mean_meansquare_dual_reduce_test(size_t n,
{
mean_dev
.
FromDevice
(
mean
.
mData
.
data
());
meansquare_dev
.
FromDevice
(
meansquare
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
mean
.
mData
,
mean_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
meansquare
.
mData
,
meansquare_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
mean
,
mean_ref
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
meansquare
,
meansquare_ref
);
};
return
(
pass
?
0
:
1
);
...
...
example/33_multiple_reduce/dual_reduce_multiblock.cpp
View file @
24af0144
...
...
@@ -13,7 +13,7 @@
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_multiple_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "dual_reduce_common.hpp"
...
...
example/33_multiple_reduce/dual_reduce_threadwise.cpp
View file @
24af0144
...
...
@@ -13,7 +13,7 @@
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_multiple_reduce_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_multiple_reduce_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "dual_reduce_common.hpp"
...
...
example/34_batchnorm/batchnorm_common.hpp
View file @
24af0144
...
...
@@ -10,131 +10,17 @@
#include "ck/utility/data_type.hpp"
// binary operation used to calculate invVariance from mean and meansquare
struct
InvVariance
{
InvVariance
(
double
epsilon
)
:
epsilon_
(
epsilon
){};
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
mean
,
const
T
&
meansquare
)
const
{
static_assert
(
std
::
is_same
<
T
,
float
>::
value
||
std
::
is_same
<
T
,
double
>::
value
,
"Data type is not supported by this operation!"
);
using
ck
::
type_convert
;
using
ck
::
math
::
sqrt
;
T
tmp_epsilon
=
type_convert
<
T
>
(
epsilon_
);
y
=
meansquare
-
mean
*
mean
;
y
=
1.0
f
/
sqrt
(
tmp_epsilon
+
y
);
};
double
epsilon_
;
};
// (4-in, 2-out) element-wise operation used to update the moving average of mean and variance
struct
MovingAverage
{
MovingAverage
(
double
factor
)
:
factor_
(
factor
){};
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y0
,
T
&
y1
,
const
T
&
mean
,
const
T
&
runningMean
,
const
T
&
meansquare
,
const
T
&
runningVariance
)
const
{
static_assert
(
std
::
is_same
<
T
,
float
>::
value
||
std
::
is_same
<
T
,
double
>::
value
,
"Data type is not supported by this operation!"
);
using
ck
::
type_convert
;
T
tmp_factor
=
type_convert
<
T
>
(
factor_
);
T
variance
=
meansquare
-
mean
*
mean
;
y0
=
runningMean
*
(
type_convert
<
T
>
(
1.0
f
)
-
tmp_factor
)
+
mean
*
tmp_factor
;
y1
=
runningVariance
*
(
type_convert
<
T
>
(
1.0
f
)
-
tmp_factor
)
+
variance
*
tmp_factor
;
};
double
factor_
;
};
struct
MovingAverageAndInvVariance
{
MovingAverageAndInvVariance
(
double
epsilon
,
double
factor
)
:
epsilon_
(
epsilon
),
factor_
(
factor
){};
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y0
,
// resultRunningMean
T
&
y1
,
// resultRunningVariance
T
&
y2
,
// saveInvVariance
const
T
&
mean
,
const
T
&
runningMean
,
const
T
&
meansquare
,
const
T
&
runningVariance
)
const
{
static_assert
(
std
::
is_same
<
T
,
float
>::
value
||
std
::
is_same
<
T
,
double
>::
value
,
"Data type is not supported by this operation!"
);
using
ck
::
type_convert
;
using
ck
::
math
::
sqrt
;
T
tmp_epsilon
=
type_convert
<
T
>
(
epsilon_
);
T
tmp_factor
=
type_convert
<
T
>
(
factor_
);
T
variance
=
meansquare
-
mean
*
mean
;
y0
=
runningMean
*
(
type_convert
<
T
>
(
1.0
f
)
-
tmp_factor
)
+
mean
*
tmp_factor
;
y1
=
runningVariance
*
(
type_convert
<
T
>
(
1.0
f
)
-
tmp_factor
)
+
variance
*
tmp_factor
;
y2
=
1.0
f
/
sqrt
(
tmp_epsilon
+
variance
);
};
double
epsilon_
;
double
factor_
;
};
struct
NormalizeInInfer
{
NormalizeInInfer
(
double
epsilon
=
1e-4
)
:
epsilon_
(
epsilon
)
{}
template
<
typename
T1
,
typename
T2
>
template
<
typename
T1
,
typename
T2
,
typename
T3
,
typename
T4
>
__host__
__device__
constexpr
void
operator
()(
T1
&
y
,
const
T1
&
x
,
const
T2
&
mean
,
const
T2
&
variance
,
const
T2
&
gamma
,
const
T2
&
beta
)
const
{
static_assert
(
std
::
is_same
<
T2
,
float
>::
value
||
std
::
is_same
<
T2
,
double
>::
value
,
"Data type is not supported by this operation!"
);
using
ck
::
type_convert
;
using
ck
::
math
::
sqrt
;
T2
tmp_x
,
tmp_y
;
tmp_x
=
type_convert
<
T2
>
(
x
);
tmp_y
=
((
tmp_x
-
mean
)
/
sqrt
(
variance
+
type_convert
<
T2
>
(
epsilon_
)))
*
gamma
+
beta
;
y
=
type_convert
<
T1
>
(
tmp_y
);
};
double
epsilon_
;
};
struct
NormalizeInForward
{
NormalizeInForward
(
double
epsilon
=
1e-4
)
:
epsilon_
(
epsilon
)
{}
template
<
typename
T1
,
typename
T2
>
__host__
__device__
constexpr
void
operator
()(
T1
&
y
,
const
T1
&
x
,
const
T2
&
mean
,
const
T2
&
meansquare
,
const
T2
&
gamma
,
const
T2
&
beta
)
const
const
T3
&
gamma
,
const
T4
&
beta
)
const
{
static_assert
(
std
::
is_same
<
T2
,
float
>::
value
||
std
::
is_same
<
T2
,
double
>::
value
,
"Data type is not supported by this operation!"
);
...
...
@@ -143,11 +29,12 @@ struct NormalizeInForward
using
ck
::
math
::
sqrt
;
T2
tmp_x
,
tmp_y
;
T2
variance
=
meansquare
-
mean
*
mean
;
tmp_x
=
type_convert
<
T2
>
(
x
);
tmp_y
=
((
tmp_x
-
mean
)
/
sqrt
(
variance
+
type_convert
<
T2
>
(
epsilon_
)))
*
gamma
+
beta
;
tmp_y
=
((
tmp_x
-
mean
)
/
sqrt
(
variance
+
type_convert
<
T2
>
(
epsilon_
)))
*
type_convert
<
T2
>
(
gamma
)
+
type_convert
<
T2
>
(
beta
);
y
=
type_convert
<
T1
>
(
tmp_y
);
};
...
...
example/34_batchnorm/batchnorm_forward_impl.hpp
deleted
100644 → 0
View file @
961f5e9e
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cassert>
#include <vector>
#include "ck/ck.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_multiple_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "batchnorm_common.hpp"
template
<
typename
InOutDataType
,
typename
AccDataType
,
ck
::
index_t
Rank
,
ck
::
index_t
NumBatchNormReduceDim
,
bool
fastest_dim_is_reduced
=
false
>
int
bnorm_fwd
(
bool
time_kernel
,
bool
updateMovingAverage
,
bool
saveMeanAndInvVariance
,
const
std
::
array
<
int
,
NumBatchNormReduceDim
>
reduceDims
,
const
std
::
array
<
ck
::
index_t
,
Rank
>
xyLengths
,
const
std
::
array
<
ck
::
index_t
,
Rank
>
xStrides
,
const
std
::
array
<
ck
::
index_t
,
Rank
>
yStrides
,
const
std
::
array
<
ck
::
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarLengths
,
const
std
::
array
<
ck
::
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarStrides
,
const
void
*
p_x
,
const
void
*
p_scale
,
const
void
*
p_bias
,
void
*
p_y
,
double
exponentialAverageFactor
,
void
*
p_runningMean
,
void
*
p_runningVariance
,
double
epsilon
,
void
*
p_saveMean
,
void
*
p_saveInvVariance
,
void
*
p_tmp_mean
,
void
*
p_tmp_meansquare
)
{
static_assert
(
NumBatchNormReduceDim
<
Rank
,
"Invalid number of reduced dimensions for batchnorm!"
);
constexpr
ck
::
index_t
NumScaleBiasMeanVarDim
=
Rank
-
NumBatchNormReduceDim
;
using
InElementwiseOperation_Mean
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation_Mean
=
ck
::
tensor_operation
::
element_wise
::
UnaryDivide
;
using
InElementwiseOperation_Meansquare
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
AccElementwiseOperation_Meansquare
=
ck
::
tensor_operation
::
element_wise
::
UnaryDivide
;
using
DeviceMeanAndMeansquareInstance
=
ck
::
tensor_operation
::
device
::
DeviceMultipleReduceMultiBlock
<
2
,
InOutDataType
,
AccDataType
,
ck
::
Tuple
<
AccDataType
,
AccDataType
>
,
Rank
,
NumBatchNormReduceDim
,
ck
::
reduce
::
Add
,
ck
::
Tuple
<
InElementwiseOperation_Mean
,
InElementwiseOperation_Meansquare
>
,
ck
::
Tuple
<
AccElementwiseOperation_Mean
,
AccElementwiseOperation_Meansquare
>
,
ck
::
InMemoryDataOperationEnum
::
Set
,
false
,
// PropagateNan
256
,
16
,
16
,
1
,
1
,
fastest_dim_is_reduced
?
1
:
0
,
1
,
ck
::
Sequence
<
1
,
1
>>
;
using
DeviceNormalizeInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
InOutDataType
,
AccDataType
,
AccDataType
,
AccDataType
,
AccDataType
>
,
// x, mean,
// meansquare,
// scale, bias
ck
::
Tuple
<
InOutDataType
>
,
// y
NormalizeInForward
,
Rank
,
2
,
// MPerthread
ck
::
Sequence
<
1
,
1
,
1
,
1
,
1
>
,
// scalarPerVector: x, mean, meansquare, scale, bias
ck
::
Sequence
<
1
>>
;
// scalarPerVector: y
using
DeviceInvVarianceInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
AccDataType
,
AccDataType
>
,
// mean, meansquare
ck
::
Tuple
<
AccDataType
>
,
// invVariance
InvVariance
,
NumScaleBiasMeanVarDim
,
2
,
// MPerthread
ck
::
Sequence
<
1
,
1
>
,
// scalarPerVector: mean, meansquare
ck
::
Sequence
<
1
>>
;
// scalarPerVector: invVariance
using
DeviceMovingAverageInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
AccDataType
,
AccDataType
,
AccDataType
,
AccDataType
>
,
// old moving mean, new mean,
// old moving variance, new
// meansquare
ck
::
Tuple
<
AccDataType
,
AccDataType
>
,
// updated moving mean, updated moving variance
MovingAverage
,
NumScaleBiasMeanVarDim
,
4
,
// MPerthread
ck
::
Sequence
<
1
,
1
,
1
,
1
>
,
// scalarPerVector: old moving mean, new mean, old moving
// variance, new meansquare
ck
::
Sequence
<
1
,
1
>>
;
// scalarPerVector: updated moving mean, updated moving variance
using
DeviceMovingAverageAndInvVarianceInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
AccDataType
,
AccDataType
,
AccDataType
,
AccDataType
>
,
// old moving mean, new
// mean, old moving
// variance, new
// meansquare
ck
::
Tuple
<
AccDataType
,
AccDataType
,
AccDataType
>
,
// updated moving mean, updated moving
// variancem, invVariance
MovingAverageAndInvVariance
,
NumScaleBiasMeanVarDim
,
4
,
// MPerthread
ck
::
Sequence
<
1
,
1
,
1
,
1
>
,
// scalarPerVector: old moving mean, new mean, old moving
// variance, new meansquare
ck
::
Sequence
<
1
,
1
,
1
>>
;
// scalarPerVector: updated moving mean, updated moving variance
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumBatchNormReduceDim
>
(
reduceDims
);
std
::
array
<
ck
::
index_t
,
Rank
>
aligned_scaleBiasMeanVarStrides
{
0
};
int
i
=
0
;
for
(
auto
dim
:
invariantDims
)
{
assert
(
xyLengths
[
dim
]
==
bnScaleBiasMeanVarLengths
[
i
]);
aligned_scaleBiasMeanVarStrides
[
dim
]
=
bnScaleBiasMeanVarStrides
[
i
];
i
++
;
};
int32_t
reduceLength
=
1
;
for
(
auto
dim
:
reduceDims
)
reduceLength
*=
xyLengths
[
dim
];
int32_t
invariantLength
=
1
;
for
(
auto
dim
:
invariantDims
)
invariantLength
*=
xyLengths
[
dim
];
size_t
total_length
=
static_cast
<
size_t
>
(
invariantLength
)
*
reduceLength
;
float
avg_time
=
0.0
f
;
std
::
size_t
num_bytes
=
0
;
auto
dev_mean_and_meansquare
=
DeviceMeanAndMeansquareInstance
{};
void
*
p_mean
=
saveMeanAndInvVariance
?
p_saveMean
:
p_tmp_mean
;
const
AccDataType
alpha
=
ck
::
type_convert
<
AccDataType
>
(
1.0
f
);
const
AccDataType
beta
=
ck
::
type_convert
<
AccDataType
>
(
0.0
f
);
auto
argument_ptr1
=
dev_mean_and_meansquare
.
MakeArgumentPointer
(
xyLengths
,
xStrides
,
bnScaleBiasMeanVarLengths
,
{
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
},
reduceDims
,
{
&
alpha
,
&
alpha
},
{
&
beta
,
&
beta
},
p_x
,
{
p_mean
,
p_tmp_meansquare
},
ck
::
make_tuple
(
InElementwiseOperation_Mean
{},
InElementwiseOperation_Meansquare
{}),
ck
::
make_tuple
(
AccElementwiseOperation_Mean
{
reduceLength
},
AccElementwiseOperation_Meansquare
{
reduceLength
}));
auto
dev_normalize
=
DeviceNormalizeInstance
{};
auto
argument_ptr2
=
dev_normalize
.
MakeArgumentPointer
(
xyLengths
,
{
xStrides
,
aligned_scaleBiasMeanVarStrides
,
aligned_scaleBiasMeanVarStrides
,
aligned_scaleBiasMeanVarStrides
,
aligned_scaleBiasMeanVarStrides
},
{
yStrides
},
{
p_x
,
p_mean
,
p_tmp_meansquare
,
p_scale
,
p_bias
},
{
p_y
},
NormalizeInForward
{
epsilon
});
if
(
!
dev_mean_and_meansquare
.
IsSupportedArgument
(
argument_ptr1
.
get
())
||
!
dev_normalize
.
IsSupportedArgument
(
argument_ptr2
.
get
()))
{
std
::
cout
<<
"The runtime parameters seems not supported by the Devic, exiting!"
<<
std
::
endl
;
return
(
-
1
);
};
auto
invoker_ptr1
=
dev_mean_and_meansquare
.
MakeInvokerPointer
();
auto
invoker_ptr2
=
dev_normalize
.
MakeInvokerPointer
();
avg_time
+=
invoker_ptr1
->
Run
(
argument_ptr1
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
avg_time
+=
invoker_ptr2
->
Run
(
argument_ptr2
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
num_bytes
+=
(
total_length
*
sizeof
(
InOutDataType
)
+
invariantLength
*
2
*
sizeof
(
AccDataType
))
+
// No.1
(
total_length
*
(
1
*
sizeof
(
InOutDataType
)
+
4
*
sizeof
(
AccDataType
))
+
total_length
*
sizeof
(
InOutDataType
));
// No.2
if
(
saveMeanAndInvVariance
&&
updateMovingAverage
)
{
auto
dev_moving_average_inv_variance
=
DeviceMovingAverageAndInvVarianceInstance
{};
auto
argument_ptr3
=
dev_moving_average_inv_variance
.
MakeArgumentPointer
(
bnScaleBiasMeanVarLengths
,
{
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
},
{
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
},
{
p_mean
,
p_runningMean
,
p_tmp_meansquare
,
p_runningVariance
},
{
p_runningMean
,
p_runningVariance
,
p_saveInvVariance
},
MovingAverageAndInvVariance
{
epsilon
,
exponentialAverageFactor
});
if
(
!
dev_moving_average_inv_variance
.
IsSupportedArgument
(
argument_ptr3
.
get
()))
{
std
::
cout
<<
"Runtime parameters not supported by the Device, exiting!"
<<
std
::
endl
;
return
(
-
1
);
};
auto
invoker_ptr3
=
dev_moving_average_inv_variance
.
MakeInvokerPointer
();
avg_time
+=
invoker_ptr3
->
Run
(
argument_ptr3
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
num_bytes
+=
invariantLength
*
(
4
+
3
)
*
sizeof
(
AccDataType
)
*
2
;
// No.5
}
else
if
(
saveMeanAndInvVariance
)
{
auto
dev_inv_variance
=
DeviceInvVarianceInstance
{};
auto
argument_ptr3
=
dev_inv_variance
.
MakeArgumentPointer
(
bnScaleBiasMeanVarLengths
,
{
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
},
{
bnScaleBiasMeanVarStrides
},
{
p_mean
,
p_tmp_meansquare
},
{
p_saveInvVariance
},
InvVariance
{
epsilon
});
if
(
!
dev_inv_variance
.
IsSupportedArgument
(
argument_ptr3
.
get
()))
{
std
::
cout
<<
"Runtime parameters not supported by the Device, exiting!"
<<
std
::
endl
;
return
(
-
1
);
};
auto
invoker_ptr3
=
dev_inv_variance
.
MakeInvokerPointer
();
avg_time
+=
invoker_ptr3
->
Run
(
argument_ptr3
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
num_bytes
+=
invariantLength
*
(
2
+
1
)
*
sizeof
(
AccDataType
);
}
else
if
(
updateMovingAverage
)
{
auto
dev_moving_average
=
DeviceMovingAverageInstance
{};
auto
argument_ptr3
=
dev_moving_average
.
MakeArgumentPointer
(
bnScaleBiasMeanVarLengths
,
{
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
},
{
bnScaleBiasMeanVarStrides
,
bnScaleBiasMeanVarStrides
},
{
p_mean
,
p_runningMean
,
p_tmp_meansquare
,
p_runningVariance
},
{
p_runningMean
,
p_runningVariance
},
MovingAverage
{
exponentialAverageFactor
});
if
(
!
dev_moving_average
.
IsSupportedArgument
(
argument_ptr3
.
get
()))
{
std
::
cout
<<
"Runtime parameters not supported by the Device, exiting!"
<<
std
::
endl
;
return
(
-
1
);
};
auto
invoker_ptr3
=
dev_moving_average
.
MakeInvokerPointer
();
avg_time
+=
invoker_ptr3
->
Run
(
argument_ptr3
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
num_bytes
+=
invariantLength
*
(
4
+
2
)
*
sizeof
(
AccDataType
)
*
2
;
// No.5
};
if
(
time_kernel
)
{
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
};
return
(
0
);
};
example/34_batchnorm/batchnorm_forward_nhwc.cpp
View file @
24af0144
...
...
@@ -9,19 +9,16 @@
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward_nhwc_c.hpp"
#include "batchnorm_forward_impl.hpp"
template
<
typename
InOutDataType
,
typename
AccDataType
>
using
ReferenceBatchNormFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchNormFwd_Input_N_H_W_C_Output_C
<
InOutDataType
,
AccDataType
>
;
#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
static
struct
option
long_options
[]
=
{{
"inOutLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
...
...
@@ -44,6 +41,7 @@ class BatchNormFwdArg
int
data_type
=
0
;
int
init_method
=
2
;
bool
time_kernel
=
false
;
bool
use_multiblock_welford
=
false
;
public:
void
show_usage
(
const
char
*
cmd
)
...
...
@@ -68,6 +66,7 @@ class BatchNormFwdArg
"value, 3=decimal value)"
<<
std
::
endl
;
std
::
cout
<<
"Arg5: time kernel (0=no, 1=yes)"
<<
std
::
endl
;
std
::
cout
<<
"Arg6: use multi-block welford (0=n0, 1=yes)"
<<
std
::
endl
;
};
int
processArgs
(
int
argc
,
char
*
argv
[])
...
...
@@ -110,14 +109,15 @@ class BatchNormFwdArg
};
};
if
(
optind
+
5
>
argc
)
if
(
optind
+
6
>
argc
)
throw
std
::
runtime_error
(
"Invalid cmd-line arguments, more argumetns are needed!"
);
data_type
=
std
::
atoi
(
argv
[
optind
++
]);
updateMovingAverage
=
std
::
atoi
(
argv
[
optind
++
]);
saveMeanAndInvVariance
=
std
::
atoi
(
argv
[
optind
++
]);
init_method
=
std
::
atoi
(
argv
[
optind
++
]);
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
]));
time_kernel
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
++
]));
use_multiblock_welford
=
static_cast
<
bool
>
(
std
::
atoi
(
argv
[
optind
]));
if
(
data_type
!=
0
&&
data_type
!=
1
&&
data_type
!=
3
&&
data_type
!=
5
&&
data_type
!=
6
)
return
(
-
1
);
...
...
@@ -128,7 +128,7 @@ class BatchNormFwdArg
using
namespace
ck
;
template
<
typename
InOutDataType
,
typename
AccDataType
>
template
<
typename
InOutDataType
,
typename
AccDataType
,
bool
UseMultiblockInK
>
bool
bnorm_fwd_nhwc_test
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
...
...
@@ -264,82 +264,145 @@ bool bnorm_fwd_nhwc_test(bool do_verification,
std
::
array
<
index_t
,
Rank
-
NumReduceDim
>
i_scaleBiasMeanVarLengths
;
std
::
array
<
index_t
,
Rank
-
NumReduceDim
>
i_scaleBiasMeanVarStrides
;
std
::
copy
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
i_inOutLengths
.
begin
());
std
::
copy
(
inOutStrides
.
begin
(),
inOutStrides
.
end
(),
i_inOutStrides
.
begin
());
std
::
copy
(
scaleBiasMeanVarLengths
.
begin
(),
scaleBiasMeanVarLengths
.
end
(),
i_scaleBiasMeanVarLengths
.
begin
());
std
::
copy
(
scaleBiasMeanVarStrides
.
begin
(),
scaleBiasMeanVarStrides
.
end
(),
i_scaleBiasMeanVarStrides
.
begin
());
int
result
=
0
;
// used for saving meansquare
DeviceMem
workspace
(
sizeof
(
AccDataType
)
*
2
*
resultSaveMean_ref
.
mDesc
.
GetElementSpaceSize
()
+
128
);
void
*
p_tmp_mean
=
workspace
.
GetDeviceBuffer
();
void
*
p_tmp_meansquare
=
static_cast
<
char
*>
(
p_tmp_mean
)
+
(
sizeof
(
AccDataType
)
*
resultSaveMean_ref
.
mDesc
.
GetElementSpaceSize
()
+
63
)
/
64
*
64
;
result
=
bnorm_fwd
<
InOutDataType
,
AccDataType
,
Rank
,
NumReduceDim
,
false
>
(
time_kernel
,
updateMovingAverage
,
saveMeanAndInvVariance
,
{
0
,
1
,
2
},
ck
::
ranges
::
copy
(
inOutLengths
,
i_inOutLengths
.
begin
());
ck
::
ranges
::
copy
(
inOutStrides
,
i_inOutStrides
.
begin
());
ck
::
ranges
::
copy
(
scaleBiasMeanVarLengths
,
i_scaleBiasMeanVarLengths
.
begin
());
ck
::
ranges
::
copy
(
scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
.
begin
());
using
PassThroughOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceBatchNormFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchNormFwdImpl
<
InOutDataType
,
InOutDataType
,
AccDataType
,
AccDataType
,
// ScaleDataType
AccDataType
,
// BiasDataType
AccDataType
,
// MeanVarDataType
PassThroughOp
,
// YElementwiseOp
Rank
,
NumReduceDim
,
UseMultiblockInK
,
256
,
16
,
16
,
1
,
2
,
0
,
1
,
1
,
1
,
1
,
1
>
;
auto
batchnorm_fwd
=
DeviceBatchNormFwdInstance
{};
auto
argument_ptr
=
batchnorm_fwd
.
MakeArgumentPointer
(
i_inOutLengths
,
i_inOutStrides
,
i_inOutStrides
,
{
0
,
1
,
2
},
i_scaleBiasMeanVarLengths
,
i_scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
,
x_dev
.
GetDeviceBuffer
(),
bnScale_dev
.
GetDeviceBuffer
(),
bnBias_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
averageFactor
,
updateMovingAverage
?
resultRunningMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
updateMovingAverage
?
resultRunningVariance_dev
.
GetDeviceBuffer
()
:
nullptr
,
epsilon
,
PassThroughOp
{},
y_dev
.
GetDeviceBuffer
(),
saveMeanAndInvVariance
?
resultSaveMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
saveMeanAndInvVariance
?
resultSaveInvVariance_dev
.
GetDeviceBuffer
()
:
nullptr
,
p_tmp_mean
,
p_tmp_meansquare
);
averageFactor
,
updateMovingAverage
?
resultRunningMean_dev
.
GetDeviceBuffer
()
:
nullptr
,
updateMovingAverage
?
resultRunningVariance_dev
.
GetDeviceBuffer
()
:
nullptr
);
if
(
result
<
0
)
if
(
!
batchnorm_fwd
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
cout
<<
"The runtime parameters seems not supported by the BatchNorm device instance, "
"exiting!"
<<
std
::
endl
;
return
(
false
);
};
size_t
workspace_sz
=
batchnorm_fwd
.
GetWorkSpaceSize
(
argument_ptr
.
get
());
DeviceMem
workspace_dev
(
workspace_sz
);
batchnorm_fwd
.
SetWorkSpacePointer
(
argument_ptr
.
get
(),
workspace_dev
.
GetDeviceBuffer
());
auto
invoker_ptr
=
batchnorm_fwd
.
MakeInvokerPointer
();
if
(
time_kernel
)
{
float
avg_time
=
0.0
f
;
size_t
num_bytes
=
0
;
size_t
total_length
=
inOutLengths
[
0
]
*
inOutLengths
[
1
]
*
inOutLengths
[
2
]
*
inOutLengths
[
3
];
size_t
invariant_length
=
inOutLengths
[
3
];
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
// inputing of x, scale, bias, outputing of y
num_bytes
+=
total_length
*
sizeof
(
InOutDataType
)
*
2
+
invariant_length
*
sizeof
(
AccDataType
)
*
2
;
// outputing of mean, inv-variance
num_bytes
+=
saveMeanAndInvVariance
?
invariant_length
*
sizeof
(
AccDataType
)
*
2
:
0
;
// updating of moving mean, variance
num_bytes
+=
updateMovingAverage
?
invariant_length
*
sizeof
(
AccDataType
)
*
4
:
0
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
}
else
(
void
)
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
bool
pass
=
true
;
if
(
do_verification
)
{
auto
batchNormFwd_ref
=
ReferenceBatchNormFwdInstance
<
InOutDataType
,
AccDataType
>
{};
using
ReferenceBatchNormFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchNormFwd_Input_N_H_W_C_Output_C
<
InOutDataType
,
InOutDataType
,
AccDataType
,
AccDataType
,
AccDataType
,
AccDataType
,
PassThroughOp
>
;
auto
batchNormFwd_ref
=
ReferenceBatchNormFwdInstance
{};
auto
argument_ptr_ref
=
batchNormFwd_ref
.
MakeArgumentPointer
(
i_inOutLengths
,
i_inOutStrides
,
i_inOutStrides
,
{
0
,
1
,
2
},
i_scaleBiasMeanVarLengths
,
i_scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
,
x
.
mData
.
data
(),
bnScale
.
mData
.
data
(),
bnBias
.
mData
.
data
(),
y_ref
.
mData
.
data
(),
0.1
,
// exponentialAverageFactor
updateMovingAverage
?
resultRunningMean_ref
.
mData
.
data
()
:
nullptr
,
// resultRunningMean
updateMovingAverage
?
resultRunningVariance_ref
.
mData
.
data
()
:
nullptr
,
// resultRunningVariance
epsilon
,
PassThroughOp
{},
y_ref
.
mData
.
data
(),
saveMeanAndInvVariance
?
resultSaveMean_ref
.
mData
.
data
()
:
nullptr
,
saveMeanAndInvVariance
?
resultSaveInvVariance_ref
.
mData
.
data
()
:
nullptr
);
saveMeanAndInvVariance
?
resultSaveInvVariance_ref
.
mData
.
data
()
:
nullptr
,
averageFactor
,
updateMovingAverage
?
resultRunningMean_ref
.
mData
.
data
()
:
nullptr
,
updateMovingAverage
?
resultRunningVariance_ref
.
mData
.
data
()
:
nullptr
);
if
(
!
batchNormFwd_ref
.
IsSupportedArgument
(
argument_ptr_ref
.
get
()))
{
std
::
cout
<<
"The runtime parameters seems not supported by the BatchNorm
instance, exiting!"
std
::
cout
<<
"The runtime parameters seems not supported by the BatchNorm reference "
"
instance, exiting!"
<<
std
::
endl
;
return
(
-
2
);
return
(
false
);
};
auto
invoker_ptr_ref
=
batchNormFwd_ref
.
MakeInvokerPointer
();
...
...
@@ -347,7 +410,7 @@ bool bnorm_fwd_nhwc_test(bool do_verification,
(
void
)
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
y_dev
.
FromDevice
(
y
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
y
.
mData
,
y_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
y
,
y_ref
);
if
(
updateMovingAverage
)
{
...
...
@@ -357,23 +420,22 @@ bool bnorm_fwd_nhwc_test(bool do_verification,
resultRunningMean_dev
.
FromDevice
(
resultRunningMean
.
mData
.
data
());
resultRunningVariance_dev
.
FromDevice
(
resultRunningVariance
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
resultRunningMean
.
mData
,
resultRunningMean_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
resultRunningVariance
.
mData
,
resultRunningVariance_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
resultRunningMean
,
resultRunningMean_ref
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
resultRunningVariance
,
resultRunningVariance_ref
);
};
if
(
saveMeanAndInvVariance
)
{
using
ck
::
host_common
::
dumpBufferToFile
;
Tensor
<
AccDataType
>
resultSaveMean
(
scaleBiasMeanVarLengths
);
Tensor
<
AccDataType
>
resultSaveInvVariance
(
scaleBiasMeanVarLengths
);
resultSaveMean_dev
.
FromDevice
(
resultSaveMean
.
mData
.
data
());
resultSaveInvVariance_dev
.
FromDevice
(
resultSaveInvVariance
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
resultSaveMean
.
mData
,
resultSaveMean_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
resultSaveInvVariance
.
mData
,
resultSaveInvVariance_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
resultSaveMean
,
resultSaveMean_ref
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
resultSaveInvVariance
,
resultSaveInvVariance_ref
);
};
};
...
...
@@ -396,7 +458,17 @@ int main(int argc, char* argv[])
if
(
arg
.
data_type
==
0
)
{
pass
=
bnorm_fwd_nhwc_test
<
ck
::
half_t
,
float
>
(
arg
.
do_verification
,
if
(
arg
.
use_multiblock_welford
)
pass
=
bnorm_fwd_nhwc_test
<
ck
::
half_t
,
float
,
true
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
arg
.
updateMovingAverage
,
arg
.
saveMeanAndInvVariance
,
averageFactor
,
epsilon
);
else
pass
=
bnorm_fwd_nhwc_test
<
ck
::
half_t
,
float
,
false
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
...
...
@@ -407,7 +479,17 @@ int main(int argc, char* argv[])
}
else
if
(
arg
.
data_type
==
1
)
{
pass
=
bnorm_fwd_nhwc_test
<
float
,
float
>
(
arg
.
do_verification
,
if
(
arg
.
use_multiblock_welford
)
pass
=
bnorm_fwd_nhwc_test
<
float
,
float
,
true
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
arg
.
updateMovingAverage
,
arg
.
saveMeanAndInvVariance
,
averageFactor
,
epsilon
);
else
pass
=
bnorm_fwd_nhwc_test
<
float
,
float
,
false
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
...
...
@@ -418,7 +500,17 @@ int main(int argc, char* argv[])
}
else
if
(
arg
.
data_type
==
3
)
{
pass
=
bnorm_fwd_nhwc_test
<
int8_t
,
float
>
(
arg
.
do_verification
,
if
(
arg
.
use_multiblock_welford
)
pass
=
bnorm_fwd_nhwc_test
<
int8_t
,
float
,
true
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
arg
.
updateMovingAverage
,
arg
.
saveMeanAndInvVariance
,
averageFactor
,
epsilon
);
else
pass
=
bnorm_fwd_nhwc_test
<
int8_t
,
float
,
false
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
...
...
@@ -429,7 +521,17 @@ int main(int argc, char* argv[])
}
else
if
(
arg
.
data_type
==
5
)
{
pass
=
bnorm_fwd_nhwc_test
<
ck
::
bhalf_t
,
float
>
(
arg
.
do_verification
,
if
(
arg
.
use_multiblock_welford
)
pass
=
bnorm_fwd_nhwc_test
<
ck
::
bhalf_t
,
float
,
true
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
arg
.
updateMovingAverage
,
arg
.
saveMeanAndInvVariance
,
averageFactor
,
epsilon
);
else
pass
=
bnorm_fwd_nhwc_test
<
ck
::
bhalf_t
,
float
,
false
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
...
...
@@ -440,7 +542,17 @@ int main(int argc, char* argv[])
}
else
if
(
arg
.
data_type
==
6
)
{
pass
=
bnorm_fwd_nhwc_test
<
double
,
double
>
(
arg
.
do_verification
,
if
(
arg
.
use_multiblock_welford
)
pass
=
bnorm_fwd_nhwc_test
<
double
,
double
,
true
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
arg
.
updateMovingAverage
,
arg
.
saveMeanAndInvVariance
,
averageFactor
,
epsilon
);
else
pass
=
bnorm_fwd_nhwc_test
<
double
,
double
,
false
>
(
arg
.
do_verification
,
arg
.
init_method
,
arg
.
time_kernel
,
arg
.
inOutLengths
,
...
...
@@ -452,12 +564,21 @@ int main(int argc, char* argv[])
}
else
{
pass
=
bnorm_fwd_nhwc_test
<
ck
::
half_t
,
float
>
(
true
,
pass
=
bnorm_fwd_nhwc_test
<
ck
::
half_t
,
float
,
true
>
(
true
,
2
,
false
,
// don't time kernel
{
128
,
16
,
16
,
1024
},
{
128
,
16
,
6
,
512
},
true
,
true
,
averageFactor
,
epsilon
);
pass
=
pass
&&
bnorm_fwd_nhwc_test
<
ck
::
half_t
,
float
,
false
>
(
true
,
2
,
false
,
// don't time kernel
{
128
,
16
,
3
,
1024
},
true
,
true
,
false
,
averageFactor
,
epsilon
);
};
...
...
example/34_batchnorm/batchnorm_infer_impl.hpp
View file @
24af0144
...
...
@@ -10,12 +10,16 @@
#include "ck/utility/sequence.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_elementwise.hpp"
#include "batchnorm_common.hpp"
template
<
typename
InOutDataType
,
template
<
typename
XDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
ScaleDataType
,
typename
BiasDataType
,
typename
MeanVarDataType
,
ck
::
index_t
Rank
,
ck
::
index_t
NumBatchNormReduceDim
,
bool
fastest_dim_is_reduced
=
false
>
...
...
@@ -26,7 +30,9 @@ int bnorm_infer(
const
std
::
array
<
ck
::
index_t
,
Rank
>
xStrides
,
const
std
::
array
<
ck
::
index_t
,
Rank
>
yStrides
,
const
std
::
array
<
ck
::
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarLengths
,
const
std
::
array
<
ck
::
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleBiasMeanVarStrides
,
const
std
::
array
<
ck
::
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnScaleStrides
,
const
std
::
array
<
ck
::
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnBiasStrides
,
const
std
::
array
<
ck
::
index_t
,
Rank
-
NumBatchNormReduceDim
>
bnMeanVarStrides
,
const
void
*
p_x
,
const
void
*
p_scale
,
const
void
*
p_bias
,
...
...
@@ -41,11 +47,11 @@ int bnorm_infer(
"Invalid number of reduced dimensions for batchnorm!"
);
using
DeviceNormalizeInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
InOut
DataType
,
AccDataType
,
AccDataType
,
AccDataType
,
AccDataType
>
,
// x, mean,
ck
::
Tuple
<
X
DataType
,
AccDataType
,
AccDataType
,
AccDataType
,
AccDataType
>
,
// x, mean,
// variance,
// scale,
// bias,
ck
::
Tuple
<
InOut
DataType
>
,
// y
ck
::
Tuple
<
Y
DataType
>
,
// y
NormalizeInInfer
,
Rank
,
2
,
// MPerthread
...
...
@@ -53,14 +59,18 @@ int bnorm_infer(
ck
::
Sequence
<
1
>>
;
// scalarPerVector: y
auto
invariantDims
=
get_invariant_dims
<
Rank
,
NumBatchNormReduceDim
>
(
reduceDims
);
std
::
array
<
ck
::
index_t
,
Rank
>
aligned_scaleBiasMeanVarStrides
{
0
};
std
::
array
<
ck
::
index_t
,
Rank
>
aligned_bnScaleStrides
{
0
};
std
::
array
<
ck
::
index_t
,
Rank
>
aligned_bnBiasStrides
{
0
};
std
::
array
<
ck
::
index_t
,
Rank
>
aligned_bnMeanVarStrides
{
0
};
int
i
=
0
;
for
(
auto
dim
:
invariantDims
)
{
assert
(
xyLengths
[
dim
]
==
bnScaleBiasMeanVarLengths
[
i
]);
aligned_scaleBiasMeanVarStrides
[
dim
]
=
bnScaleBiasMeanVarStrides
[
i
];
aligned_bnScaleStrides
[
dim
]
=
bnScaleStrides
[
i
];
aligned_bnBiasStrides
[
dim
]
=
bnBiasStrides
[
i
];
aligned_bnMeanVarStrides
[
dim
]
=
bnMeanVarStrides
[
i
];
i
++
;
};
...
...
@@ -84,10 +94,10 @@ int bnorm_infer(
auto
argument_ptr1
=
dev_normalize
.
MakeArgumentPointer
(
xyLengths
,
{
xStrides
,
aligned_
scaleBias
MeanVarStrides
,
aligned_
scaleBias
MeanVarStrides
,
aligned_
s
cale
BiasMeanVar
Strides
,
aligned_
scaleBiasMeanVar
Strides
},
aligned_
bn
MeanVarStrides
,
aligned_
bn
MeanVarStrides
,
aligned_
bnS
caleStrides
,
aligned_
bnBias
Strides
},
{
yStrides
},
{
p_x
,
p_estimatedMean
,
p_estimatedVariance
,
p_scale
,
p_bias
},
{
p_y
},
...
...
@@ -105,8 +115,10 @@ int bnorm_infer(
avg_time
+=
invoker_ptr1
->
Run
(
argument_ptr1
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
num_bytes
+=
(
total_length
*
(
1
*
sizeof
(
InOutDataType
)
+
4
*
sizeof
(
AccDataType
))
+
total_length
*
sizeof
(
InOutDataType
));
num_bytes
+=
total_length
*
sizeof
(
XDataType
)
+
invariantLength
*
(
sizeof
(
ScaleDataType
)
+
sizeof
(
BiasDataType
)
+
2
*
sizeof
(
MeanVarDataType
))
+
total_length
*
sizeof
(
YDataType
);
if
(
time_kernel
)
{
...
...
example/34_batchnorm/batchnorm_infer_nhwc.cpp
View file @
24af0144
...
...
@@ -9,6 +9,7 @@
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
...
...
@@ -18,11 +19,6 @@
#include "batchnorm_infer_impl.hpp"
template
<
typename
InOutDataType
,
typename
AccDataType
>
using
ReferenceBatchNormInferInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchNormInfer_Input_N_H_W_C_Output_C
<
InOutDataType
,
AccDataType
>
;
static
struct
option
long_options
[]
=
{{
"inOutLengths"
,
required_argument
,
nullptr
,
'D'
},
{
"verify"
,
required_argument
,
nullptr
,
'v'
},
{
"help"
,
no_argument
,
nullptr
,
'?'
},
...
...
@@ -225,25 +221,30 @@ bool bnorm_infer_nhwc_test(bool do_verification,
std
::
array
<
index_t
,
Rank
-
NumReduceDim
>
i_scaleBiasMeanVarLengths
;
std
::
array
<
index_t
,
Rank
-
NumReduceDim
>
i_scaleBiasMeanVarStrides
;
std
::
copy
(
inOutLengths
.
begin
(),
inOutLengths
.
end
(),
i_inOutLengths
.
begin
());
std
::
copy
(
inOutStrides
.
begin
(),
inOutStrides
.
end
(),
i_inOutStrides
.
begin
());
std
::
copy
(
scaleBiasMeanVarLengths
.
begin
(),
scaleBiasMeanVarLengths
.
end
(),
i_scaleBiasMeanVarLengths
.
begin
());
std
::
copy
(
scaleBiasMeanVarStrides
.
begin
(),
scaleBiasMeanVarStrides
.
end
(),
i_scaleBiasMeanVarStrides
.
begin
());
ck
::
ranges
::
copy
(
inOutLengths
,
i_inOutLengths
.
begin
());
ck
::
ranges
::
copy
(
inOutStrides
,
i_inOutStrides
.
begin
());
ck
::
ranges
::
copy
(
scaleBiasMeanVarLengths
,
i_scaleBiasMeanVarLengths
.
begin
());
ck
::
ranges
::
copy
(
scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
.
begin
());
int
result
=
0
;
result
=
bnorm_infer
<
InOutDataType
,
AccDataType
,
Rank
,
NumReduceDim
,
false
>
(
time_kernel
,
result
=
bnorm_infer
<
InOutDataType
,
InOutDataType
,
AccDataType
,
AccDataType
,
AccDataType
,
AccDataType
,
Rank
,
NumReduceDim
,
false
>
(
time_kernel
,
{
0
,
1
,
2
},
i_inOutLengths
,
i_inOutStrides
,
i_inOutStrides
,
i_scaleBiasMeanVarLengths
,
i_scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
,
x_dev
.
GetDeviceBuffer
(),
bnScale_dev
.
GetDeviceBuffer
(),
bnBias_dev
.
GetDeviceBuffer
(),
...
...
@@ -259,7 +260,15 @@ bool bnorm_infer_nhwc_test(bool do_verification,
if
(
do_verification
)
{
auto
batchNormInfer_ref
=
ReferenceBatchNormInferInstance
<
InOutDataType
,
AccDataType
>
{};
using
ReferenceBatchNormInferInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchNormInfer_Input_N_H_W_C_Output_C
<
InOutDataType
,
InOutDataType
,
AccDataType
,
AccDataType
,
AccDataType
,
AccDataType
>
;
auto
batchNormInfer_ref
=
ReferenceBatchNormInferInstance
{};
auto
argument_ptr_ref
=
batchNormInfer_ref
.
MakeArgumentPointer
(
i_inOutLengths
,
...
...
@@ -267,6 +276,8 @@ bool bnorm_infer_nhwc_test(bool do_verification,
i_inOutStrides
,
i_scaleBiasMeanVarLengths
,
i_scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
,
i_scaleBiasMeanVarStrides
,
x
.
mData
.
data
(),
bnScale
.
mData
.
data
(),
bnBias
.
mData
.
data
(),
...
...
@@ -288,7 +299,7 @@ bool bnorm_infer_nhwc_test(bool do_verification,
(
void
)
invoker_ptr_ref
->
Run
(
argument_ptr_ref
.
get
());
y_dev
.
FromDevice
(
y
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
y
.
mData
,
y_ref
.
mData
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
y
,
y_ref
);
};
return
(
pass
);
...
...
example/35_splitK_gemm/run_splitK_gemm_example.inc
View file @
24af0144
...
...
@@ -34,15 +34,15 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1_
uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
return
HostTensorDescriptor
({
row
,
col
},
{
1_
uz
,
stride
});
}
};
...
...
@@ -146,15 +146,12 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
if
(
std
::
is_same
<
CDataType
,
ck
::
half_t
>::
value
)
{
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
,
"fp16 incorrect result"
,
3
e
-
3
,
1
e
-
3
);
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
,
"fp16 incorrect result"
,
3
e
-
3
,
1
e
-
3
);
}
else
{
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
.
mData
,
c_m_n_host_result
.
mData
);
pass
&=
ck
::
utils
::
check_err
(
c_m_n_device_result
,
c_m_n_host_result
);
}
}
...
...
example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
View file @
24af0144
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
View file @
24af0144
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
View file @
24af0144
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
View file @
24af0144
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
View file @
24af0144
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_splitk_c_shuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
View file @
24af0144
...
...
@@ -9,7 +9,7 @@
#include <ctime>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_sparse_embedding3_forward_layernorm.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_sparse_embedding3_forward_layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
@@ -86,12 +86,10 @@ int main()
constexpr
auto
index_length
=
2048
;
constexpr
AccDataType
epsilon
=
1e-4
;
auto
f_host_tensor_desc_1d
=
[](
std
::
size_t
len_
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
len_
}));
};
auto
f_host_tensor_desc_1d
=
[](
std
::
size_t
len_
)
{
return
HostTensorDescriptor
({
len_
});
};
auto
f_host_tensor_desc_2d
=
[](
std
::
size_t
rows_
,
std
::
size_t
cols_
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
(
{
rows_
,
cols_
})
)
;
return
HostTensorDescriptor
({
rows_
,
cols_
});
};
using
ReferenceInstance
=
...
...
@@ -203,8 +201,7 @@ int main()
ref_invoker
.
Run
(
ref_argument
);
out_dev
.
FromDevice
(
out_from_dev
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
out_from_dev
.
mData
,
out
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
pass
&=
ck
::
utils
::
check_err
(
out_from_dev
,
out
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
}
double
total_read
=
current_dim
*
index_length
*
3
*
sizeof
(
EmbType
)
+
...
...
example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
View file @
24af0144
...
...
@@ -12,13 +12,14 @@ Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
template
<
ck
::
index_t
...
Is
>
...
...
@@ -314,15 +315,15 @@ int main(int argc, char* argv[])
std
::
size_t
stride
,
std
::
size_t
batch_stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
Row
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
stride
,
1
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
batch_stride
,
1
,
stride
}));
return
HostTensorDescriptor
({
batch_count
,
row
,
col
},
{
batch_stride
,
1
_uz
,
stride
});
}
};
...
...
@@ -511,8 +512,7 @@ int main(int argc, char* argv[])
cde1_element_op
(
e1_g_m_o_host_result
(
idx
),
c1_g_m_o
(
idx
),
d1_g_m_o
(
idx
));
});
return
ck
::
utils
::
check_err
(
e1_g_m_o_device_result
.
mData
,
e1_g_m_o_host_result
.
mData
)
?
0
:
1
;
return
ck
::
utils
::
check_err
(
e1_g_m_o_device_result
,
e1_g_m_o_host_result
)
?
0
:
1
;
}
return
0
;
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
41
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment