Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
d20c20a6
Commit
d20c20a6
authored
Oct 16, 2024
by
Mirza Halilcevic
Browse files
Merge remote-tracking branch 'upstream/develop' into gemm_elementwise_gemm
parents
250a89f3
10158b0f
Changes
95
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1873 additions
and
151 deletions
+1873
-151
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
...e/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+3
-3
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp
...line/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp
+4
-5
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+10
-7
include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
...gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+424
-0
include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
+27
-0
include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
...ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
+247
-86
include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp
...ps/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp
+13
-9
library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
...library/reference_tensor_operation/gpu/reference_gemm.hpp
+245
-0
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+7
-21
profiler/src/CMakeLists.txt
profiler/src/CMakeLists.txt
+6
-6
script/cmake-ck-dev.sh
script/cmake-ck-dev.sh
+2
-1
script/cmake-ck-release.sh
script/cmake-ck-release.sh
+2
-1
test/CMakeLists.txt
test/CMakeLists.txt
+4
-12
test/data_type/CMakeLists.txt
test/data_type/CMakeLists.txt
+5
-0
test/data_type/test_custom_type.cpp
test/data_type/test_custom_type.cpp
+874
-0
No files found.
include/ck_tile/ops/gemm/pipeline/
block_
gemm_pipeline_agmem_bgmem_creg_v2.hpp
→
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
View file @
d20c20a6
...
...
@@ -4,15 +4,15 @@
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/pipeline/
block_
gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
namespace
ck_tile
{
// A Tile Window: global memory
// B Tile Window: global memory
// C Distributed tensor: register
template
<
typename
Problem
,
typename
Policy
=
Block
GemmPipelineAGmemBGmemCRegV2DefaultPolicy
>
struct
Block
GemmPipelineAGmemBGmemCRegV2
template
<
typename
Problem
,
typename
Policy
=
GemmPipelineAGmemBGmemCRegV2DefaultPolicy
>
struct
GemmPipelineAGmemBGmemCRegV2
{
using
ADataType
=
remove_cvref_t
<
typename
Problem
::
ADataType
>
;
using
BDataType
=
remove_cvref_t
<
typename
Problem
::
BDataType
>
;
...
...
include/ck_tile/ops/gemm/pipeline/
block_
gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp
→
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp
View file @
d20c20a6
...
...
@@ -7,12 +7,11 @@
namespace
ck_tile
{
// Default policy for
Block
GemmPipelineAGmemBGmemCRegV2
// Default policy for GemmPipelineAGmemBGmemCRegV2
// Default policy class should not be templated, put template on member functions instead
// NOTE: policy should be binded to its corresponding operation. It's just a coincidence that
// BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy is the same as
// BlockGemmPipelineAGmemBGmemCRegV1DefaultPolicy
using
BlockGemmPipelineAGmemBGmemCRegV2DefaultPolicy
=
BlockGemmPipelineAGmemBGmemCRegV1DefaultPolicy
;
// GemmPipelineAGmemBGmemCRegV2DefaultPolicy is the same as
// GemmPipelineAGmemBGmemCRegV1DefaultPolicy
using
GemmPipelineAGmemBGmemCRegV2DefaultPolicy
=
GemmPipelineAGmemBGmemCRegV1DefaultPolicy
;
}
// namespace ck_tile
include/ck_tile/ops/gemm/pipeline/
block_
gemm_pipeline_problem.hpp
→
include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
View file @
d20c20a6
...
...
@@ -13,20 +13,23 @@ template <typename ADataType_,
typename
BDataType_
,
typename
CDataType_
,
typename
BlockGemmShape_
,
bool
kPadA_
=
false
,
bool
kPadB_
=
false
,
bool
kPadC_
=
false
>
struct
BlockGemmPipelineProblem
typename
TileGemmTraits_
>
struct
GemmPipelineProblem
{
using
ADataType
=
remove_cvref_t
<
ADataType_
>
;
using
BDataType
=
remove_cvref_t
<
BDataType_
>
;
using
CDataType
=
remove_cvref_t
<
CDataType_
>
;
using
BlockGemmShape
=
remove_cvref_t
<
BlockGemmShape_
>
;
using
GemmTraits
=
remove_cvref_t
<
TileGemmTraits_
>
;
static
constexpr
index_t
kBlockSize
=
BlockGemmShape
::
NumWarps
*
get_warp_size
();
static
constexpr
bool
kPadA
=
kPadA_
;
static
constexpr
bool
kPadB
=
kPadB_
;
static
constexpr
bool
kPadC
=
kPadC_
;
static
constexpr
bool
kPadA
=
GemmTraits
::
kPadA
;
static
constexpr
bool
kPadB
=
GemmTraits
::
kPadB
;
static
constexpr
bool
kPadC
=
GemmTraits
::
kPadC
;
using
LayoutA
=
remove_cvref_t
<
typename
GemmTraits
::
LayoutA
>
;
using
LayoutB
=
remove_cvref_t
<
typename
GemmTraits
::
LayoutB
>
;
using
LayoutC
=
remove_cvref_t
<
typename
GemmTraits
::
LayoutC
>
;
static
constexpr
index_t
AlignmentA
=
kPadA
?
1
:
VectorLoadSize
/
sizeof
(
ADataType
);
static
constexpr
index_t
AlignmentB
=
kPadB
?
1
:
VectorLoadSize
/
sizeof
(
BDataType
);
...
...
include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
0 → 100644
View file @
d20c20a6
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
namespace
ck_tile
{
// UniversalGemm Policy
template
<
typename
LayoutA_
,
typename
LayoutB_
,
typename
LayoutC_
>
struct
UniversalGemmPipelineAgBgCrPolicy
{
using
LayoutA
=
remove_cvref_t
<
LayoutA_
>
;
using
LayoutB
=
remove_cvref_t
<
LayoutB_
>
;
using
LayoutC
=
remove_cvref_t
<
LayoutC_
>
;
static
constexpr
auto
I0
=
number
<
0
>
{};
static
constexpr
auto
I1
=
number
<
1
>
{};
static
constexpr
auto
I2
=
number
<
2
>
{};
static
constexpr
bool
TransposeC
=
true
;
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeALdsBlockDescriptor
()
{
using
WarpGemm
=
WarpGemmMfmaDispatcher
<
typename
Problem
::
ADataType
,
typename
Problem
::
BDataType
,
typename
Problem
::
CDataType
,
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I0
),
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I1
),
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I2
),
TransposeC
>
;
using
ADataType
=
remove_cvref_t
<
typename
Problem
::
ADataType
>
;
constexpr
index_t
MPerBlock
=
Problem
::
BlockGemmShape
::
kM
;
constexpr
index_t
KPerBlock
=
Problem
::
BlockGemmShape
::
kK
;
constexpr
index_t
K1
=
WarpGemm
::
kK
;
constexpr
index_t
K0
=
KPerBlock
/
K1
;
if
constexpr
(
std
::
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
LayoutA
>::
value
)
{
constexpr
auto
MLdsLayer
=
32
*
4
/
KPerBlock
/
sizeof
(
ADataType
)
<
1
?
1
:
32
*
4
/
KPerBlock
/
sizeof
(
ADataType
);
constexpr
auto
a_lds_block_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
*
number
<
MLdsLayer
>
{},
number
<
MPerBlock
/
MLdsLayer
>
{},
K1
),
make_tuple
(
K1
,
number
<
KPerBlock
*
MLdsLayer
>
{},
I1
));
constexpr
auto
a_lds_block_desc_permuted
=
transform_tensor_descriptor
(
a_lds_block_desc
,
make_tuple
(
make_xor_transform
(
make_tuple
(
number
<
MPerBlock
/
MLdsLayer
>
{},
number
<
K0
*
MLdsLayer
>
{})),
make_pass_through_transform
(
K1
)),
make_tuple
(
sequence
<
1
,
0
>
{},
sequence
<
2
>
{}),
make_tuple
(
sequence
<
1
,
0
>
{},
sequence
<
2
>
{}));
constexpr
auto
a_lds_block_desc_ak0_kMLdsLayer_m_ak1
=
transform_tensor_descriptor
(
a_lds_block_desc_permuted
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
number
<
MLdsLayer
>
{})),
make_pass_through_transform
(
number
<
MPerBlock
/
MLdsLayer
>
{}),
make_pass_through_transform
(
K1
)),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{}),
make_tuple
(
sequence
<
0
,
2
>
{},
sequence
<
1
>
{},
sequence
<
3
>
{}));
constexpr
auto
a_lds_block_desc_m_k
=
transform_tensor_descriptor
(
a_lds_block_desc_ak0_kMLdsLayer_m_ak1
,
make_tuple
(
make_merge_transform_v3_division_mod
(
make_tuple
(
K0
,
K1
)),
make_merge_transform_v3_division_mod
(
make_tuple
(
number
<
MPerBlock
/
MLdsLayer
>
{},
number
<
MLdsLayer
>
{}))),
make_tuple
(
sequence
<
0
,
3
>
{},
sequence
<
1
,
2
>
{}),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
>
{}));
return
a_lds_block_desc_m_k
;
}
else
// ColumnMajor A
{
// kfold and mpair dimension is not always required.
// more dimension in merge_transform increase the difficulty of generating immarg offset
// for compiler.
constexpr
auto
M0
=
get_warp_size
()
*
Problem
::
BlockGemmShape
::
BlockWarps
::
at
(
I0
);
constexpr
auto
M1
=
MPerBlock
/
M0
;
constexpr
auto
KThreadWrite
=
Problem
::
kBlockSize
/
M0
;
constexpr
auto
K0PerThreadWrite
=
K0
/
KThreadWrite
;
constexpr
auto
KThreadRead
=
64
/
WarpGemm
::
kM
;
constexpr
auto
K0PerThreadRead
=
K0
/
KThreadRead
;
constexpr
auto
kfold
=
(
K1
*
M0
*
sizeof
(
ADataType
)
>
128
)
?
1
:
128
/
(
K1
*
M0
*
sizeof
(
ADataType
));
constexpr
auto
KThreadReadPerm
=
(
kfold
*
K0PerThreadWrite
/
K0PerThreadRead
)
>
1
?
KThreadRead
/
(
kfold
*
K0PerThreadWrite
/
K0PerThreadRead
)
:
KThreadRead
;
// 1<=mpair<=kN0
constexpr
auto
mpair
=
(
K1
*
WarpGemm
::
kM
*
sizeof
(
ADataType
)
>
128
)
?
1
:
((
128
/
(
K1
*
WarpGemm
::
kM
*
sizeof
(
ADataType
)))
>
M0
?
M0
:
128
/
(
K1
*
WarpGemm
::
kM
*
sizeof
(
ADataType
)));
constexpr
auto
a_lds_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
number
<
KThreadWrite
/
kfold
/
KThreadReadPerm
>
{},
number
<
K0PerThreadWrite
>
{},
number
<
KThreadReadPerm
*
M1
>
{},
number
<
kfold
*
M0
/
mpair
>
{},
number
<
mpair
>
{},
K1
));
constexpr
auto
a_lds_block_desc_permuted
=
transform_tensor_descriptor
(
a_lds_block_desc
,
make_tuple
(
make_pass_through_transform
(
number
<
KThreadWrite
/
kfold
/
KThreadReadPerm
>
{}),
make_pass_through_transform
(
number
<
K0PerThreadWrite
>
{}),
make_xor_transform
(
make_tuple
(
number
<
KThreadReadPerm
*
M1
>
{},
number
<
kfold
*
M0
/
mpair
>
{})),
make_pass_through_transform
(
number
<
mpair
>
{}),
make_pass_through_transform
(
K1
)),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
,
3
>
{},
sequence
<
4
>
{},
sequence
<
5
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
,
3
>
{},
sequence
<
4
>
{},
sequence
<
5
>
{}));
constexpr
auto
a_lds_block_desc_unmerged
=
transform_tensor_descriptor
(
a_lds_block_desc_permuted
,
make_tuple
(
make_pass_through_transform
(
number
<
KThreadWrite
/
kfold
/
KThreadReadPerm
>
{}),
make_pass_through_transform
(
number
<
K0PerThreadWrite
>
{}),
make_unmerge_transform
(
make_tuple
(
number
<
KThreadReadPerm
>
{},
number
<
M1
>
{})),
make_unmerge_transform
(
make_tuple
(
number
<
kfold
>
{},
number
<
M0
/
mpair
>
{})),
make_pass_through_transform
(
number
<
mpair
>
{}),
make_pass_through_transform
(
K1
)),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{},
sequence
<
3
>
{},
sequence
<
4
>
{},
sequence
<
5
>
{}),
make_tuple
(
sequence
<
1
>
{},
sequence
<
2
>
{},
sequence
<
0
,
3
>
{},
sequence
<
4
,
5
>
{},
sequence
<
6
>
{},
sequence
<
7
>
{}));
constexpr
auto
a_lds_block_desc_m_k
=
transform_tensor_descriptor
(
a_lds_block_desc_unmerged
,
make_tuple
(
make_merge_transform_v3_division_mod
(
make_tuple
(
number
<
KThreadReadPerm
>
{},
number
<
KThreadWrite
/
kfold
/
KThreadReadPerm
>
{},
number
<
kfold
>
{},
number
<
K0PerThreadWrite
>
{},
K1
)),
make_merge_transform_v3_division_mod
(
make_tuple
(
number
<
M0
/
mpair
>
{},
number
<
mpair
>
{},
number
<
M1
>
{}))),
make_tuple
(
sequence
<
0
,
1
,
4
,
2
,
7
>
{},
sequence
<
5
,
6
,
3
>
{}),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
>
{}));
return
a_lds_block_desc_m_k
;
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeBLdsBlockDescriptor
()
{
using
WarpGemm
=
WarpGemmMfmaDispatcher
<
typename
Problem
::
ADataType
,
typename
Problem
::
BDataType
,
typename
Problem
::
CDataType
,
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I0
),
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I1
),
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I2
),
TransposeC
>
;
using
BDataType
=
remove_cvref_t
<
typename
Problem
::
BDataType
>
;
constexpr
index_t
NPerBlock
=
Problem
::
BlockGemmShape
::
kN
;
constexpr
index_t
KPerBlock
=
Problem
::
BlockGemmShape
::
kK
;
constexpr
index_t
K1
=
WarpGemm
::
kK
;
constexpr
index_t
K0
=
KPerBlock
/
K1
;
if
constexpr
(
std
::
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
LayoutB
>::
value
)
{
// NLdsLayer * K0 as logical Bank
constexpr
auto
NLdsLayer
=
32
*
4
/
KPerBlock
/
sizeof
(
BDataType
)
<
1
?
1
:
32
*
4
/
KPerBlock
/
sizeof
(
BDataType
);
;
constexpr
auto
b_lds_block_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
K0
*
number
<
NLdsLayer
>
{},
number
<
NPerBlock
/
NLdsLayer
>
{},
K1
),
make_tuple
(
K1
,
number
<
KPerBlock
*
NLdsLayer
>
{},
I1
));
constexpr
auto
b_lds_block_desc_permuted
=
transform_tensor_descriptor
(
b_lds_block_desc
,
make_tuple
(
make_xor_transform
(
make_tuple
(
number
<
NPerBlock
/
NLdsLayer
>
{},
number
<
K0
*
NLdsLayer
>
{})),
make_pass_through_transform
(
K1
)),
make_tuple
(
sequence
<
1
,
0
>
{},
sequence
<
2
>
{}),
make_tuple
(
sequence
<
1
,
0
>
{},
sequence
<
2
>
{}));
constexpr
auto
b_lds_block_desc_bk0_kNLdsLayer_n_bk1
=
transform_tensor_descriptor
(
b_lds_block_desc_permuted
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
number
<
NLdsLayer
>
{})),
make_pass_through_transform
(
number
<
NPerBlock
/
NLdsLayer
>
{}),
make_pass_through_transform
(
K1
)),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{}),
make_tuple
(
sequence
<
0
,
2
>
{},
sequence
<
1
>
{},
sequence
<
3
>
{}));
constexpr
auto
b_lds_block_desc_n_k
=
transform_tensor_descriptor
(
b_lds_block_desc_bk0_kNLdsLayer_n_bk1
,
make_tuple
(
make_merge_transform_v3_division_mod
(
make_tuple
(
K0
,
K1
)),
make_merge_transform_v3_division_mod
(
make_tuple
(
number
<
NPerBlock
/
NLdsLayer
>
{},
number
<
NLdsLayer
>
{}))),
make_tuple
(
sequence
<
0
,
3
>
{},
sequence
<
1
,
2
>
{}),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
>
{}));
return
b_lds_block_desc_n_k
;
}
else
// RowMajor B
{
constexpr
auto
N0
=
get_warp_size
()
*
Problem
::
BlockGemmShape
::
BlockWarps
::
at
(
I1
);
constexpr
auto
N1
=
NPerBlock
/
N0
;
constexpr
auto
KThreadWrite
=
Problem
::
kBlockSize
/
N0
;
constexpr
auto
K0PerThreadWrite
=
K0
/
KThreadWrite
;
constexpr
auto
KThreadRead
=
64
/
WarpGemm
::
kN
;
constexpr
auto
K0PerThreadRead
=
K0
/
KThreadRead
;
constexpr
auto
kfold
=
(
K1
*
N0
*
sizeof
(
BDataType
)
>
128
)
?
1
:
128
/
(
K1
*
N0
*
sizeof
(
BDataType
));
constexpr
auto
KThreadReadPerm
=
(
kfold
*
K0PerThreadWrite
/
K0PerThreadRead
)
>
1
?
KThreadRead
/
(
kfold
*
K0PerThreadWrite
/
K0PerThreadRead
)
:
KThreadRead
;
// 1<=npair<=kN0
constexpr
auto
npair
=
(
K1
*
WarpGemm
::
kN
*
sizeof
(
BDataType
)
>
128
)
?
1
:
((
128
/
(
K1
*
WarpGemm
::
kN
*
sizeof
(
BDataType
)))
>
N0
?
N0
:
128
/
(
K1
*
WarpGemm
::
kN
*
sizeof
(
BDataType
)));
constexpr
auto
b_lds_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
number
<
KThreadWrite
/
kfold
/
KThreadReadPerm
>
{},
number
<
K0PerThreadWrite
>
{},
number
<
KThreadReadPerm
*
N1
>
{},
number
<
kfold
*
N0
/
npair
>
{},
number
<
npair
>
{},
K1
));
constexpr
auto
b_lds_block_desc_permuted
=
transform_tensor_descriptor
(
b_lds_block_desc
,
make_tuple
(
make_pass_through_transform
(
number
<
KThreadWrite
/
kfold
/
KThreadReadPerm
>
{}),
make_pass_through_transform
(
number
<
K0PerThreadWrite
>
{}),
make_xor_transform
(
make_tuple
(
number
<
KThreadReadPerm
*
N1
>
{},
number
<
kfold
*
N0
/
npair
>
{})),
make_pass_through_transform
(
number
<
npair
>
{}),
make_pass_through_transform
(
K1
)),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
,
3
>
{},
sequence
<
4
>
{},
sequence
<
5
>
{}),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
,
3
>
{},
sequence
<
4
>
{},
sequence
<
5
>
{}));
constexpr
auto
b_lds_block_desc_unmerged
=
transform_tensor_descriptor
(
b_lds_block_desc_permuted
,
make_tuple
(
make_pass_through_transform
(
number
<
KThreadWrite
/
kfold
/
KThreadReadPerm
>
{}),
make_pass_through_transform
(
number
<
K0PerThreadWrite
>
{}),
make_unmerge_transform
(
make_tuple
(
number
<
KThreadReadPerm
>
{},
number
<
N1
>
{})),
make_unmerge_transform
(
make_tuple
(
number
<
kfold
>
{},
number
<
N0
/
npair
>
{})),
make_pass_through_transform
(
number
<
npair
>
{}),
make_pass_through_transform
(
K1
)),
make_tuple
(
sequence
<
0
>
{},
sequence
<
1
>
{},
sequence
<
2
>
{},
sequence
<
3
>
{},
sequence
<
4
>
{},
sequence
<
5
>
{}),
make_tuple
(
sequence
<
1
>
{},
sequence
<
2
>
{},
sequence
<
0
,
3
>
{},
sequence
<
4
,
5
>
{},
sequence
<
6
>
{},
sequence
<
7
>
{}));
constexpr
auto
b_lds_block_desc_n_k
=
transform_tensor_descriptor
(
b_lds_block_desc_unmerged
,
make_tuple
(
make_merge_transform_v3_division_mod
(
make_tuple
(
number
<
KThreadReadPerm
>
{},
number
<
KThreadWrite
/
kfold
/
KThreadReadPerm
>
{},
number
<
kfold
>
{},
number
<
K0PerThreadWrite
>
{},
K1
)),
make_merge_transform_v3_division_mod
(
make_tuple
(
number
<
N0
/
npair
>
{},
number
<
npair
>
{},
number
<
N1
>
{}))),
make_tuple
(
sequence
<
0
,
1
,
4
,
2
,
7
>
{},
sequence
<
5
,
6
,
3
>
{}),
make_tuple
(
sequence
<
1
>
{},
sequence
<
0
>
{}));
return
b_lds_block_desc_n_k
;
}
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSizeA
()
{
constexpr
index_t
smem_size_a
=
sizeof
(
typename
Problem
::
ADataType
)
*
MakeALdsBlockDescriptor
<
Problem
>
().
get_element_space_size
();
return
smem_size_a
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSizeB
()
{
constexpr
index_t
smem_size_b
=
sizeof
(
typename
Problem
::
BDataType
)
*
MakeBLdsBlockDescriptor
<
Problem
>
().
get_element_space_size
();
return
smem_size_b
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
constexpr
index_t
smem_size_a
=
GetSmemSizeA
<
Problem
>
();
constexpr
index_t
smem_size_b
=
GetSmemSizeB
<
Problem
>
();
index_t
smem_size
=
0
;
smem_size
+=
smem_size_a
+
smem_size_b
;
return
smem_size
;
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeADramTileDistribution
()
{
using
WarpGemm
=
WarpGemmMfmaDispatcher
<
typename
Problem
::
ADataType
,
typename
Problem
::
BDataType
,
typename
Problem
::
CDataType
,
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I0
),
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I1
),
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I2
),
TransposeC
>
;
constexpr
index_t
BlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
MPerBlock
=
Problem
::
BlockGemmShape
::
kM
;
constexpr
index_t
KPerBlock
=
Problem
::
BlockGemmShape
::
kK
;
constexpr
index_t
K1
=
WarpGemm
::
kK
;
constexpr
index_t
K0
=
KPerBlock
/
K1
;
constexpr
index_t
M2
=
get_warp_size
()
/
K0
;
constexpr
index_t
M1
=
BlockSize
/
get_warp_size
();
static_assert
(
M2
!=
0
,
"M2 is zero, which will lead to a division by zero error."
);
static_assert
(
M1
!=
0
,
"M1 is zero, which will lead to a division by zero error."
);
constexpr
index_t
M0
=
MPerBlock
/
(
M2
*
M1
);
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
M0
,
M1
,
M2
>
,
sequence
<
K0
,
K1
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
1
>>
{});
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
MakeBDramTileDistribution
()
{
using
WarpGemm
=
WarpGemmMfmaDispatcher
<
typename
Problem
::
ADataType
,
typename
Problem
::
BDataType
,
typename
Problem
::
CDataType
,
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I0
),
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I1
),
Problem
::
BlockGemmShape
::
WarpTile
::
at
(
I2
),
TransposeC
>
;
constexpr
index_t
BlockSize
=
Problem
::
kBlockSize
;
constexpr
index_t
NPerBlock
=
Problem
::
BlockGemmShape
::
kN
;
constexpr
index_t
KPerBlock
=
Problem
::
BlockGemmShape
::
kK
;
constexpr
index_t
K1
=
WarpGemm
::
kK
;
constexpr
index_t
K0
=
KPerBlock
/
K1
;
constexpr
index_t
N2
=
get_warp_size
()
/
K0
;
constexpr
index_t
N1
=
BlockSize
/
get_warp_size
();
static_assert
(
N2
!=
0
,
"M2 is zero, which will lead to a division by zero error."
);
static_assert
(
N1
!=
0
,
"M1 is zero, which will lead to a division by zero error."
);
constexpr
index_t
N0
=
NPerBlock
/
(
N2
*
N1
);
return
make_static_tile_distribution
(
tile_distribution_encoding
<
sequence
<
1
>
,
tuple
<
sequence
<
N0
,
N1
,
N2
>
,
sequence
<
K0
,
K1
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
1
,
2
>>
,
tuple
<
sequence
<
1
>
,
sequence
<
2
,
0
>>
,
sequence
<
1
,
2
>
,
sequence
<
0
,
1
>>
{});
}
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
auto
GetBlockGemm
()
{
using
AccDataType
=
float
;
using
BlockWarps
=
typename
Problem
::
BlockGemmShape
::
BlockWarps
;
using
WarpTile
=
typename
Problem
::
BlockGemmShape
::
WarpTile
;
using
WarpGemm
=
WarpGemmMfmaDispatcher
<
typename
Problem
::
ADataType
,
typename
Problem
::
BDataType
,
AccDataType
,
WarpTile
::
at
(
I0
),
WarpTile
::
at
(
I1
),
WarpTile
::
at
(
I2
),
TransposeC
>
;
using
BlockGemmPolicy
=
BlockGemmASmemBSmemCRegV1CustomPolicy
<
typename
Problem
::
ADataType
,
typename
Problem
::
BDataType
,
typename
Problem
::
CDataType
,
BlockWarps
,
WarpGemm
>
;
return
BlockGemmASmemBSmemCRegV1
<
Problem
,
BlockGemmPolicy
>
{};
}
};
}
// namespace ck_tile
include/ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp
0 → 100644
View file @
d20c20a6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
namespace
ck_tile
{
template
<
bool
kPadA_
,
bool
kPadB_
,
bool
kPadC_
,
typename
LayoutA_
,
typename
LayoutB_
,
typename
LayoutC_
>
struct
TileGemmTraits
{
static
constexpr
bool
kPadA
=
kPadA_
;
static
constexpr
bool
kPadB
=
kPadB_
;
static
constexpr
bool
kPadC
=
kPadC_
;
using
LayoutA
=
LayoutA_
;
using
LayoutB
=
LayoutB_
;
using
LayoutC
=
LayoutC_
;
};
}
// namespace ck_tile
include/ck_tile/ops/layernorm2d/kernel/layernorm2d_fwd_kernel.hpp
View file @
d20c20a6
...
...
@@ -31,8 +31,14 @@ struct Layernorm2dFwd
static
constexpr
ck_tile
::
index_t
kMPerBlock
=
Problem
::
BlockShape
::
kMPerBlock
;
static
constexpr
ck_tile
::
index_t
kNPerBlock
=
Problem
::
BlockShape
::
kNPerBlock
;
static
constexpr
bool
kPadM
=
Problem
::
kPadM
;
static
constexpr
bool
kPadN
=
Problem
::
kPadN
;
static
constexpr
ck_tile
::
index_t
kNThreadPerWarp
=
Problem
::
BlockShape
::
kNThreadPerWarp
;
static
constexpr
ck_tile
::
index_t
kNPerThread
=
Problem
::
BlockShape
::
kNPerThread
;
static
constexpr
auto
I0
=
number
<
0
>
{};
static
constexpr
auto
I1
=
number
<
1
>
{};
struct
Kargs
{
...
...
@@ -96,19 +102,25 @@ struct Layernorm2dFwd
sequence
<
2
>>
{});
}
template
<
typename
Dstr
>
CK_TILE_DEVICE
static
constexpr
auto
GetNPerThread
(
Dstr
)
CK_TILE_DEVICE
static
int
GetWelfordMaxCount
(
int
N
)
{
constexpr
auto
nDstrSpan
=
Dstr
::
get_distributed_spans
().
template
at
<
1
>();
using
Lengths
=
decltype
(
nDstrSpan
.
impl_
);
constexpr
ck_tile
::
index_t
kNThreadPerBlock
=
kNPerBlock
/
kNPerThread
;
ck_tile
::
index_t
ret
=
1
;
int
thread_id_n
=
get_thread_id
()
%
kNThreadPerBlock
;
int
max_count
=
__builtin_amdgcn_readfirstlane
(
N
<
kNPerBlock
?
0
:
kNPerThread
*
(
N
/
kNPerBlock
));
int
n_per_block_tail_loop
=
__builtin_amdgcn_readfirstlane
(
N
-
max_count
*
kNThreadPerBlock
);
ck_tile
::
static_for
<
0
,
Lengths
::
size
(),
1
>
{}(
[
&
](
auto
idx
)
{
ret
*=
Lengths
::
template
at
(
idx
);
});
if
(
n_per_block_tail_loop
>
0
)
{
int
thread_max_n
=
(
thread_id_n
+
1
)
*
kNPerThread
;
int
delta
=
thread_max_n
-
n_per_block_tail_loop
;
delta
=
clamp
(
thread_max_n
-
n_per_block_tail_loop
,
0
,
kNPerThread
);
max_count
+=
kNPerThread
-
delta
;
}
return
re
t
;
return
max_coun
t
;
}
template
<
typename
DistributedTensor
>
...
...
@@ -129,42 +141,29 @@ struct Layernorm2dFwd
return
out_dstr_tensor
;
}
template
<
bool
Cond
=
(
kHasGamma
&&
kHasBeta
)>
CK_TILE_DEVICE
std
::
enable_if_t
<
Cond
>
TwoPassLayernorm2dFwd
(
const
XDataType
*
p_x
,
const
GammaDataType
*
p_gamma
,
const
BetaDataType
*
p_beta
,
YDataType
*
p_y
,
MeanDataType
*
p_mean
,
InvStdDataType
*
p_invStd
,
const
ComputeDataType
epsilon
,
ck_tile
::
index_t
M
,
ck_tile
::
index_t
N
)
const
template
<
typename
XBlockWindow
,
typename
GammaBlockWindow
,
typename
BetaBlockWindow
,
typename
YBlockWindow
,
typename
MeanBlockWindow
,
typename
InvStdBlockWindow
,
bool
Cond
=
(
kHasGamma
&&
kHasBeta
)>
CK_TILE_DEVICE
std
::
enable_if_t
<
Cond
>
TwoPassLayernorm2dFwd
(
XBlockWindow
&
x_block_window
,
GammaBlockWindow
&
gamma_block_window
,
BetaBlockWindow
&
beta_block_window
,
YBlockWindow
&
y_block_window
,
MeanBlockWindow
&
mean_block_window
,
InvStdBlockWindow
&
inv_std_block_window
,
ComputeDataType
epsilon
,
ck_tile
::
index_t
N
)
const
{
constexpr
auto
I0
=
number
<
0
>
{};
constexpr
auto
I1
=
number
<
1
>
{};
const
auto
x_m_n
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
p_x
,
make_tuple
(
M
,
N
),
make_tuple
(
N
,
1
),
number
<
32
>
{},
number
<
1
>
{});
const
auto
gamma_n
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
p_gamma
,
make_tuple
(
N
),
make_tuple
(
1
),
number
<
32
>
{},
number
<
1
>
{});
// TODO - Optimize tail loop to reduce move_tile_window()
index_t
num_n_tile_iteration
=
__builtin_amdgcn_readfirstlane
(
integer_divide_ceil
(
N
,
kNPerBlock
));
const
auto
beta_n
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
p_beta
,
make_tuple
(
N
),
make_tuple
(
1
),
number
<
32
>
{},
number
<
1
>
{});
const
auto
iM
=
get_block_id
()
*
kMPerBlock
;
constexpr
auto
xDstr
=
MakeXBlockTileDistribution
();
auto
x_block_window
=
make_tile_window
(
x_m_n
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
{
iM
,
0
},
xDstr
);
index_t
num_n_tile_iteration
=
__builtin_amdgcn_readfirstlane
(
N
/
kNPerBlock
);
// TODO: padding - handle max_count if N % kNPerBlock != 0
constexpr
auto
NPerThread
=
GetNPerThread
(
xDstr
);
ThreadWelford
<
ComputeDataType
,
XDataType
>
thread_welford
{
type_convert
<
int
>
(
NPerThread
*
N
/
kNPerBlock
)};
int
welford_max_count
=
GetWelfordMaxCount
(
N
);
ThreadWelford
<
ComputeDataType
,
XDataType
>
thread_welford
{
welford_max_count
};
using
XTensorType
=
decltype
(
load_tile
(
x_block_window
));
auto
mean_compute_block_tensor
=
...
...
@@ -190,44 +189,14 @@ struct Layernorm2dFwd
auto
inv_std_compute_block_tensor
=
InvSqrt
(
var_compute_block_tensor
,
epsilon
);
if
constexpr
(
kSaveMean
)
{
const
auto
mean_m
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_mean
,
make_tuple
(
M
),
number
<
32
>
{});
auto
mean_block_window
=
make_tile_window
(
mean_m
,
make_tuple
(
number
<
kMPerBlock
>
{}),
{
iM
});
store_tile
(
mean_block_window
,
cast_tile
<
MeanDataType
>
(
mean_compute_block_tensor
));
}
if
constexpr
(
kSaveInvStd
)
{
const
auto
inv_std_m
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
p_invStd
,
make_tuple
(
M
),
number
<
32
>
{});
auto
inv_std_block_window
=
make_tile_window
(
inv_std_m
,
make_tuple
(
number
<
kMPerBlock
>
{}),
{
iM
});
store_tile
(
inv_std_block_window
,
cast_tile
<
MeanDataType
>
(
inv_std_compute_block_tensor
));
}
// TODO: Extract normalize pipeline
const
auto
y_m_n
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
p_y
,
make_tuple
(
M
,
N
),
make_tuple
(
N
,
1
),
number
<
32
>
{},
number
<
1
>
{});
auto
y_block_window
=
make_tile_window
(
y_m_n
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
{
iM
,
0
});
constexpr
auto
gammaDstr
=
MakeGammaBetaBlockTileDistribution
();
constexpr
auto
betaDstr
=
gammaDstr
;
auto
gamma_block_window
=
make_tile_window
(
gamma_n
,
make_tuple
(
number
<
kNPerBlock
>
{}),
{
0
},
gammaDstr
);
auto
beta_block_window
=
make_tile_window
(
beta_n
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
{
0
},
betaDstr
);
store_tile
(
inv_std_block_window
,
cast_tile
<
InvStdDataType
>
(
inv_std_compute_block_tensor
));
// reverse read x to reuse cache
ck_tile
::
index_t
stride_to_right_most_window
=
N
-
kNPerBlock
;
ck_tile
::
index_t
stride_to_right_most_window
=
N
%
kNPerBlock
==
0
?
N
-
kNPerBlock
:
N
-
N
%
kNPerBlock
;
move_tile_window
(
x_block_window
,
{
0
,
-
kNPerBlock
});
move_tile_window
(
gamma_block_window
,
{
stride_to_right_most_window
});
...
...
@@ -274,17 +243,209 @@ struct Layernorm2dFwd
}
}
template
<
typename
XBlockWindow
,
typename
GammaBlockWindow
,
typename
BetaBlockWindow
,
typename
YBlockWindow
,
typename
MeanBlockWindow
,
typename
InvStdBlockWindow
,
bool
Cond
=
(
kHasGamma
&&
kHasBeta
)>
CK_TILE_DEVICE
std
::
enable_if_t
<
Cond
>
OnePassLayernorm2dFwd
(
XBlockWindow
&
x_block_window
,
GammaBlockWindow
&
gamma_block_window
,
BetaBlockWindow
&
beta_block_window
,
YBlockWindow
&
y_block_window
,
MeanBlockWindow
&
mean_block_window
,
InvStdBlockWindow
&
inv_std_block_window
,
ComputeDataType
epsilon
,
ck_tile
::
index_t
N
)
const
{
int
welford_max_count
=
GetWelfordMaxCount
(
N
);
ThreadWelford
<
ComputeDataType
,
XDataType
>
thread_welford
{
welford_max_count
};
using
XTensorType
=
decltype
(
load_tile
(
x_block_window
));
auto
mean_compute_block_tensor
=
thread_welford
.
template
MakeInitialMeanVarDistributedTensor
<
XTensorType
>();
auto
var_compute_block_tensor
=
thread_welford
.
template
MakeInitialMeanVarDistributedTensor
<
XTensorType
>();
clear_tile
(
mean_compute_block_tensor
);
clear_tile
(
var_compute_block_tensor
);
const
auto
x_block_tensor
=
load_tile
(
x_block_window
);
thread_welford
(
x_block_tensor
,
mean_compute_block_tensor
,
var_compute_block_tensor
);
// TODO: support cross warp Welford
WarpMergeWelford
<
ComputeDataType
,
true
>
{}(
mean_compute_block_tensor
,
var_compute_block_tensor
,
thread_welford
.
cur_count_
);
auto
inv_std_compute_block_tensor
=
InvSqrt
(
var_compute_block_tensor
,
epsilon
);
if
constexpr
(
kSaveMean
)
store_tile
(
mean_block_window
,
cast_tile
<
MeanDataType
>
(
mean_compute_block_tensor
));
if
constexpr
(
kSaveInvStd
)
store_tile
(
inv_std_block_window
,
cast_tile
<
InvStdDataType
>
(
inv_std_compute_block_tensor
));
// normalize
const
auto
gamma_block_tensor
=
load_tile
(
gamma_block_window
);
const
auto
beta_block_tensor
=
load_tile
(
beta_block_window
);
constexpr
auto
x_spans
=
decltype
(
x_block_tensor
)
::
get_distributed_spans
();
auto
y_block_tensor
=
make_static_distributed_tensor
<
YDataType
>
(
x_block_tensor
.
get_tile_distribution
());
sweep_tile_span
(
x_spans
[
I1
],
[
&
](
auto
idx1
)
{
constexpr
auto
j_idx
=
make_tuple
(
idx1
);
const
auto
gamma
=
type_convert
<
ComputeDataType
>
(
gamma_block_tensor
[
j_idx
]);
const
auto
beta
=
type_convert
<
ComputeDataType
>
(
beta_block_tensor
[
j_idx
]);
sweep_tile_span
(
x_spans
[
I0
],
[
&
](
auto
idx0
)
{
constexpr
auto
i_idx
=
make_tuple
(
idx0
);
constexpr
auto
i_j_idx
=
make_tuple
(
idx0
,
idx1
);
const
auto
mean
=
mean_compute_block_tensor
[
i_idx
];
const
auto
inv_std
=
inv_std_compute_block_tensor
[
i_idx
];
const
auto
x
=
type_convert
<
ComputeDataType
>
(
x_block_tensor
[
i_j_idx
]);
auto
y
=
(
x
-
mean
)
*
inv_std
*
gamma
+
beta
;
y_block_tensor
(
i_j_idx
)
=
type_convert
<
YDataType
>
(
y
);
});
});
store_tile
(
y_block_window
,
y_block_tensor
);
}
CK_TILE_DEVICE
void
operator
()(
Kargs
kargs
)
const
{
TwoPassLayernorm2dFwd
(
static_cast
<
const
XDataType
*>
(
kargs
.
p_x
),
static_cast
<
const
GammaDataType
*>
(
kargs
.
p_gamma
),
static_cast
<
const
BetaDataType
*>
(
kargs
.
p_beta
),
static_cast
<
YDataType
*>
(
kargs
.
p_y
),
static_cast
<
MeanDataType
*>
(
kargs
.
p_mean
),
static_cast
<
InvStdDataType
*>
(
kargs
.
p_invStd
),
static_cast
<
const
ComputeDataType
>
(
kargs
.
epsilon
),
kargs
.
M
,
kargs
.
N
);
const
auto
x_m_n
=
[
&
]()
{
const
auto
x_dram_naive
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
const
XDataType
*>
(
kargs
.
p_x
),
make_tuple
(
kargs
.
M
,
kargs
.
N
),
make_tuple
(
kargs
.
N
,
1
),
number
<
kNPerThread
>
{},
number
<
1
>
{});
return
pad_tensor_view
(
x_dram_naive
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
sequence
<
kPadM
,
kPadN
>
{});
}();
const
auto
gamma_n
=
[
&
]()
{
const
auto
gamma_dram_naive
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
const
GammaDataType
*>
(
kargs
.
p_gamma
),
make_tuple
(
kargs
.
N
),
make_tuple
(
1
),
number
<
kNPerThread
>
{},
number
<
1
>
{});
return
pad_tensor_view
(
gamma_dram_naive
,
make_tuple
(
number
<
kNPerBlock
>
{}),
sequence
<
kPadN
>
{});
}();
const
auto
beta_n
=
[
&
]()
{
const
auto
gamma_dram_naive
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
const
BetaDataType
*>
(
kargs
.
p_beta
),
make_tuple
(
kargs
.
N
),
make_tuple
(
1
),
number
<
kNPerThread
>
{},
number
<
1
>
{});
return
pad_tensor_view
(
gamma_dram_naive
,
make_tuple
(
number
<
kNPerBlock
>
{}),
sequence
<
kPadN
>
{});
}();
const
auto
iM
=
get_block_id
()
*
kMPerBlock
;
constexpr
auto
xDstr
=
MakeXBlockTileDistribution
();
auto
x_block_window
=
make_tile_window
(
x_m_n
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
{
iM
,
0
},
xDstr
);
const
auto
y_m_n
=
[
&
]()
{
const
auto
y_dram_naive
=
make_naive_tensor_view
<
address_space_enum
::
global
>
(
static_cast
<
YDataType
*>
(
kargs
.
p_y
),
make_tuple
(
kargs
.
M
,
kargs
.
N
),
make_tuple
(
kargs
.
N
,
1
),
number
<
kNPerThread
>
{},
number
<
1
>
{});
return
pad_tensor_view
(
y_dram_naive
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
sequence
<
kPadM
,
kPadN
>
{});
}();
auto
y_block_window
=
make_tile_window
(
y_m_n
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
{
iM
,
0
});
constexpr
auto
gammaDstr
=
MakeGammaBetaBlockTileDistribution
();
constexpr
auto
betaDstr
=
gammaDstr
;
auto
gamma_block_window
=
make_tile_window
(
gamma_n
,
make_tuple
(
number
<
kNPerBlock
>
{}),
{
0
},
gammaDstr
);
auto
beta_block_window
=
make_tile_window
(
beta_n
,
make_tuple
(
number
<
kMPerBlock
>
{},
number
<
kNPerBlock
>
{}),
{
0
},
betaDstr
);
auto
mean_block_window
=
[
&
]()
{
if
constexpr
(
kSaveMean
)
{
const
auto
mean_m
=
[
&
]()
{
const
auto
mean_dram_naive
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
static_cast
<
MeanDataType
*>
(
kargs
.
p_mean
),
make_tuple
(
kargs
.
M
),
number
<
1
>
{});
return
pad_tensor_view
(
mean_dram_naive
,
make_tuple
(
number
<
kMPerBlock
>
{}),
sequence
<
kPadM
>
{});
}();
return
make_tile_window
(
mean_m
,
make_tuple
(
number
<
kMPerBlock
>
{}),
{
iM
});
}
else
return
make_null_tile_window
(
make_tuple
(
number
<
kMPerBlock
>
{}));
}();
auto
inv_std_block_window
=
[
&
]()
{
if
constexpr
(
kSaveInvStd
)
{
const
auto
inv_std_m
=
[
&
]()
{
const
auto
inv_std_dram_naive
=
make_naive_tensor_view_packed
<
address_space_enum
::
global
>
(
static_cast
<
InvStdDataType
*>
(
kargs
.
p_invStd
),
make_tuple
(
kargs
.
M
),
number
<
1
>
{});
return
pad_tensor_view
(
inv_std_dram_naive
,
make_tuple
(
number
<
kMPerBlock
>
{}),
sequence
<
kPadM
>
{});
}();
return
make_tile_window
(
inv_std_m
,
make_tuple
(
number
<
kMPerBlock
>
{}),
{
iM
});
}
else
return
make_null_tile_window
(
make_tuple
(
number
<
kMPerBlock
>
{}));
}();
if
(
kargs
.
N
<=
kNPerBlock
)
OnePassLayernorm2dFwd
(
x_block_window
,
gamma_block_window
,
beta_block_window
,
y_block_window
,
mean_block_window
,
inv_std_block_window
,
static_cast
<
const
ComputeDataType
>
(
kargs
.
epsilon
),
kargs
.
N
);
else
TwoPassLayernorm2dFwd
(
x_block_window
,
gamma_block_window
,
beta_block_window
,
y_block_window
,
mean_block_window
,
inv_std_block_window
,
static_cast
<
const
ComputeDataType
>
(
kargs
.
epsilon
),
kargs
.
N
);
}
};
...
...
include/ck_tile/ops/layernorm2d/pipeline/block_layernorm2d_fwd_problem.hpp
View file @
d20c20a6
...
...
@@ -14,17 +14,21 @@ template <typename XDataType_,
typename
YDataType_
,
typename
MeanDataType_
,
typename
InvStdDataType_
,
typename
BlockShape_
>
typename
BlockShape_
,
bool
kPadM_
,
bool
kPadN_
>
struct
BlockLayernorm2dFwdProblem
{
using
XDataType
=
remove_cvref_t
<
XDataType_
>
;
using
GammaDataType
=
remove_cvref_t
<
GammaDataType_
>
;
using
BetaDataType
=
remove_cvref_t
<
BetaDataType_
>
;
using
ComputeDataType
=
remove_cvref_t
<
ComputeDataType_
>
;
using
YDataType
=
remove_cvref_t
<
YDataType_
>
;
using
MeanDataType
=
remove_cvref_t
<
MeanDataType_
>
;
using
InvStdDataType
=
remove_cvref_t
<
InvStdDataType_
>
;
using
BlockShape
=
remove_cvref_t
<
BlockShape_
>
;
using
XDataType
=
remove_cvref_t
<
XDataType_
>
;
using
GammaDataType
=
remove_cvref_t
<
GammaDataType_
>
;
using
BetaDataType
=
remove_cvref_t
<
BetaDataType_
>
;
using
ComputeDataType
=
remove_cvref_t
<
ComputeDataType_
>
;
using
YDataType
=
remove_cvref_t
<
YDataType_
>
;
using
MeanDataType
=
remove_cvref_t
<
MeanDataType_
>
;
using
InvStdDataType
=
remove_cvref_t
<
InvStdDataType_
>
;
using
BlockShape
=
remove_cvref_t
<
BlockShape_
>
;
static
constexpr
bool
kPadM
=
kPadM_
;
static
constexpr
bool
kPadN
=
kPadN_
;
};
}
// namespace ck_tile
library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
0 → 100644
View file @
d20c20a6
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
typename
ComputeTypeA
,
typename
ComputeTypeB
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
naive_gemm_kernel
(
const
ADataType
*
__restrict__
p_a_grid
,
const
BDataType
*
__restrict__
p_b_grid
,
CDataType
*
__restrict__
p_c_grid
,
index_t
m
,
index_t
n
,
index_t
k
,
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CDEElementwiseOperation
c_element_op
)
{
using
RowMajor
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
const
int
row_idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
const
int
col_idx
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
row_idx
<
m
&&
col_idx
<
n
)
{
AccDataType
v_acc
=
static_cast
<
AccDataType
>
(
0.0
);
ComputeTypeA
v_a
=
static_cast
<
ComputeTypeA
>
(
0.0
);
ComputeTypeB
v_b
=
static_cast
<
ComputeTypeB
>
(
0.0
);
CDataType
v_c
=
static_cast
<
CDataType
>
(
0.0
);
for
(
int
k_idx
=
0
;
k_idx
<
k
;
++
k_idx
)
{
// check input matrices layout
int
element_idx_a
=
0
;
int
element_idx_b
=
0
;
if
constexpr
(
std
::
is_same_v
<
ALayout
,
RowMajor
>
)
{
element_idx_a
=
row_idx
*
k
+
k_idx
;
}
else
{
element_idx_a
=
row_idx
+
m
*
k_idx
;
}
if
constexpr
(
std
::
is_same_v
<
BLayout
,
RowMajor
>
)
{
element_idx_b
=
k_idx
*
n
+
col_idx
;
}
else
{
element_idx_b
=
k_idx
+
k
*
col_idx
;
}
// apply a_element_op
a_element_op
(
v_a
,
p_a_grid
[
element_idx_a
]);
// apply b_element_op
b_element_op
(
v_b
,
p_b_grid
[
element_idx_b
]);
// multiply and accumulate
v_acc
+=
static_cast
<
AccDataType
>
(
v_a
)
*
static_cast
<
AccDataType
>
(
v_b
);
}
// apply c_element_op
c_element_op
(
v_c
,
v_acc
);
// check output matrix layout
int
element_idx_c
=
0
;
if
constexpr
(
std
::
is_same_v
<
CLayout
,
RowMajor
>
)
{
element_idx_c
=
row_idx
*
n
+
col_idx
;
}
else
{
element_idx_c
=
row_idx
+
m
*
col_idx
;
}
// prepare output
p_c_grid
[
element_idx_c
]
=
v_c
;
}
}
}
// namespace ck
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
typename
ComputeTypeA
=
CDataType
,
typename
ComputeTypeB
=
ComputeTypeA
>
struct
ReferenceGemm
:
public
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
device
::
BaseArgument
{
Argument
(
const
void
*
p_a_grid
,
const
void
*
p_b_grid
,
void
*
p_c_grid
,
index_t
m
,
index_t
n
,
index_t
k
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
:
p_a_grid_
{
static_cast
<
const
ADataType
*>
(
p_a_grid
)},
p_b_grid_
{
static_cast
<
const
BDataType
*>
(
p_b_grid
)},
p_c_grid_
{
static_cast
<
CDataType
*>
(
p_c_grid
)},
m_
{
m
},
n_
{
n
},
k_
{
k
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
c_element_op_
{
c_element_op
}
{
}
const
ADataType
*
p_a_grid_
;
const
BDataType
*
p_b_grid_
;
CDataType
*
p_c_grid_
;
index_t
m_
;
index_t
n_
;
index_t
k_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CElementwiseOperation
c_element_op_
;
};
// Invoker
struct
Invoker
:
public
device
::
BaseInvoker
{
using
Argument
=
ReferenceGemm
::
Argument
;
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
int
block_size
=
16
;
dim3
block_dim
(
block_size
,
block_size
,
1
);
dim3
grid_dim
(
(
arg
.
m_
+
block_size
-
1
)
/
block_size
,
(
arg
.
n_
+
block_size
-
1
)
/
block_size
,
1
);
auto
launch_kernel
=
[
&
]()
{
const
auto
kernel
=
naive_gemm_kernel
<
ALayout
,
BLayout
,
CLayout
,
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
ComputeTypeA
,
ComputeTypeB
>
;
return
launch_and_time_kernel
(
stream_config
,
kernel
,
grid_dim
,
block_dim
,
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_c_grid_
,
arg
.
m_
,
arg
.
n_
,
arg
.
k_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
};
return
launch_kernel
();
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
void
*
p_a_grid
,
const
void
*
p_b_grid
,
void
*
p_c_grid
,
index_t
m
,
index_t
n
,
index_t
k
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CElementwiseOperation
c_element_op
)
{
return
Argument
{
p_a_grid
,
p_b_grid
,
p_c_grid
,
m
,
n
,
k
,
a_element_op
,
b_element_op
,
c_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"Device Reference Gemm"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
d20c20a6
...
...
@@ -37,11 +37,7 @@ function(add_instance_library INSTANCE_NAME)
endforeach
()
endif
()
if
(
INSTANCES_ONLY
)
set
(
INST_TARGETS
${
DEFAULT_GPU_TARGETS
}
)
else
()
set
(
INST_TARGETS
${
GPU_TARGETS
}
)
endif
()
set
(
INST_TARGETS
${
SUPPORTED_GPU_TARGETS
}
)
# Do not build DL instances if DL_KERNELS macro is not set
foreach
(
source IN LISTS ARGN
)
...
...
@@ -64,9 +60,9 @@ function(add_instance_library INSTANCE_NAME)
list
(
REMOVE_ITEM ARGN
"
${
source
}
"
)
endif
()
endforeach
()
# Do not build mha instances if gfx94 targets are not on the target list
# Do not build mha instances if gfx94
or gfx90a
targets are not on the target list
foreach
(
source IN LISTS ARGN
)
if
(
NOT INST_TARGETS MATCHES
"gfx94"
AND source MATCHES
"mha"
)
if
(
NOT INST_TARGETS MATCHES
"gfx94"
AND
NOT INST_TARGETS MATCHES
"gfx90a"
AND
source MATCHES
"mha"
)
message
(
"removing mha instance
${
source
}
"
)
list
(
REMOVE_ITEM ARGN
"
${
source
}
"
)
endif
()
...
...
@@ -75,17 +71,13 @@ function(add_instance_library INSTANCE_NAME)
if
(
ARGN
)
set
(
INST_OBJ
)
foreach
(
source IN LISTS ARGN
)
if
(
INSTANCES_ONLY
)
set
(
INST_TARGETS
${
DEFAULT_GPU_TARGETS
}
)
else
()
set
(
INST_TARGETS
${
GPU_TARGETS
}
)
endif
()
set
(
INST_TARGETS
${
SUPPORTED_GPU_TARGETS
}
)
if
(
source MATCHES
"_xdl"
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
elseif
(
ARGN MATCHES
"_wmma"
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030
)
elseif
(
ARGN MATCHES
"mha"
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908
gfx90a
gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
list
(
REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201
)
endif
()
set
(
offload_targets
)
foreach
(
target IN LISTS INST_TARGETS
)
...
...
@@ -191,12 +183,7 @@ FOREACH(subdir_path ${dir_list})
set
(
add_inst 1
)
endif
()
if
(
INSTANCES_ONLY
)
set
(
INST_TARGETS
${
DEFAULT_GPU_TARGETS
}
)
else
()
set
(
INST_TARGETS
${
GPU_TARGETS
}
)
endif
()
set
(
INST_TARGETS
${
SUPPORTED_GPU_TARGETS
}
)
if
((
"
${
cmake_instance
}
"
MATCHES
"quantization"
)
AND
(
DEFINED DTYPES
)
AND
(
NOT DTYPES MATCHES
"int8"
))
message
(
"quantization instances will not be built!"
)
...
...
@@ -320,8 +307,7 @@ if(CK_DEVICE_CONV_INSTANCES)
endif
()
if
(
CK_DEVICE_MHA_INSTANCES
)
set
(
gpu_list
${
INST_TARGETS
}
)
list
(
FILTER gpu_list INCLUDE REGEX
"^gfx94"
)
if
(
gpu_list
)
if
(
gpu_list MATCHES
"gfx94"
OR gpu_list MATCHES
"gfx90a"
)
add_library
(
device_mha_operations STATIC
${
CK_DEVICE_MHA_INSTANCES
}
)
add_library
(
composablekernels::device_mha_operations ALIAS device_mha_operations
)
target_compile_features
(
device_mha_operations PUBLIC
)
...
...
profiler/src/CMakeLists.txt
View file @
d20c20a6
...
...
@@ -24,7 +24,7 @@ set(PROFILER_SOURCES
profile_permute_scale.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx9"
)
if
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx9"
)
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
list
(
APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp
)
list
(
APPEND PROFILER_SOURCES profile_contraction_scale.cpp
)
...
...
@@ -49,7 +49,7 @@ if(GPU_TARGETS MATCHES "gfx9")
list
(
APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp
)
endif
()
list
(
APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx94"
)
if
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx94"
)
list
(
APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp
)
list
(
APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp
)
endif
()
...
...
@@ -69,7 +69,7 @@ if(GPU_TARGETS MATCHES "gfx9")
endif
()
if
(
GPU_TARGETS MATCHES
"gfx11"
OR GPU_TARGETS MATCHES
"gfx12"
OR GPU_TARGETS MATCHES
"gfx9"
)
if
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx11"
OR
SUPPORTED_
GPU_TARGETS MATCHES
"gfx12"
OR
SUPPORTED_
GPU_TARGETS MATCHES
"gfx9"
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
list
(
APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp
)
endif
()
...
...
@@ -111,7 +111,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_inst
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_transpose_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_permute_scale_instance
)
if
(
GPU_TARGETS MATCHES
"gfx9"
)
if
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx9"
)
if
(
DTYPES MATCHES
"fp32"
OR DTYPES MATCHES
"fp64"
OR NOT DEFINED DTYPES
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_bilinear_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_contraction_scale_instance
)
...
...
@@ -135,7 +135,7 @@ if(GPU_TARGETS MATCHES "gfx9")
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_batched_gemm_reduce_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_multiply_add_instance
)
if
(
GPU_TARGETS MATCHES
"gfx94"
)
if
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx94"
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_multiply_multiply_instance
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_ab_scale_instance
)
endif
()
...
...
@@ -159,7 +159,7 @@ if(GPU_TARGETS MATCHES "gfx9")
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_grouped_conv3d_fwd_convinvscale_instance
)
endif
()
if
(
GPU_TARGETS MATCHES
"gfx9"
OR GPU_TARGETS MATCHES
"gfx11"
OR GPU_TARGETS MATCHES
"gfx12"
)
if
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx9"
OR
SUPPORTED_
GPU_TARGETS MATCHES
"gfx11"
OR
SUPPORTED_
GPU_TARGETS MATCHES
"gfx12"
)
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
target_link_libraries
(
${
PROFILER_EXECUTABLE
}
PRIVATE device_gemm_bilinear_instance
)
endif
()
...
...
script/cmake-ck-dev.sh
View file @
d20c20a6
...
...
@@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1
if
[
$#
-ge
2
]
;
then
GPU_TARGETS
=
$2
REST_ARGS
=
${
@
:3
}
shift
2
REST_ARGS
=
$@
else
GPU_TARGETS
=
"gfx908;gfx90a;gfx940"
REST_ARGS
=
...
...
script/cmake-ck-release.sh
View file @
d20c20a6
...
...
@@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1
if
[
$#
-ge
2
]
;
then
GPU_TARGETS
=
$2
REST_ARGS
=
${
@
:3
}
shift
2
REST_ARGS
=
$@
else
GPU_TARGETS
=
"gfx908;gfx90a;gfx940"
REST_ARGS
=
...
...
test/CMakeLists.txt
View file @
d20c20a6
...
...
@@ -41,11 +41,7 @@ function(add_test_executable TEST_NAME)
endforeach
()
endif
()
if
(
INSTANCES_ONLY
)
set
(
TEST_TARGETS
${
DEFAULT_GPU_TARGETS
}
)
else
()
set
(
TEST_TARGETS
${
GPU_TARGETS
}
)
endif
()
set
(
TEST_TARGETS
${
SUPPORTED_GPU_TARGETS
}
)
foreach
(
source IN LISTS ARGN
)
if
(
NOT DEFINED DL_KERNELS AND source MATCHES
"_dl"
)
...
...
@@ -122,11 +118,7 @@ function(add_gtest_executable TEST_NAME)
endforeach
()
endif
()
if
(
INSTANCES_ONLY
)
set
(
TEST_TARGETS
${
DEFAULT_GPU_TARGETS
}
)
else
()
set
(
TEST_TARGETS
${
GPU_TARGETS
}
)
endif
()
set
(
TEST_TARGETS
${
SUPPORTED_GPU_TARGETS
}
)
foreach
(
source IN LISTS ARGN
)
if
(
NOT DEFINED DL_KERNELS AND source MATCHES
"_dl"
)
...
...
@@ -211,10 +203,10 @@ add_subdirectory(conv_tensor_rearrange)
add_subdirectory
(
transpose
)
add_subdirectory
(
permute_scale
)
add_subdirectory
(
wrapper
)
if
(
GPU_TARGETS MATCHES
"gfx11"
)
if
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx11"
)
add_subdirectory
(
wmma_op
)
endif
()
if
(
GPU_TARGETS MATCHES
"gfx942"
AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2
)
# smfmac needs ROCm6.2
if
(
SUPPORTED_
GPU_TARGETS MATCHES
"gfx942"
AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2
)
# smfmac needs ROCm6.2
add_subdirectory
(
smfmac_op
)
endif
()
add_subdirectory
(
position_embedding
)
test/data_type/CMakeLists.txt
View file @
d20c20a6
...
...
@@ -18,4 +18,9 @@ if(result EQUAL 0)
target_link_libraries
(
test_bf8 PRIVATE utility
)
endif
()
add_gtest_executable
(
test_custom_type test_custom_type.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_custom_type PRIVATE utility
)
endif
()
add_gtest_executable
(
test_type_convert_const type_convert_const.cpp
)
test/data_type/test_custom_type.cpp
0 → 100644
View file @
d20c20a6
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/utility/data_type.hpp"
#include "ck/utility/type_convert.hpp"
using
ck
::
bf8_t
;
using
ck
::
bhalf_t
;
using
ck
::
f8_t
;
using
ck
::
half_t
;
using
ck
::
Number
;
using
ck
::
type_convert
;
using
ck
::
vector_type
;
TEST
(
Custom_bool
,
TestSize
)
{
struct
custom_bool_t
{
bool
data
;
};
ASSERT_EQ
(
sizeof
(
custom_bool_t
),
sizeof
(
bool
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bool_t
,
2
>
),
sizeof
(
vector_type
<
bool
,
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bool_t
,
4
>
),
sizeof
(
vector_type
<
bool
,
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bool_t
,
8
>
),
sizeof
(
vector_type
<
bool
,
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bool_t
,
16
>
),
sizeof
(
vector_type
<
bool
,
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bool_t
,
32
>
),
sizeof
(
vector_type
<
bool
,
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bool_t
,
64
>
),
sizeof
(
vector_type
<
bool
,
64
>
));
}
TEST
(
Custom_bool
,
TestAsType
)
{
struct
custom_bool_t
{
using
type
=
bool
;
type
data
;
custom_bool_t
()
:
data
{
type
{}}
{}
custom_bool_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
bool
>
test_vec
=
{
false
,
true
,
false
,
true
};
// reference vector
vector_type
<
custom_bool_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_bool_t
>()(
Number
<
i
>
{}).
data
,
false
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_bool_t
>()(
Number
<
i
>
{})
=
custom_bool_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_bool_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_bool_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_bool
,
TestAsTypeReshape
)
{
struct
custom_bool_t
{
using
type
=
bool
;
type
data
;
custom_bool_t
()
:
data
{
type
{}}
{}
custom_bool_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
bool
>
test_vec
=
{
false
,
true
,
false
,
true
};
// reference vector
vector_type
<
custom_bool_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_bool_t
>()(
Number
<
i
>
{}).
data
,
false
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_bool_t
>()(
Number
<
i
>
{})
=
custom_bool_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_bool_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_bool_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_bool_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_int8
,
TestSize
)
{
struct
custom_int8_t
{
int8_t
data
;
};
ASSERT_EQ
(
sizeof
(
custom_int8_t
),
sizeof
(
int8_t
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_int8_t
,
2
>
),
sizeof
(
vector_type
<
int8_t
,
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_int8_t
,
4
>
),
sizeof
(
vector_type
<
int8_t
,
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_int8_t
,
8
>
),
sizeof
(
vector_type
<
int8_t
,
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_int8_t
,
16
>
),
sizeof
(
vector_type
<
int8_t
,
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_int8_t
,
32
>
),
sizeof
(
vector_type
<
int8_t
,
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_int8_t
,
64
>
),
sizeof
(
vector_type
<
int8_t
,
64
>
));
}
TEST
(
Custom_int8
,
TestAsType
)
{
struct
custom_int8_t
{
using
type
=
int8_t
;
type
data
;
custom_int8_t
()
:
data
{
type
{}}
{}
custom_int8_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
int8_t
>
test_vec
=
{
3
,
-
6
,
8
,
-
2
};
// reference vector
vector_type
<
custom_int8_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_int8_t
>()(
Number
<
i
>
{}).
data
,
0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_int8_t
>()(
Number
<
i
>
{})
=
custom_int8_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_int8_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_int8_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_int8
,
TestAsTypeReshape
)
{
struct
custom_int8_t
{
using
type
=
int8_t
;
type
data
;
custom_int8_t
()
:
data
{
type
{}}
{}
custom_int8_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
int8_t
>
test_vec
=
{
3
,
-
6
,
8
,
-
2
};
// reference vector
vector_type
<
custom_int8_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_int8_t
>()(
Number
<
i
>
{}).
data
,
0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_int8_t
>()(
Number
<
i
>
{})
=
custom_int8_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_int8_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_int8_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_int8_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_uint8
,
TestSize
)
{
struct
custom_uint8_t
{
uint8_t
data
;
};
ASSERT_EQ
(
sizeof
(
custom_uint8_t
),
sizeof
(
uint8_t
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_uint8_t
,
2
>
),
sizeof
(
vector_type
<
uint8_t
,
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_uint8_t
,
4
>
),
sizeof
(
vector_type
<
uint8_t
,
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_uint8_t
,
8
>
),
sizeof
(
vector_type
<
uint8_t
,
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_uint8_t
,
16
>
),
sizeof
(
vector_type
<
uint8_t
,
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_uint8_t
,
32
>
),
sizeof
(
vector_type
<
uint8_t
,
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_uint8_t
,
64
>
),
sizeof
(
vector_type
<
uint8_t
,
64
>
));
}
TEST
(
Custom_uint8
,
TestAsType
)
{
struct
custom_uint8_t
{
using
type
=
uint8_t
;
type
data
;
custom_uint8_t
()
:
data
{
type
{}}
{}
custom_uint8_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
uint8_t
>
test_vec
=
{
3
,
6
,
8
,
2
};
// reference vector
vector_type
<
custom_uint8_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_uint8_t
>()(
Number
<
i
>
{}).
data
,
0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_uint8_t
>()(
Number
<
i
>
{})
=
custom_uint8_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_uint8_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_uint8_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_uint8
,
TestAsTypeReshape
)
{
struct
custom_uint8_t
{
using
type
=
uint8_t
;
type
data
;
custom_uint8_t
()
:
data
{
type
{}}
{}
custom_uint8_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
uint8_t
>
test_vec
=
{
3
,
6
,
8
,
2
};
// reference vector
vector_type
<
custom_uint8_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_uint8_t
>()(
Number
<
i
>
{}).
data
,
0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_uint8_t
>()(
Number
<
i
>
{})
=
custom_uint8_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_uint8_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_uint8_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_uint8_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_f8
,
TestSize
)
{
struct
custom_f8_t
{
_BitInt
(
8
)
data
;
};
ASSERT_EQ
(
sizeof
(
custom_f8_t
),
sizeof
(
_BitInt
(
8
)));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_f8_t
,
2
>
),
sizeof
(
vector_type
<
_BitInt
(
8
),
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_f8_t
,
4
>
),
sizeof
(
vector_type
<
_BitInt
(
8
),
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_f8_t
,
8
>
),
sizeof
(
vector_type
<
_BitInt
(
8
),
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_f8_t
,
16
>
),
sizeof
(
vector_type
<
_BitInt
(
8
),
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_f8_t
,
32
>
),
sizeof
(
vector_type
<
_BitInt
(
8
),
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_f8_t
,
64
>
),
sizeof
(
vector_type
<
_BitInt
(
8
),
64
>
));
}
TEST
(
Custom_f8
,
TestAsType
)
{
struct
custom_f8_t
{
using
type
=
_BitInt
(
8
);
type
data
;
custom_f8_t
()
:
data
{
type
{}}
{}
custom_f8_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
_BitInt
(
8
)
>
test_vec
=
{
type_convert
<
_BitInt
(
8
)
>
(
0.3
f
),
type_convert
<
_BitInt
(
8
)
>
(
-
0.6
f
),
type_convert
<
_BitInt
(
8
)
>
(
0.8
f
),
type_convert
<
_BitInt
(
8
)
>
(
-
0.2
f
)};
// reference vector
vector_type
<
custom_f8_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}(
[
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_f8_t
>()(
Number
<
i
>
{}).
data
,
0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_f8_t
>()(
Number
<
i
>
{})
=
custom_f8_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_f8_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_f8_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_f8
,
TestAsTypeReshape
)
{
struct
custom_f8_t
{
using
type
=
_BitInt
(
8
);
type
data
;
custom_f8_t
()
:
data
{
type
{}}
{}
custom_f8_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
_BitInt
(
8
)
>
test_vec
=
{
type_convert
<
_BitInt
(
8
)
>
(
0.3
f
),
type_convert
<
_BitInt
(
8
)
>
(
-
0.6
f
),
type_convert
<
_BitInt
(
8
)
>
(
0.8
f
),
type_convert
<
_BitInt
(
8
)
>
(
-
0.2
f
)};
// reference vector
vector_type
<
custom_f8_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}(
[
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_f8_t
>()(
Number
<
i
>
{}).
data
,
0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_f8_t
>()(
Number
<
i
>
{})
=
custom_f8_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_f8_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_f8_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_f8_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_bf8
,
TestSize
)
{
struct
custom_bf8_t
{
unsigned
_BitInt
(
8
)
data
;
};
ASSERT_EQ
(
sizeof
(
custom_bf8_t
),
sizeof
(
unsigned
_BitInt
(
8
)));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bf8_t
,
2
>
),
sizeof
(
vector_type
<
unsigned
_BitInt
(
8
),
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bf8_t
,
4
>
),
sizeof
(
vector_type
<
unsigned
_BitInt
(
8
),
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bf8_t
,
8
>
),
sizeof
(
vector_type
<
unsigned
_BitInt
(
8
),
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bf8_t
,
16
>
),
sizeof
(
vector_type
<
unsigned
_BitInt
(
8
),
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bf8_t
,
32
>
),
sizeof
(
vector_type
<
unsigned
_BitInt
(
8
),
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bf8_t
,
64
>
),
sizeof
(
vector_type
<
unsigned
_BitInt
(
8
),
64
>
));
}
TEST
(
Custom_bf8
,
TestAsType
)
{
struct
custom_bf8_t
{
using
type
=
unsigned
_BitInt
(
8
);
type
data
;
custom_bf8_t
()
:
data
{
type
{}}
{}
custom_bf8_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
unsigned
_BitInt
(
8
)
>
test_vec
=
{
type_convert
<
unsigned
_BitInt
(
8
)
>
(
0.3
f
),
type_convert
<
unsigned
_BitInt
(
8
)
>
(
-
0.6
f
),
type_convert
<
unsigned
_BitInt
(
8
)
>
(
0.8
f
),
type_convert
<
unsigned
_BitInt
(
8
)
>
(
-
0.2
f
)};
// reference vector
vector_type
<
custom_bf8_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}(
[
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_bf8_t
>()(
Number
<
i
>
{}).
data
,
0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_bf8_t
>()(
Number
<
i
>
{})
=
custom_bf8_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_bf8_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_bf8_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_bf8
,
TestAsTypeReshape
)
{
struct
custom_bf8_t
{
using
type
=
unsigned
_BitInt
(
8
);
type
data
;
custom_bf8_t
()
:
data
{
type
{}}
{}
custom_bf8_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
unsigned
_BitInt
(
8
)
>
test_vec
=
{
type_convert
<
unsigned
_BitInt
(
8
)
>
(
0.3
f
),
type_convert
<
unsigned
_BitInt
(
8
)
>
(
-
0.6
f
),
type_convert
<
unsigned
_BitInt
(
8
)
>
(
0.8
f
),
type_convert
<
unsigned
_BitInt
(
8
)
>
(
-
0.2
f
)};
// reference vector
vector_type
<
custom_bf8_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}(
[
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_bf8_t
>()(
Number
<
i
>
{}).
data
,
0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_bf8_t
>()(
Number
<
i
>
{})
=
custom_bf8_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_bf8_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_bf8_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_bf8_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_half
,
TestSize
)
{
struct
custom_half_t
{
half_t
data
;
};
ASSERT_EQ
(
sizeof
(
custom_half_t
),
sizeof
(
half_t
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_half_t
,
2
>
),
sizeof
(
vector_type
<
half_t
,
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_half_t
,
4
>
),
sizeof
(
vector_type
<
half_t
,
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_half_t
,
8
>
),
sizeof
(
vector_type
<
half_t
,
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_half_t
,
16
>
),
sizeof
(
vector_type
<
half_t
,
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_half_t
,
32
>
),
sizeof
(
vector_type
<
half_t
,
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_half_t
,
64
>
),
sizeof
(
vector_type
<
half_t
,
64
>
));
}
TEST
(
Custom_half
,
TestAsType
)
{
struct
custom_half_t
{
using
type
=
half_t
;
type
data
;
custom_half_t
()
:
data
{
type
{}}
{}
custom_half_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
half_t
>
test_vec
=
{
half_t
{
0.3
f
},
half_t
{
-
0.6
f
},
half_t
{
0.8
f
},
half_t
{
-
0.2
f
}};
// reference vector
vector_type
<
custom_half_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_half_t
>()(
Number
<
i
>
{}).
data
,
type_convert
<
half_t
>
(
0.0
f
));
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_half_t
>()(
Number
<
i
>
{})
=
custom_half_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_half_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_half_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_half
,
TestAsTypeReshape
)
{
struct
custom_half_t
{
using
type
=
half_t
;
type
data
;
custom_half_t
()
:
data
{
type
{}}
{}
custom_half_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
half_t
>
test_vec
=
{
half_t
{
0.3
f
},
half_t
{
-
0.6
f
},
half_t
{
0.8
f
},
half_t
{
-
0.2
f
}};
// reference vector
vector_type
<
custom_half_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_half_t
>()(
Number
<
i
>
{}).
data
,
type_convert
<
half_t
>
(
0.0
f
));
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_half_t
>()(
Number
<
i
>
{})
=
custom_half_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_half_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_half_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_half_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_bhalf
,
TestSize
)
{
struct
custom_bhalf_t
{
bhalf_t
data
;
};
ASSERT_EQ
(
sizeof
(
custom_bhalf_t
),
sizeof
(
bhalf_t
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bhalf_t
,
2
>
),
sizeof
(
vector_type
<
bhalf_t
,
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bhalf_t
,
4
>
),
sizeof
(
vector_type
<
bhalf_t
,
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bhalf_t
,
8
>
),
sizeof
(
vector_type
<
bhalf_t
,
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bhalf_t
,
16
>
),
sizeof
(
vector_type
<
bhalf_t
,
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bhalf_t
,
32
>
),
sizeof
(
vector_type
<
bhalf_t
,
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_bhalf_t
,
64
>
),
sizeof
(
vector_type
<
bhalf_t
,
64
>
));
}
TEST
(
Custom_bhalf
,
TestAsType
)
{
struct
custom_bhalf_t
{
using
type
=
bhalf_t
;
type
data
;
custom_bhalf_t
()
:
data
{
type
{}}
{}
custom_bhalf_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
bhalf_t
>
test_vec
=
{
type_convert
<
bhalf_t
>
(
0.3
f
),
type_convert
<
bhalf_t
>
(
-
0.6
f
),
type_convert
<
bhalf_t
>
(
0.8
f
),
type_convert
<
bhalf_t
>
(
-
0.2
f
)};
// reference vector
vector_type
<
custom_bhalf_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_bhalf_t
>()(
Number
<
i
>
{}).
data
,
type_convert
<
bhalf_t
>
(
0.0
f
));
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_bhalf_t
>()(
Number
<
i
>
{})
=
custom_bhalf_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_bhalf_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_bhalf_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_bhalf
,
TestAsTypeReshape
)
{
struct
custom_bhalf_t
{
using
type
=
bhalf_t
;
type
data
;
custom_bhalf_t
()
:
data
{
type
{}}
{}
custom_bhalf_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
bhalf_t
>
test_vec
=
{
type_convert
<
bhalf_t
>
(
0.3
f
),
type_convert
<
bhalf_t
>
(
-
0.6
f
),
type_convert
<
bhalf_t
>
(
0.8
f
),
type_convert
<
bhalf_t
>
(
-
0.2
f
)};
// reference vector
vector_type
<
custom_bhalf_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_bhalf_t
>()(
Number
<
i
>
{}).
data
,
type_convert
<
bhalf_t
>
(
0.0
f
));
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_bhalf_t
>()(
Number
<
i
>
{})
=
custom_bhalf_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_bhalf_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_bhalf_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_bhalf_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_float
,
TestSize
)
{
struct
custom_float_t
{
float
data
;
};
ASSERT_EQ
(
sizeof
(
custom_float_t
),
sizeof
(
float
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_float_t
,
2
>
),
sizeof
(
vector_type
<
float
,
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_float_t
,
4
>
),
sizeof
(
vector_type
<
float
,
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_float_t
,
8
>
),
sizeof
(
vector_type
<
float
,
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_float_t
,
16
>
),
sizeof
(
vector_type
<
float
,
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_float_t
,
32
>
),
sizeof
(
vector_type
<
float
,
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_float_t
,
64
>
),
sizeof
(
vector_type
<
float
,
64
>
));
}
TEST
(
Custom_float
,
TestAsType
)
{
struct
custom_float_t
{
using
type
=
float
;
type
data
;
custom_float_t
()
:
data
{
type
{}}
{}
custom_float_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
float
>
test_vec
=
{
0.3
f
,
-
0.6
f
,
0.8
f
,
-
0.2
f
};
// reference vector
vector_type
<
custom_float_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_float_t
>()(
Number
<
i
>
{}).
data
,
0.0
f
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_float_t
>()(
Number
<
i
>
{})
=
custom_float_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_float_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_float_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_float
,
TestAsTypeReshape
)
{
struct
custom_float_t
{
using
type
=
float
;
type
data
;
custom_float_t
()
:
data
{
type
{}}
{}
custom_float_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
float
>
test_vec
=
{
0.3
f
,
-
0.6
f
,
0.8
f
,
-
0.2
f
};
// reference vector
vector_type
<
custom_float_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_float_t
>()(
Number
<
i
>
{}).
data
,
0.0
f
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_float_t
>()(
Number
<
i
>
{})
=
custom_float_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_float_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_float_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_float_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_double
,
TestSize
)
{
struct
custom_double_t
{
double
data
;
};
ASSERT_EQ
(
sizeof
(
custom_double_t
),
sizeof
(
double
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_double_t
,
2
>
),
sizeof
(
vector_type
<
double
,
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_double_t
,
4
>
),
sizeof
(
vector_type
<
double
,
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_double_t
,
8
>
),
sizeof
(
vector_type
<
double
,
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_double_t
,
16
>
),
sizeof
(
vector_type
<
double
,
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_double_t
,
32
>
),
sizeof
(
vector_type
<
double
,
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
custom_double_t
,
64
>
),
sizeof
(
vector_type
<
double
,
64
>
));
}
TEST
(
Custom_double
,
TestAsType
)
{
struct
custom_double_t
{
using
type
=
double
;
type
data
;
custom_double_t
()
:
data
{
type
{}}
{}
custom_double_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
double
>
test_vec
=
{
0.3
,
0.6
,
0.8
,
0.2
};
// reference vector
vector_type
<
custom_double_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_double_t
>()(
Number
<
i
>
{}).
data
,
0.0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_double_t
>()(
Number
<
i
>
{})
=
custom_double_t
{
test_vec
.
at
(
i
)};
});
// copy the vector
vector_type
<
custom_double_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_double_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Custom_double
,
TestAsTypeReshape
)
{
struct
custom_double_t
{
using
type
=
double
;
type
data
;
custom_double_t
()
:
data
{
type
{}}
{}
custom_double_t
(
type
init
)
:
data
{
init
}
{}
};
// test size
const
int
size
=
4
;
std
::
vector
<
double
>
test_vec
=
{
0.3
,
0.6
,
0.8
,
0.2
};
// reference vector
vector_type
<
custom_double_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
custom_double_t
>()(
Number
<
i
>
{}).
data
,
0.0
);
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
custom_double_t
>()(
Number
<
i
>
{})
=
custom_double_t
{
test_vec
.
at
(
i
)};
});
// copy the first half of a vector
vector_type
<
custom_double_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
custom_double_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
custom_double_t
>()(
Number
<
i
>
{}).
data
,
test_vec
.
at
(
i
));
});
}
TEST
(
Complex_half
,
TestSize
)
{
struct
complex_half_t
{
half_t
real
;
half_t
img
;
};
ASSERT_EQ
(
sizeof
(
complex_half_t
),
sizeof
(
half_t
)
+
sizeof
(
half_t
));
ASSERT_EQ
(
sizeof
(
vector_type
<
complex_half_t
,
2
>
),
sizeof
(
vector_type
<
half_t
,
2
>
)
+
sizeof
(
vector_type
<
half_t
,
2
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
complex_half_t
,
4
>
),
sizeof
(
vector_type
<
half_t
,
4
>
)
+
sizeof
(
vector_type
<
half_t
,
4
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
complex_half_t
,
8
>
),
sizeof
(
vector_type
<
half_t
,
8
>
)
+
sizeof
(
vector_type
<
half_t
,
8
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
complex_half_t
,
16
>
),
sizeof
(
vector_type
<
half_t
,
16
>
)
+
sizeof
(
vector_type
<
half_t
,
16
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
complex_half_t
,
32
>
),
sizeof
(
vector_type
<
half_t
,
32
>
)
+
sizeof
(
vector_type
<
half_t
,
32
>
));
ASSERT_EQ
(
sizeof
(
vector_type
<
complex_half_t
,
64
>
),
sizeof
(
vector_type
<
half_t
,
64
>
)
+
sizeof
(
vector_type
<
half_t
,
64
>
));
}
TEST
(
Complex_half
,
TestAlignment
)
{
struct
complex_half_t
{
half_t
real
;
half_t
img
;
};
ASSERT_EQ
(
alignof
(
vector_type
<
complex_half_t
,
2
>
),
alignof
(
vector_type
<
half_t
,
2
>
)
+
alignof
(
vector_type
<
half_t
,
2
>
));
ASSERT_EQ
(
alignof
(
vector_type
<
complex_half_t
,
4
>
),
alignof
(
vector_type
<
half_t
,
4
>
)
+
alignof
(
vector_type
<
half_t
,
4
>
));
ASSERT_EQ
(
alignof
(
vector_type
<
complex_half_t
,
8
>
),
alignof
(
vector_type
<
half_t
,
8
>
)
+
alignof
(
vector_type
<
half_t
,
8
>
));
ASSERT_EQ
(
alignof
(
vector_type
<
complex_half_t
,
16
>
),
alignof
(
vector_type
<
half_t
,
16
>
)
+
alignof
(
vector_type
<
half_t
,
16
>
));
ASSERT_EQ
(
alignof
(
vector_type
<
complex_half_t
,
32
>
),
alignof
(
vector_type
<
half_t
,
32
>
)
+
alignof
(
vector_type
<
half_t
,
32
>
));
ASSERT_EQ
(
alignof
(
vector_type
<
complex_half_t
,
64
>
),
alignof
(
vector_type
<
half_t
,
64
>
)
+
alignof
(
vector_type
<
half_t
,
64
>
));
}
TEST
(
Complex_half
,
TestAsType
)
{
struct
complex_half_t
{
using
type
=
half_t
;
type
real
;
type
img
;
complex_half_t
()
:
real
{
type
{}},
img
{
type
{}}
{}
complex_half_t
(
type
real_init
,
type
img_init
)
:
real
{
real_init
},
img
{
img_init
}
{}
};
// test size
const
int
size
=
4
;
// custom type number of elements
const
int
num_elem
=
sizeof
(
complex_half_t
)
/
sizeof
(
complex_half_t
::
type
);
std
::
vector
<
half_t
>
test_vec
=
{
half_t
{
0.3
f
},
half_t
{
-
0.6
f
},
half_t
{
0.8
f
},
half_t
{
-
0.2
f
},
half_t
{
0.5
f
},
half_t
{
-
0.7
f
},
half_t
{
0.9
f
},
half_t
{
-
0.3
f
}};
// reference vector
vector_type
<
complex_half_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{}).
real
,
type_convert
<
half_t
>
(
0.0
f
));
ASSERT_EQ
(
right_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{}).
img
,
type_convert
<
half_t
>
(
0.0
f
));
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{})
=
complex_half_t
{
test_vec
.
at
(
num_elem
*
i
),
test_vec
.
at
(
num_elem
*
i
+
1
)};
});
// copy the vector
vector_type
<
complex_half_t
,
size
>
left_vec
{
right_vec
};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{}).
real
,
test_vec
.
at
(
num_elem
*
i
));
ASSERT_EQ
(
left_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{}).
img
,
test_vec
.
at
(
num_elem
*
i
+
1
));
});
}
TEST
(
Complex_half
,
TestAsTypeReshape
)
{
struct
complex_half_t
{
using
type
=
half_t
;
type
real
;
type
img
;
complex_half_t
()
:
real
{
type
{}},
img
{
type
{}}
{}
complex_half_t
(
type
real_init
,
type
img_init
)
:
real
{
real_init
},
img
{
img_init
}
{}
};
// test size
const
int
size
=
4
;
// custom type number of elements
const
int
num_elem
=
sizeof
(
complex_half_t
)
/
sizeof
(
complex_half_t
::
type
);
std
::
vector
<
half_t
>
test_vec
=
{
half_t
{
0.3
f
},
half_t
{
-
0.6
f
},
half_t
{
0.8
f
},
half_t
{
-
0.2
f
},
half_t
{
0.5
f
},
half_t
{
-
0.7
f
},
half_t
{
0.9
f
},
half_t
{
-
0.3
f
}};
// reference vector
vector_type
<
complex_half_t
,
size
>
right_vec
;
// check default CTOR
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
right_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{}).
real
,
type_convert
<
half_t
>
(
0.0
f
));
ASSERT_EQ
(
right_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{}).
img
,
type_convert
<
half_t
>
(
0.0
f
));
});
// assign test values to the vector
ck
::
static_for
<
0
,
size
,
1
>
{}([
&
](
auto
i
)
{
right_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{})
=
complex_half_t
{
test_vec
.
at
(
num_elem
*
i
),
test_vec
.
at
(
num_elem
*
i
+
1
)};
});
// copy the first half of a vector
vector_type
<
complex_half_t
,
size
/
2
>
left_vec
{
right_vec
.
template
AsType
<
vector_type
<
complex_half_t
,
size
/
2
>
::
type
>
()(
Number
<
0
>
{})};
// check if values were copied correctly
ck
::
static_for
<
0
,
size
/
2
,
1
>
{}([
&
](
auto
i
)
{
ASSERT_EQ
(
left_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{}).
real
,
test_vec
.
at
(
num_elem
*
i
));
ASSERT_EQ
(
left_vec
.
template
AsType
<
complex_half_t
>()(
Number
<
i
>
{}).
img
,
test_vec
.
at
(
num_elem
*
i
+
1
));
});
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment