Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
bc641634
Commit
bc641634
authored
Nov 18, 2023
by
Jun Liu
Browse files
Merge branch 'develop-tmp' into amd-develop
parents
f30e5975
a3d9a2cd
Changes
235
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
288 additions
and
112 deletions
+288
-112
script/redis-cli.conf
script/redis-cli.conf
+10
-0
script/sccache_wrapper.sh
script/sccache_wrapper.sh
+56
-0
test/CMakeLists.txt
test/CMakeLists.txt
+1
-1
test/contraction/test_contraction.cpp
test/contraction/test_contraction.cpp
+96
-55
test/contraction/test_contraction_interface.cpp
test/contraction/test_contraction_interface.cpp
+5
-5
test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
+26
-20
test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
...tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
+8
-6
test/grouped_gemm/test_grouped_gemm_interface.cpp
test/grouped_gemm/test_grouped_gemm_interface.cpp
+4
-0
test/normalization/CMakeLists.txt
test/normalization/CMakeLists.txt
+0
-21
test/normalization_fwd/CMakeLists.txt
test/normalization_fwd/CMakeLists.txt
+30
-0
test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
test/normalization_fwd/test_groupnorm_fwd_fp16.cpp
+1
-1
test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
test/normalization_fwd/test_groupnorm_fwd_fp32.cpp
+1
-1
test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
test/normalization_fwd/test_layernorm2d_fwd_fp16.cpp
+1
-1
test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
test/normalization_fwd/test_layernorm2d_fwd_fp32.cpp
+1
-1
test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
+48
-0
No files found.
script/redis-cli.conf
0 → 100644
View file @
bc641634
fips
=
no
setuid
=
root
setgid
=
root
pid
= /
var
/
run
/
stunnel
.
pid
debug
=
7
options
=
NO_SSLv2
options
=
NO_SSLv3
[
redis
-
cli
]
client
=
yes
accept
=
127
.
0
.
0
.
1
:
6379
script/sccache_wrapper.sh
0 → 100755
View file @
bc641634
#!/bin/bash
set
-e
COMPILERS_HASH_DIR
=
${
COMPILERS_HASH_DIR
:-
"/tmp/.sccache"
}
SCCACHE_EXTRAFILES
=
${
SCCACHE_EXTRAFILES
:-
"
${
COMPILERS_HASH_DIR
}
/rocm_compilers_hash_file"
}
SCCACHE_BIN
=
${
SCCACHE_BIN
:-
"
${
SCCACHE_INSTALL_LOCATION
}
/sccache"
}
ENFORCE_REDIS
=
"false"
while
[
"
$1
"
!=
""
]
;
do
case
$1
in
--enforce_redis
)
shift
;
ENFORCE_REDIS
=
"true"
;;
--no-hipcc
)
shift
;;
*
)
break
;;
esac
done
setup_rocm_compilers_hash_file
()
{
mkdir
-p
"
$COMPILERS_HASH_DIR
"
HIPCC_MD5
=
"
$(
md5sum
"
${
ROCM_PATH
}
/bin/hipcc"
)
"
pushd
"
${
ROCM_PATH
}
/amdgcn/bitcode"
DEVICELIBS_BITCODES_MD5
=
"
$(
find
.
-type
f
-exec
md5sum
{}
\;
|
sort
|
md5sum
)
"
popd
HIPCC_HASH_VALUE
=
"
${
HIPCC_MD5
%% *
}
"
DEVICELIBS_BITCODES_HASH_VALUE
=
"
${
DEVICELIBS_BITCODES_MD5
%% *
}
"
# MD5 checksums of clang and clang-offload-bundler cannot be used since they will keep changing
# if the ROCM_PATH changes, ie; for every mainline build.
# This is because ROCM_PATH gets encoded into the clang/clang-offload-bundler binaries as part
# of RPATH.
# The versions themselves contain the commit hash of the compiler repo at the time of building.
# Hence, this should be a viable alternative to using the binary checksum itself.
CLANG_VERSION
=
"
$(
"
${
ROCM_PATH
}
/llvm/bin/clang"
--version
|
head
-n
1
)
"
CLANG_OFFLOAD_BUNDLER_VERSION
=
"
$(
"
${
ROCM_PATH
}
/llvm/bin/clang-offload-bundler"
--version
|
head
-n
1
)
"
printf
'%s: %s\n'
'clang version'
"
${
CLANG_VERSION
}
"
|
tee
-a
"
$SCCACHE_EXTRAFILES
"
printf
'%s: %s\n'
'clang-offload-bundler version'
"
${
CLANG_OFFLOAD_BUNDLER_VERSION
}
"
|
tee
-a
"
$SCCACHE_EXTRAFILES
"
printf
'%s: %s\n'
'hipcc md5sum'
"
${
HIPCC_HASH_VALUE
}
"
|
tee
-a
"
$SCCACHE_EXTRAFILES
"
printf
'%s: %s\n'
'devicelibs bitcode md5sum'
"
${
DEVICELIBS_BITCODES_HASH_VALUE
}
"
|
tee
-a
"
$SCCACHE_EXTRAFILES
"
echo
"sccache-wrapper: compilers hash file set up at
${
SCCACHE_EXTRAFILES
}
"
cat
"
$SCCACHE_EXTRAFILES
"
}
if
[
"
${
ENFORCE_REDIS
}
"
==
"true"
]
;
then
if
[
-z
"
${
SCCACHE_REDIS
}
"
]
;
then
echo
"SCCACHE_REDIS not set. Not wrapping compilers with sccache."
exit
10
else
response
=
$(
redis-cli
-u
${
SCCACHE_REDIS
}
ping
)
||
true
if
[
"
${
response
}
"
!=
"PONG"
]
;
then
echo
"Redis server unreachable. Not wrapping compilers with sccache."
exit
20
fi
fi
fi
setup_rocm_compilers_hash_file
$SCCACHE_BIN
--version
$SCCACHE_BIN
--start-server
test/CMakeLists.txt
View file @
bc641634
...
...
@@ -139,7 +139,7 @@ add_subdirectory(grouped_convnd_fwd)
add_subdirectory
(
grouped_convnd_bwd_weight
)
add_subdirectory
(
block_to_ctile_map
)
add_subdirectory
(
softmax
)
add_subdirectory
(
normalization
)
add_subdirectory
(
normalization
_fwd
)
add_subdirectory
(
data_type
)
add_subdirectory
(
elementwise_normalization
)
add_subdirectory
(
batchnorm
)
...
...
test/contraction/test_contraction.cpp
View file @
bc641634
...
...
@@ -10,9 +10,12 @@
#include <gtest/gtest.h>
#include "profiler/profile_contraction_impl.hpp"
#include "profiler/profile_contraction_utils.hpp"
using
F32
=
float
;
using
F64
=
double
;
using
F16
=
ck
::
half_t
;
using
BF16
=
ck
::
bhalf_t
;
using
F32
=
float
;
using
F64
=
double
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
...
...
@@ -20,49 +23,49 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
using
Bilinear
=
ck
::
tensor_operation
::
element_wise
::
Bilinear
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
struct
MemoryParam
s
struct
Dimension
s
{
std
::
vector
<
ck
::
index_t
>
M
;
std
::
vector
<
ck
::
index_t
>
N
;
std
::
vector
<
ck
::
index_t
>
K
;
std
::
vector
<
ck
::
index_t
>
StridesA
;
std
::
vector
<
ck
::
index_t
>
StridesB
;
std
::
vector
<
ck
::
index_t
>
StridesC
;
std
::
vector
<
ck
::
index_t
>
StridesD
;
};
template
<
typename
Tuple
>
class
TestContraction
:
public
::
testing
::
Test
{
protected:
using
ALayout
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
BLayout
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
CDLayout
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
DataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
DTupleDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
CDElementOp
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
std
::
vector
<
MemoryParams
>
list_of_memory_params
=
{{{
32
,
32
},
{
32
,
32
},
{
32
,
32
},
{
32768
,
1024
,
32
,
1
},
{
32768
,
1024
,
32
,
1
},
{
32768
,
1024
,
32
,
1
},
{
32768
,
1024
,
32
,
1
}},
{{
16
,
16
},
{
32
,
32
},
{
16
,
16
},
{
4096
,
256
,
16
,
1
},
{
16
,
1
,
8192
,
256
},
{
16384
,
1024
,
32
,
1
},
{
16384
,
1024
,
32
,
1
}}};
std
::
vector
<
ck
::
index_t
>
init_methods
=
{
0
,
1
,
2
};
using
ALayout
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
BLayout
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
CDLayout
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
DataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
DTupleDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
ComputeDataType
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
using
CDElementOp
=
std
::
tuple_element_t
<
6
,
Tuple
>
;
std
::
vector
<
Dimensions
>
dimension_list
=
{{{
32
,
32
},
{
32
,
32
},
{
32
,
32
}},
{{
16
,
16
},
{
32
,
32
},
{
16
,
16
}}};
std
::
vector
<
ck
::
index_t
>
init_methods
=
{
1
,
2
};
std
::
unique_ptr
<
CDElementOp
>
p_cd_element_op
;
void
Run
()
{
for
(
auto
&
memory
_params
:
list_of_memory_params
)
for
(
auto
&
dimension
_params
:
dimension_list
)
{
std
::
vector
<
ck
::
index_t
>
StridesA
;
std
::
vector
<
ck
::
index_t
>
StridesB
;
std
::
vector
<
ck
::
index_t
>
StridesC
;
std
::
vector
<
ck
::
index_t
>
StridesD
;
const
auto
&
M
=
dimension_params
.
M
;
const
auto
&
N
=
dimension_params
.
N
;
const
auto
&
K
=
dimension_params
.
K
;
assign_default_strides
(
ALayout
{},
StridesA
,
{
M
[
0
],
M
[
1
],
K
[
0
],
K
[
1
]});
assign_default_strides
(
BLayout
{},
StridesB
,
{
N
[
0
],
N
[
1
],
K
[
0
],
K
[
1
]});
assign_default_strides
(
CDLayout
{},
StridesC
,
{
M
[
0
],
M
[
1
],
N
[
0
],
N
[
1
]});
assign_default_strides
(
CDLayout
{},
StridesD
,
{
M
[
0
],
M
[
1
],
N
[
0
],
N
[
1
]});
for
(
const
ck
::
index_t
init_method
:
init_methods
)
{
bool
pass
=
...
...
@@ -70,19 +73,20 @@ class TestContraction : public ::testing::Test
BLayout
,
CDLayout
,
DataType
,
ComputeDataType
,
DTupleDataType
,
CDElementOp
>
(
true
/*do_verification*/
,
init_method
,
false
/*do_logs*/
,
false
/*time_kernel*/
,
*
p_cd_element_op
,
memory
_params
.
M
,
memory
_params
.
N
,
memory
_params
.
K
,
memory_params
.
StridesA
,
memory_params
.
StridesB
,
memory_params
.
StridesC
,
memory_params
.
StridesD
);
dimension
_params
.
M
,
dimension
_params
.
N
,
dimension
_params
.
K
,
StridesA
,
StridesB
,
StridesC
,
StridesD
);
EXPECT_TRUE
(
pass
);
}
}
...
...
@@ -99,24 +103,18 @@ class TestContractionBilinear : public TestContraction<Tuple>
{
};
#define ALL_LAYOUT_COMBINATIONS(dt, tuple_dt, compute_dt, op) \
std::tuple<Row, Row, Row, dt, tuple_dt, compute_dt, op>, \
std::tuple<Row, Col, Row, dt, tuple_dt, compute_dt, op>, \
std::tuple<Col, Row, Row, dt, tuple_dt, compute_dt, op>, \
std::tuple<Col, Col, Row, dt, tuple_dt, compute_dt, op>
using
BilinearKernelTypes
=
::
testing
::
Types
<
std
::
tuple
<
Row
,
Row
,
Row
,
F32
,
ck
::
Tuple
<
F32
>
,
Bilinear
>
,
std
::
tuple
<
Row
,
Col
,
Row
,
F32
,
ck
::
Tuple
<
F32
>
,
Bilinear
>
,
std
::
tuple
<
Col
,
Row
,
Row
,
F32
,
ck
::
Tuple
<
F32
>
,
Bilinear
>
,
std
::
tuple
<
Col
,
Col
,
Row
,
F32
,
ck
::
Tuple
<
F32
>
,
Bilinear
>
,
std
::
tuple
<
Row
,
Row
,
Row
,
F64
,
ck
::
Tuple
<
F32
>
,
Bilinear
>
,
std
::
tuple
<
Row
,
Col
,
Row
,
F64
,
ck
::
Tuple
<
F32
>
,
Bilinear
>
,
std
::
tuple
<
Col
,
Row
,
Row
,
F64
,
ck
::
Tuple
<
F32
>
,
Bilinear
>
,
std
::
tuple
<
Col
,
Col
,
Row
,
F64
,
ck
::
Tuple
<
F32
>
,
Bilinear
>>
;
using
ScaleKernelTypes
=
::
testing
::
Types
<
std
::
tuple
<
Row
,
Row
,
Row
,
F32
,
ck
::
Tuple
<>
,
Scale
>
,
std
::
tuple
<
Row
,
Col
,
Row
,
F32
,
ck
::
Tuple
<>
,
Scale
>
,
std
::
tuple
<
Col
,
Row
,
Row
,
F32
,
ck
::
Tuple
<>
,
Scale
>
,
std
::
tuple
<
Col
,
Col
,
Row
,
F32
,
ck
::
Tuple
<>
,
Scale
>
,
std
::
tuple
<
Row
,
Row
,
Row
,
F64
,
ck
::
Tuple
<>
,
Scale
>
,
std
::
tuple
<
Row
,
Col
,
Row
,
F64
,
ck
::
Tuple
<>
,
Scale
>
,
std
::
tuple
<
Col
,
Row
,
Row
,
F64
,
ck
::
Tuple
<>
,
Scale
>
,
std
::
tuple
<
Col
,
Col
,
Row
,
F64
,
ck
::
Tuple
<>
,
Scale
>>
;
::
testing
::
Types
<
ALL_LAYOUT_COMBINATIONS
(
F32
,
ck
::
Tuple
<
F32
>
,
F32
,
Bilinear
),
ALL_LAYOUT_COMBINATIONS
(
F64
,
ck
::
Tuple
<
F64
>
,
F64
,
Bilinear
)
>
;
using
ScaleKernelTypes
=
::
testing
::
Types
<
ALL_LAYOUT_COMBINATIONS
(
F32
,
ck
::
Tuple
<>
,
F32
,
Scale
),
ALL_LAYOUT_COMBINATIONS
(
F64
,
ck
::
Tuple
<>
,
F64
,
Scale
)
>
;
TYPED_TEST_SUITE
(
TestContractionBilinear
,
BilinearKernelTypes
);
TYPED_TEST_SUITE
(
TestContractionScale
,
ScaleKernelTypes
);
...
...
@@ -136,3 +134,46 @@ TYPED_TEST(TestContractionScale, scale)
this
->
p_cd_element_op
=
std
::
make_unique
<
Scale
>
(
0.5
f
);
this
->
Run
();
}
template
<
typename
Tuple
>
class
TestContractionScaleMixedPrecision
:
public
TestContraction
<
Tuple
>
{
};
template
<
typename
Tuple
>
class
TestContractionBilinearMixedPrecision
:
public
TestContraction
<
Tuple
>
{
};
using
BilinearKernelTypesMixedPrecision
=
::
testing
::
Types
<
ALL_LAYOUT_COMBINATIONS
(
F32
,
ck
::
Tuple
<
F32
>
,
F16
,
Bilinear
),
ALL_LAYOUT_COMBINATIONS
(
F32
,
ck
::
Tuple
<
F32
>
,
BF16
,
Bilinear
),
ALL_LAYOUT_COMBINATIONS
(
F64
,
ck
::
Tuple
<
F64
>
,
F32
,
Bilinear
),
ALL_LAYOUT_COMBINATIONS
(
F16
,
ck
::
Tuple
<
F16
>
,
F32
,
Bilinear
),
ALL_LAYOUT_COMBINATIONS
(
BF16
,
ck
::
Tuple
<
BF16
>
,
F32
,
Bilinear
)
>
;
using
ScaleKernelTypesMixedPrecision
=
::
testing
::
Types
<
ALL_LAYOUT_COMBINATIONS
(
F32
,
ck
::
Tuple
<>
,
F16
,
Scale
),
ALL_LAYOUT_COMBINATIONS
(
F32
,
ck
::
Tuple
<>
,
BF16
,
Scale
),
ALL_LAYOUT_COMBINATIONS
(
F64
,
ck
::
Tuple
<>
,
F32
,
Scale
),
ALL_LAYOUT_COMBINATIONS
(
F16
,
ck
::
Tuple
<>
,
F32
,
Scale
),
ALL_LAYOUT_COMBINATIONS
(
BF16
,
ck
::
Tuple
<>
,
F32
,
Scale
)
>
;
TYPED_TEST_SUITE
(
TestContractionBilinearMixedPrecision
,
BilinearKernelTypesMixedPrecision
);
TYPED_TEST_SUITE
(
TestContractionScaleMixedPrecision
,
ScaleKernelTypesMixedPrecision
);
TYPED_TEST
(
TestContractionBilinearMixedPrecision
,
bilinear
)
{
this
->
p_cd_element_op
=
std
::
make_unique
<
Bilinear
>
(
1.
f
,
1.
f
);
this
->
Run
();
this
->
p_cd_element_op
=
std
::
make_unique
<
Bilinear
>
(
-
0.5
f
,
0.5
f
);
this
->
Run
();
}
TYPED_TEST
(
TestContractionScaleMixedPrecision
,
scale
)
{
this
->
p_cd_element_op
=
std
::
make_unique
<
Scale
>
(
1.
f
);
this
->
Run
();
this
->
p_cd_element_op
=
std
::
make_unique
<
Scale
>
(
0.5
f
);
this
->
Run
();
}
test/contraction/test_contraction_interface.cpp
View file @
bc641634
...
...
@@ -34,11 +34,11 @@ class ContractionInstanceWrapper
static
constexpr
ck
::
index_t
NumDim
=
2
;
// clang-format off
using
ContractionDeviceInstance
=
ck
::
tensor_operation
::
device
::
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData|
A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
//#####################################| | | | Type| Type| Type| DataType| Type| Type|
Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//#####################################| | | | | | | | | |
Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//#####################################| | | | | | | | | |
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
DeviceContractionMultipleD_Xdl_CShuffle
<
NumDim
,
NumDim
,
NumDim
,
F32
,
F32
,
F32
,
F32
,
ck
::
Tuple
<
F32
>
,
F32
,
Pass
,
Pass
,
Bilinear
,
GemmSpec
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
CDEBlockTransferScalarPerVector
>
;
//#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
Compute|
//#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
Data|
//#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
Type|
//#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
|
DeviceContractionMultipleD_Xdl_CShuffle
<
NumDim
,
NumDim
,
NumDim
,
F32
,
F32
,
F32
,
F32
,
ck
::
Tuple
<
F32
>
,
F32
,
Pass
,
Pass
,
Bilinear
,
GemmSpec
,
1
,
256
,
256
,
128
,
16
,
4
,
4
,
32
,
32
,
4
,
2
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
4
,
4
,
1
,
S
<
4
,
32
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
4
,
4
,
1
,
1
,
1
,
S
<
1
,
16
,
1
,
16
>
,
CDEBlockTransferScalarPerVector
,
F32
>
;
// clang-format on
bool
isSupported
(
std
::
vector
<
ck
::
index_t
>&
ADims
,
...
...
test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp
View file @
bc641634
...
...
@@ -45,14 +45,20 @@ class TestConvTensorRearrange : public ::testing::Test
using
namespace
ck
::
tensor_layout
::
convolution
;
using
namespace
ck
::
conv_tensor_rearrange_op
;
using
KernelTypes1d
=
::
testing
::
Types
<
std
::
tuple
<
GNWC
,
ImageToColumn
>
,
std
::
tuple
<
GNWC
,
ColumnToImage
>>
;
using
KernelTypes1d
=
::
testing
::
Types
<
std
::
tuple
<
GNWC
,
ImageToColumn
>
,
std
::
tuple
<
GNWC
,
ColumnToImage
>
,
std
::
tuple
<
NWGC
,
ImageToColumn
>
,
std
::
tuple
<
NWGC
,
ColumnToImage
>>
;
using
KernelTypes2d
=
::
testing
::
Types
<
std
::
tuple
<
GNHWC
,
ImageToColumn
>
,
std
::
tuple
<
GNHWC
,
ColumnToImage
>>
;
using
KernelTypes2d
=
::
testing
::
Types
<
std
::
tuple
<
GNHWC
,
ImageToColumn
>
,
std
::
tuple
<
GNHWC
,
ColumnToImage
>
,
std
::
tuple
<
NHWGC
,
ImageToColumn
>
,
std
::
tuple
<
NHWGC
,
ColumnToImage
>>
;
using
KernelTypes3d
=
::
testing
::
Types
<
std
::
tuple
<
GNDHWC
,
ImageToColumn
>
,
std
::
tuple
<
GNDHWC
,
ColumnToImage
>>
;
using
KernelTypes3d
=
::
testing
::
Types
<
std
::
tuple
<
GNDHWC
,
ImageToColumn
>
,
std
::
tuple
<
GNDHWC
,
ColumnToImage
>
,
std
::
tuple
<
NDHWGC
,
ImageToColumn
>
,
std
::
tuple
<
NDHWGC
,
ColumnToImage
>>
;
template
<
typename
Tuple
>
class
TestConvTensorRearrange1d
:
public
TestConvTensorRearrange
<
Tuple
>
...
...
@@ -77,16 +83,16 @@ TYPED_TEST(TestConvTensorRearrange1d, Test1D)
{
this
->
conv_params
.
clear
();
this
->
conv_params
.
push_back
({
1
,
1
,
4
,
1
,
192
,
{
3
},
{
28
},
{
1
},
{
1
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
1
,
64
,
1
,
64
,
{
3
},
{
14
},
{
1
},
{
1
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
1
,
64
,
1
,
64
,
{
1
},
{
7
},
{
3
},
{
1
},
{
0
},
{
0
}});
this
->
conv_params
.
push_back
({
1
,
1
,
64
,
1
,
64
,
{
1
},
{
3
},
{
1
},
{
1
},
{
0
},
{
0
}});
this
->
conv_params
.
push_back
({
1
,
2
,
4
,
1
,
192
,
{
3
},
{
28
},
{
1
},
{
1
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
2
,
64
,
1
,
64
,
{
3
},
{
14
},
{
1
},
{
1
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
2
,
64
,
1
,
64
,
{
1
},
{
7
},
{
3
},
{
1
},
{
0
},
{
0
}});
this
->
conv_params
.
push_back
({
1
,
2
,
64
,
1
,
64
,
{
1
},
{
3
},
{
1
},
{
1
},
{
0
},
{
0
}});
// ScalarPerVector should be 1
this
->
conv_params
.
push_back
({
1
,
1
,
4
,
1
,
1
,
{
3
},
{
28
},
{
1
},
{
1
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
2
,
4
,
1
,
1
,
{
3
},
{
28
},
{
1
},
{
1
},
{
1
},
{
1
}});
// stride != 1
this
->
conv_params
.
push_back
({
1
,
1
,
1
,
1
,
4
,
{
3
},
{
28
},
{
2
},
{
1
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
2
,
1
,
1
,
4
,
{
3
},
{
28
},
{
2
},
{
1
},
{
1
},
{
1
}});
// dilation != 1
this
->
conv_params
.
push_back
({
1
,
1
,
1
,
1
,
4
,
{
3
},
{
28
},
{
1
},
{
2
},
{
1
},
{
1
}});
this
->
conv_params
.
push_back
({
1
,
2
,
1
,
1
,
4
,
{
3
},
{
28
},
{
1
},
{
2
},
{
1
},
{
1
}});
#ifdef CK_ENABLE_FP32
this
->
template
Run
<
1
,
float
,
float
>();
#endif
...
...
@@ -106,13 +112,13 @@ TYPED_TEST(TestConvTensorRearrange2d, Test2D)
this
->
conv_params
.
clear
();
this
->
conv_params
.
push_back
(
{
2
,
1
,
4
,
1
,
192
,
{
3
,
3
},
{
28
,
28
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
{
2
,
2
,
4
,
1
,
192
,
{
3
,
3
},
{
28
,
28
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
this
->
conv_params
.
push_back
(
{
2
,
1
,
64
,
1
,
64
,
{
3
,
3
},
{
14
,
14
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
{
2
,
2
,
64
,
1
,
64
,
{
3
,
3
},
{
14
,
14
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
},
{
1
,
1
}});
this
->
conv_params
.
push_back
({
2
,
1
,
64
,
1
,
64
,
{
1
,
1
},
{
7
,
7
},
{
3
,
3
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
this
->
conv_params
.
push_back
({
2
,
1
,
64
,
1
,
64
,
{
1
,
1
},
{
3
,
3
},
{
1
,
1
},
{
1
,
1
},
{
0
,
0
},
{
0
,
0
}});
this
->
conv_params
.
push_back
(
{
2
,
1
,
64
,
1
,
64
,
{
3
,
3
},
{
28
,
28
},
{
2
,
2
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
}});
{
2
,
2
,
64
,
1
,
64
,
{
3
,
3
},
{
28
,
28
},
{
2
,
2
},
{
2
,
2
},
{
1
,
1
},
{
1
,
1
}});
#ifdef CK_ENABLE_FP32
this
->
template
Run
<
2
,
float
,
float
>();
#endif
...
...
@@ -131,13 +137,13 @@ TYPED_TEST(TestConvTensorRearrange3d, Test3D)
{
this
->
conv_params
.
clear
();
this
->
conv_params
.
push_back
(
{
3
,
1
,
16
,
1
,
64
,
{
1
,
1
,
1
},
{
7
,
7
,
7
},
{
2
,
2
,
2
},
{
3
,
3
,
3
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}});
{
3
,
2
,
16
,
1
,
64
,
{
1
,
1
,
1
},
{
7
,
7
,
7
},
{
2
,
2
,
2
},
{
3
,
3
,
3
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}});
this
->
conv_params
.
push_back
(
{
3
,
1
,
2
,
1
,
64
,
{
3
,
3
,
3
},
{
14
,
14
,
3
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
{
3
,
2
,
2
,
1
,
64
,
{
3
,
3
,
3
},
{
14
,
14
,
3
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
this
->
conv_params
.
push_back
(
{
3
,
1
,
32
,
1
,
64
,
{
1
,
1
,
1
},
{
3
,
3
,
3
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}});
{
3
,
2
,
32
,
1
,
64
,
{
1
,
1
,
1
},
{
3
,
3
,
3
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}});
this
->
conv_params
.
push_back
(
{
3
,
1
,
64
,
1
,
64
,
{
3
,
3
,
3
},
{
14
,
14
,
14
},
{
2
,
2
,
2
},
{
2
,
2
,
2
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
{
3
,
2
,
64
,
1
,
64
,
{
3
,
3
,
3
},
{
14
,
14
,
14
},
{
2
,
2
,
2
},
{
2
,
2
,
2
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}});
#ifdef CK_ENABLE_FP32
this
->
template
Run
<
3
,
float
,
float
>();
#endif
...
...
test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp
View file @
bc641634
...
...
@@ -53,7 +53,7 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
template
<
typename
ConvTensorRearrangeOp
>
bool
Run
()
{
const
auto
G
=
conv_param
.
G_
;
const
auto
N
=
conv_param
.
N_
;
const
auto
C
=
conv_param
.
C_
;
const
auto
FakeC
=
...
...
@@ -71,13 +71,13 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
const
auto
image_desc
=
ck
::
utils
::
conv
::
make_input_host_tensor_descriptor_g_n_c_wis_packed
<
ImLayout
>
(
conv_param
);
const
auto
gemm_desc
=
HostTensorDescriptor
({
NDoHoWo
,
CZYX
});
const
auto
gemm_desc
=
HostTensorDescriptor
({
G
,
NDoHoWo
,
CZYX
});
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
filter_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
output_spatial_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
input_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
2
>
output_m_k_strides
{};
std
::
array
<
ck
::
index_t
,
3
>
output_
g_
m_k_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
...
...
@@ -89,7 +89,7 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
copy
(
conv_param
.
filter_spatial_lengths_
,
filter_spatial_lengths
);
copy
(
conv_param
.
output_spatial_lengths_
,
output_spatial_lengths
);
copy
(
image_desc
.
GetStrides
(),
input_g_n_c_wis_strides
);
copy
(
gemm_desc
.
GetStrides
(),
output_m_k_strides
);
copy
(
gemm_desc
.
GetStrides
(),
output_
g_
m_k_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
...
...
@@ -100,13 +100,14 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
auto
img2col
=
DeviceImgToColInstance
{};
auto
argument
=
img2col
.
MakeArgument
(
nullptr
,
nullptr
,
G
,
N
,
IsCPacked
?
C
:
FakeC
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
input_g_n_c_wis_strides
,
output_m_k_strides
,
output_
g_
m_k_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
...
...
@@ -119,13 +120,14 @@ class TestConvTensorRearrangeInterface : public ::testing::Test
auto
col2img
=
DeviceColToimgInstance
{};
auto
argument
=
col2img
.
MakeArgument
(
nullptr
,
nullptr
,
G
,
N
,
IsCPacked
?
C
:
FakeC
,
input_spatial_lengths
,
filter_spatial_lengths
,
output_spatial_lengths
,
input_g_n_c_wis_strides
,
output_m_k_strides
,
output_
g_
m_k_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
...
...
test/grouped_gemm/test_grouped_gemm_interface.cpp
View file @
bc641634
...
...
@@ -108,6 +108,10 @@ TEST_F(TestGGemmSplitKInterface_MKNKMN, KLoops)
// kloops % 2
Ks
=
std
::
vector
<
int
>
{
256
,
512
,
320
,
768
};
EXPECT_FALSE
(
DefaultGGemmInstance
{}.
IsSupported
(
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
,
kbatch
));
Ks
=
std
::
vector
<
int
>
{
256
,
512
,
384
,
768
};
EXPECT_TRUE
(
DefaultGGemmInstance
{}.
IsSupported
(
Ms
,
Ns
,
Ks
,
StrideAs
,
StrideBs
,
StrideCs
,
kbatch
));
...
...
test/normalization/CMakeLists.txt
deleted
100644 → 0
View file @
f30e5975
add_custom_target
(
test_normalization
)
add_gtest_executable
(
test_layernorm2d_fp32 test_layernorm2d_fp32.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_layernorm2d_fp32 PRIVATE utility device_normalization_instance
)
add_dependencies
(
test_normalization test_layernorm2d_fp32
)
endif
()
add_gtest_executable
(
test_groupnorm_fp32 test_groupnorm_fp32.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_groupnorm_fp32 PRIVATE utility device_normalization_instance
)
add_dependencies
(
test_normalization test_groupnorm_fp32
)
endif
()
add_gtest_executable
(
test_layernorm2d_fp16 test_layernorm2d_fp16.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_layernorm2d_fp16 PRIVATE utility device_normalization_instance
)
add_dependencies
(
test_normalization test_layernorm2d_fp16
)
endif
()
add_gtest_executable
(
test_groupnorm_fp16 test_groupnorm_fp16.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_groupnorm_fp16 PRIVATE utility device_normalization_instance
)
add_dependencies
(
test_normalization test_groupnorm_fp16
)
endif
()
test/normalization_fwd/CMakeLists.txt
0 → 100644
View file @
bc641634
add_custom_target
(
test_normalization_fwd
)
add_gtest_executable
(
test_layernorm2d_fwd_fp32 test_layernorm2d_fwd_fp32.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_layernorm2d_fwd_fp32 PRIVATE utility device_normalization_fwd_instance
)
add_dependencies
(
test_normalization_fwd test_layernorm2d_fwd_fp32
)
endif
()
add_gtest_executable
(
test_groupnorm_fwd_fp32 test_groupnorm_fwd_fp32.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_groupnorm_fwd_fp32 PRIVATE utility device_normalization_fwd_instance
)
add_dependencies
(
test_normalization_fwd test_groupnorm_fwd_fp32
)
endif
()
add_gtest_executable
(
test_layernorm2d_fwd_fp16 test_layernorm2d_fwd_fp16.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_layernorm2d_fwd_fp16 PRIVATE utility device_normalization_fwd_instance
)
add_dependencies
(
test_normalization_fwd test_layernorm2d_fwd_fp16
)
endif
()
add_gtest_executable
(
test_layernorm4d_fwd_fp16 test_layernorm4d_fwd_fp16.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_layernorm4d_fwd_fp16 PRIVATE utility device_normalization_fwd_instance
)
add_dependencies
(
test_normalization_fwd test_layernorm4d_fwd_fp16
)
endif
()
add_gtest_executable
(
test_groupnorm_fwd_fp16 test_groupnorm_fwd_fp16.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
test_groupnorm_fwd_fp16 PRIVATE utility device_normalization_fwd_instance
)
add_dependencies
(
test_normalization_fwd test_groupnorm_fwd_fp16
)
endif
()
test/normalization/test_groupnorm_fp16.cpp
→
test/normalization
_fwd
/test_groupnorm_
fwd_
fp16.cpp
View file @
bc641634
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_groupnorm_impl.hpp"
#include "profiler/profile_groupnorm_
fwd_
impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
test/normalization/test_groupnorm_fp32.cpp
→
test/normalization
_fwd
/test_groupnorm_
fwd_
fp32.cpp
View file @
bc641634
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_groupnorm_impl.hpp"
#include "profiler/profile_groupnorm_
fwd_
impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
test/normalization/test_layernorm2d_fp16.cpp
→
test/normalization
_fwd
/test_layernorm2d_
fwd_
fp16.cpp
View file @
bc641634
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_layernorm_impl.hpp"
#include "profiler/profile_layernorm_
fwd_
impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
test/normalization/test_layernorm2d_fp32.cpp
→
test/normalization
_fwd
/test_layernorm2d_
fwd_
fp32.cpp
View file @
bc641634
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_layernorm_impl.hpp"
#include "profiler/profile_layernorm_
fwd_
impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
test/normalization_fwd/test_layernorm4d_fwd_fp16.cpp
0 → 100644
View file @
bc641634
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/profile_layernorm_fwd_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestLayernorm4d
:
public
::
testing
::
Test
{
protected:
using
XDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
GammaDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
BetaDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
ComputeDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
YDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
SaveMeanInvStdDataType
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
void
Run
()
{
// [N, D], reduce D
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{
{
1
,
1
,
1
,
1
},
{
7
,
7
,
7
,
7
},
{
256
,
16
,
16
,
8
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_layernorm_impl
<
XDataType
,
GammaDataType
,
BetaDataType
,
ComputeDataType
,
YDataType
,
SaveMeanInvStdDataType
,
true
,
4
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType>
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
,
F32
>>
;
TYPED_TEST_SUITE
(
TestLayernorm4d
,
KernelTypes
);
TYPED_TEST
(
TestLayernorm4d
,
Test_FP16
)
{
this
->
Run
();
}
Prev
1
…
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment