Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
50b6b104
Commit
50b6b104
authored
Sep 23, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into groupnorm_check
parents
d7bb21c2
e9d4e893
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
160 additions
and
31 deletions
+160
-31
Jenkinsfile
Jenkinsfile
+6
-6
client_example/CMakeLists.txt
client_example/CMakeLists.txt
+1
-1
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
...ration/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+89
-1
library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
...ar_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+45
-2
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+19
-21
No files found.
Jenkinsfile
View file @
50b6b104
...
...
@@ -663,8 +663,8 @@ pipeline {
{
agent
{
label
rocmnode
(
"gfx908 || gfx90a"
)
}
environment
{
setup_args
=
"${params.COMPILER_VERSION == "
ck
-
9110
" ? """
-
DBUILD_DEV
=
Off
-
DCMAKE_INSTALL_PREFIX
=..
/install -D
CMAKE_CXX_FLAGS="--offload-arch=gfx908 --offload-arch=gfx90a
-O3 -Xclang -mlink-builtin-bitcode -Xclang /
opt
/rocm/
amdgcn
/bitcode/
oclc_abi_version_400
.
bc
" """
:
""" -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -D
CMAKE_CXX_FLAGS="--offload-arch=gfx908 --offload-arch=gfx90a
-O3 " """
}
"
execute_args
=
"${params.COMPILER_VERSION == "
ck
-
9110
" ? """
cd
..
/client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/
install
;
/opt/
rocm
" -D
CMAKE_CXX_FLAGS="
--
offload
-
arch
=
gfx908
--
offload
-
arch
=
gfx90a
-
O3
-
Xclang
-
mlink
-
builtin
-
bitcode
-
Xclang
/opt/
rocm
/amdgcn/
bitcode
/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../
client_example
&&
rm
-
rf
build
&&
mkdir
build
&&
cd
build
&&
cmake
-
D
CMAKE_PREFIX_PATH
=
"${env.WORKSPACE}/install;/opt/rocm"
-
D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908 --offload-arch=gfx90a
-O3"
-
D
CMAKE_CXX_COMPILER
=
"${build_compiler()}"
..
&&
make
-
j
""" }"
setup_args
=
"${params.COMPILER_VERSION == "
ck
-
9110
" ? """
-
DBUILD_DEV
=
Off
-
DCMAKE_INSTALL_PREFIX
=..
/install -D
GPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="
-O3 -Xclang -mlink-builtin-bitcode -Xclang /
opt
/rocm/
amdgcn
/bitcode/
oclc_abi_version_400
.
bc
" """
:
""" -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -D
GPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="
-O3 " """
}
"
execute_args
=
"${params.COMPILER_VERSION == "
ck
-
9110
" ? """
cd
..
/client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/
install
;
/opt/
rocm
" -D
GPU_TARGETS="
gfx908
;
gfx90a
" -DCMAKE_CXX_FLAGS="
-
O3
-
Xclang
-
mlink
-
builtin
-
bitcode
-
Xclang
/opt/
rocm
/amdgcn/
bitcode
/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../
client_example
&&
rm
-
rf
build
&&
mkdir
build
&&
cd
build
&&
cmake
-
D
CMAKE_PREFIX_PATH
=
"${env.WORKSPACE}/install;/opt/rocm"
-
D
GPU_TARGETS
=
"gfx908,gfx90a"
-
DCMAKE_CXX_FLAGS
=
"
-O3"
-
D
CMAKE_CXX_COMPILER
=
"${build_compiler()}"
..
&&
make
-
j
""" }"
}
steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
...
...
@@ -689,8 +689,8 @@ pipeline {
{
agent{ label rocmnode("gfx908")}
environment{
setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """
-
DBUILD_DEV
=
Off
-
DCMAKE_INSTALL_PREFIX
=..
/install -D
CMAKE_CXX_FLAGS="--offload-arch=gfx908
-O3 -Xclang -mlink-builtin-bitcode -Xclang /
opt
/rocm/
amdgcn
/bitcode/
oclc_abi_version_400
.
bc
" """
:
""" -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -D
CMAKE_CXX_FLAGS="--offload-arch=gfx908
-O3 " """
}
"
execute_args
=
"${params.COMPILER_VERSION == "
ck
-
9110
" ? """
cd
..
/client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/
install
;
/opt/
rocm
" -D
CMAKE_CXX_FLAGS="
--
offload
-
arch
=
gfx908
-
O3
-
Xclang
-
mlink
-
builtin
-
bitcode
-
Xclang
/opt/
rocm
/amdgcn/
bitcode
/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../
client_example
&&
rm
-
rf
build
&&
mkdir
build
&&
cd
build
&&
cmake
-
D
CMAKE_PREFIX_PATH
=
"${env.WORKSPACE}/install;/opt/rocm"
-
D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908
-O3"
-
D
CMAKE_CXX_COMPILER
=
"${build_compiler()}"
..
&&
make
-
j
""" }"
setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """
-
DBUILD_DEV
=
Off
-
DCMAKE_INSTALL_PREFIX
=..
/install -D
GPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="
-O3 -Xclang -mlink-builtin-bitcode -Xclang /
opt
/rocm/
amdgcn
/bitcode/
oclc_abi_version_400
.
bc
" """
:
""" -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -D
GPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="
-O3 " """
}
"
execute_args
=
"${params.COMPILER_VERSION == "
ck
-
9110
" ? """
cd
..
/client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/
install
;
/opt/
rocm
" -D
GPU_TARGETS="
gfx908
;
gfx90a
" -DCMAKE_CXX_FLAGS="
-
O3
-
Xclang
-
mlink
-
builtin
-
bitcode
-
Xclang
/opt/
rocm
/amdgcn/
bitcode
/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../
client_example
&&
rm
-
rf
build
&&
mkdir
build
&&
cd
build
&&
cmake
-
D
CMAKE_PREFIX_PATH
=
"${env.WORKSPACE}/install;/opt/rocm"
-
D
GPU_TARGETS
=
"gfx908;gfx90a"
-
DCMAKE_CXX_FLAGS
=
"
-O3"
-
D
CMAKE_CXX_COMPILER
=
"${build_compiler()}"
..
&&
make
-
j
""" }"
}
steps{
buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
...
...
@@ -712,7 +712,7 @@ pipeline {
options { retry(2) }
agent{ label rocmnode("gfx908 || gfx90a")}
environment{
setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """
-
D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908
-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc"
-
DBUILD_DEV
=
On
""" : """
-
D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx908
-O3 "
-
DBUILD_DEV
=
On
"""}"
setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """
-
D
GPU_TARGETS
=
"gfx908;gfx90a"
-
DCMAKE_CXX_FLAGS
=
"
-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc"
-
DBUILD_DEV
=
On
""" : """
-
D
GPU_TARGETS
=
"gfx908;gfx90a"
-
DCMAKE_CXX_FLAGS
=
"
-O3 "
-
DBUILD_DEV
=
On
"""}"
}
steps{
runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
...
...
@@ -727,7 +727,7 @@ pipeline {
options { retry(2) }
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """
-
D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx90a
-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc"
-
DBUILD_DEV
=
On
""" : """
-
D
CMAKE_CXX_FLAGS
=
" --offload-arch=gfx90a
-O3 "
-
DBUILD_DEV
=
On
"""}"
setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """
-
D
GPU_TARGETS
=
"gfx90a"
-
DCMAKE_CXX_FLAGS
=
"
-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc"
-
DBUILD_DEV
=
On
""" : """
-
D
GPU_TARGETS
=
"gfx90a"
-
DCMAKE_CXX_FLAGS
=
"
-O3 "
-
DBUILD_DEV
=
On
"""}"
}
steps
{
runPerfTest
(
setup_args:
setup_args
,
config_targets:
"ckProfiler"
,
no_reboot:
true
,
build_type:
'Release'
)
...
...
client_example/CMakeLists.txt
View file @
50b6b104
...
...
@@ -9,7 +9,7 @@ message(STATUS "Build with HIP ${hip_VERSION}")
# add all example subdir
file
(
GLOB dir_list LIST_DIRECTORIES true *
)
FOREACH
(
subdir
${
dir_list
}
)
IF
(
IS_DIRECTORY
"
${
subdir
}
"
)
IF
(
IS_DIRECTORY
"
${
subdir
}
"
AND
(
NOT
"
${
subdir
}
"
MATCHES
"build"
)
)
add_subdirectory
(
${
subdir
}
)
ENDIF
()
ENDFOREACH
()
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
View file @
50b6b104
...
...
@@ -332,7 +332,10 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
block_2_etile_map_
{
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
e_grid_desc_m_n_
)},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
}
cde_element_op_
{
cde_element_op
},
MRaw_
{
MRaw
},
NRaw_
{
NRaw
},
KRaw_
{
KRaw
}
{
// populate pointer, desc for Ds
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
...
...
@@ -400,6 +403,11 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
// for checking vector load/store
index_t
MRaw_
;
index_t
NRaw_
;
index_t
KRaw_
;
};
// Invoker
...
...
@@ -486,6 +494,86 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
return
false
;
}
// check vector load/store
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
// check vector load of A
if
constexpr
(
is_same_v
<
ALayout
,
Row
>
&&
ABlockTransferSrcVectorDim
==
2
)
{
if
(
arg
.
KRaw_
%
ABlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
}
else
if
constexpr
(
is_same_v
<
ALayout
,
Col
>
&&
ABlockTransferSrcVectorDim
==
1
)
{
// FIXME: not rigorous
if
(
arg
.
MRaw_
%
ABlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
}
else
{
return
false
;
}
// check vector laod of B
if
constexpr
(
is_same_v
<
BLayout
,
Col
>
&&
BBlockTransferSrcVectorDim
==
2
)
{
if
(
arg
.
KRaw_
%
BBlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
}
else
if
constexpr
(
is_same_v
<
BLayout
,
Row
>
&&
BBlockTransferSrcVectorDim
==
1
)
{
// FIXME: not rigorous
if
(
arg
.
NRaw_
%
BBlockTransferSrcScalarPerVector
!=
0
)
{
return
false
;
}
}
else
{
return
false
;
}
// check vector load of Ds
// only support RowMajor for now
bool
all_valid
=
true
;
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
if
constexpr
(
!
is_same_v
<
DLayout
,
Row
>
)
{
all_valid
=
false
;
}
});
if
(
!
all_valid
)
{
return
false
;
}
// check vector store of E
// only support RowMajor for now
if
constexpr
(
is_same_v
<
ELayout
,
Row
>
)
{
if
(
arg
.
NRaw_
%
CDEBlockTransferScalarPerVector_NPerBlock
!=
0
)
{
return
false
;
}
}
else
{
return
false
;
}
}
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_m_k_
,
arg
.
b_grid_desc_n_k_
,
arg
.
ds_grid_desc_m_n_
,
...
...
library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
View file @
50b6b104
This diff is collapsed.
Click to expand it.
profiler/src/profiler.cpp
View file @
50b6b104
...
...
@@ -3,27 +3,27 @@
#include <cstring>
//
int profile_gemm(int, char*[]);
//
int profile_gemm_splitk(int, char*[]);
//
int profile_gemm_bilinear(int, char*[]);
//
int profile_gemm_add_add_fastgelu(int, char*[]);
//
int profile_gemm_reduce(int, char*[]);
//
int profile_gemm_bias_add_reduce(int, char*[]);
//
int profile_batched_gemm(int, char*[]);
//
int profile_batched_gemm_gemm(int, char*[]);
//
int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
//
int profile_batched_gemm_reduce(int, char*[]);
//
int profile_grouped_gemm(int, char*[]);
//
int profile_conv_fwd(int, char*[]);
//
int profile_conv_fwd_bias_relu(int, char*[]);
//
int profile_conv_fwd_bias_relu_add(int, char*[]);
//
int profile_conv_bwd_data(int, char*[]);
//
int profile_conv_bwd_weight(int, char*[]);
//
int profile_grouped_conv_fwd(int, char*[]);
//
int profile_normalization(int, char*[]);
int
profile_gemm
(
int
,
char
*
[]);
int
profile_gemm_splitk
(
int
,
char
*
[]);
int
profile_gemm_bilinear
(
int
,
char
*
[]);
int
profile_gemm_add_add_fastgelu
(
int
,
char
*
[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
int
profile_gemm_bias_add_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
int
profile_batched_gemm_gemm
(
int
,
char
*
[]);
int
profile_batched_gemm_add_relu_gemm_add
(
int
,
char
*
[]);
int
profile_batched_gemm_reduce
(
int
,
char
*
[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
int
profile_conv_bwd_data
(
int
,
char
*
[]);
int
profile_conv_bwd_weight
(
int
,
char
*
[]);
int
profile_grouped_conv_fwd
(
int
,
char
*
[]);
int
profile_normalization
(
int
,
char
*
[]);
int
profile_layernorm
(
int
,
char
*
[]);
int
profile_groupnorm
(
int
,
char
*
[]);
//
int profile_reduce(int, char*[]);
int
profile_reduce
(
int
,
char
*
[]);
static
void
print_helper_message
()
{
...
...
@@ -57,7 +57,6 @@ int main(int argc, char* argv[])
return
0
;
}
#if 0
else
if
(
strcmp
(
argv
[
1
],
"gemm"
)
==
0
)
{
return
profile_gemm
(
argc
,
argv
);
...
...
@@ -134,7 +133,6 @@ int main(int argc, char* argv[])
{
return
profile_normalization
(
argc
,
argv
);
}
#endif
else
if
(
strcmp
(
argv
[
1
],
"layernorm"
)
==
0
)
{
return
profile_layernorm
(
argc
,
argv
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment