Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
dc1c2bf8
Commit
dc1c2bf8
authored
Oct 20, 2024
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into letaoqin/update_layernorm
parents
5cfd751b
a285d6f9
Changes
27
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1084 additions
and
135 deletions
+1084
-135
CMakeLists.txt
CMakeLists.txt
+7
-0
Jenkinsfile
Jenkinsfile
+38
-11
codegen/CMakeLists.txt
codegen/CMakeLists.txt
+29
-31
codegen/test/CMakeLists.txt
codegen/test/CMakeLists.txt
+20
-18
codegen/test/include/common.hpp
codegen/test/include/common.hpp
+0
-0
codegen/test/rtc/CMakeLists.txt
codegen/test/rtc/CMakeLists.txt
+2
-0
codegen/test/rtc/include/rtc/compile_kernel.hpp
codegen/test/rtc/include/rtc/compile_kernel.hpp
+2
-2
codegen/test/rtc/include/rtc/filesystem.hpp
codegen/test/rtc/include/rtc/filesystem.hpp
+60
-0
codegen/test/rtc/include/rtc/tmp_dir.hpp
codegen/test/rtc/include/rtc/tmp_dir.hpp
+2
-2
codegen/test/rtc/src/compile_kernel.cpp
codegen/test/rtc/src/compile_kernel.cpp
+5
-5
codegen/test/rtc/src/tmp_dir.cpp
codegen/test/rtc/src/tmp_dir.cpp
+3
-3
example/44_elementwise_permute/CMakeLists.txt
example/44_elementwise_permute/CMakeLists.txt
+1
-0
example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
...se_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
+247
-0
example/ck_tile/03_gemm/gemm_basic.cpp
example/ck_tile/03_gemm/gemm_basic.cpp
+5
-2
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+6
-0
include/ck/utility/data_type.hpp
include/ck/utility/data_type.hpp
+625
-30
include/ck/utility/math_v2.hpp
include/ck/utility/math_v2.hpp
+4
-0
include/ck_tile/core/config.hpp
include/ck_tile/core/config.hpp
+4
-1
include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
...a/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
+12
-15
include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
...eline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
+12
-15
No files found.
CMakeLists.txt
View file @
dc1c2bf8
...
@@ -202,6 +202,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
...
@@ -202,6 +202,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
add_compile_options
(
-fno-offload-uniform-block
)
add_compile_options
(
-fno-offload-uniform-block
)
endif
()
endif
()
endif
()
endif
()
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
GREATER 500500000
)
check_cxx_compiler_flag
(
"-mllvm --lsr-drop-solution=1"
HAS_LSR_DROP_SOLUTION
)
if
(
HAS_LSR_DROP_SOLUTION
)
message
(
"Adding the lsr-drop-solution=1 compiler flag"
)
add_compile_options
(
"SHELL: -mllvm --lsr-drop-solution=1"
)
endif
()
endif
()
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
GREATER 600140090
)
if
(
NOT WIN32 AND
${
hip_VERSION_FLAT
}
GREATER 600140090
)
check_cxx_compiler_flag
(
"-mllvm -enable-post-misched=0"
HAS_ENABLE_POST_MISCHED
)
check_cxx_compiler_flag
(
"-mllvm -enable-post-misched=0"
HAS_ENABLE_POST_MISCHED
)
if
(
HAS_ENABLE_POST_MISCHED
)
if
(
HAS_ENABLE_POST_MISCHED
)
...
...
Jenkinsfile
View file @
dc1c2bf8
...
@@ -735,11 +735,11 @@ def process_results(Map conf=[:]){
...
@@ -735,11 +735,11 @@ def process_results(Map conf=[:]){
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS
=
BRANCH_NAME
==
"develop"
?
'''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
CRON_SETTINGS
=
BRANCH_NAME
==
"develop"
?
'''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true
;RUN_CODEGEN_TESTS=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_
CODEGEN_TESTS=false;RUN_
PERFORMANCE_TESTS=false;USE_SCCACHE=false
0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
0 13 * * * % BUILD_LEGACY_OS=true
'''
:
""
0 13 * * * % BUILD_LEGACY_OS=true'''
:
""
pipeline
{
pipeline
{
agent
none
agent
none
...
@@ -806,6 +806,10 @@ pipeline {
...
@@ -806,6 +806,10 @@ pipeline {
name:
"RUN_GROUPED_CONV_LARGE_CASES_TESTS"
,
name:
"RUN_GROUPED_CONV_LARGE_CASES_TESTS"
,
defaultValue:
false
,
defaultValue:
false
,
description:
"Run the grouped conv large cases tests (default: OFF)"
)
description:
"Run the grouped conv large cases tests (default: OFF)"
)
booleanParam
(
name:
"RUN_CODEGEN_TESTS"
,
defaultValue:
false
,
description:
"Run codegen tests (default: OFF)"
)
booleanParam
(
booleanParam
(
name:
"RUN_CK_TILE_FMHA_TESTS"
,
name:
"RUN_CK_TILE_FMHA_TESTS"
,
defaultValue:
false
,
defaultValue:
false
,
...
@@ -926,7 +930,30 @@ pipeline {
...
@@ -926,7 +930,30 @@ pipeline {
execute_args
=
""" ../script/cmake-ck-dev.sh ../ gfx90a && \
execute_args
=
""" ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
./bin/test_grouped_convnd_fwd_large_cases_xdl"""
./bin/test_grouped_convnd_fwd_large_cases_xdl"""
}
}
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
}
}
}
}
stage
(
"Run Codegen Tests"
)
{
parallel
{
stage
(
"Run Codegen Tests on gfx90a"
)
{
when
{
beforeAgent
true
expression
{
params
.
RUN_CODEGEN_TESTS
.
toBoolean
()
}
}
agent
{
label
rocmnode
(
"gfx90a"
)}
environment
{
setup_args
=
"NO_CK_BUILD"
execute_args
=
""" CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
make -j64 check"""
}
steps
{
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
cleanWs
()
...
@@ -951,7 +978,7 @@ pipeline {
...
@@ -951,7 +978,7 @@ pipeline {
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ &&
cd ../ &&
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
}
}
steps
{
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
cleanWs
()
...
@@ -970,7 +997,7 @@ pipeline {
...
@@ -970,7 +997,7 @@ pipeline {
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ &&
cd ../ &&
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
}
}
steps
{
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
cleanWs
()
...
@@ -995,7 +1022,7 @@ pipeline {
...
@@ -995,7 +1022,7 @@ pipeline {
make -j64 tile_example_gemm_basic && \
make -j64 tile_example_gemm_basic && \
cd ../ &&
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
}
}
steps
{
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
cleanWs
()
...
@@ -1014,7 +1041,7 @@ pipeline {
...
@@ -1014,7 +1041,7 @@ pipeline {
make -j64 tile_example_gemm_basic && \
make -j64 tile_example_gemm_basic && \
cd ../ &&
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
}
}
steps
{
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
cleanWs
()
...
@@ -1040,7 +1067,7 @@ pipeline {
...
@@ -1040,7 +1067,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " \
-DCMAKE_CXX_FLAGS=" -O3 " \
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
execute_args
=
" "
execute_args
=
" "
}
}
steps
{
steps
{
Build_CK_and_Reboot
(
setup_args:
setup_args
,
config_targets:
" "
,
no_reboot:
true
,
build_type:
'Release'
,
docker_name:
docker_name
)
Build_CK_and_Reboot
(
setup_args:
setup_args
,
config_targets:
" "
,
no_reboot:
true
,
build_type:
'Release'
,
docker_name:
docker_name
)
cleanWs
()
cleanWs
()
...
@@ -1059,7 +1086,7 @@ pipeline {
...
@@ -1059,7 +1086,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " \
-DCMAKE_CXX_FLAGS=" -O3 " \
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
execute_args
=
" "
execute_args
=
" "
}
}
steps
{
steps
{
Build_CK_and_Reboot
(
setup_args:
setup_args
,
config_targets:
" "
,
no_reboot:
true
,
build_type:
'Release'
,
docker_name:
docker_name
)
Build_CK_and_Reboot
(
setup_args:
setup_args
,
config_targets:
" "
,
no_reboot:
true
,
build_type:
'Release'
,
docker_name:
docker_name
)
cleanWs
()
cleanWs
()
...
@@ -1140,7 +1167,7 @@ pipeline {
...
@@ -1140,7 +1167,7 @@ pipeline {
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
}
}
steps
{
steps
{
buildHipClangJobAndReboot
(
setup_cmd:
""
,
build_cmd:
""
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
buildHipClangJobAndReboot
(
setup_cmd:
""
,
build_cmd:
""
,
no_reboot:
true
,
build_type:
'Release'
,
execute_cmd:
execute_args
)
cleanWs
()
cleanWs
()
...
...
codegen/CMakeLists.txt
View file @
dc1c2bf8
cmake_minimum_required
(
VERSION 3.16
)
project
(
composable_kernel_host
)
set
(
CMAKE_EXPORT_COMPILE_COMMANDS ON
)
set
(
CMAKE_EXPORT_COMPILE_COMMANDS ON
)
set
(
CMAKE_LIBRARY_OUTPUT_DIRECTORY
${
CMAKE_BINARY_DIR
}
/lib
)
set
(
CMAKE_LIBRARY_OUTPUT_DIRECTORY
${
CMAKE_BINARY_DIR
}
/lib
)
...
@@ -5,56 +8,51 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
...
@@ -5,56 +8,51 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set
(
CMAKE_RUNTIME_OUTPUT_DIRECTORY
${
CMAKE_BINARY_DIR
}
/bin
)
set
(
CMAKE_RUNTIME_OUTPUT_DIRECTORY
${
CMAKE_BINARY_DIR
}
/bin
)
set
(
CK_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/..
)
set
(
CK_ROOT
${
CMAKE_CURRENT_SOURCE_DIR
}
/..
)
add_compile_options
(
-std=c++17
)
find_package
(
ROCM
)
f
in
d_package
(
hip
)
in
clude
(
ROCMInstallTargets
)
add_custom_target
(
codegen
)
include
(
ROCMTest
)
# add include directories
rocm_setup_version
(
VERSION 1.0
)
include_directories
(
BEFORE
${
PROJECT_BINARY_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/library/include
${
HIP_INCLUDE_DIRS
}
)
list
(
APPEND CMAKE_MODULE_PATH
${
CK_ROOT
}
/cmake
)
list
(
APPEND CMAKE_MODULE_PATH
${
CK_ROOT
}
/cmake
)
include
(
Embed
)
include
(
Embed
)
file
(
GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
file
(
GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
${
CK_ROOT
}
/include/ck/*.hpp
)
${
CK_ROOT
}
/include/ck/*.hpp
)
#printouts fot debug purposes
#
printouts fot debug purposes
#message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
#
message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
#message(STATUS "RELATIVE: ${CK_ROOT}/include")
#
message(STATUS "RELATIVE: ${CK_ROOT}/include")
add_embed_library
(
ck_headers
${
KERNEL_FILES
}
RELATIVE
${
CK_ROOT
}
/include
)
add_embed_library
(
ck_headers
${
KERNEL_FILES
}
RELATIVE
${
CK_ROOT
}
/include
)
file
(
GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp
)
add_compile_options
(
-std=c++17
)
##message(STATUS "SOURCE_FILES: ${SOURCES}"
)
file
(
GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp
)
# TODO: Use object library
# TODO: Use object library
add_library
(
ck_host STATIC
${
SOURCES
}
)
add_library
(
ck_host STATIC
${
SOURCES
}
)
target_link_libraries
(
ck_host PRIVATE ck_headers
)
target_link_libraries
(
ck_host PRIVATE ck_headers
)
set_target_properties
(
ck_host PROPERTIES
set_target_properties
(
ck_host PROPERTIES
LINKER_LANGUAGE CXX
LINKER_LANGUAGE CXX
POSITION_INDEPENDENT_CODE ON
)
POSITION_INDEPENDENT_CODE ON
)
target_include_directories
(
ck_host PUBLIC
# target_include_directories(ck_host PUBLIC
$<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/include>
# $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
# )
)
add_executable
(
ck-template-driver driver/main.cpp
)
add_executable
(
ck-template-driver driver/main.cpp
)
target_link_libraries
(
ck-template-driver ck_host
)
target_link_libraries
(
ck-template-driver ck_host
)
rocm_install
(
rocm_install
_targets
(
TARGETS ck_host ck_headers
TARGETS ck_host ck_headers
EXPORT ck_hostTargets
EXPORT ck_host_targets
INCLUDE include
PRIVATE
)
rocm_export_targets
(
EXPORT ck_host_targets
NAMESPACE composable_kernel::
)
)
rocm_install
(
EXPORT ck_hostTargets
FILE composable_kernelck_hostTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
rocm_install
(
DIRECTORY include/ck DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
)
if
(
BUILD_TESTING
)
if
(
BUILD_TESTING
)
add_subdirectory
(
test
)
add_subdirectory
(
test
)
endif
()
endif
()
codegen/test/CMakeLists.txt
View file @
dc1c2bf8
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm
)
list
(
APPEND CMAKE_PREFIX_PATH /opt/rocm
)
add_subdirectory
(
rtc
)
add_subdirectory
(
rtc
)
file
(
GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp
)
file
(
GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp
)
# do not build the tests when we build the library for various targets
if
(
NOT GPU_ARCHS
)
# TODO: These tests need to be refactored to remove dependency on main ck
foreach
(
TEST_SRC
${
TEST_SRCS
}
)
# headers and device compilation.
set_source_files_properties
(
${
TEST_SRC
}
PROPERTIES LANGUAGE HIP
)
set
(
TESTS_REQUIRE_DEVICE_COMPILE
get_filename_component
(
BASE_NAME
${
TEST_SRC
}
NAME_WE
)
grouped_conv_fwd_multiple_d_v1
add_executable
(
codegen_test_
${
BASE_NAME
}
${
TEST_SRC
}
)
grouped_conv_fwd_multiple_d_v2
if
(
CK_USE_ALTERNATIVE_PYTHON
)
grouped_conv_fwd_multiple_d_v3
target_link_options
(
codegen_test_
${
BASE_NAME
}
PRIVATE -lstdc++fs
)
grouped_conv_fwd_multiple_d_v4
endif
()
)
add_dependencies
(
codegen codegen_test_
${
BASE_NAME
}
)
find_package
(
hip
)
add_dependencies
(
tests codegen_test_
${
BASE_NAME
}
)
add_dependencies
(
check codegen_test_
${
BASE_NAME
}
)
foreach
(
TEST_SRC
${
TEST_SRCS
}
)
add_test
(
NAME codegen_test_
${
BASE_NAME
}
COMMAND codegen_test_
${
BASE_NAME
}
)
get_filename_component
(
BASE_NAME
${
TEST_SRC
}
NAME_WE
)
message
(
"adding test codegen_test_
${
BASE_NAME
}
"
)
rocm_add_test_executable
(
codegen_test_
${
BASE_NAME
}
${
TEST_SRC
}
)
target_link_libraries
(
codegen_test_
${
BASE_NAME
}
ck_rtc ck_host
)
target_link_libraries
(
codegen_test_
${
BASE_NAME
}
ck_rtc ck_host
)
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/codegen/test/include
)
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC include
)
if
(
BASE_NAME IN_LIST TESTS_REQUIRE_DEVICE_COMPILE
)
target_link_libraries
(
codegen_test_
${
BASE_NAME
}
hip::device
)
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/include
)
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/include
)
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/library/include
)
target_include_directories
(
codegen_test_
${
BASE_NAME
}
PUBLIC
${
CK_ROOT
}
/library/include
)
endf
oreach
()
end
i
f
()
end
i
f
()
endf
oreach
()
codegen/test/common.hpp
→
codegen/test/
include/
common.hpp
View file @
dc1c2bf8
File moved
codegen/test/rtc/CMakeLists.txt
View file @
dc1c2bf8
find_package
(
hip
)
file
(
GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp
)
file
(
GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp
)
add_library
(
ck_rtc
${
RTC_SOURCES
}
)
add_library
(
ck_rtc
${
RTC_SOURCES
}
)
target_include_directories
(
ck_rtc PUBLIC include
)
target_include_directories
(
ck_rtc PUBLIC include
)
target_link_libraries
(
ck_rtc PUBLIC hip::host
)
target_link_libraries
(
ck_rtc PUBLIC hip::host
)
target_link_libraries
(
ck_rtc PUBLIC -lstdc++fs
)
codegen/test/rtc/include/rtc/compile_kernel.hpp
View file @
dc1c2bf8
...
@@ -2,14 +2,14 @@
...
@@ -2,14 +2,14 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp>
#include <rtc/kernel.hpp>
#include <c
k
/filesystem.hpp>
#include <
rt
c/filesystem.hpp>
#include <string>
#include <string>
namespace
rtc
{
namespace
rtc
{
struct
src_file
struct
src_file
{
{
CK
::
fs
::
path
path
;
fs
::
path
path
;
std
::
string_view
content
;
std
::
string_view
content
;
};
};
...
...
codegen/test/rtc/include/rtc/filesystem.hpp
0 → 100644
View file @
dc1c2bf8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#include <string>
#include <string_view>
// clang-format off
#if defined(CPPCHECK)
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 1
#elif defined(_WIN32)
#if _MSC_VER >= 1920
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 0
#elif _MSC_VER >= 1900
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#elif defined(__has_include)
#if __has_include(<filesystem>) && __cplusplus >= 201703L
#define RTC_HAS_FILESYSTEM 1
#else
#define RTC_HAS_FILESYSTEM 0
#endif
#if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
// clang-format on
#if RTC_HAS_FILESYSTEM
#include <filesystem>
#elif RTC_HAS_FILESYSTEM_TS
#include <experimental/filesystem>
#else
#error "No filesystem include available"
#endif
namespace
rtc
{
#if RTC_HAS_FILESYSTEM
namespace
fs
=
::
std
::
filesystem
;
#elif RTC_HAS_FILESYSTEM_TS
namespace
fs
=
::
std
::
experimental
::
filesystem
;
#endif
}
// namespace rtc
#endif // GUARD_RTC_FILESYSTEM_HPP_
codegen/test/rtc/include/rtc/tmp_dir.hpp
View file @
dc1c2bf8
...
@@ -2,13 +2,13 @@
...
@@ -2,13 +2,13 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string>
#include <string>
#include <c
k
/filesystem.hpp>
#include <
rt
c/filesystem.hpp>
namespace
rtc
{
namespace
rtc
{
struct
tmp_dir
struct
tmp_dir
{
{
CK
::
fs
::
path
path
;
fs
::
path
path
;
tmp_dir
(
const
std
::
string
&
prefix
=
""
);
tmp_dir
(
const
std
::
string
&
prefix
=
""
);
void
execute
(
const
std
::
string
&
cmd
)
const
;
void
execute
(
const
std
::
string
&
cmd
)
const
;
...
...
codegen/test/rtc/src/compile_kernel.cpp
View file @
dc1c2bf8
#include
"
rtc/hip.hpp
"
#include
<
rtc/hip.hpp
>
#include <rtc/compile_kernel.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/tmp_dir.hpp>
#include <rtc/tmp_dir.hpp>
#include <stdexcept>
#include <stdexcept>
...
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
...
@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
for
(
const
auto
&
src
:
srcs
)
for
(
const
auto
&
src
:
srcs
)
{
{
CK
::
fs
::
path
full_path
=
td
.
path
/
src
.
path
;
fs
::
path
full_path
=
td
.
path
/
src
.
path
;
CK
::
fs
::
path
parent_path
=
full_path
.
parent_path
();
fs
::
path
parent_path
=
full_path
.
parent_path
();
CK
::
fs
::
create_directories
(
parent_path
);
fs
::
create_directories
(
parent_path
);
write_string
(
full_path
.
string
(),
src
.
content
);
write_string
(
full_path
.
string
(),
src
.
content
);
if
(
src
.
path
.
extension
().
string
()
==
".cpp"
)
if
(
src
.
path
.
extension
().
string
()
==
".cpp"
)
{
{
...
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
...
@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
td
.
execute
(
compiler
()
+
options
.
flags
);
td
.
execute
(
compiler
()
+
options
.
flags
);
auto
out_path
=
td
.
path
/
out
;
auto
out_path
=
td
.
path
/
out
;
if
(
not
CK
::
fs
::
exists
(
out_path
))
if
(
not
fs
::
exists
(
out_path
))
throw
std
::
runtime_error
(
"Output file missing: "
+
out
);
throw
std
::
runtime_error
(
"Output file missing: "
+
out
);
auto
obj
=
read_buffer
(
out_path
.
string
());
auto
obj
=
read_buffer
(
out_path
.
string
());
...
...
codegen/test/rtc/src/tmp_dir.cpp
View file @
dc1c2bf8
...
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
...
@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
}
}
tmp_dir
::
tmp_dir
(
const
std
::
string
&
prefix
)
tmp_dir
::
tmp_dir
(
const
std
::
string
&
prefix
)
:
path
(
CK
::
fs
::
temp_directory_path
()
/
:
path
(
fs
::
temp_directory_path
()
/
unique_string
(
prefix
.
empty
()
?
"ck-rtc"
:
"ck-rtc-"
+
prefix
))
unique_string
(
prefix
.
empty
()
?
"ck-rtc"
:
"ck-rtc-"
+
prefix
))
{
{
CK
::
fs
::
create_directories
(
this
->
path
);
fs
::
create_directories
(
this
->
path
);
}
}
void
tmp_dir
::
execute
(
const
std
::
string
&
cmd
)
const
void
tmp_dir
::
execute
(
const
std
::
string
&
cmd
)
const
...
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
...
@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
std
::
system
(
s
.
c_str
());
std
::
system
(
s
.
c_str
());
}
}
tmp_dir
::~
tmp_dir
()
{
CK
::
fs
::
remove_all
(
this
->
path
);
}
tmp_dir
::~
tmp_dir
()
{
fs
::
remove_all
(
this
->
path
);
}
}
// namespace rtc
}
// namespace rtc
example/44_elementwise_permute/CMakeLists.txt
View file @
dc1c2bf8
...
@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu
...
@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu
add_example_executable
(
example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp
)
add_example_executable
(
example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp
)
add_example_executable
(
example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp
)
add_example_executable
(
example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp
)
add_example_executable
(
elementwise_scale_permute_amax_2D_fp16_fp8 elementwise_scale_permute_amax_2D_fp16_fp8.cpp
)
example/44_elementwise_permute/elementwise_scale_permute_amax_2D_fp16_fp8.cpp
0 → 100644
View file @
dc1c2bf8
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/utility/reduction_enums.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
F8
=
ck
::
f8_t
;
using
InputDataType
=
F16
;
using
ScaleDataType
=
F32
;
using
OutputDataType
=
F8
;
static
constexpr
ck
::
index_t
NumDim
=
2
;
constexpr
ck
::
ReduceTensorOp
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
constexpr
bool
PropagateNan
=
true
;
constexpr
bool
OutputIndex
=
false
;
using
ReduceOperation
=
typename
ck
::
reduce_binary_operator
<
ReduceOpId
>::
opType
;
struct
ScalePassThrough
{
ScalePassThrough
(
const
float
alpha
=
1.
f
)
:
alpha_
(
alpha
)
{}
__host__
__device__
constexpr
void
operator
()(
OutputDataType
&
y0
,
OutputDataType
&
y1
,
const
InputDataType
&
x0
)
const
{
y0
=
ck
::
type_convert
<
OutputDataType
>
(
ck
::
type_convert
<
ScaleDataType
>
(
x0
)
*
alpha_
);
y1
=
y0
;
}
const
ScaleDataType
alpha_
;
};
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
UnaryAbs
=
ck
::
tensor_operation
::
element_wise
::
UnaryAbs
;
using
DeviceElementwisePermuteInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseImpl
<
ck
::
Tuple
<
InputDataType
>
,
// InDataTypeTuple
ck
::
Tuple
<
OutputDataType
,
OutputDataType
>
,
// OutDataTypeTuple
ScalePassThrough
,
// Elementwise
NumDim
,
// NumDim
256
,
// BlockSize
128
,
// M0PerBlock
128
,
// M1PerBlock
8
,
// M0PerThread
8
,
// M1PerThread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
8
>
,
// InScalarPerVectorSeq
ck
::
Sequence
<
8
,
1
>>
;
// OutScalarPerVectorSeq
using
DeviceReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceReduceMultiBlock
<
OutputDataType
,
OutputDataType
,
OutputDataType
,
NumDim
,
NumDim
,
ReduceOperation
,
UnaryAbs
,
PassThrough
,
ck
::
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
OutputIndex
,
false
,
// HaveIndexInputIfOutputIndex
1024
,
// BlockSize
1
,
// MThreadClusterSize
1024
,
// KThreadClusterSize
1
,
// MThreadSliceSize
16
,
// KThreadSliceSize
1
,
// InSrcVectorDim
16
,
// InSrceVectorSize
1
>
;
// OutDstVectorSize
void
reference_scale_permute_amax
(
Tensor
<
InputDataType
>&
input
,
Tensor
<
OutputDataType
>&
host_output_scaled_casted_transposed
,
Tensor
<
OutputDataType
>&
host_output_scaled_casted
,
Tensor
<
OutputDataType
>&
host_output_amax
,
const
float
scale
)
{
ScalePassThrough
out_element_op
(
scale
);
const
ck
::
index_t
M
=
input
.
GetLengths
()[
0
];
const
ck
::
index_t
K
=
input
.
GetLengths
()[
1
];
for
(
ck
::
index_t
m
=
0
;
m
<
M
;
m
++
)
{
for
(
ck
::
index_t
k
=
0
;
k
<
K
;
k
++
)
{
OutputDataType
y0
,
y1
;
out_element_op
(
y0
,
y1
,
input
(
m
,
k
));
host_output_scaled_casted
(
m
,
k
)
=
y0
;
host_output_scaled_casted_transposed
(
m
,
k
)
=
y1
;
const
OutputDataType
y_fabs
=
ck
::
type_convert
<
OutputDataType
>
(
ck
::
math
::
abs
(
ck
::
type_convert
<
float
>
(
y0
)));
host_output_amax
(
0
)
=
ck
::
math
::
max
(
y_fabs
,
host_output_amax
(
0
));
}
}
}
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
bool
time_kernel
=
true
;
const
float
scale
=
2.
f
;
ck
::
index_t
M
=
1024
;
ck
::
index_t
K
=
1024
;
if
(
argc
==
3
)
{
M
=
std
::
stoi
(
argv
[
1
]);
K
=
std
::
stoi
(
argv
[
2
]);
}
std
::
array
<
ck
::
index_t
,
2
>
dims
=
{
M
,
K
};
std
::
array
<
ck
::
index_t
,
2
>
in_strides
=
{
K
,
1
};
std
::
array
<
ck
::
index_t
,
2
>
out_strides
=
{
1
,
M
};
Tensor
<
InputDataType
>
input
(
dims
,
in_strides
);
Tensor
<
OutputDataType
>
output_scaled_casted_transposed
(
dims
,
out_strides
);
Tensor
<
OutputDataType
>
output_scaled_casted
(
dims
,
in_strides
);
Tensor
<
OutputDataType
>
output_amax
({
1
});
input
.
GenerateTensorValue
(
GeneratorTensor_3
<
InputDataType
>
{
0.0
,
1.0
});
DeviceMem
input_dev_buf
(
sizeof
(
InputDataType
)
*
input
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
output_scaled_casted_transposed_dev_buf
(
sizeof
(
OutputDataType
)
*
output_scaled_casted_transposed
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
output_scaled_casted_dev_buf
(
sizeof
(
OutputDataType
)
*
output_scaled_casted
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
output_amax_dev_buf
(
sizeof
(
OutputDataType
)
*
output_amax
.
mDesc
.
GetElementSpaceSize
());
input_dev_buf
.
ToDevice
(
input
.
mData
.
data
());
std
::
array
<
const
void
*
,
1
>
inputs
=
{
input_dev_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
2
>
outputs
=
{
output_scaled_casted_transposed_dev_buf
.
GetDeviceBuffer
(),
output_scaled_casted_dev_buf
.
GetDeviceBuffer
()};
std
::
cout
<<
"Input: "
<<
input
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"Scale: "
<<
scale
<<
std
::
endl
;
std
::
cout
<<
"Output scaled casted transposed: "
<<
output_scaled_casted_transposed
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"Output scaled casted: "
<<
output_scaled_casted
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"Output amax: "
<<
output_amax
.
mDesc
<<
std
::
endl
;
auto
launch_transpose_scale
=
[
&
]()
{
auto
transposeScale
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
transposeScale
.
MakeArgumentPointer
(
dims
,
{
in_strides
},
{
out_strides
,
in_strides
},
inputs
,
outputs
,
ScalePassThrough
{
scale
});
if
(
!
transposeScale
.
IsSupportedArgument
(
argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
auto
transposeScale_invoker_ptr
=
transposeScale
.
MakeInvokerPointer
();
return
transposeScale_invoker_ptr
->
Run
(
argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
};
auto
launch_reduce
=
[
&
]()
{
auto
reduce
=
DeviceReduceInstance
{};
auto
reduce_argument_ptr
=
reduce
.
MakeArgumentPointer
(
dims
,
in_strides
,
{
1
},
// Output Lengths
{
1
},
// Output Strides
{
0
,
1
},
// Reduce Dims
static_cast
<
double
>
(
1.
f
),
static_cast
<
double
>
(
0.
f
),
output_scaled_casted_dev_buf
.
GetDeviceBuffer
(),
nullptr
,
output_amax_dev_buf
.
GetDeviceBuffer
(),
nullptr
,
UnaryAbs
{},
PassThrough
{});
if
(
!
reduce
.
IsSupportedArgument
(
reduce_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the device instance, exiting!"
);
};
auto
invoker_ptr
=
reduce
.
MakeInvokerPointer
();
return
invoker_ptr
->
Run
(
reduce_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
};
float
ave_time
=
launch_transpose_scale
();
ave_time
+=
launch_reduce
();
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
OutputDataType
>
host_output_scaled_casted_transposed
(
dims
,
out_strides
);
Tensor
<
OutputDataType
>
host_output_scaled_casted
(
dims
,
in_strides
);
Tensor
<
OutputDataType
>
host_output_amax
({
1
});
reference_scale_permute_amax
(
input
,
host_output_scaled_casted_transposed
,
host_output_scaled_casted
,
host_output_amax
,
scale
);
output_scaled_casted_transposed_dev_buf
.
FromDevice
(
output_scaled_casted_transposed
.
mData
.
data
());
output_scaled_casted_dev_buf
.
FromDevice
(
output_scaled_casted
.
mData
.
data
());
output_amax_dev_buf
.
FromDevice
(
output_amax
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
output_scaled_casted_transposed
.
mData
,
host_output_scaled_casted_transposed
.
mData
,
"Error: Incorrect results scaled transposed"
,
1e-3
,
1e-3
);
pass
&=
ck
::
utils
::
check_err
(
output_scaled_casted
.
mData
,
host_output_scaled_casted
.
mData
,
"Error: Incorrect results scaled"
,
1e-3
,
1e-3
);
pass
&=
ck
::
utils
::
check_err
(
output_amax
.
mData
,
host_output_amax
.
mData
,
"Error: Incorrect results amax"
,
1e-3
,
1e-3
);
}
return
pass
?
0
:
1
;
}
example/ck_tile/03_gemm/gemm_basic.cpp
View file @
dc1c2bf8
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
...
@@ -282,7 +281,11 @@ int main(int argc, char* argv[])
...
@@ -282,7 +281,11 @@ int main(int argc, char* argv[])
using
CodegenPipelineProblem
=
ck_tile
::
using
CodegenPipelineProblem
=
ck_tile
::
GemmPipelineProblem
<
ADataType
,
BDataType
,
AccDataType
,
CodegenGemmShape
,
CodegenGemmTraits
>
;
GemmPipelineProblem
<
ADataType
,
BDataType
,
AccDataType
,
CodegenGemmShape
,
CodegenGemmTraits
>
;
using
CodegenGemmPipeline
=
ck_tile
::
GemmPipelineAGmemBGmemCRegV1
<
CodegenPipelineProblem
>
;
using
CodegenGemmPolicy
=
ck_tile
::
UniversalGemmPipelineAgBgCrPolicy
<
matrix_a_layout
,
matrix_b_layout
,
matrix_c_layout
>
;
using
CodegenGemmPipeline
=
ck_tile
::
GemmPipelineAGmemBGmemCRegV1
<
CodegenPipelineProblem
,
CodegenGemmPolicy
>
;
invoke_gemm
<
ck_tile
::
half_t
,
invoke_gemm
<
ck_tile
::
half_t
,
matrix_a_layout
,
matrix_a_layout
,
...
...
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
View file @
dc1c2bf8
...
@@ -419,6 +419,12 @@ struct UnaryAbs
...
@@ -419,6 +419,12 @@ struct UnaryAbs
y
=
ck
::
math
::
abs
(
x
);
y
=
ck
::
math
::
abs
(
x
);
};
};
template
<
>
__host__
__device__
void
operator
()(
f8_t
&
y
,
const
f8_t
&
x
)
const
{
y
=
ck
::
type_convert
<
f8_t
>
(
ck
::
math
::
abs
(
ck
::
type_convert
<
float
>
(
x
)));
};
};
};
struct
UnarySqrt
struct
UnarySqrt
...
...
include/ck/utility/data_type.hpp
View file @
dc1c2bf8
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -13,8 +13,24 @@ using int4_t = _BitInt(4);
...
@@ -13,8 +13,24 @@ using int4_t = _BitInt(4);
using
f8_t
=
_BitInt
(
8
);
using
f8_t
=
_BitInt
(
8
);
using
bf8_t
=
unsigned
_BitInt
(
8
);
using
bf8_t
=
unsigned
_BitInt
(
8
);
inline
constexpr
auto
next_pow2
(
uint32_t
x
)
{
// Precondition: x > 1.
return
x
>
1u
?
(
1u
<<
(
32u
-
__builtin_clz
(
x
-
1u
)))
:
x
;
}
// native types: double, float, _Float16, ushort, int32_t, int8_t, uint8_t, f8_t, bf8_t, bool
template
<
typename
T
>
inline
constexpr
bool
is_native_type
()
{
return
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
bhalf_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
||
is_same
<
T
,
uint8_t
>::
value
||
is_same
<
T
,
f8_t
>::
value
||
is_same
<
T
,
bf8_t
>::
value
||
is_same
<
T
,
bool
>::
value
;
}
// vector_type
// vector_type
template
<
typename
T
,
index_t
N
>
template
<
typename
T
,
index_t
N
,
typename
Enable
=
void
>
struct
vector_type
;
struct
vector_type
;
// Caution: DO NOT REMOVE
// Caution: DO NOT REMOVE
...
@@ -171,7 +187,7 @@ struct scalar_type<bool>
...
@@ -171,7 +187,7 @@ struct scalar_type<bool>
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
1
>
struct
vector_type
<
T
,
1
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
using
type
=
d1_t
;
using
type
=
d1_t
;
...
@@ -189,7 +205,8 @@ struct vector_type<T, 1>
...
@@ -189,7 +205,8 @@ struct vector_type<T, 1>
template
<
typename
X
>
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
,
"wrong!"
);
static_assert
(
is_same
<
X
,
d1_t
>::
value
,
"Something went wrong, please check src and dst types."
);
return
data_
.
d1x1_
;
return
data_
.
d1x1_
;
}
}
...
@@ -197,7 +214,8 @@ struct vector_type<T, 1>
...
@@ -197,7 +214,8 @@ struct vector_type<T, 1>
template
<
typename
X
>
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
__host__
__device__
constexpr
auto
&
AsType
()
{
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
,
"wrong!"
);
static_assert
(
is_same
<
X
,
d1_t
>::
value
,
"Something went wrong, please check src and dst types."
);
return
data_
.
d1x1_
;
return
data_
.
d1x1_
;
}
}
...
@@ -205,7 +223,7 @@ struct vector_type<T, 1>
...
@@ -205,7 +223,7 @@ struct vector_type<T, 1>
__device__
int
static
err
=
0
;
__device__
int
static
err
=
0
;
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
2
>
struct
vector_type
<
T
,
2
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
...
@@ -226,7 +244,8 @@ struct vector_type<T, 2>
...
@@ -226,7 +244,8 @@ struct vector_type<T, 2>
template
<
typename
X
>
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
,
"wrong!"
);
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -245,7 +264,8 @@ struct vector_type<T, 2>
...
@@ -245,7 +264,8 @@ struct vector_type<T, 2>
template
<
typename
X
>
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
__host__
__device__
constexpr
auto
&
AsType
()
{
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
,
"wrong!"
);
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -263,7 +283,7 @@ struct vector_type<T, 2>
...
@@ -263,7 +283,7 @@ struct vector_type<T, 2>
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
4
>
struct
vector_type
<
T
,
4
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
...
@@ -287,7 +307,7 @@ struct vector_type<T, 4>
...
@@ -287,7 +307,7 @@ struct vector_type<T, 4>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
,
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -311,7 +331,7 @@ struct vector_type<T, 4>
...
@@ -311,7 +331,7 @@ struct vector_type<T, 4>
__host__
__device__
constexpr
auto
&
AsType
()
__host__
__device__
constexpr
auto
&
AsType
()
{
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
,
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -333,7 +353,7 @@ struct vector_type<T, 4>
...
@@ -333,7 +353,7 @@ struct vector_type<T, 4>
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
8
>
struct
vector_type
<
T
,
8
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
...
@@ -360,7 +380,7 @@ struct vector_type<T, 8>
...
@@ -360,7 +380,7 @@ struct vector_type<T, 8>
{
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
,
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -389,7 +409,7 @@ struct vector_type<T, 8>
...
@@ -389,7 +409,7 @@ struct vector_type<T, 8>
{
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
,
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -415,7 +435,7 @@ struct vector_type<T, 8>
...
@@ -415,7 +435,7 @@ struct vector_type<T, 8>
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
16
>
struct
vector_type
<
T
,
16
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
...
@@ -445,7 +465,7 @@ struct vector_type<T, 16>
...
@@ -445,7 +465,7 @@ struct vector_type<T, 16>
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
,
is_same
<
X
,
d16_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -479,7 +499,7 @@ struct vector_type<T, 16>
...
@@ -479,7 +499,7 @@ struct vector_type<T, 16>
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
,
is_same
<
X
,
d16_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -509,7 +529,7 @@ struct vector_type<T, 16>
...
@@ -509,7 +529,7 @@ struct vector_type<T, 16>
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
32
>
struct
vector_type
<
T
,
32
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
...
@@ -541,7 +561,7 @@ struct vector_type<T, 32>
...
@@ -541,7 +561,7 @@ struct vector_type<T, 32>
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
,
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -579,7 +599,7 @@ struct vector_type<T, 32>
...
@@ -579,7 +599,7 @@ struct vector_type<T, 32>
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
,
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -613,7 +633,7 @@ struct vector_type<T, 32>
...
@@ -613,7 +633,7 @@ struct vector_type<T, 32>
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
64
>
struct
vector_type
<
T
,
64
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
...
@@ -648,7 +668,7 @@ struct vector_type<T, 64>
...
@@ -648,7 +668,7 @@ struct vector_type<T, 64>
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d64_t
>::
value
,
is_same
<
X
,
d64_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -691,7 +711,7 @@ struct vector_type<T, 64>
...
@@ -691,7 +711,7 @@ struct vector_type<T, 64>
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d64_t
>::
value
,
is_same
<
X
,
d64_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -729,7 +749,7 @@ struct vector_type<T, 64>
...
@@ -729,7 +749,7 @@ struct vector_type<T, 64>
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
128
>
struct
vector_type
<
T
,
128
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
...
@@ -766,7 +786,7 @@ struct vector_type<T, 128>
...
@@ -766,7 +786,7 @@ struct vector_type<T, 128>
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d64_t
>::
value
||
is_same
<
X
,
d128_t
>::
value
,
is_same
<
X
,
d64_t
>::
value
||
is_same
<
X
,
d128_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -813,7 +833,7 @@ struct vector_type<T, 128>
...
@@ -813,7 +833,7 @@ struct vector_type<T, 128>
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d64_t
>::
value
||
is_same
<
X
,
d128_t
>::
value
,
is_same
<
X
,
d64_t
>::
value
||
is_same
<
X
,
d128_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -855,7 +875,7 @@ struct vector_type<T, 128>
...
@@ -855,7 +875,7 @@ struct vector_type<T, 128>
};
};
template
<
typename
T
>
template
<
typename
T
>
struct
vector_type
<
T
,
256
>
struct
vector_type
<
T
,
256
,
typename
std
::
enable_if_t
<
is_native_type
<
T
>
()
>
>
{
{
using
d1_t
=
T
;
using
d1_t
=
T
;
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
typedef
T
d2_t
__attribute__
((
ext_vector_type
(
2
)));
...
@@ -894,7 +914,7 @@ struct vector_type<T, 256>
...
@@ -894,7 +914,7 @@ struct vector_type<T, 256>
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d64_t
>::
value
||
is_same
<
X
,
d128_t
>::
value
||
is_same
<
X
,
d256_t
>::
value
,
is_same
<
X
,
d64_t
>::
value
||
is_same
<
X
,
d128_t
>::
value
||
is_same
<
X
,
d256_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -945,7 +965,7 @@ struct vector_type<T, 256>
...
@@ -945,7 +965,7 @@ struct vector_type<T, 256>
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d64_t
>::
value
||
is_same
<
X
,
d128_t
>::
value
||
is_same
<
X
,
d256_t
>::
value
,
is_same
<
X
,
d64_t
>::
value
||
is_same
<
X
,
d128_t
>::
value
||
is_same
<
X
,
d256_t
>::
value
,
"
wrong!
"
);
"
Something went wrong, please check src and dst types.
"
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
{
...
@@ -990,6 +1010,581 @@ struct vector_type<T, 256>
...
@@ -990,6 +1010,581 @@ struct vector_type<T, 256>
}
}
};
};
template
<
typename
T
,
index_t
N
>
struct
non_native_vector_base
{
using
type
=
non_native_vector_base
<
T
,
N
>
;
__host__
__device__
non_native_vector_base
()
=
default
;
__host__
__device__
non_native_vector_base
(
const
type
&
)
=
default
;
__host__
__device__
non_native_vector_base
(
type
&&
)
=
default
;
__host__
__device__
~
non_native_vector_base
()
=
default
;
T
d
[
N
];
};
// non-native vector_type implementation
template
<
typename
T
>
struct
vector_type
<
T
,
1
,
typename
std
::
enable_if_t
<!
is_native_type
<
T
>
()
>>
{
using
d1_t
=
T
;
using
type
=
d1_t
;
union
alignas
(
next_pow2
(
1
*
sizeof
(
T
)))
{
d1_t
d1_
;
StaticallyIndexedArray
<
d1_t
,
1
>
d1x1_
;
}
data_
;
__host__
__device__
constexpr
vector_type
()
:
data_
{
type
{}}
{}
__host__
__device__
constexpr
vector_type
(
type
v
)
:
data_
{
v
}
{}
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
,
"Something went wrong, please check src and dst types."
);
return
data_
.
d1x1_
;
}
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
,
"Something went wrong, please check src and dst types."
);
return
data_
.
d1x1_
;
}
};
template
<
typename
T
>
struct
vector_type
<
T
,
2
,
typename
std
::
enable_if_t
<!
is_native_type
<
T
>
()
>>
{
using
d1_t
=
T
;
using
d2_t
=
non_native_vector_base
<
T
,
2
>
;
using
type
=
d2_t
;
union
alignas
(
next_pow2
(
2
*
sizeof
(
T
)))
{
d2_t
d2_
;
StaticallyIndexedArray
<
d1_t
,
2
>
d1x2_
;
StaticallyIndexedArray
<
d2_t
,
1
>
d2x1_
;
}
data_
;
__host__
__device__
constexpr
vector_type
()
:
data_
{
type
{}}
{}
__host__
__device__
constexpr
vector_type
(
type
v
)
:
data_
{
v
}
{}
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x1_
;
}
else
{
return
err
;
}
}
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x1_
;
}
else
{
return
err
;
}
}
};
template
<
typename
T
>
struct
vector_type
<
T
,
4
,
typename
std
::
enable_if_t
<!
is_native_type
<
T
>
()
>>
{
using
d1_t
=
T
;
using
d2_t
=
non_native_vector_base
<
T
,
2
>
;
using
d4_t
=
non_native_vector_base
<
T
,
4
>
;
using
type
=
d4_t
;
union
alignas
(
next_pow2
(
4
*
sizeof
(
T
)))
{
d4_t
d4_
;
StaticallyIndexedArray
<
d1_t
,
4
>
d1x4_
;
StaticallyIndexedArray
<
d2_t
,
2
>
d2x2_
;
StaticallyIndexedArray
<
d4_t
,
1
>
d4x1_
;
}
data_
;
__host__
__device__
constexpr
vector_type
()
:
data_
{
type
{}}
{}
__host__
__device__
constexpr
vector_type
(
type
v
)
:
data_
{
v
}
{}
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x1_
;
}
else
{
return
err
;
}
}
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x1_
;
}
else
{
return
err
;
}
}
};
template
<
typename
T
>
struct
vector_type
<
T
,
8
,
typename
std
::
enable_if_t
<!
is_native_type
<
T
>
()
>>
{
using
d1_t
=
T
;
using
d2_t
=
non_native_vector_base
<
T
,
2
>
;
using
d4_t
=
non_native_vector_base
<
T
,
4
>
;
using
d8_t
=
non_native_vector_base
<
T
,
8
>
;
using
type
=
d8_t
;
union
alignas
(
next_pow2
(
8
*
sizeof
(
T
)))
{
d8_t
d8_
;
StaticallyIndexedArray
<
d1_t
,
8
>
d1x8_
;
StaticallyIndexedArray
<
d2_t
,
4
>
d2x4_
;
StaticallyIndexedArray
<
d4_t
,
2
>
d4x2_
;
StaticallyIndexedArray
<
d8_t
,
1
>
d8x1_
;
}
data_
;
__host__
__device__
constexpr
vector_type
()
:
data_
{
type
{}}
{}
__host__
__device__
constexpr
vector_type
(
type
v
)
:
data_
{
v
}
{}
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x8_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d8_t
>::
value
)
{
return
data_
.
d8x1_
;
}
else
{
return
err
;
}
}
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x8_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d8_t
>::
value
)
{
return
data_
.
d8x1_
;
}
else
{
return
err
;
}
}
};
template
<
typename
T
>
struct
vector_type
<
T
,
16
,
typename
std
::
enable_if_t
<!
is_native_type
<
T
>
()
>>
{
using
d1_t
=
T
;
using
d2_t
=
non_native_vector_base
<
T
,
2
>
;
using
d4_t
=
non_native_vector_base
<
T
,
4
>
;
using
d8_t
=
non_native_vector_base
<
T
,
8
>
;
using
d16_t
=
non_native_vector_base
<
T
,
16
>
;
using
type
=
d16_t
;
union
alignas
(
next_pow2
(
16
*
sizeof
(
T
)))
{
d16_t
d16_
;
StaticallyIndexedArray
<
d1_t
,
16
>
d1x16_
;
StaticallyIndexedArray
<
d2_t
,
8
>
d2x8_
;
StaticallyIndexedArray
<
d4_t
,
4
>
d4x4_
;
StaticallyIndexedArray
<
d8_t
,
2
>
d8x2_
;
StaticallyIndexedArray
<
d16_t
,
1
>
d16x1_
;
}
data_
;
__host__
__device__
constexpr
vector_type
()
:
data_
{
type
{}}
{}
__host__
__device__
constexpr
vector_type
(
type
v
)
:
data_
{
v
}
{}
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x16_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x8_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d8_t
>::
value
)
{
return
data_
.
d8x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d16_t
>::
value
)
{
return
data_
.
d16x1_
;
}
else
{
return
err
;
}
}
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x16_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x8_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d8_t
>::
value
)
{
return
data_
.
d8x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d16_t
>::
value
)
{
return
data_
.
d16x1_
;
}
else
{
return
err
;
}
}
};
template
<
typename
T
>
struct
vector_type
<
T
,
32
,
typename
std
::
enable_if_t
<!
is_native_type
<
T
>
()
>>
{
using
d1_t
=
T
;
using
d2_t
=
non_native_vector_base
<
T
,
2
>
;
using
d4_t
=
non_native_vector_base
<
T
,
4
>
;
using
d8_t
=
non_native_vector_base
<
T
,
8
>
;
using
d16_t
=
non_native_vector_base
<
T
,
16
>
;
using
d32_t
=
non_native_vector_base
<
T
,
32
>
;
using
type
=
d32_t
;
union
alignas
(
next_pow2
(
32
*
sizeof
(
T
)))
{
d32_t
d32_
;
StaticallyIndexedArray
<
d1_t
,
32
>
d1x32_
;
StaticallyIndexedArray
<
d2_t
,
16
>
d2x16_
;
StaticallyIndexedArray
<
d4_t
,
8
>
d4x8_
;
StaticallyIndexedArray
<
d8_t
,
4
>
d8x4_
;
StaticallyIndexedArray
<
d16_t
,
2
>
d16x2_
;
StaticallyIndexedArray
<
d32_t
,
1
>
d32x1_
;
}
data_
;
__host__
__device__
constexpr
vector_type
()
:
data_
{
type
{}}
{}
__host__
__device__
constexpr
vector_type
(
type
v
)
:
data_
{
v
}
{}
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x32_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x16_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x8_
;
}
else
if
constexpr
(
is_same
<
X
,
d8_t
>::
value
)
{
return
data_
.
d8x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d16_t
>::
value
)
{
return
data_
.
d16x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d32_t
>::
value
)
{
return
data_
.
d32x1_
;
}
else
{
return
err
;
}
}
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x32_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x16_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x8_
;
}
else
if
constexpr
(
is_same
<
X
,
d8_t
>::
value
)
{
return
data_
.
d8x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d16_t
>::
value
)
{
return
data_
.
d16x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d32_t
>::
value
)
{
return
data_
.
d32x1_
;
}
else
{
return
err
;
}
}
};
template
<
typename
T
>
struct
vector_type
<
T
,
64
,
typename
std
::
enable_if_t
<!
is_native_type
<
T
>
()
>>
{
using
d1_t
=
T
;
using
d2_t
=
non_native_vector_base
<
T
,
2
>
;
using
d4_t
=
non_native_vector_base
<
T
,
4
>
;
using
d8_t
=
non_native_vector_base
<
T
,
8
>
;
using
d16_t
=
non_native_vector_base
<
T
,
16
>
;
using
d32_t
=
non_native_vector_base
<
T
,
32
>
;
using
d64_t
=
non_native_vector_base
<
T
,
64
>
;
using
type
=
d64_t
;
union
alignas
(
next_pow2
(
64
*
sizeof
(
T
)))
{
d64_t
d64_
;
StaticallyIndexedArray
<
d1_t
,
64
>
d1x64_
;
StaticallyIndexedArray
<
d2_t
,
32
>
d2x32_
;
StaticallyIndexedArray
<
d4_t
,
16
>
d4x16_
;
StaticallyIndexedArray
<
d8_t
,
8
>
d8x8_
;
StaticallyIndexedArray
<
d16_t
,
4
>
d16x4_
;
StaticallyIndexedArray
<
d32_t
,
2
>
d32x2_
;
StaticallyIndexedArray
<
d64_t
,
1
>
d64x1_
;
}
data_
;
__host__
__device__
constexpr
vector_type
()
:
data_
{
type
{}}
{}
__host__
__device__
constexpr
vector_type
(
type
v
)
:
data_
{
v
}
{}
template
<
typename
X
>
__host__
__device__
constexpr
const
auto
&
AsType
()
const
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d64_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x64_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x32_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x16_
;
}
else
if
constexpr
(
is_same
<
X
,
d8_t
>::
value
)
{
return
data_
.
d8x8_
;
}
else
if
constexpr
(
is_same
<
X
,
d16_t
>::
value
)
{
return
data_
.
d16x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d32_t
>::
value
)
{
return
data_
.
d32x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d64_t
>::
value
)
{
return
data_
.
d64x1_
;
}
else
{
return
err
;
}
}
template
<
typename
X
>
__host__
__device__
constexpr
auto
&
AsType
()
{
static_assert
(
is_same
<
X
,
d1_t
>::
value
||
is_same
<
X
,
d2_t
>::
value
||
is_same
<
X
,
d4_t
>::
value
||
is_same
<
X
,
d8_t
>::
value
||
is_same
<
X
,
d16_t
>::
value
||
is_same
<
X
,
d32_t
>::
value
||
is_same
<
X
,
d64_t
>::
value
,
"Something went wrong, please check src and dst types."
);
if
constexpr
(
is_same
<
X
,
d1_t
>::
value
)
{
return
data_
.
d1x64_
;
}
else
if
constexpr
(
is_same
<
X
,
d2_t
>::
value
)
{
return
data_
.
d2x32_
;
}
else
if
constexpr
(
is_same
<
X
,
d4_t
>::
value
)
{
return
data_
.
d4x16_
;
}
else
if
constexpr
(
is_same
<
X
,
d8_t
>::
value
)
{
return
data_
.
d8x8_
;
}
else
if
constexpr
(
is_same
<
X
,
d16_t
>::
value
)
{
return
data_
.
d16x4_
;
}
else
if
constexpr
(
is_same
<
X
,
d32_t
>::
value
)
{
return
data_
.
d32x2_
;
}
else
if
constexpr
(
is_same
<
X
,
d64_t
>::
value
)
{
return
data_
.
d64x1_
;
}
else
{
return
err
;
}
}
};
using
int64_t
=
long
;
using
int64_t
=
long
;
// fp64
// fp64
...
@@ -1051,8 +1646,8 @@ using bf8x8_t = typename vector_type<bf8_t, 8>::type;
...
@@ -1051,8 +1646,8 @@ using bf8x8_t = typename vector_type<bf8_t, 8>::type;
using
bf8x16_t
=
typename
vector_type
<
bf8_t
,
16
>::
type
;
using
bf8x16_t
=
typename
vector_type
<
bf8_t
,
16
>::
type
;
using
bf8x32_t
=
typename
vector_type
<
bf8_t
,
32
>::
type
;
using
bf8x32_t
=
typename
vector_type
<
bf8_t
,
32
>::
type
;
using
bf8x64_t
=
typename
vector_type
<
bf8_t
,
64
>::
type
;
using
bf8x64_t
=
typename
vector_type
<
bf8_t
,
64
>::
type
;
// u8
// u8
// i8
using
uint8x2_t
=
typename
vector_type
<
uint8_t
,
2
>::
type
;
using
uint8x2_t
=
typename
vector_type
<
uint8_t
,
2
>::
type
;
using
uint8x4_t
=
typename
vector_type
<
uint8_t
,
4
>::
type
;
using
uint8x4_t
=
typename
vector_type
<
uint8_t
,
4
>::
type
;
using
uint8x8_t
=
typename
vector_type
<
uint8_t
,
8
>::
type
;
using
uint8x8_t
=
typename
vector_type
<
uint8_t
,
8
>::
type
;
...
...
include/ck/utility/math_v2.hpp
View file @
dc1c2bf8
...
@@ -80,6 +80,8 @@ static inline __host__ bool isnan(half_t x)
...
@@ -80,6 +80,8 @@ static inline __host__ bool isnan(half_t x)
return
(
xx
&
0x7FFF
)
>
0x7C00
;
return
(
xx
&
0x7FFF
)
>
0x7C00
;
};
};
static
inline
__host__
bool
isnan
(
f8_t
x
)
{
return
(
x
&
0x80
);
};
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
static
inline
__host__
bool
isnan
(
int4_t
x
)
static
inline
__host__
bool
isnan
(
int4_t
x
)
{
{
...
@@ -529,6 +531,8 @@ static inline __device__ bool isnan(half_t x)
...
@@ -529,6 +531,8 @@ static inline __device__ bool isnan(half_t x)
return
(
xx
&
0x7FFF
)
>
0x7C00
;
return
(
xx
&
0x7FFF
)
>
0x7C00
;
};
};
static
inline
__device__
bool
isnan
(
f8_t
x
)
{
return
(
x
&
0x80
);
};
static
inline
__device__
half_t
sqrt
(
half_t
x
)
static
inline
__device__
half_t
sqrt
(
half_t
x
)
{
{
return
static_cast
<
half_t
>
(
__builtin_amdgcn_sqrtf
(
static_cast
<
float
>
(
x
)));
return
static_cast
<
half_t
>
(
__builtin_amdgcn_sqrtf
(
static_cast
<
float
>
(
x
)));
...
...
include/ck_tile/core/config.hpp
View file @
dc1c2bf8
...
@@ -159,8 +159,11 @@
...
@@ -159,8 +159,11 @@
#endif
#endif
#endif
#endif
// workaround for ROCm 6.2 and later
#ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
#ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
#if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133
#if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133) || \
(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 3 && HIP_VERSION_PATCH >= 42131) || \
(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR > 3)
#define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
#define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
#else
#else
#define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0
#define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp
View file @
dc1c2bf8
...
@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
k_lds_ptr
,
Policy
::
template
MakeKLdsWriteBlockDescriptor
<
Problem
>());
k_lds_ptr
,
Policy
::
template
MakeKLdsWriteBlockDescriptor
<
Problem
>());
auto
k_lds_write_window
=
auto
k_lds_write_window
=
make_tile_window
(
k_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
K0
>
{}),
{
0
,
0
});
make_tile_window
(
k_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
QKHeaddim
>
{}),
{
0
,
0
});
auto
k_lds_read_window
=
auto
k_lds_read_window
=
make_tile_window
(
k_lds_write_window
.
get_bottom_tensor_view
(),
make_tile_window
(
k_lds_write_window
.
get_bottom_tensor_view
(),
make_tuple
(
number
<
kN0
>
{},
number
<
kK0
>
{}),
make_tuple
(
number
<
kN0
>
{},
number
<
kK0
>
{}),
k_lds_write_window
.
get_window_origin
(),
k_lds_write_window
.
get_window_origin
(),
Policy
::
template
MakeKReg
Slice
BlockDescriptor
<
Problem
>());
Policy
::
template
MakeKRegBlockDescriptor
<
Problem
>());
auto
k_reg_tensor
=
make_static_distributed_tensor
<
KDataType
>
(
auto
k_reg_tensor
=
make_static_distributed_tensor
<
KDataType
>
(
Policy
::
template
MakeKRegBlockDescriptor
<
Problem
>());
Policy
::
template
MakeKRegBlockDescriptor
<
Problem
>());
...
@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
v_lds_ptr
,
Policy
::
template
MakeVLdsWriteBlockDescriptor
<
Problem
>());
v_lds_ptr
,
Policy
::
template
MakeVLdsWriteBlockDescriptor
<
Problem
>());
auto
v_lds_write_window
=
auto
v_lds_write_window
=
make_tile_window
(
v_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
K2
>
{}),
{
0
,
0
});
make_tile_window
(
v_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
VHeaddim
>
{}),
{
0
,
0
});
auto
v_lds_read_window
=
auto
v_lds_read_window
=
make_tile_window
(
v_lds_write_window
.
get_bottom_tensor_view
(),
make_tile_window
(
v_lds_write_window
.
get_bottom_tensor_view
(),
make_tuple
(
number
<
kN0
>
{},
number
<
kK2
>
{}),
make_tuple
(
number
<
kN0
>
{},
number
<
kK2
>
{}),
v_lds_write_window
.
get_window_origin
(),
v_lds_write_window
.
get_window_origin
(),
Policy
::
template
MakeVRegSliceBlockDescriptor
<
Problem
>());
Policy
::
template
MakeVRegBlockDescriptor
<
Problem
>());
auto
v_reg_tensor
=
make_static_distributed_tensor
<
VDataType
>
(
Policy
::
template
MakeVRegBlockDescriptor
<
Problem
>());
//------------------------------------------------------------------
//------------------------------------------------------------------
// KT, Reg ->LDS ->Reg
// KT, Reg ->LDS ->Reg
...
@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
kt_lds_ptr
,
Policy
::
template
MakeShuffledKLdsWriteBlockDescriptor
<
Problem
>());
kt_lds_ptr
,
Policy
::
template
MakeShuffledKLdsWriteBlockDescriptor
<
Problem
>());
auto
shuffled_k_lds_write_window
=
make_tile_window
(
auto
shuffled_k_lds_write_window
=
make_tile_window
(
shuffled_k_lds_write
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
K0
>
{}),
{
0
,
0
});
shuffled_k_lds_write
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
QKHeaddim
>
{}),
{
0
,
0
});
auto
kt_lds_read
=
make_tensor_view
<
address_space_enum
::
lds
>
(
auto
kt_lds_read
=
make_tensor_view
<
address_space_enum
::
lds
>
(
kt_lds_ptr
,
Policy
::
template
MakeKTLdsReadBlockDescriptor
<
Problem
>());
kt_lds_ptr
,
Policy
::
template
MakeKTLdsReadBlockDescriptor
<
Problem
>());
...
@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
block_sync_lds
();
block_sync_lds
();
v_reg_tensor
=
load_tile
(
v_lds_read_window
);
auto
v_reg_tensor
=
load_tile
(
v_lds_read_window
);
block_sync_lds
();
block_sync_lds
();
//---------------------------- Loop Load in ----------------------------//
//---------------------------- Loop Load in ----------------------------//
// Q: HBM ->Reg ->LDS
// Q: HBM ->Reg ->LDS
...
@@ -276,7 +273,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -276,7 +273,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
q_lds_ptr
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>());
q_lds_ptr
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>());
auto
q_lds_window
=
auto
q_lds_window
=
make_tile_window
(
q_lds
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
K0
>
{}),
{
0
,
0
});
make_tile_window
(
q_lds
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
QKHeaddim
>
{}),
{
0
,
0
});
auto
q_lds_read_window
=
auto
q_lds_read_window
=
make_tile_window
(
q_lds_window
.
get_bottom_tensor_view
(),
make_tile_window
(
q_lds_window
.
get_bottom_tensor_view
(),
...
@@ -297,7 +294,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -297,7 +294,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
qt_lds_ptr
,
Policy
::
template
MakeShuffledQLdsWriteBlockDescriptor
<
Problem
>());
qt_lds_ptr
,
Policy
::
template
MakeShuffledQLdsWriteBlockDescriptor
<
Problem
>());
auto
shuffled_q_lds_write_window
=
make_tile_window
(
auto
shuffled_q_lds_write_window
=
make_tile_window
(
shuffled_q_lds_write
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
K0
>
{}),
{
0
,
0
});
shuffled_q_lds_write
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
QKHeaddim
>
{}),
{
0
,
0
});
auto
qt_lds_read
=
make_tensor_view
<
address_space_enum
::
lds
>
(
auto
qt_lds_read
=
make_tensor_view
<
address_space_enum
::
lds
>
(
qt_lds_ptr
,
Policy
::
template
MakeQTLdsReadBlockDescriptor
<
Problem
>());
qt_lds_ptr
,
Policy
::
template
MakeQTLdsReadBlockDescriptor
<
Problem
>());
...
@@ -322,7 +319,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -322,7 +319,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
do_lds_ptr
,
Policy
::
template
MakeOGradLdsBlockDescriptor
<
Problem
>());
do_lds_ptr
,
Policy
::
template
MakeOGradLdsBlockDescriptor
<
Problem
>());
auto
do_lds_window
=
auto
do_lds_window
=
make_tile_window
(
do_lds
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
K2
>
{}),
{
0
,
0
});
make_tile_window
(
do_lds
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
VHeaddim
>
{}),
{
0
,
0
});
auto
do_lds_read_window
=
auto
do_lds_read_window
=
make_tile_window
(
do_lds_window
.
get_bottom_tensor_view
(),
make_tile_window
(
do_lds_window
.
get_bottom_tensor_view
(),
...
@@ -341,7 +338,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -341,7 +338,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
dot_lds_ptr
,
Policy
::
template
MakeShuffledOGradLdsWriteBlockDescriptor
<
Problem
>());
dot_lds_ptr
,
Policy
::
template
MakeShuffledOGradLdsWriteBlockDescriptor
<
Problem
>());
auto
shuffled_do_lds_write_window
=
make_tile_window
(
auto
shuffled_do_lds_write_window
=
make_tile_window
(
shuffled_do_lds_write
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
K2
>
{}),
{
0
,
0
});
shuffled_do_lds_write
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
VHeaddim
>
{}),
{
0
,
0
});
auto
dot_read_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
auto
dot_read_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
dot_lds_ptr
,
Policy
::
template
MakeOGradTLdsReadBlockDescriptor
<
Problem
>());
dot_lds_ptr
,
Policy
::
template
MakeOGradTLdsReadBlockDescriptor
<
Problem
>());
...
@@ -483,9 +480,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
...
@@ -483,9 +480,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
index_t
i_total_loops
=
0
;
index_t
i_total_loops
=
0
;
index_t
seqlen_q_step
=
seqlen_q_start
;
index_t
seqlen_q_step
=
seqlen_q_start
;
static_assert
(
kQKHeaddim
=
=
kK0
,
"kQKHeaddim should equal
t
o kK0"
);
static_assert
(
kQKHeaddim
>
=
kK0
,
"kQKHeaddim should
be
equal o
r greater than
kK0"
);
static_assert
(
kM0
==
kK1
,
"kM0 should equal to kK1"
);
static_assert
(
kM0
==
kK1
,
"kM0 should equal to kK1"
);
static_assert
(
kVHeaddim
=
=
kK2
,
"kVHeaddim should equal
t
o kK2"
);
static_assert
(
kVHeaddim
>
=
kK2
,
"kVHeaddim should
be
equal o
r greater than
kK2"
);
static_assert
(
kM0
==
kK3
,
"kM0 should equal to kK3"
);
static_assert
(
kM0
==
kK3
,
"kM0 should equal to kK3"
);
constexpr
index_t
k4_loops
=
kN0
/
kK4
;
constexpr
index_t
k4_loops
=
kN0
/
kK4
;
...
...
include/ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr_iglp.hpp
View file @
dc1c2bf8
...
@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
k_lds_ptr
,
Policy
::
template
MakeKLdsWriteBlockDescriptor
<
Problem
>());
k_lds_ptr
,
Policy
::
template
MakeKLdsWriteBlockDescriptor
<
Problem
>());
auto
k_lds_write_window
=
auto
k_lds_write_window
=
make_tile_window
(
k_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
K0
>
{}),
{
0
,
0
});
make_tile_window
(
k_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
QKHeaddim
>
{}),
{
0
,
0
});
auto
k_lds_read_window
=
auto
k_lds_read_window
=
make_tile_window
(
k_lds_write_window
.
get_bottom_tensor_view
(),
make_tile_window
(
k_lds_write_window
.
get_bottom_tensor_view
(),
make_tuple
(
number
<
kN0
>
{},
number
<
kK0
>
{}),
make_tuple
(
number
<
kN0
>
{},
number
<
kK0
>
{}),
k_lds_write_window
.
get_window_origin
(),
k_lds_write_window
.
get_window_origin
(),
Policy
::
template
MakeKReg
Slice
BlockDescriptor
<
Problem
>());
Policy
::
template
MakeKRegBlockDescriptor
<
Problem
>());
auto
k_reg_tensor
=
make_static_distributed_tensor
<
KDataType
>
(
auto
k_reg_tensor
=
make_static_distributed_tensor
<
KDataType
>
(
Policy
::
template
MakeKRegBlockDescriptor
<
Problem
>());
Policy
::
template
MakeKRegBlockDescriptor
<
Problem
>());
...
@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
v_lds_ptr
,
Policy
::
template
MakeVLdsWriteBlockDescriptor
<
Problem
>());
v_lds_ptr
,
Policy
::
template
MakeVLdsWriteBlockDescriptor
<
Problem
>());
auto
v_lds_write_window
=
auto
v_lds_write_window
=
make_tile_window
(
v_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
K2
>
{}),
{
0
,
0
});
make_tile_window
(
v_lds
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
VHeaddim
>
{}),
{
0
,
0
});
auto
v_lds_read_window
=
auto
v_lds_read_window
=
make_tile_window
(
v_lds_write_window
.
get_bottom_tensor_view
(),
make_tile_window
(
v_lds_write_window
.
get_bottom_tensor_view
(),
make_tuple
(
number
<
kN0
>
{},
number
<
kK2
>
{}),
make_tuple
(
number
<
kN0
>
{},
number
<
kK2
>
{}),
v_lds_write_window
.
get_window_origin
(),
v_lds_write_window
.
get_window_origin
(),
Policy
::
template
MakeVRegSliceBlockDescriptor
<
Problem
>());
Policy
::
template
MakeVRegBlockDescriptor
<
Problem
>());
auto
v_reg_tensor
=
make_static_distributed_tensor
<
VDataType
>
(
Policy
::
template
MakeVRegBlockDescriptor
<
Problem
>());
//------------------------------------------------------------------
//------------------------------------------------------------------
// KT, Reg ->LDS ->Reg
// KT, Reg ->LDS ->Reg
...
@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
kt_lds_ptr
,
Policy
::
template
MakeShuffledKLdsWriteBlockDescriptor
<
Problem
>());
kt_lds_ptr
,
Policy
::
template
MakeShuffledKLdsWriteBlockDescriptor
<
Problem
>());
auto
shuffled_k_lds_write_window
=
make_tile_window
(
auto
shuffled_k_lds_write_window
=
make_tile_window
(
shuffled_k_lds_write
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
K0
>
{}),
{
0
,
0
});
shuffled_k_lds_write
,
make_tuple
(
number
<
kN0
>
{},
number
<
k
QKHeaddim
>
{}),
{
0
,
0
});
auto
kt_lds_read
=
make_tensor_view
<
address_space_enum
::
lds
>
(
auto
kt_lds_read
=
make_tensor_view
<
address_space_enum
::
lds
>
(
kt_lds_ptr
,
Policy
::
template
MakeKTLdsReadBlockDescriptor
<
Problem
>());
kt_lds_ptr
,
Policy
::
template
MakeKTLdsReadBlockDescriptor
<
Problem
>());
...
@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
block_sync_lds
();
block_sync_lds
();
v_reg_tensor
=
load_tile
(
v_lds_read_window
);
auto
v_reg_tensor
=
load_tile
(
v_lds_read_window
);
//---------------------------- Loop Load in ----------------------------//
//---------------------------- Loop Load in ----------------------------//
// Q: HBM ->Reg ->LDS
// Q: HBM ->Reg ->LDS
auto
q_dram_window
=
auto
q_dram_window
=
...
@@ -275,7 +272,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -275,7 +272,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
q_lds_ptr
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>());
q_lds_ptr
,
Policy
::
template
MakeQLdsBlockDescriptor
<
Problem
>());
auto
q_lds_window
=
auto
q_lds_window
=
make_tile_window
(
q_lds
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
K0
>
{}),
{
0
,
0
});
make_tile_window
(
q_lds
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
QKHeaddim
>
{}),
{
0
,
0
});
auto
q_lds_read_window
=
auto
q_lds_read_window
=
make_tile_window
(
q_lds_window
.
get_bottom_tensor_view
(),
make_tile_window
(
q_lds_window
.
get_bottom_tensor_view
(),
...
@@ -296,7 +293,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -296,7 +293,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
qt_lds_ptr
,
Policy
::
template
MakeShuffledQLdsWriteBlockDescriptor
<
Problem
>());
qt_lds_ptr
,
Policy
::
template
MakeShuffledQLdsWriteBlockDescriptor
<
Problem
>());
auto
shuffled_q_lds_write_window
=
make_tile_window
(
auto
shuffled_q_lds_write_window
=
make_tile_window
(
shuffled_q_lds_write
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
K0
>
{}),
{
0
,
0
});
shuffled_q_lds_write
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
QKHeaddim
>
{}),
{
0
,
0
});
auto
qt_lds_read
=
make_tensor_view
<
address_space_enum
::
lds
>
(
auto
qt_lds_read
=
make_tensor_view
<
address_space_enum
::
lds
>
(
qt_lds_ptr
,
Policy
::
template
MakeQTLdsReadBlockDescriptor
<
Problem
>());
qt_lds_ptr
,
Policy
::
template
MakeQTLdsReadBlockDescriptor
<
Problem
>());
...
@@ -321,7 +318,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -321,7 +318,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
do_lds_ptr
,
Policy
::
template
MakeOGradLdsBlockDescriptor
<
Problem
>());
do_lds_ptr
,
Policy
::
template
MakeOGradLdsBlockDescriptor
<
Problem
>());
auto
do_lds_window
=
auto
do_lds_window
=
make_tile_window
(
do_lds
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
K2
>
{}),
{
0
,
0
});
make_tile_window
(
do_lds
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
VHeaddim
>
{}),
{
0
,
0
});
auto
do_lds_read_window
=
auto
do_lds_read_window
=
make_tile_window
(
do_lds_window
.
get_bottom_tensor_view
(),
make_tile_window
(
do_lds_window
.
get_bottom_tensor_view
(),
...
@@ -340,7 +337,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -340,7 +337,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
dot_lds_ptr
,
Policy
::
template
MakeShuffledOGradLdsWriteBlockDescriptor
<
Problem
>());
dot_lds_ptr
,
Policy
::
template
MakeShuffledOGradLdsWriteBlockDescriptor
<
Problem
>());
auto
shuffled_do_lds_write_window
=
make_tile_window
(
auto
shuffled_do_lds_write_window
=
make_tile_window
(
shuffled_do_lds_write
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
K2
>
{}),
{
0
,
0
});
shuffled_do_lds_write
,
make_tuple
(
number
<
kM0
>
{},
number
<
k
VHeaddim
>
{}),
{
0
,
0
});
auto
dot_read_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
auto
dot_read_lds
=
make_tensor_view
<
address_space_enum
::
lds
>
(
dot_lds_ptr
,
Policy
::
template
MakeOGradTLdsReadBlockDescriptor
<
Problem
>());
dot_lds_ptr
,
Policy
::
template
MakeOGradTLdsReadBlockDescriptor
<
Problem
>());
...
@@ -482,9 +479,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
...
@@ -482,9 +479,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
index_t
i_total_loops
=
0
;
index_t
i_total_loops
=
0
;
index_t
seqlen_q_step
=
seqlen_q_start
;
index_t
seqlen_q_step
=
seqlen_q_start
;
static_assert
(
kQKHeaddim
=
=
kK0
,
"kQKHeaddim should equal
t
o kK0"
);
static_assert
(
kQKHeaddim
>
=
kK0
,
"kQKHeaddim should
be
equal o
r greater than
kK0"
);
static_assert
(
kM0
==
kK1
,
"kM0 should equal to kK1"
);
static_assert
(
kM0
==
kK1
,
"kM0 should equal to kK1"
);
static_assert
(
kVHeaddim
=
=
kK2
,
"kVHeaddim should equal
t
o kK2"
);
static_assert
(
kVHeaddim
>
=
kK2
,
"kVHeaddim should
be
equal o
r greater than
kK2"
);
static_assert
(
kM0
==
kK3
,
"kM0 should equal to kK3"
);
static_assert
(
kM0
==
kK3
,
"kM0 should equal to kK3"
);
constexpr
index_t
k4_loops
=
kN0
/
kK4
;
constexpr
index_t
k4_loops
=
kN0
/
kK4
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment