Commit dc1c2bf8 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into letaoqin/update_layernorm

parents 5cfd751b a285d6f9
...@@ -202,6 +202,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302) ...@@ -202,6 +202,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
add_compile_options(-fno-offload-uniform-block) add_compile_options(-fno-offload-uniform-block)
endif() endif()
endif() endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
if(HAS_LSR_DROP_SOLUTION)
message("Adding the lsr-drop-solution=1 compiler flag")
add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
endif()
endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED) check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
if(HAS_ENABLE_POST_MISCHED) if(HAS_ENABLE_POST_MISCHED)
......
...@@ -735,11 +735,11 @@ def process_results(Map conf=[:]){ ...@@ -735,11 +735,11 @@ def process_results(Map conf=[:]){
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.2;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true
0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true 0 21 * * * % ROCMVERSION=6.2;hipTensor_test=true;RUN_CODEGEN_TESTS=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;BUILD_COMPILER=/llvm-project/build/bin/clang++;BUILD_GFX12=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true
0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_CODEGEN_TESTS=false;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false
0 13 * * * % BUILD_LEGACY_OS=true ''' : "" 0 13 * * * % BUILD_LEGACY_OS=true''' : ""
pipeline { pipeline {
agent none agent none
...@@ -806,6 +806,10 @@ pipeline { ...@@ -806,6 +806,10 @@ pipeline {
name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS", name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
defaultValue: false, defaultValue: false,
description: "Run the grouped conv large cases tests (default: OFF)") description: "Run the grouped conv large cases tests (default: OFF)")
booleanParam(
name: "RUN_CODEGEN_TESTS",
defaultValue: false,
description: "Run codegen tests (default: OFF)")
booleanParam( booleanParam(
name: "RUN_CK_TILE_FMHA_TESTS", name: "RUN_CK_TILE_FMHA_TESTS",
defaultValue: false, defaultValue: false,
...@@ -926,7 +930,30 @@ pipeline { ...@@ -926,7 +930,30 @@ pipeline {
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 test_grouped_convnd_fwd_large_cases_xdl && \ make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
./bin/test_grouped_convnd_fwd_large_cases_xdl""" ./bin/test_grouped_convnd_fwd_large_cases_xdl"""
} }
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Run Codegen Tests")
{
parallel
{
stage("Run Codegen Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_CODEGEN_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
make -j64 check"""
}
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -951,7 +978,7 @@ pipeline { ...@@ -951,7 +978,7 @@ pipeline {
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ && cd ../ &&
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -970,7 +997,7 @@ pipeline { ...@@ -970,7 +997,7 @@ pipeline {
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ && cd ../ &&
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -995,7 +1022,7 @@ pipeline { ...@@ -995,7 +1022,7 @@ pipeline {
make -j64 tile_example_gemm_basic && \ make -j64 tile_example_gemm_basic && \
cd ../ && cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -1014,7 +1041,7 @@ pipeline { ...@@ -1014,7 +1041,7 @@ pipeline {
make -j64 tile_example_gemm_basic && \ make -j64 tile_example_gemm_basic && \
cd ../ && cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -1040,7 +1067,7 @@ pipeline { ...@@ -1040,7 +1067,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " \ -DCMAKE_CXX_FLAGS=" -O3 " \
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
execute_args = " " execute_args = " "
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name) Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
cleanWs() cleanWs()
...@@ -1059,7 +1086,7 @@ pipeline { ...@@ -1059,7 +1086,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " \ -DCMAKE_CXX_FLAGS=" -O3 " \
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
execute_args = " " execute_args = " "
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name) Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
cleanWs() cleanWs()
...@@ -1140,7 +1167,7 @@ pipeline { ...@@ -1140,7 +1167,7 @@ pipeline {
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \ -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """ -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
......
cmake_minimum_required(VERSION 3.16)
project(composable_kernel_host)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
...@@ -5,56 +8,51 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) ...@@ -5,56 +8,51 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
add_compile_options(-std=c++17) find_package(ROCM)
find_package(hip) include(ROCMInstallTargets)
add_custom_target(codegen) include(ROCMTest)
# add include directories rocm_setup_version(VERSION 1.0)
include_directories(BEFORE
${PROJECT_BINARY_DIR}/include
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
${HIP_INCLUDE_DIRS}
)
list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake) list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
include(Embed) include(Embed)
file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
${CK_ROOT}/include/ck/*.hpp) ${CK_ROOT}/include/ck/*.hpp)
#printouts fot debug purposes # printouts fot debug purposes
#message(STATUS "KERNEL_FILES: ${KERNEL_FILES}") # message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
#message(STATUS "RELATIVE: ${CK_ROOT}/include") # message(STATUS "RELATIVE: ${CK_ROOT}/include")
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include) add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp) add_compile_options(-std=c++17)
##message(STATUS "SOURCE_FILES: ${SOURCES}") file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
# TODO: Use object library # TODO: Use object library
add_library(ck_host STATIC ${SOURCES}) add_library(ck_host STATIC ${SOURCES})
target_link_libraries(ck_host PRIVATE ck_headers) target_link_libraries(ck_host PRIVATE ck_headers)
set_target_properties(ck_host PROPERTIES set_target_properties(ck_host PROPERTIES
LINKER_LANGUAGE CXX LINKER_LANGUAGE CXX
POSITION_INDEPENDENT_CODE ON) POSITION_INDEPENDENT_CODE ON)
target_include_directories(ck_host PUBLIC # target_include_directories(ck_host PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> # $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include> # )
)
add_executable(ck-template-driver driver/main.cpp) add_executable(ck-template-driver driver/main.cpp)
target_link_libraries(ck-template-driver ck_host) target_link_libraries(ck-template-driver ck_host)
rocm_install( rocm_install_targets(
TARGETS ck_host ck_headers TARGETS ck_host ck_headers
EXPORT ck_hostTargets EXPORT ck_host_targets
INCLUDE include
PRIVATE
)
rocm_export_targets(
EXPORT ck_host_targets
NAMESPACE composable_kernel::
) )
rocm_install(EXPORT ck_hostTargets
FILE composable_kernelck_hostTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel)
rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(BUILD_TESTING) if(BUILD_TESTING)
add_subdirectory(test) add_subdirectory(test)
endif() endif()
list(APPEND CMAKE_PREFIX_PATH /opt/rocm) list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
add_subdirectory(rtc) add_subdirectory(rtc)
file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp) file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
# do not build the tests when we build the library for various targets
if(NOT GPU_ARCHS) # TODO: These tests need to be refactored to remove dependency on main ck
foreach(TEST_SRC ${TEST_SRCS}) # headers and device compilation.
set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP) set(TESTS_REQUIRE_DEVICE_COMPILE
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE) grouped_conv_fwd_multiple_d_v1
add_executable(codegen_test_${BASE_NAME} ${TEST_SRC}) grouped_conv_fwd_multiple_d_v2
if(CK_USE_ALTERNATIVE_PYTHON) grouped_conv_fwd_multiple_d_v3
target_link_options(codegen_test_${BASE_NAME} PRIVATE -lstdc++fs) grouped_conv_fwd_multiple_d_v4
endif() )
add_dependencies(codegen codegen_test_${BASE_NAME}) find_package(hip)
add_dependencies(tests codegen_test_${BASE_NAME})
add_dependencies(check codegen_test_${BASE_NAME}) foreach(TEST_SRC ${TEST_SRCS})
add_test(NAME codegen_test_${BASE_NAME} COMMAND codegen_test_${BASE_NAME}) get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
message("adding test codegen_test_${BASE_NAME}") rocm_add_test_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host) target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/codegen/test/include) target_include_directories(codegen_test_${BASE_NAME} PUBLIC include)
if(BASE_NAME IN_LIST TESTS_REQUIRE_DEVICE_COMPILE)
target_link_libraries(codegen_test_${BASE_NAME} hip::device)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include) target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include) target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
endforeach() endif()
endif() endforeach()
find_package(hip)
file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp) file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
add_library(ck_rtc ${RTC_SOURCES}) add_library(ck_rtc ${RTC_SOURCES})
target_include_directories(ck_rtc PUBLIC include) target_include_directories(ck_rtc PUBLIC include)
target_link_libraries(ck_rtc PUBLIC hip::host) target_link_libraries(ck_rtc PUBLIC hip::host)
target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
...@@ -2,14 +2,14 @@ ...@@ -2,14 +2,14 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp> #include <rtc/kernel.hpp>
#include <ck/filesystem.hpp> #include <rtc/filesystem.hpp>
#include <string> #include <string>
namespace rtc { namespace rtc {
struct src_file struct src_file
{ {
CK::fs::path path; fs::path path;
std::string_view content; std::string_view content;
}; };
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#include <string>
#include <string_view>
// clang-format off
#if defined(CPPCHECK)
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 1
#elif defined(_WIN32)
#if _MSC_VER >= 1920
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 0
#elif _MSC_VER >= 1900
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#elif defined(__has_include)
#if __has_include(<filesystem>) && __cplusplus >= 201703L
#define RTC_HAS_FILESYSTEM 1
#else
#define RTC_HAS_FILESYSTEM 0
#endif
#if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
// clang-format on
#if RTC_HAS_FILESYSTEM
#include <filesystem>
#elif RTC_HAS_FILESYSTEM_TS
#include <experimental/filesystem>
#else
#error "No filesystem include available"
#endif
namespace rtc {
#if RTC_HAS_FILESYSTEM
namespace fs = ::std::filesystem;
#elif RTC_HAS_FILESYSTEM_TS
namespace fs = ::std::experimental::filesystem;
#endif
} // namespace rtc
#endif // GUARD_RTC_FILESYSTEM_HPP_
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string> #include <string>
#include <ck/filesystem.hpp> #include <rtc/filesystem.hpp>
namespace rtc { namespace rtc {
struct tmp_dir struct tmp_dir
{ {
CK::fs::path path; fs::path path;
tmp_dir(const std::string& prefix = ""); tmp_dir(const std::string& prefix = "");
void execute(const std::string& cmd) const; void execute(const std::string& cmd) const;
......
#include "rtc/hip.hpp" #include <rtc/hip.hpp>
#include <rtc/compile_kernel.hpp> #include <rtc/compile_kernel.hpp>
#include <rtc/tmp_dir.hpp> #include <rtc/tmp_dir.hpp>
#include <stdexcept> #include <stdexcept>
...@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options ...@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
for(const auto& src : srcs) for(const auto& src : srcs)
{ {
CK::fs::path full_path = td.path / src.path; fs::path full_path = td.path / src.path;
CK::fs::path parent_path = full_path.parent_path(); fs::path parent_path = full_path.parent_path();
CK::fs::create_directories(parent_path); fs::create_directories(parent_path);
write_string(full_path.string(), src.content); write_string(full_path.string(), src.content);
if(src.path.extension().string() == ".cpp") if(src.path.extension().string() == ".cpp")
{ {
...@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options ...@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
td.execute(compiler() + options.flags); td.execute(compiler() + options.flags);
auto out_path = td.path / out; auto out_path = td.path / out;
if(not CK::fs::exists(out_path)) if(not fs::exists(out_path))
throw std::runtime_error("Output file missing: " + out); throw std::runtime_error("Output file missing: " + out);
auto obj = read_buffer(out_path.string()); auto obj = read_buffer(out_path.string());
......
...@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix) ...@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
} }
tmp_dir::tmp_dir(const std::string& prefix) tmp_dir::tmp_dir(const std::string& prefix)
: path(CK::fs::temp_directory_path() / : path(fs::temp_directory_path() /
unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix)) unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
{ {
CK::fs::create_directories(this->path); fs::create_directories(this->path);
} }
void tmp_dir::execute(const std::string& cmd) const void tmp_dir::execute(const std::string& cmd) const
...@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const ...@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
std::system(s.c_str()); std::system(s.c_str());
} }
tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); } tmp_dir::~tmp_dir() { fs::remove_all(this->path); }
} // namespace rtc } // namespace rtc
...@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu ...@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu
add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp) add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp)
add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp) add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp)
add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp) add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp)
add_example_executable(elementwise_scale_permute_amax_2D_fp16_fp8 elementwise_scale_permute_amax_2D_fp16_fp8.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/utility/reduction_enums.hpp"
using F16 = ck::half_t;
using F32 = float;
using F8 = ck::f8_t;
using InputDataType = F16;
using ScaleDataType = F32;
using OutputDataType = F8;
static constexpr ck::index_t NumDim = 2;
constexpr ck::ReduceTensorOp ReduceOpId = ck::ReduceTensorOp::MAX;
constexpr bool PropagateNan = true;
constexpr bool OutputIndex = false;
using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
struct ScalePassThrough
{
ScalePassThrough(const float alpha = 1.f) : alpha_(alpha) {}
__host__ __device__ constexpr void
operator()(OutputDataType& y0, OutputDataType& y1, const InputDataType& x0) const
{
y0 = ck::type_convert<OutputDataType>(ck::type_convert<ScaleDataType>(x0) * alpha_);
y1 = y0;
}
const ScaleDataType alpha_;
};
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using UnaryAbs = ck::tensor_operation::element_wise::UnaryAbs;
using DeviceElementwisePermuteInstance = ck::tensor_operation::device::DeviceElementwiseImpl<
ck::Tuple<InputDataType>, // InDataTypeTuple
ck::Tuple<OutputDataType, OutputDataType>, // OutDataTypeTuple
ScalePassThrough, // Elementwise
NumDim, // NumDim
256, // BlockSize
128, // M0PerBlock
128, // M1PerBlock
8, // M0PerThread
8, // M1PerThread
ck::Sequence<1, 0>, // ThreadClusterArrangeOrder
ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<8, 1>>; // OutScalarPerVectorSeq
using DeviceReduceInstance =
ck::tensor_operation::device::DeviceReduceMultiBlock<OutputDataType,
OutputDataType,
OutputDataType,
NumDim,
NumDim,
ReduceOperation,
UnaryAbs,
PassThrough,
ck::InMemoryDataOperationEnum::Set,
PropagateNan,
OutputIndex,
false, // HaveIndexInputIfOutputIndex
1024, // BlockSize
1, // MThreadClusterSize
1024, // KThreadClusterSize
1, // MThreadSliceSize
16, // KThreadSliceSize
1, // InSrcVectorDim
16, // InSrceVectorSize
1>; // OutDstVectorSize
void reference_scale_permute_amax(Tensor<InputDataType>& input,
Tensor<OutputDataType>& host_output_scaled_casted_transposed,
Tensor<OutputDataType>& host_output_scaled_casted,
Tensor<OutputDataType>& host_output_amax,
const float scale)
{
ScalePassThrough out_element_op(scale);
const ck::index_t M = input.GetLengths()[0];
const ck::index_t K = input.GetLengths()[1];
for(ck::index_t m = 0; m < M; m++)
{
for(ck::index_t k = 0; k < K; k++)
{
OutputDataType y0, y1;
out_element_op(y0, y1, input(m, k));
host_output_scaled_casted(m, k) = y0;
host_output_scaled_casted_transposed(m, k) = y1;
const OutputDataType y_fabs =
ck::type_convert<OutputDataType>(ck::math::abs(ck::type_convert<float>(y0)));
host_output_amax(0) = ck::math::max(y_fabs, host_output_amax(0));
}
}
}
int main(int argc, char* argv[])
{
bool do_verification = true;
bool time_kernel = true;
const float scale = 2.f;
ck::index_t M = 1024;
ck::index_t K = 1024;
if(argc == 3)
{
M = std::stoi(argv[1]);
K = std::stoi(argv[2]);
}
std::array<ck::index_t, 2> dims = {M, K};
std::array<ck::index_t, 2> in_strides = {K, 1};
std::array<ck::index_t, 2> out_strides = {1, M};
Tensor<InputDataType> input(dims, in_strides);
Tensor<OutputDataType> output_scaled_casted_transposed(dims, out_strides);
Tensor<OutputDataType> output_scaled_casted(dims, in_strides);
Tensor<OutputDataType> output_amax({1});
input.GenerateTensorValue(GeneratorTensor_3<InputDataType>{0.0, 1.0});
DeviceMem input_dev_buf(sizeof(InputDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem output_scaled_casted_transposed_dev_buf(
sizeof(OutputDataType) * output_scaled_casted_transposed.mDesc.GetElementSpaceSize());
DeviceMem output_scaled_casted_dev_buf(sizeof(OutputDataType) *
output_scaled_casted.mDesc.GetElementSpaceSize());
DeviceMem output_amax_dev_buf(sizeof(OutputDataType) * output_amax.mDesc.GetElementSpaceSize());
input_dev_buf.ToDevice(input.mData.data());
std::array<const void*, 1> inputs = {input_dev_buf.GetDeviceBuffer()};
std::array<void*, 2> outputs = {output_scaled_casted_transposed_dev_buf.GetDeviceBuffer(),
output_scaled_casted_dev_buf.GetDeviceBuffer()};
std::cout << "Input: " << input.mDesc << std::endl;
std::cout << "Scale: " << scale << std::endl;
std::cout << "Output scaled casted transposed: " << output_scaled_casted_transposed.mDesc
<< std::endl;
std::cout << "Output scaled casted: " << output_scaled_casted.mDesc << std::endl;
std::cout << "Output amax: " << output_amax.mDesc << std::endl;
auto launch_transpose_scale = [&]() {
auto transposeScale = DeviceElementwisePermuteInstance{};
auto argument = transposeScale.MakeArgumentPointer(dims,
{in_strides},
{out_strides, in_strides},
inputs,
outputs,
ScalePassThrough{scale});
if(!transposeScale.IsSupportedArgument(argument.get()))
{
throw std::runtime_error(
"The runtime parameters seems not supported by the device instance, exiting!");
};
auto transposeScale_invoker_ptr = transposeScale.MakeInvokerPointer();
return transposeScale_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
};
auto launch_reduce = [&]() {
auto reduce = DeviceReduceInstance{};
auto reduce_argument_ptr =
reduce.MakeArgumentPointer(dims,
in_strides,
{1}, // Output Lengths
{1}, // Output Strides
{0, 1}, // Reduce Dims
static_cast<double>(1.f),
static_cast<double>(0.f),
output_scaled_casted_dev_buf.GetDeviceBuffer(),
nullptr,
output_amax_dev_buf.GetDeviceBuffer(),
nullptr,
UnaryAbs{},
PassThrough{});
if(!reduce.IsSupportedArgument(reduce_argument_ptr.get()))
{
throw std::runtime_error(
"The runtime parameters seems not supported by the device instance, exiting!");
};
auto invoker_ptr = reduce.MakeInvokerPointer();
return invoker_ptr->Run(reduce_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
};
float ave_time = launch_transpose_scale();
ave_time += launch_reduce();
std::cout << "Perf: " << ave_time << " ms" << std::endl;
bool pass = true;
if(do_verification)
{
Tensor<OutputDataType> host_output_scaled_casted_transposed(dims, out_strides);
Tensor<OutputDataType> host_output_scaled_casted(dims, in_strides);
Tensor<OutputDataType> host_output_amax({1});
reference_scale_permute_amax(input,
host_output_scaled_casted_transposed,
host_output_scaled_casted,
host_output_amax,
scale);
output_scaled_casted_transposed_dev_buf.FromDevice(
output_scaled_casted_transposed.mData.data());
output_scaled_casted_dev_buf.FromDevice(output_scaled_casted.mData.data());
output_amax_dev_buf.FromDevice(output_amax.mData.data());
pass &= ck::utils::check_err(output_scaled_casted_transposed.mData,
host_output_scaled_casted_transposed.mData,
"Error: Incorrect results scaled transposed",
1e-3,
1e-3);
pass &= ck::utils::check_err(output_scaled_casted.mData,
host_output_scaled_casted.mData,
"Error: Incorrect results scaled",
1e-3,
1e-3);
pass &= ck::utils::check_err(
output_amax.mData, host_output_amax.mData, "Error: Incorrect results amax", 1e-3, 1e-3);
}
return pass ? 0 : 1;
}
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
...@@ -282,7 +281,11 @@ int main(int argc, char* argv[]) ...@@ -282,7 +281,11 @@ int main(int argc, char* argv[])
using CodegenPipelineProblem = ck_tile:: using CodegenPipelineProblem = ck_tile::
GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>; GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>; using CodegenGemmPolicy = ck_tile::
UniversalGemmPipelineAgBgCrPolicy<matrix_a_layout, matrix_b_layout, matrix_c_layout>;
using CodegenGemmPipeline =
ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem, CodegenGemmPolicy>;
invoke_gemm<ck_tile::half_t, invoke_gemm<ck_tile::half_t,
matrix_a_layout, matrix_a_layout,
......
...@@ -419,6 +419,12 @@ struct UnaryAbs ...@@ -419,6 +419,12 @@ struct UnaryAbs
y = ck::math::abs(x); y = ck::math::abs(x);
}; };
template <>
__host__ __device__ void operator()(f8_t& y, const f8_t& x) const
{
y = ck::type_convert<f8_t>(ck::math::abs(ck::type_convert<float>(x)));
};
}; };
struct UnarySqrt struct UnarySqrt
......
This diff is collapsed.
...@@ -80,6 +80,8 @@ static inline __host__ bool isnan(half_t x) ...@@ -80,6 +80,8 @@ static inline __host__ bool isnan(half_t x)
return (xx & 0x7FFF) > 0x7C00; return (xx & 0x7FFF) > 0x7C00;
}; };
static inline __host__ bool isnan(f8_t x) { return (x & 0x80); };
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
static inline __host__ bool isnan(int4_t x) static inline __host__ bool isnan(int4_t x)
{ {
...@@ -529,6 +531,8 @@ static inline __device__ bool isnan(half_t x) ...@@ -529,6 +531,8 @@ static inline __device__ bool isnan(half_t x)
return (xx & 0x7FFF) > 0x7C00; return (xx & 0x7FFF) > 0x7C00;
}; };
static inline __device__ bool isnan(f8_t x) { return (x & 0x80); };
static inline __device__ half_t sqrt(half_t x) static inline __device__ half_t sqrt(half_t x)
{ {
return static_cast<half_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x))); return static_cast<half_t>(__builtin_amdgcn_sqrtf(static_cast<float>(x)));
......
...@@ -159,8 +159,11 @@ ...@@ -159,8 +159,11 @@
#endif #endif
#endif #endif
// workaround for ROCm 6.2 and later
#ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE #ifndef CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE
#if HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133 #if(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 2 && HIP_VERSION_PATCH >= 41133) || \
(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR == 3 && HIP_VERSION_PATCH >= 42131) || \
(HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR > 3)
#define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 1
#else #else
#define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0 #define CK_TILE_WORKAROUND_ROCM_6_2_SCRATCH_MEMORY_ISSUE 0
......
...@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
k_lds_ptr, Policy::template MakeKLdsWriteBlockDescriptor<Problem>()); k_lds_ptr, Policy::template MakeKLdsWriteBlockDescriptor<Problem>());
auto k_lds_write_window = auto k_lds_write_window =
make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0}); make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
auto k_lds_read_window = auto k_lds_read_window =
make_tile_window(k_lds_write_window.get_bottom_tensor_view(), make_tile_window(k_lds_write_window.get_bottom_tensor_view(),
make_tuple(number<kN0>{}, number<kK0>{}), make_tuple(number<kN0>{}, number<kK0>{}),
k_lds_write_window.get_window_origin(), k_lds_write_window.get_window_origin(),
Policy::template MakeKRegSliceBlockDescriptor<Problem>()); Policy::template MakeKRegBlockDescriptor<Problem>());
auto k_reg_tensor = make_static_distributed_tensor<KDataType>( auto k_reg_tensor = make_static_distributed_tensor<KDataType>(
Policy::template MakeKRegBlockDescriptor<Problem>()); Policy::template MakeKRegBlockDescriptor<Problem>());
...@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
v_lds_ptr, Policy::template MakeVLdsWriteBlockDescriptor<Problem>()); v_lds_ptr, Policy::template MakeVLdsWriteBlockDescriptor<Problem>());
auto v_lds_write_window = auto v_lds_write_window =
make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kK2>{}), {0, 0}); make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kVHeaddim>{}), {0, 0});
auto v_lds_read_window = auto v_lds_read_window =
make_tile_window(v_lds_write_window.get_bottom_tensor_view(), make_tile_window(v_lds_write_window.get_bottom_tensor_view(),
make_tuple(number<kN0>{}, number<kK2>{}), make_tuple(number<kN0>{}, number<kK2>{}),
v_lds_write_window.get_window_origin(), v_lds_write_window.get_window_origin(),
Policy::template MakeVRegSliceBlockDescriptor<Problem>()); Policy::template MakeVRegBlockDescriptor<Problem>());
auto v_reg_tensor = make_static_distributed_tensor<VDataType>(
Policy::template MakeVRegBlockDescriptor<Problem>());
//------------------------------------------------------------------ //------------------------------------------------------------------
// KT, Reg ->LDS ->Reg // KT, Reg ->LDS ->Reg
...@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
kt_lds_ptr, Policy::template MakeShuffledKLdsWriteBlockDescriptor<Problem>()); kt_lds_ptr, Policy::template MakeShuffledKLdsWriteBlockDescriptor<Problem>());
auto shuffled_k_lds_write_window = make_tile_window( auto shuffled_k_lds_write_window = make_tile_window(
shuffled_k_lds_write, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0}); shuffled_k_lds_write, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
auto kt_lds_read = make_tensor_view<address_space_enum::lds>( auto kt_lds_read = make_tensor_view<address_space_enum::lds>(
kt_lds_ptr, Policy::template MakeKTLdsReadBlockDescriptor<Problem>()); kt_lds_ptr, Policy::template MakeKTLdsReadBlockDescriptor<Problem>());
...@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
block_sync_lds(); block_sync_lds();
v_reg_tensor = load_tile(v_lds_read_window); auto v_reg_tensor = load_tile(v_lds_read_window);
block_sync_lds(); block_sync_lds();
//---------------------------- Loop Load in ----------------------------// //---------------------------- Loop Load in ----------------------------//
// Q: HBM ->Reg ->LDS // Q: HBM ->Reg ->LDS
...@@ -276,7 +273,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -276,7 +273,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
q_lds_ptr, Policy::template MakeQLdsBlockDescriptor<Problem>()); q_lds_ptr, Policy::template MakeQLdsBlockDescriptor<Problem>());
auto q_lds_window = auto q_lds_window =
make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kK0>{}), {0, 0}); make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
auto q_lds_read_window = auto q_lds_read_window =
make_tile_window(q_lds_window.get_bottom_tensor_view(), make_tile_window(q_lds_window.get_bottom_tensor_view(),
...@@ -297,7 +294,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -297,7 +294,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
qt_lds_ptr, Policy::template MakeShuffledQLdsWriteBlockDescriptor<Problem>()); qt_lds_ptr, Policy::template MakeShuffledQLdsWriteBlockDescriptor<Problem>());
auto shuffled_q_lds_write_window = make_tile_window( auto shuffled_q_lds_write_window = make_tile_window(
shuffled_q_lds_write, make_tuple(number<kM0>{}, number<kK0>{}), {0, 0}); shuffled_q_lds_write, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
auto qt_lds_read = make_tensor_view<address_space_enum::lds>( auto qt_lds_read = make_tensor_view<address_space_enum::lds>(
qt_lds_ptr, Policy::template MakeQTLdsReadBlockDescriptor<Problem>()); qt_lds_ptr, Policy::template MakeQTLdsReadBlockDescriptor<Problem>());
...@@ -322,7 +319,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -322,7 +319,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
do_lds_ptr, Policy::template MakeOGradLdsBlockDescriptor<Problem>()); do_lds_ptr, Policy::template MakeOGradLdsBlockDescriptor<Problem>());
auto do_lds_window = auto do_lds_window =
make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kK2>{}), {0, 0}); make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
auto do_lds_read_window = auto do_lds_read_window =
make_tile_window(do_lds_window.get_bottom_tensor_view(), make_tile_window(do_lds_window.get_bottom_tensor_view(),
...@@ -341,7 +338,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -341,7 +338,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
dot_lds_ptr, Policy::template MakeShuffledOGradLdsWriteBlockDescriptor<Problem>()); dot_lds_ptr, Policy::template MakeShuffledOGradLdsWriteBlockDescriptor<Problem>());
auto shuffled_do_lds_write_window = make_tile_window( auto shuffled_do_lds_write_window = make_tile_window(
shuffled_do_lds_write, make_tuple(number<kM0>{}, number<kK2>{}), {0, 0}); shuffled_do_lds_write, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
auto dot_read_lds = make_tensor_view<address_space_enum::lds>( auto dot_read_lds = make_tensor_view<address_space_enum::lds>(
dot_lds_ptr, Policy::template MakeOGradTLdsReadBlockDescriptor<Problem>()); dot_lds_ptr, Policy::template MakeOGradTLdsReadBlockDescriptor<Problem>());
...@@ -483,9 +480,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR ...@@ -483,9 +480,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVR
index_t i_total_loops = 0; index_t i_total_loops = 0;
index_t seqlen_q_step = seqlen_q_start; index_t seqlen_q_step = seqlen_q_start;
static_assert(kQKHeaddim == kK0, "kQKHeaddim should equal to kK0"); static_assert(kQKHeaddim >= kK0, "kQKHeaddim should be equal or greater than kK0");
static_assert(kM0 == kK1, "kM0 should equal to kK1"); static_assert(kM0 == kK1, "kM0 should equal to kK1");
static_assert(kVHeaddim == kK2, "kVHeaddim should equal to kK2"); static_assert(kVHeaddim >= kK2, "kVHeaddim should be equal or greater than kK2");
static_assert(kM0 == kK3, "kM0 should equal to kK3"); static_assert(kM0 == kK3, "kM0 should equal to kK3");
constexpr index_t k4_loops = kN0 / kK4; constexpr index_t k4_loops = kN0 / kK4;
......
...@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -178,13 +178,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
k_lds_ptr, Policy::template MakeKLdsWriteBlockDescriptor<Problem>()); k_lds_ptr, Policy::template MakeKLdsWriteBlockDescriptor<Problem>());
auto k_lds_write_window = auto k_lds_write_window =
make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0}); make_tile_window(k_lds, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
auto k_lds_read_window = auto k_lds_read_window =
make_tile_window(k_lds_write_window.get_bottom_tensor_view(), make_tile_window(k_lds_write_window.get_bottom_tensor_view(),
make_tuple(number<kN0>{}, number<kK0>{}), make_tuple(number<kN0>{}, number<kK0>{}),
k_lds_write_window.get_window_origin(), k_lds_write_window.get_window_origin(),
Policy::template MakeKRegSliceBlockDescriptor<Problem>()); Policy::template MakeKRegBlockDescriptor<Problem>());
auto k_reg_tensor = make_static_distributed_tensor<KDataType>( auto k_reg_tensor = make_static_distributed_tensor<KDataType>(
Policy::template MakeKRegBlockDescriptor<Problem>()); Policy::template MakeKRegBlockDescriptor<Problem>());
...@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -204,16 +204,13 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
v_lds_ptr, Policy::template MakeVLdsWriteBlockDescriptor<Problem>()); v_lds_ptr, Policy::template MakeVLdsWriteBlockDescriptor<Problem>());
auto v_lds_write_window = auto v_lds_write_window =
make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kK2>{}), {0, 0}); make_tile_window(v_lds, make_tuple(number<kN0>{}, number<kVHeaddim>{}), {0, 0});
auto v_lds_read_window = auto v_lds_read_window =
make_tile_window(v_lds_write_window.get_bottom_tensor_view(), make_tile_window(v_lds_write_window.get_bottom_tensor_view(),
make_tuple(number<kN0>{}, number<kK2>{}), make_tuple(number<kN0>{}, number<kK2>{}),
v_lds_write_window.get_window_origin(), v_lds_write_window.get_window_origin(),
Policy::template MakeVRegSliceBlockDescriptor<Problem>()); Policy::template MakeVRegBlockDescriptor<Problem>());
auto v_reg_tensor = make_static_distributed_tensor<VDataType>(
Policy::template MakeVRegBlockDescriptor<Problem>());
//------------------------------------------------------------------ //------------------------------------------------------------------
// KT, Reg ->LDS ->Reg // KT, Reg ->LDS ->Reg
...@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -227,7 +224,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
kt_lds_ptr, Policy::template MakeShuffledKLdsWriteBlockDescriptor<Problem>()); kt_lds_ptr, Policy::template MakeShuffledKLdsWriteBlockDescriptor<Problem>());
auto shuffled_k_lds_write_window = make_tile_window( auto shuffled_k_lds_write_window = make_tile_window(
shuffled_k_lds_write, make_tuple(number<kN0>{}, number<kK0>{}), {0, 0}); shuffled_k_lds_write, make_tuple(number<kN0>{}, number<kQKHeaddim>{}), {0, 0});
auto kt_lds_read = make_tensor_view<address_space_enum::lds>( auto kt_lds_read = make_tensor_view<address_space_enum::lds>(
kt_lds_ptr, Policy::template MakeKTLdsReadBlockDescriptor<Problem>()); kt_lds_ptr, Policy::template MakeKTLdsReadBlockDescriptor<Problem>());
...@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -257,7 +254,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
block_sync_lds(); block_sync_lds();
v_reg_tensor = load_tile(v_lds_read_window); auto v_reg_tensor = load_tile(v_lds_read_window);
//---------------------------- Loop Load in ----------------------------// //---------------------------- Loop Load in ----------------------------//
// Q: HBM ->Reg ->LDS // Q: HBM ->Reg ->LDS
auto q_dram_window = auto q_dram_window =
...@@ -275,7 +272,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -275,7 +272,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
q_lds_ptr, Policy::template MakeQLdsBlockDescriptor<Problem>()); q_lds_ptr, Policy::template MakeQLdsBlockDescriptor<Problem>());
auto q_lds_window = auto q_lds_window =
make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kK0>{}), {0, 0}); make_tile_window(q_lds, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
auto q_lds_read_window = auto q_lds_read_window =
make_tile_window(q_lds_window.get_bottom_tensor_view(), make_tile_window(q_lds_window.get_bottom_tensor_view(),
...@@ -296,7 +293,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -296,7 +293,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
qt_lds_ptr, Policy::template MakeShuffledQLdsWriteBlockDescriptor<Problem>()); qt_lds_ptr, Policy::template MakeShuffledQLdsWriteBlockDescriptor<Problem>());
auto shuffled_q_lds_write_window = make_tile_window( auto shuffled_q_lds_write_window = make_tile_window(
shuffled_q_lds_write, make_tuple(number<kM0>{}, number<kK0>{}), {0, 0}); shuffled_q_lds_write, make_tuple(number<kM0>{}, number<kQKHeaddim>{}), {0, 0});
auto qt_lds_read = make_tensor_view<address_space_enum::lds>( auto qt_lds_read = make_tensor_view<address_space_enum::lds>(
qt_lds_ptr, Policy::template MakeQTLdsReadBlockDescriptor<Problem>()); qt_lds_ptr, Policy::template MakeQTLdsReadBlockDescriptor<Problem>());
...@@ -321,7 +318,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -321,7 +318,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
do_lds_ptr, Policy::template MakeOGradLdsBlockDescriptor<Problem>()); do_lds_ptr, Policy::template MakeOGradLdsBlockDescriptor<Problem>());
auto do_lds_window = auto do_lds_window =
make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kK2>{}), {0, 0}); make_tile_window(do_lds, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
auto do_lds_read_window = auto do_lds_read_window =
make_tile_window(do_lds_window.get_bottom_tensor_view(), make_tile_window(do_lds_window.get_bottom_tensor_view(),
...@@ -340,7 +337,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -340,7 +337,7 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
dot_lds_ptr, Policy::template MakeShuffledOGradLdsWriteBlockDescriptor<Problem>()); dot_lds_ptr, Policy::template MakeShuffledOGradLdsWriteBlockDescriptor<Problem>());
auto shuffled_do_lds_write_window = make_tile_window( auto shuffled_do_lds_write_window = make_tile_window(
shuffled_do_lds_write, make_tuple(number<kM0>{}, number<kK2>{}), {0, 0}); shuffled_do_lds_write, make_tuple(number<kM0>{}, number<kVHeaddim>{}), {0, 0});
auto dot_read_lds = make_tensor_view<address_space_enum::lds>( auto dot_read_lds = make_tensor_view<address_space_enum::lds>(
dot_lds_ptr, Policy::template MakeOGradTLdsReadBlockDescriptor<Problem>()); dot_lds_ptr, Policy::template MakeOGradTLdsReadBlockDescriptor<Problem>());
...@@ -482,9 +479,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP ...@@ -482,9 +479,9 @@ struct BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP
index_t i_total_loops = 0; index_t i_total_loops = 0;
index_t seqlen_q_step = seqlen_q_start; index_t seqlen_q_step = seqlen_q_start;
static_assert(kQKHeaddim == kK0, "kQKHeaddim should equal to kK0"); static_assert(kQKHeaddim >= kK0, "kQKHeaddim should be equal or greater than kK0");
static_assert(kM0 == kK1, "kM0 should equal to kK1"); static_assert(kM0 == kK1, "kM0 should equal to kK1");
static_assert(kVHeaddim == kK2, "kVHeaddim should equal to kK2"); static_assert(kVHeaddim >= kK2, "kVHeaddim should be equal or greater than kK2");
static_assert(kM0 == kK3, "kM0 should equal to kK3"); static_assert(kM0 == kK3, "kM0 should equal to kK3");
constexpr index_t k4_loops = kN0 / kK4; constexpr index_t k4_loops = kN0 / kK4;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment