Commit e941f59f authored by Andriy Roshchenko's avatar Andriy Roshchenko
Browse files

Merge branch gfx950 into andriy/lwpck-2413

parents fe9d9812 7da48908
...@@ -206,6 +206,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302) ...@@ -206,6 +206,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
add_compile_options(-fno-offload-uniform-block) add_compile_options(-fno-offload-uniform-block)
endif() endif()
endif() endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
if(HAS_LSR_DROP_SOLUTION)
message("Adding the lsr-drop-solution=1 compiler flag")
add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
endif()
endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090) if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED) check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
if(HAS_ENABLE_POST_MISCHED) if(HAS_ENABLE_POST_MISCHED)
......
...@@ -795,6 +795,10 @@ pipeline { ...@@ -795,6 +795,10 @@ pipeline {
name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS", name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
defaultValue: false, defaultValue: false,
description: "Run the grouped conv large cases tests (default: OFF)") description: "Run the grouped conv large cases tests (default: OFF)")
booleanParam(
name: "RUN_CODEGEN_TESTS",
defaultValue: false,
description: "Run codegen tests (default: OFF)")
booleanParam( booleanParam(
name: "RUN_CK_TILE_FMHA_TESTS", name: "RUN_CK_TILE_FMHA_TESTS",
defaultValue: false, defaultValue: false,
...@@ -915,7 +919,30 @@ pipeline { ...@@ -915,7 +919,30 @@ pipeline {
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 test_grouped_convnd_fwd_large_cases_xdl && \ make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
./bin/test_grouped_convnd_fwd_large_cases_xdl""" ./bin/test_grouped_convnd_fwd_large_cases_xdl"""
} }
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Run Codegen Tests")
{
parallel
{
stage("Run Codegen Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_CODEGEN_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
make -j64 check"""
}
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -940,7 +967,7 @@ pipeline { ...@@ -940,7 +967,7 @@ pipeline {
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ && cd ../ &&
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -959,7 +986,7 @@ pipeline { ...@@ -959,7 +986,7 @@ pipeline {
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \ make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ && cd ../ &&
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -984,7 +1011,7 @@ pipeline { ...@@ -984,7 +1011,7 @@ pipeline {
make -j64 tile_example_gemm_basic && \ make -j64 tile_example_gemm_basic && \
cd ../ && cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -1003,7 +1030,7 @@ pipeline { ...@@ -1003,7 +1030,7 @@ pipeline {
make -j64 tile_example_gemm_basic && \ make -j64 tile_example_gemm_basic && \
cd ../ && cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
...@@ -1029,7 +1056,7 @@ pipeline { ...@@ -1029,7 +1056,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " \ -DCMAKE_CXX_FLAGS=" -O3 " \
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
execute_args = " " execute_args = " "
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name) Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
cleanWs() cleanWs()
...@@ -1048,7 +1075,7 @@ pipeline { ...@@ -1048,7 +1075,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " \ -DCMAKE_CXX_FLAGS=" -O3 " \
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """ -DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
execute_args = " " execute_args = " "
} }
steps{ steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name) Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
cleanWs() cleanWs()
...@@ -1129,7 +1156,7 @@ pipeline { ...@@ -1129,7 +1156,7 @@ pipeline {
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \ -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """ -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
} }
steps{ steps{
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args) buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs() cleanWs()
......
# Composable Kernel # Composable Kernel
> [!NOTE]
> The published documentation is available at [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
The Composable Kernel (CK) library provides a programming model for writing performance-critical The Composable Kernel (CK) library provides a programming model for writing performance-critical
kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
uses general purpose kernel languages, such as HIP C++. uses general purpose kernel languages, such as HIP C++.
......
cmake_minimum_required(VERSION 3.16)
project(composable_kernel_host)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
...@@ -5,56 +8,51 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) ...@@ -5,56 +8,51 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..) set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
add_compile_options(-std=c++17) find_package(ROCM)
find_package(hip) include(ROCMInstallTargets)
add_custom_target(codegen) include(ROCMTest)
# add include directories rocm_setup_version(VERSION 1.0)
include_directories(BEFORE
${PROJECT_BINARY_DIR}/include
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
${HIP_INCLUDE_DIRS}
)
list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake) list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
include(Embed) include(Embed)
file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
${CK_ROOT}/include/ck/*.hpp) ${CK_ROOT}/include/ck/*.hpp)
#printouts fot debug purposes # printouts fot debug purposes
#message(STATUS "KERNEL_FILES: ${KERNEL_FILES}") # message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
#message(STATUS "RELATIVE: ${CK_ROOT}/include") # message(STATUS "RELATIVE: ${CK_ROOT}/include")
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include) add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp) add_compile_options(-std=c++17)
##message(STATUS "SOURCE_FILES: ${SOURCES}") file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
# TODO: Use object library # TODO: Use object library
add_library(ck_host STATIC ${SOURCES}) add_library(ck_host STATIC ${SOURCES})
target_link_libraries(ck_host PRIVATE ck_headers) target_link_libraries(ck_host PRIVATE ck_headers)
set_target_properties(ck_host PROPERTIES set_target_properties(ck_host PROPERTIES
LINKER_LANGUAGE CXX LINKER_LANGUAGE CXX
POSITION_INDEPENDENT_CODE ON) POSITION_INDEPENDENT_CODE ON)
target_include_directories(ck_host PUBLIC # target_include_directories(ck_host PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> # $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include> # )
)
add_executable(ck-template-driver driver/main.cpp) add_executable(ck-template-driver driver/main.cpp)
target_link_libraries(ck-template-driver ck_host) target_link_libraries(ck-template-driver ck_host)
rocm_install( rocm_install_targets(
TARGETS ck_host ck_headers TARGETS ck_host ck_headers
EXPORT ck_hostTargets EXPORT ck_host_targets
INCLUDE include
PRIVATE
)
rocm_export_targets(
EXPORT ck_host_targets
NAMESPACE composable_kernel::
) )
rocm_install(EXPORT ck_hostTargets
FILE composable_kernelck_hostTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel)
rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(BUILD_TESTING) if(BUILD_TESTING)
add_subdirectory(test) add_subdirectory(test)
endif() endif()
list(APPEND CMAKE_PREFIX_PATH /opt/rocm) list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
add_subdirectory(rtc) add_subdirectory(rtc)
file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp) file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
# do not build the tests when we build the library for various targets
if(NOT GPU_ARCHS) # TODO: These tests need to be refactored to remove dependency on main ck
foreach(TEST_SRC ${TEST_SRCS}) # headers and device compilation.
set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP) set(TESTS_REQUIRE_DEVICE_COMPILE
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE) grouped_conv_fwd_multiple_d_v1
add_executable(codegen_test_${BASE_NAME} ${TEST_SRC}) grouped_conv_fwd_multiple_d_v2
if(CK_USE_ALTERNATIVE_PYTHON) grouped_conv_fwd_multiple_d_v3
target_link_options(codegen_test_${BASE_NAME} PRIVATE -lstdc++fs) grouped_conv_fwd_multiple_d_v4
endif() )
add_dependencies(codegen codegen_test_${BASE_NAME}) find_package(hip)
add_dependencies(tests codegen_test_${BASE_NAME})
add_dependencies(check codegen_test_${BASE_NAME}) foreach(TEST_SRC ${TEST_SRCS})
add_test(NAME codegen_test_${BASE_NAME} COMMAND codegen_test_${BASE_NAME}) get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
message("adding test codegen_test_${BASE_NAME}") rocm_add_test_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host) target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/codegen/test/include) target_include_directories(codegen_test_${BASE_NAME} PUBLIC include)
if(BASE_NAME IN_LIST TESTS_REQUIRE_DEVICE_COMPILE)
target_link_libraries(codegen_test_${BASE_NAME} hip::device)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include) target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include) target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
endforeach() endif()
endif() endforeach()
find_package(hip)
file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp) file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
add_library(ck_rtc ${RTC_SOURCES}) add_library(ck_rtc ${RTC_SOURCES})
target_include_directories(ck_rtc PUBLIC include) target_include_directories(ck_rtc PUBLIC include)
target_link_libraries(ck_rtc PUBLIC hip::host) target_link_libraries(ck_rtc PUBLIC hip::host)
target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
...@@ -2,14 +2,14 @@ ...@@ -2,14 +2,14 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp> #include <rtc/kernel.hpp>
#include <ck/filesystem.hpp> #include <rtc/filesystem.hpp>
#include <string> #include <string>
namespace rtc { namespace rtc {
struct src_file struct src_file
{ {
CK::fs::path path; fs::path path;
std::string_view content; std::string_view content;
}; };
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#include <string>
#include <string_view>
// clang-format off
#if defined(CPPCHECK)
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 1
#elif defined(_WIN32)
#if _MSC_VER >= 1920
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 0
#elif _MSC_VER >= 1900
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#elif defined(__has_include)
#if __has_include(<filesystem>) && __cplusplus >= 201703L
#define RTC_HAS_FILESYSTEM 1
#else
#define RTC_HAS_FILESYSTEM 0
#endif
#if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
// clang-format on
#if RTC_HAS_FILESYSTEM
#include <filesystem>
#elif RTC_HAS_FILESYSTEM_TS
#include <experimental/filesystem>
#else
#error "No filesystem include available"
#endif
namespace rtc {
#if RTC_HAS_FILESYSTEM
namespace fs = ::std::filesystem;
#elif RTC_HAS_FILESYSTEM_TS
namespace fs = ::std::experimental::filesystem;
#endif
} // namespace rtc
#endif // GUARD_RTC_FILESYSTEM_HPP_
...@@ -2,13 +2,13 @@ ...@@ -2,13 +2,13 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR #define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string> #include <string>
#include <ck/filesystem.hpp> #include <rtc/filesystem.hpp>
namespace rtc { namespace rtc {
struct tmp_dir struct tmp_dir
{ {
CK::fs::path path; fs::path path;
tmp_dir(const std::string& prefix = ""); tmp_dir(const std::string& prefix = "");
void execute(const std::string& cmd) const; void execute(const std::string& cmd) const;
......
#include "rtc/hip.hpp" #include <rtc/hip.hpp>
#include <rtc/compile_kernel.hpp> #include <rtc/compile_kernel.hpp>
#include <rtc/tmp_dir.hpp> #include <rtc/tmp_dir.hpp>
#include <stdexcept> #include <stdexcept>
...@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options ...@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
for(const auto& src : srcs) for(const auto& src : srcs)
{ {
CK::fs::path full_path = td.path / src.path; fs::path full_path = td.path / src.path;
CK::fs::path parent_path = full_path.parent_path(); fs::path parent_path = full_path.parent_path();
CK::fs::create_directories(parent_path); fs::create_directories(parent_path);
write_string(full_path.string(), src.content); write_string(full_path.string(), src.content);
if(src.path.extension().string() == ".cpp") if(src.path.extension().string() == ".cpp")
{ {
...@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options ...@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
td.execute(compiler() + options.flags); td.execute(compiler() + options.flags);
auto out_path = td.path / out; auto out_path = td.path / out;
if(not CK::fs::exists(out_path)) if(not fs::exists(out_path))
throw std::runtime_error("Output file missing: " + out); throw std::runtime_error("Output file missing: " + out);
auto obj = read_buffer(out_path.string()); auto obj = read_buffer(out_path.string());
......
...@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix) ...@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
} }
tmp_dir::tmp_dir(const std::string& prefix) tmp_dir::tmp_dir(const std::string& prefix)
: path(CK::fs::temp_directory_path() / : path(fs::temp_directory_path() /
unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix)) unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
{ {
CK::fs::create_directories(this->path); fs::create_directories(this->path);
} }
void tmp_dir::execute(const std::string& cmd) const void tmp_dir::execute(const std::string& cmd) const
...@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const ...@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
std::system(s.c_str()); std::system(s.c_str());
} }
tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); } tmp_dir::~tmp_dir() { fs::remove_all(this->path); }
} // namespace rtc } // namespace rtc
rocm-docs-core==1.8.2 rocm-docs-core==1.8.3
sphinxcontrib-bibtex==2.6.3 sphinxcontrib-bibtex==2.6.3
...@@ -103,7 +103,7 @@ requests==2.32.3 ...@@ -103,7 +103,7 @@ requests==2.32.3
# via # via
# pygithub # pygithub
# sphinx # sphinx
rocm-docs-core==1.8.2 rocm-docs-core==1.8.3
# via -r requirements.in # via -r requirements.in
six==1.16.0 six==1.16.0
# via pybtex # via pybtex
......
...@@ -29,9 +29,9 @@ struct ProblemSize final ...@@ -29,9 +29,9 @@ struct ProblemSize final
ck::index_t N = 4096; ck::index_t N = 4096;
ck::index_t K = 4096; ck::index_t K = 4096;
ck::index_t StrideA = 0; ck::index_t StrideA = -1;
ck::index_t StrideB = 0; ck::index_t StrideB = -1;
ck::index_t StrideC = 0; ck::index_t StrideC = -1;
}; };
struct ProblemSizeStreamK final struct ProblemSizeStreamK final
...@@ -40,9 +40,9 @@ struct ProblemSizeStreamK final ...@@ -40,9 +40,9 @@ struct ProblemSizeStreamK final
ck::index_t N = 4096; ck::index_t N = 4096;
ck::index_t K = 4096; ck::index_t K = 4096;
ck::index_t StrideA = 0; ck::index_t StrideA = -1;
ck::index_t StrideB = 0; ck::index_t StrideB = -1;
ck::index_t StrideC = 0; ck::index_t StrideC = -1;
ck::index_t NumSKBlocks = -1; ck::index_t NumSKBlocks = -1;
}; };
...@@ -52,9 +52,9 @@ struct ProblemSizeStreamK_universal final ...@@ -52,9 +52,9 @@ struct ProblemSizeStreamK_universal final
ck::index_t N = 4096; ck::index_t N = 4096;
ck::index_t K = 4096; ck::index_t K = 4096;
ck::index_t StrideA = 0; ck::index_t StrideA = -1;
ck::index_t StrideB = 0; ck::index_t StrideB = -1;
ck::index_t StrideC = 0; ck::index_t StrideC = -1;
ck::index_t Grid_size = -1; // defaults to max occupancy ck::index_t Grid_size = -1; // defaults to max occupancy
ck::index_t Streamk_sel = 1; // defaults to 1-tile SK ck::index_t Streamk_sel = 1; // defaults to 1-tile SK
...@@ -66,9 +66,9 @@ struct ProblemSizeSplitK final ...@@ -66,9 +66,9 @@ struct ProblemSizeSplitK final
ck::index_t N = 4096; ck::index_t N = 4096;
ck::index_t K = 4096; ck::index_t K = 4096;
ck::index_t StrideA = 0; ck::index_t StrideA = -1;
ck::index_t StrideB = 0; ck::index_t StrideB = -1;
ck::index_t StrideC = 0; ck::index_t StrideC = -1;
ck::index_t KBatch = 1; ck::index_t KBatch = 1;
}; };
......
...@@ -53,4 +53,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa ...@@ -53,4 +53,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc" #include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? -1 : 0); } int main(int argc, char* argv[]) { return run_gemm_example(argc, argv) ? 0 : -1; }
...@@ -116,21 +116,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -116,21 +116,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
}; };
auto f_get_default_stride = auto f_get_default_stride =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == 0) if(stride == -1)
{ {
// give a chance if stride is zero, return a default packed stride // give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return col; return static_cast<std::size_t>(col);
} }
else else
{ {
return row; return static_cast<std::size_t>(row);
} }
} }
else else
return stride; return static_cast<std::size_t>(stride);
}; };
StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
...@@ -248,7 +248,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -248,7 +248,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{ {
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true; return false;
} }
ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
...@@ -281,7 +281,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -281,7 +281,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{ {
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return true; return false;
} }
std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument); std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
...@@ -314,7 +314,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -314,7 +314,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cerr << gemm.GetTypeString() << ": the instance does not support the problem config." std::cerr << gemm.GetTypeString() << ": the instance does not support the problem config."
<< std::endl; << std::endl;
return true; return false;
} }
if(config.time_kernel) if(config.time_kernel)
...@@ -402,7 +402,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -402,7 +402,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cout << "Verification on GPU: PASS" << std::endl; std::cout << "Verification on GPU: PASS" << std::endl;
} }
return !pass; return pass;
} }
bool run_gemm_example(int argc, char* argv[]) bool run_gemm_example(int argc, char* argv[])
...@@ -410,7 +410,7 @@ bool run_gemm_example(int argc, char* argv[]) ...@@ -410,7 +410,7 @@ bool run_gemm_example(int argc, char* argv[])
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
} }
bool run_gemm_streamk_example(int argc, char* argv[]) bool run_gemm_streamk_example(int argc, char* argv[])
...@@ -418,5 +418,5 @@ bool run_gemm_streamk_example(int argc, char* argv[]) ...@@ -418,5 +418,5 @@ bool run_gemm_streamk_example(int argc, char* argv[])
ProblemSizeStreamK problem_size; ProblemSizeStreamK problem_size;
ExecutionConfig config; ExecutionConfig config;
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
} }
...@@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto f_get_default_stride = auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) { [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == 0) if(stride == -1)
{ {
// give a chance if stride is 0, return a default packed stride // give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return static_cast<std::size_t>(col); return static_cast<std::size_t>(col);
......
...@@ -115,21 +115,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -115,21 +115,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
}; };
auto f_get_default_stride = auto f_get_default_stride =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == 0) if(stride == -1)
{ {
// give a chance if stride is zero, return a default packed stride // give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return col; return static_cast<std::size_t>(col);
} }
else else
{ {
return row; return static_cast<std::size_t>(row);
} }
} }
else else
return stride; return static_cast<std::size_t>(stride);
}; };
StrideA = f_get_default_stride(M, K, StrideA, ALayout{}); StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
/* /*
Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o] Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
...@@ -60,14 +60,14 @@ struct AddAddRelu ...@@ -60,14 +60,14 @@ struct AddAddRelu
{ {
const ck::half_t x = c + d0 + d1; const ck::half_t x = c + d0 + d1;
ck::tensor_operation::element_wise::Relu{}.template operator()<ck::half_t>(e, x); ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
} }
__host__ __device__ void __host__ __device__ void
operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
{ {
const float x = c + (d0 + d1); const float x = c + (d0 + d1);
ck::tensor_operation::element_wise::Relu{}.template operator()<float>(e, x); ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment