Commit 7da48908 authored by Andriy Roshchenko's avatar Andriy Roshchenko
Browse files

Merge remote-tracking branch 'origin/develop' into gfx950

parents 1f127242 7d50244e
......@@ -178,14 +178,14 @@ rocm_check_target_ids(SUPPORTED_GPU_TARGETS
message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}")
if (SUPPORTED_GPU_TARGETS MATCHES "gfx9")
message("Enabling XDL instances")
add_definitions(-DCK_USE_XDL)
set(CK_USE_XDL "ON")
message("Enabling XDL instances")
add_definitions(-DCK_USE_XDL)
set(CK_USE_XDL "ON")
endif()
if (SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
message("Enabling WMMA instances")
add_definitions(-DCK_USE_WMMA)
set(CK_USE_WMMA "ON")
message("Enabling WMMA instances")
add_definitions(-DCK_USE_WMMA)
set(CK_USE_WMMA "ON")
endif()
# CK config file to record supported datatypes, etc.
......@@ -198,6 +198,13 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500723302)
add_compile_options(-fno-offload-uniform-block)
endif()
endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 500500000)
check_cxx_compiler_flag("-mllvm --lsr-drop-solution=1" HAS_LSR_DROP_SOLUTION)
if(HAS_LSR_DROP_SOLUTION)
message("Adding the lsr-drop-solution=1 compiler flag")
add_compile_options("SHELL: -mllvm --lsr-drop-solution=1")
endif()
endif()
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600140090)
check_cxx_compiler_flag("-mllvm -enable-post-misched=0" HAS_ENABLE_POST_MISCHED)
if(HAS_ENABLE_POST_MISCHED)
......
......@@ -795,6 +795,10 @@ pipeline {
name: "RUN_GROUPED_CONV_LARGE_CASES_TESTS",
defaultValue: false,
description: "Run the grouped conv large cases tests (default: OFF)")
booleanParam(
name: "RUN_CODEGEN_TESTS",
defaultValue: false,
description: "Run codegen tests (default: OFF)")
booleanParam(
name: "RUN_CK_TILE_FMHA_TESTS",
defaultValue: false,
......@@ -915,7 +919,30 @@ pipeline {
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 test_grouped_convnd_fwd_large_cases_xdl && \
./bin/test_grouped_convnd_fwd_large_cases_xdl"""
}
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Run Codegen Tests")
{
parallel
{
stage("Run Codegen Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_CODEGEN_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen && \
make -j64 check"""
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
......@@ -940,7 +967,7 @@ pipeline {
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ &&
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
}
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
......@@ -959,7 +986,7 @@ pipeline {
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ &&
example/ck_tile/01_fmha/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
}
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
......@@ -984,7 +1011,7 @@ pipeline {
make -j64 tile_example_gemm_basic && \
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """
}
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
......@@ -1003,7 +1030,7 @@ pipeline {
make -j64 tile_example_gemm_basic && \
cd ../ &&
example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """
}
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
......@@ -1029,7 +1056,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " \
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
execute_args = " "
}
}
steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
cleanWs()
......@@ -1048,7 +1075,7 @@ pipeline {
-DCMAKE_CXX_FLAGS=" -O3 " \
-DCK_USE_ALTERNATIVE_PYTHON=/opt/Python-3.8.13/bin/python3.8 """
execute_args = " "
}
}
steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: " ", no_reboot:true, build_type: 'Release', docker_name: docker_name)
cleanWs()
......@@ -1129,7 +1156,7 @@ pipeline {
-D CMAKE_BUILD_TYPE=Release \
-D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \
-D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """
}
}
steps{
buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
......
# Composable Kernel
> [!NOTE]
> The published documentation is available at [Composable Kernel](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
The Composable Kernel (CK) library provides a programming model for writing performance-critical
kernels for machine learning workloads across multiple architectures (GPUs, CPUs, etc.). The CK library
uses general purpose kernel languages, such as HIP C++.
......
cmake_minimum_required(VERSION 3.16)
project(composable_kernel_host)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
......@@ -5,56 +8,51 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
add_compile_options(-std=c++17)
find_package(hip)
add_custom_target(codegen)
find_package(ROCM)
include(ROCMInstallTargets)
include(ROCMTest)
# add include directories
include_directories(BEFORE
${PROJECT_BINARY_DIR}/include
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
${HIP_INCLUDE_DIRS}
)
rocm_setup_version(VERSION 1.0)
list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
include(Embed)
file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
${CK_ROOT}/include/ck/*.hpp)
#printouts fot debug purposes
#message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
#message(STATUS "RELATIVE: ${CK_ROOT}/include")
${CK_ROOT}/include/ck/*.hpp)
# printouts fot debug purposes
# message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
# message(STATUS "RELATIVE: ${CK_ROOT}/include")
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
add_compile_options(-std=c++17)
##message(STATUS "SOURCE_FILES: ${SOURCES}")
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
# TODO: Use object library
add_library(ck_host STATIC ${SOURCES})
target_link_libraries(ck_host PRIVATE ck_headers)
set_target_properties(ck_host PROPERTIES
LINKER_LANGUAGE CXX
POSITION_INDEPENDENT_CODE ON)
set_target_properties(ck_host PROPERTIES
LINKER_LANGUAGE CXX
POSITION_INDEPENDENT_CODE ON)
target_include_directories(ck_host PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>
)
# target_include_directories(ck_host PUBLIC
# $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
# )
add_executable(ck-template-driver driver/main.cpp)
target_link_libraries(ck-template-driver ck_host)
rocm_install(
rocm_install_targets(
TARGETS ck_host ck_headers
EXPORT ck_hostTargets
EXPORT ck_host_targets
INCLUDE include
PRIVATE
)
rocm_export_targets(
EXPORT ck_host_targets
NAMESPACE composable_kernel::
)
rocm_install(EXPORT ck_hostTargets
FILE composable_kernelck_hostTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel)
rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(BUILD_TESTING)
add_subdirectory(test)
add_subdirectory(test)
endif()
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
add_subdirectory(rtc)
file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
# do not build the tests when we build the library for various targets
if(NOT GPU_ARCHS)
foreach(TEST_SRC ${TEST_SRCS})
set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP)
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
add_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
if(CK_USE_ALTERNATIVE_PYTHON)
target_link_options(codegen_test_${BASE_NAME} PRIVATE -lstdc++fs)
endif()
add_dependencies(codegen codegen_test_${BASE_NAME})
add_dependencies(tests codegen_test_${BASE_NAME})
add_dependencies(check codegen_test_${BASE_NAME})
add_test(NAME codegen_test_${BASE_NAME} COMMAND codegen_test_${BASE_NAME})
message("adding test codegen_test_${BASE_NAME}")
target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/codegen/test/include)
# TODO: These tests need to be refactored to remove dependency on main ck
# headers and device compilation.
set(TESTS_REQUIRE_DEVICE_COMPILE
grouped_conv_fwd_multiple_d_v1
grouped_conv_fwd_multiple_d_v2
grouped_conv_fwd_multiple_d_v3
grouped_conv_fwd_multiple_d_v4
)
find_package(hip)
foreach(TEST_SRC ${TEST_SRCS})
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
rocm_add_test_executable(codegen_test_${BASE_NAME} ${TEST_SRC})
target_link_libraries(codegen_test_${BASE_NAME} ck_rtc ck_host)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC include)
if(BASE_NAME IN_LIST TESTS_REQUIRE_DEVICE_COMPILE)
target_link_libraries(codegen_test_${BASE_NAME} hip::device)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/include)
target_include_directories(codegen_test_${BASE_NAME} PUBLIC ${CK_ROOT}/library/include)
endforeach()
endif()
endif()
endforeach()
find_package(hip)
file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
add_library(ck_rtc ${RTC_SOURCES})
target_include_directories(ck_rtc PUBLIC include)
target_link_libraries(ck_rtc PUBLIC hip::host)
target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
......@@ -2,14 +2,14 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp>
#include <ck/filesystem.hpp>
#include <rtc/filesystem.hpp>
#include <string>
namespace rtc {
struct src_file
{
CK::fs::path path;
fs::path path;
std::string_view content;
};
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#include <string>
#include <string_view>
// clang-format off
#if defined(CPPCHECK)
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 1
#elif defined(_WIN32)
#if _MSC_VER >= 1920
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 0
#elif _MSC_VER >= 1900
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#elif defined(__has_include)
#if __has_include(<filesystem>) && __cplusplus >= 201703L
#define RTC_HAS_FILESYSTEM 1
#else
#define RTC_HAS_FILESYSTEM 0
#endif
#if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
// clang-format on
#if RTC_HAS_FILESYSTEM
#include <filesystem>
#elif RTC_HAS_FILESYSTEM_TS
#include <experimental/filesystem>
#else
#error "No filesystem include available"
#endif
namespace rtc {
#if RTC_HAS_FILESYSTEM
namespace fs = ::std::filesystem;
#elif RTC_HAS_FILESYSTEM_TS
namespace fs = ::std::experimental::filesystem;
#endif
} // namespace rtc
#endif // GUARD_RTC_FILESYSTEM_HPP_
......@@ -2,13 +2,13 @@
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string>
#include <ck/filesystem.hpp>
#include <rtc/filesystem.hpp>
namespace rtc {
struct tmp_dir
{
CK::fs::path path;
fs::path path;
tmp_dir(const std::string& prefix = "");
void execute(const std::string& cmd) const;
......
#include "rtc/hip.hpp"
#include <rtc/hip.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/tmp_dir.hpp>
#include <stdexcept>
......@@ -70,9 +70,9 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
for(const auto& src : srcs)
{
CK::fs::path full_path = td.path / src.path;
CK::fs::path parent_path = full_path.parent_path();
CK::fs::create_directories(parent_path);
fs::path full_path = td.path / src.path;
fs::path parent_path = full_path.parent_path();
fs::create_directories(parent_path);
write_string(full_path.string(), src.content);
if(src.path.extension().string() == ".cpp")
{
......@@ -86,7 +86,7 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
td.execute(compiler() + options.flags);
auto out_path = td.path / out;
if(not CK::fs::exists(out_path))
if(not fs::exists(out_path))
throw std::runtime_error("Output file missing: " + out);
auto obj = read_buffer(out_path.string());
......
......@@ -31,10 +31,10 @@ std::string unique_string(const std::string& prefix)
}
tmp_dir::tmp_dir(const std::string& prefix)
: path(CK::fs::temp_directory_path() /
: path(fs::temp_directory_path() /
unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
{
CK::fs::create_directories(this->path);
fs::create_directories(this->path);
}
void tmp_dir::execute(const std::string& cmd) const
......@@ -43,6 +43,6 @@ void tmp_dir::execute(const std::string& cmd) const
std::system(s.c_str());
}
tmp_dir::~tmp_dir() { CK::fs::remove_all(this->path); }
tmp_dir::~tmp_dir() { fs::remove_all(this->path); }
} // namespace rtc
rocm-docs-core==1.8.2
rocm-docs-core==1.8.3
sphinxcontrib-bibtex==2.6.3
......@@ -103,7 +103,7 @@ requests==2.32.3
# via
# pygithub
# sphinx
rocm-docs-core==1.8.2
rocm-docs-core==1.8.3
# via -r requirements.in
six==1.16.0
# via pybtex
......
......@@ -29,9 +29,9 @@ struct ProblemSize final
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 0;
ck::index_t StrideB = 0;
ck::index_t StrideC = 0;
ck::index_t StrideA = -1;
ck::index_t StrideB = -1;
ck::index_t StrideC = -1;
};
struct ProblemSizeStreamK final
......@@ -40,9 +40,9 @@ struct ProblemSizeStreamK final
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 0;
ck::index_t StrideB = 0;
ck::index_t StrideC = 0;
ck::index_t StrideA = -1;
ck::index_t StrideB = -1;
ck::index_t StrideC = -1;
ck::index_t NumSKBlocks = -1;
};
......@@ -52,9 +52,9 @@ struct ProblemSizeStreamK_universal final
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 0;
ck::index_t StrideB = 0;
ck::index_t StrideC = 0;
ck::index_t StrideA = -1;
ck::index_t StrideB = -1;
ck::index_t StrideC = -1;
ck::index_t Grid_size = -1; // defaults to max occupancy
ck::index_t Streamk_sel = 1; // defaults to 1-tile SK
......@@ -66,18 +66,19 @@ struct ProblemSizeSplitK final
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 0;
ck::index_t StrideB = 0;
ck::index_t StrideC = 0;
ck::index_t StrideA = -1;
ck::index_t StrideB = -1;
ck::index_t StrideC = -1;
ck::index_t KBatch = 1;
};
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 2;
bool time_kernel = false;
// 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
int do_verification = 3;
int init_method = 2;
bool time_kernel = false;
};
template <ck::index_t... Is>
......@@ -126,7 +127,7 @@ bool parse_cmd_args<ProblemSize>(int argc,
}
else
{
std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
......@@ -176,7 +177,7 @@ bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
else
{
std::cerr
<< "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
<< "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
......@@ -225,7 +226,7 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
}
else
{
std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
......@@ -275,7 +276,7 @@ bool parse_cmd_args<ProblemSizeSplitK>(int argc,
}
else
{
std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl
std::cerr << "arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
......
......@@ -116,21 +116,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
};
auto f_get_default_stride =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(stride == 0)
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
{
// give a chance if stride is zero, return a default packed stride
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return col;
return static_cast<std::size_t>(col);
}
else
{
return row;
return static_cast<std::size_t>(row);
}
}
else
return stride;
return static_cast<std::size_t>(stride);
};
StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
......@@ -330,7 +330,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
bool pass = true;
if(config.do_verification)
if((config.do_verification == 1) || (config.do_verification == 3))
{
// CPU verification
auto ref_gemm = ReferenceGemmInstance{};
......@@ -353,13 +353,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass &= !ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
#endif
}
if((config.do_verification == 2) || (config.do_verification == 3))
{
// GPU verification
auto ref_gemm_gpu = ReferenceGemmInstanceGPU{};
auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
......@@ -381,14 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass &= !ck::utils::check_err(c_m_n_device_result,
c_m_n_device_ref_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_device_ref_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
}
return !pass;
return pass == true;
}
bool run_gemm_example(int argc, char* argv[])
......
......@@ -117,9 +117,9 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
auto f_get_default_stride =
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == 0)
if(stride == -1)
{
// give a chance if stride is 0, return a default packed stride
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return static_cast<std::size_t>(col);
......@@ -241,7 +241,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
}
bool pass = true;
if(config.do_verification)
if((config.do_verification == 1) || (config.do_verification == 3))
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
......
......@@ -115,21 +115,21 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
};
auto f_get_default_stride =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(stride == 0)
[](std::size_t row, std::size_t col, ck::index_t stride, auto layout) {
if(stride == -1)
{
// give a chance if stride is zero, return a default packed stride
// give a chance if stride is -1, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return col;
return static_cast<std::size_t>(col);
}
else
{
return row;
return static_cast<std::size_t>(row);
}
}
else
return stride;
return static_cast<std::size_t>(stride);
};
StrideA = f_get_default_stride(M, K, StrideA, ALayout{});
......@@ -228,7 +228,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
}
bool pass = true;
if(config.do_verification)
if((config.do_verification == 1) || (config.do_verification == 3))
{
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
/*
Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
......@@ -60,14 +60,14 @@ struct AddAddRelu
{
const ck::half_t x = c + d0 + d1;
ck::tensor_operation::element_wise::Relu{}.template operator()<ck::half_t>(e, x);
ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
}
__host__ __device__ void
operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
{
const float x = c + (d0 + d1);
ck::tensor_operation::element_wise::Relu{}.template operator()<float>(e, x);
ck::tensor_operation::element_wise::Relu{}.operator()(e, x);
}
};
......
......@@ -5,3 +5,4 @@ add_example_executable(example_elementwise_permute_4D_fp32_col elementwise_permu
add_example_executable(example_elementwise_permute_4D_fp16_col elementwise_permute_4D_fp16_col.cpp)
add_example_executable(example_elementwise_binary_4D_fp16 elementwise_binary_4D_fp16.cpp)
add_example_executable(example_elementwise_trinary_4D_fp16 elementwise_trinary_4D_fp16.cpp)
add_example_executable(elementwise_scale_permute_amax_2D_fp16_fp8 elementwise_scale_permute_amax_2D_fp16_fp8.cpp)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment