Unverified Commit 8eff4d62 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Add host lib (#1134)



* Format

* Format

* Format

* Remove const

* Use the right template

* Format

* Format

* add row/col instances

* Add missing file

* fixed

* Format

* Updates

* Format

* fixed rrr layout

* Format

* Update test and embed modules

* Restore older version

* Update year

* Set -fPIC

* Format

* Use double for isnan

* rename host folder to codegen + minor fix

* add codegen CI test

* add option to build components without building CK

* fix the groovy syntax

* fix typo

* use the correct function for the codegen stage

---------
Co-authored-by: default avatarJing Zhang <jizha@amd.com>
Co-authored-by: default avatarIllia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: default avatarillsilin <Illia.Silin@amd.com>
parent cf866211
...@@ -264,18 +264,24 @@ def cmake_build(Map conf=[:]){ ...@@ -264,18 +264,24 @@ def cmake_build(Map conf=[:]){
""") """)
sh cmd3 sh cmd3
} }
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
// reduce parallelism when compiling, clang uses too much memory // reduce parallelism when compiling, clang uses too much memory
def nt = nthreads() def nt = nthreads()
def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j${nt} ${config_targets}") def cmd
def execute_cmd = conf.get("execute_cmd", "") def execute_cmd = conf.get("execute_cmd", "")
if(!setup_args.contains("NO_CK_BUILD")){
def cmd = conf.get("cmd", """ def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j${nt} ${config_targets}")
cmd = conf.get("cmd", """
${setup_cmd} ${setup_cmd}
${build_cmd} ${build_cmd}
${execute_cmd} ${execute_cmd}
""") """)
}
else{
cmd = conf.get("cmd", """
${execute_cmd}
""")
}
echo cmd echo cmd
...@@ -667,7 +673,7 @@ pipeline { ...@@ -667,7 +673,7 @@ pipeline {
string( string(
name: 'USE_CUSTOM_DOCKER', name: 'USE_CUSTOM_DOCKER',
defaultValue: '', defaultValue: '',
description: 'If you want to use a custom docker image, please scecify it here (default: OFF).') description: 'If you want to use a custom docker image, please specify it here (default: leave blank).')
string( string(
name: 'ROCMVERSION', name: 'ROCMVERSION',
defaultValue: '6.0', defaultValue: '6.0',
...@@ -712,6 +718,10 @@ pipeline { ...@@ -712,6 +718,10 @@ pipeline {
name: "RUN_PERFORMANCE_TESTS", name: "RUN_PERFORMANCE_TESTS",
defaultValue: false, defaultValue: false,
description: "Run the performance tests (default: OFF)") description: "Run the performance tests (default: OFF)")
booleanParam(
name: "RUN_CODEGEN_TESTS",
defaultValue: true,
description: "Run the codegen tests (default: ON)")
} }
environment{ environment{
dbuser = "${dbuser}" dbuser = "${dbuser}"
...@@ -790,7 +800,34 @@ pipeline { ...@@ -790,7 +800,34 @@ pipeline {
} }
} }
} }
stage("Run Codegen Tests")
{
parallel
{
stage("Run Codegen Tests on MI100/MI200")
{
when {
beforeAgent true
expression { params.RUN_CODEGEN_TESTS.toBoolean() }
}
options { retry(2) }
agent{ label rocmnode("gfx908 || gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ cd ../codegen && rm -rf build && mkdir build && cd build && \
cmake -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
-D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j check"""
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Build CK and run Tests") stage("Build CK and run Tests")
{ {
parallel parallel
......
#####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################
if(WIN32)
set(EMBED_USE RC CACHE STRING "Use RC or CArrays to embed data files")
set_property(CACHE EMBED_USE PROPERTY STRINGS "RC;CArrays")
else()
if(BUILD_SHARED_LIBS)
set(EMBED_USE LD CACHE STRING "Use LD or CArrays to embed data files")
else()
set(EMBED_USE CArrays CACHE STRING "Use LD or CArrays to embed data files")
endif()
set_property(CACHE EMBED_USE PROPERTY STRINGS "LD;CArrays")
endif()
if(EMBED_USE STREQUAL "LD")
find_program(EMBED_LD ld REQUIRED)
find_program(EMBED_OBJCOPY objcopy REQUIRED)
endif()
function(embed_wrap_string)
set(options)
set(oneValueArgs VARIABLE AT_COLUMN)
set(multiValueArgs)
cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
string(LENGTH ${${PARSE_VARIABLE}} string_length)
math(EXPR offset "0")
while(string_length GREATER 0)
if(string_length GREATER ${PARSE_AT_COLUMN})
math(EXPR length "${PARSE_AT_COLUMN}")
else()
math(EXPR length "${string_length}")
endif()
string(SUBSTRING ${${PARSE_VARIABLE}} ${offset} ${length} line)
set(lines "${lines}\n${line}")
math(EXPR string_length "${string_length} - ${length}")
math(EXPR offset "${offset} + ${length}")
endwhile()
set(${PARSE_VARIABLE} "${lines}" PARENT_SCOPE)
endfunction()
function(generate_embed_source EMBED_NAME EMBED_DIR BASE_DIRECTORY)
set(options)
set(oneValueArgs)
set(multiValueArgs SYMBOLS FILES)
cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(RESOURCE_ID 100)
list(LENGTH PARSE_SYMBOLS SYMBOLS_LEN)
list(LENGTH PARSE_FILES FILES_LEN)
if(NOT ${SYMBOLS_LEN} EQUAL ${FILES_LEN})
message(FATAL_ERROR "Symbols and objects dont match: ${SYMBOLS_LEN} != ${FILES_LEN}")
endif()
math(EXPR LEN "${SYMBOLS_LEN} - 1")
foreach(idx RANGE ${LEN})
list(GET PARSE_SYMBOLS ${idx} SYMBOL)
list(GET PARSE_FILES ${idx} FILE)
file(RELATIVE_PATH BASE_NAME "${BASE_DIRECTORY}" ${FILE})
if(EMBED_USE STREQUAL "RC")
string(TOUPPER "${SYMBOL}" SYMBOL)
string(APPEND FILE_IDS "#define IDR_${SYMBOL} ${RESOURCE_ID}\n")
file(TO_NATIVE_PATH "${FILE}" NATIVE_FILE)
string(REPLACE "\\" "\\\\" NATIVE_FILE "${NATIVE_FILE}")
string(APPEND RC_FILE_MAPPING "IDR_${SYMBOL} TEXTFILE \"${NATIVE_FILE}\"\n")
string(APPEND INIT_KERNELS "\n {\"${BASE_NAME}\", resource::read(IDR_${SYMBOL})},")
math(EXPR RESOURCE_ID "${RESOURCE_ID} + 1" OUTPUT_FORMAT DECIMAL)
else()
set(START_SYMBOL "_binary_${SYMBOL}_start")
set(LENGTH_SYMBOL "_binary_${SYMBOL}_length")
if(EMBED_USE STREQUAL "LD")
string(APPEND EXTERNS "
extern const char ${START_SYMBOL}[];
extern const size_t _binary_${SYMBOL}_size;
const auto ${LENGTH_SYMBOL} = reinterpret_cast<size_t>(&_binary_${SYMBOL}_size);
")
else()
string(APPEND EXTERNS "
extern const char ${START_SYMBOL}[];
extern const size_t ${LENGTH_SYMBOL};
")
endif()
string(APPEND INIT_KERNELS "
{ \"${BASE_NAME}\", { ${START_SYMBOL}, ${LENGTH_SYMBOL}} },")
endif()
endforeach()
if(EMBED_USE STREQUAL "RC")
file(WRITE "${EMBED_DIR}/include/resource.h" "
#define TEXTFILE 256
${FILE_IDS}
")
file(WRITE "${EMBED_DIR}/resource.rc" "
#include \"resource.h\"
${RC_FILE_MAPPING}
")
set(EXTERNS "
#include <Windows.h>
#include \"resource.h\"
namespace resource {
std::string_view read(int id)
{
HMODULE handle = GetModuleHandle(nullptr);
HRSRC rc = FindResource(handle, MAKEINTRESOURCE(id), MAKEINTRESOURCE(TEXTFILE));
HGLOBAL data = LoadResource(handle, rc);
return {static_cast<const char*>(LockResource(data)), SizeofResource(handle, rc)};
}
}
")
set(EMBED_FILES ${EMBED_DIR}/include/resource.h ${EMBED_DIR}/resource.rc)
endif()
file(WRITE "${EMBED_DIR}/include/${EMBED_NAME}.hpp" "
#include <string_view>
#include <unordered_map>
#include <utility>
std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}();
")
file(WRITE "${EMBED_DIR}/${EMBED_NAME}.cpp" "
#include <${EMBED_NAME}.hpp>
${EXTERNS}
std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}()
{
static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}
};
return result;
}
")
list(APPEND EMBED_FILES ${EMBED_DIR}/${EMBED_NAME}.cpp ${EMBED_DIR}/include/${EMBED_NAME}.hpp)
set(EMBED_FILES ${EMBED_FILES} PARENT_SCOPE)
endfunction()
function(embed_file FILE BASE_DIRECTORY)
message(STATUS " ${FILE}")
file(RELATIVE_PATH REL_FILE "${BASE_DIRECTORY}" ${FILE})
string(MAKE_C_IDENTIFIER "${REL_FILE}" OUTPUT_SYMBOL)
get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
if(EMBED_USE STREQUAL "LD")
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
add_custom_command(
OUTPUT "${OUTPUT_FILE}"
COMMAND ${EMBED_LD} -r -o "${OUTPUT_FILE}" -z noexecstack --format=binary "${REL_FILE}"
COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUTPUT_FILE}"
WORKING_DIRECTORY "${BASE_DIRECTORY}"
DEPENDS "${FILE}"
VERBATIM)
set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
elseif(EMBED_USE STREQUAL "CArrays")
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${FILE})
set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
# reads source file contents as hex string
file(READ ${FILE} HEX_STRING HEX)
# wraps the hex string into multiple lines
embed_wrap_string(VARIABLE HEX_STRING AT_COLUMN 80)
# adds '0x' prefix and comma suffix before and after every byte respectively
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
# removes trailing comma
string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
file(WRITE "${OUTPUT_FILE}" "
#include <cstddef>
extern const char _binary_${OUTPUT_SYMBOL}_start[] = { ${ARRAY_VALUES} };
extern const size_t _binary_${OUTPUT_SYMBOL}_length = sizeof(_binary_${OUTPUT_SYMBOL}_start);
")
set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
endif()
set(OUTPUT_SYMBOL ${OUTPUT_SYMBOL} PARENT_SCOPE)
endfunction()
function(add_embed_library EMBED_NAME)
set(options)
set(oneValueArgs RELATIVE)
set(multiValueArgs)
cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(EMBED_DIR ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
file(MAKE_DIRECTORY ${EMBED_DIR})
message(STATUS "Embedding kernel files:")
foreach(FILE ${PARSE_UNPARSED_ARGUMENTS})
embed_file(${FILE} ${PARSE_RELATIVE})
list(APPEND OUTPUT_FILES ${OUTPUT_FILE})
list(APPEND SYMBOLS ${OUTPUT_SYMBOL})
endforeach()
message(STATUS "Generating embedding library '${EMBED_NAME}'")
generate_embed_source(${EMBED_NAME} ${EMBED_DIR} "${PARSE_RELATIVE}" SYMBOLS ${SYMBOLS} FILES ${PARSE_UNPARSED_ARGUMENTS})
set(INTERNAL_EMBED_LIB embed_lib_${EMBED_NAME})
if(EMBED_USE STREQUAL "LD")
add_library(${INTERNAL_EMBED_LIB} STATIC ${EMBED_FILES} ${OUTPUT_FILES})
else()
add_library(${INTERNAL_EMBED_LIB} OBJECT ${EMBED_FILES})
endif()
if(EMBED_USE STREQUAL "CArrays")
target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
endif()
target_include_directories(${INTERNAL_EMBED_LIB} PRIVATE "${EMBED_DIR}/include")
target_compile_options(${INTERNAL_EMBED_LIB} PRIVATE -Wno-reserved-identifier -Wno-extern-initializer -Wno-missing-variable-declarations)
set_target_properties(${INTERNAL_EMBED_LIB} PROPERTIES POSITION_INDEPENDENT_CODE On)
add_library(${EMBED_NAME} INTERFACE)
if(EMBED_USE STREQUAL "RC")
target_link_libraries(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
elseif(EMBED_USE STREQUAL "LD")
target_link_libraries(${EMBED_NAME} INTERFACE ${INTERNAL_EMBED_LIB})
else()
target_sources(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
endif()
target_include_directories(${EMBED_NAME} INTERFACE "${EMBED_DIR}/include")
endfunction()
cmake_minimum_required(VERSION 3.16)
project(composable_kernel_host)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CK_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
find_package(ROCM)
include(ROCMInstallTargets)
include(ROCMTest)
list(APPEND CMAKE_MODULE_PATH ${CK_ROOT}/cmake)
include(Embed)
file(GLOB_RECURSE KERNEL_FILES CONFIGURE_DEPENDS
${CK_ROOT}/include/ck/*.hpp)
message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
message(STATUS "RELATIVE: ${CK_ROOT}/include")
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${CK_ROOT}/include)
add_definitions(-std=c++17)
file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
# TODO: Use object library
add_library(ck_host STATIC ${SOURCES})
target_link_libraries(ck_host PRIVATE ck_headers)
set_target_properties(ck_host PROPERTIES
LINKER_LANGUAGE CXX
POSITION_INDEPENDENT_CODE ON)
target_include_directories(ck_host PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)
add_executable(ck-template-driver driver/main.cpp)
target_link_libraries(ck-template-driver ck_host)
rocm_install(
TARGETS ck_host ck_headers
EXPORT ck_hostTargets
)
rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(BUILD_TESTING)
add_subdirectory(test)
endif()
#include <functional>
#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>
#include "ck/host/device_gemm_multiple_d/operation.hpp"
#include "ck/host/stringutils.hpp"
using ck::host::Transform;
struct Emitters
{
std::unordered_map<std::string, std::function<std::vector<std::string>()>> m;
template <class T>
void Register(const std::string& name)
{
m[name] = [] {
auto configs = T::CreateOperations();
return Transform(configs, [](const auto& ops) { return ToTuple(ops); });
};
}
template <class T>
static std::string ToTuple(const T& ops)
{
auto templates = Transform(
ops, [](const auto& op) { return " " + op.ToSolution().ToTemplateString(); });
return "std::tuple<\n" + ck::host::JoinStrings(templates, ",\n") + ">";
}
std::string Emit(const std::string& name) { return ck::host::JoinStrings(m.at(name)(), "\n"); }
std::vector<std::string> List() const
{
return Transform(m, [](auto&& p) { return p.first; });
}
};
int main(int argc, const char* argv[])
{
std::string prog = argv[0];
std::vector<std::string> args(argv + 1, argv + argc);
Emitters e;
e.Register<ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle>(
"DeviceGemmMultipleD_Xdl_CShuffle");
if(args.empty() or std::any_of(args.begin(), args.end(), [](auto arg) {
return arg == "-h" or arg == "--help";
}))
{
std::cout << "USAGE:" << std::endl;
std::cout << " " << prog << " [TEMPLATE]" << std::endl;
std::cout << std::endl;
std::cout << "FLAGS:" << std::endl;
std::cout << " -h, --help Show help" << std::endl;
std::cout << std::endl;
std::cout << "TEMPLATES:" << std::endl;
for(auto x : e.List())
std::cout << " " << x << std::endl;
std::cout << std::endl;
return 0;
}
for(auto name : args)
std::cout << e.Emit(name) << std::endl;
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include <sstream>
#include <iterator>
#include <numeric>
#include "ck/host/types.hpp"
namespace ck {
namespace host {
namespace device_gemm_multiple_d {
struct Problem
{
std::size_t M = 0;
std::size_t N = 0;
std::size_t K = 0;
bool TransA = false;
bool TransB = false;
bool TransE = false;
std::vector<bool> DsTrans = {};
DataType ADataType = DataType::Half;
DataType BDataType = DataType::Half;
DataType EDataType = DataType::Half;
std::vector<DataType> DsDataType = {};
std::string AElementOp = "ck::tensor_operation::element_wise::PassThrough";
std::string BElementOp = "ck::tensor_operation::element_wise::PassThrough";
std::string CDEElementOp = "ck::Tuple<>";
std::string GetIncludeHeader() const;
std::vector<Solution> GetSolutions(const std::string& arch) const;
};
} // namespace device_gemm_multiple_d
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <string>
#include "ck/host/types.hpp"
#include "ck/host/operation/gemm.hpp"
#include "ck/host/device_gemm_multiple_d/problem.hpp"
namespace ck {
namespace host {
namespace device_gemm_multiple_d {
struct Operation_Xdl_CShuffle
{
static std::vector<std::vector<Operation_Xdl_CShuffle>> CreateOperations();
static std::vector<Operation_Xdl_CShuffle> CreateOperations(const Problem& prob);
TensorDesc A{};
TensorDesc B{};
DataType acc = DataType::Float;
DataType cs_type = DataType::Half;
std::vector<TensorDesc> Ds = {};
TensorDesc E{};
std::string a_elem_op = PassThrough;
std::string b_elem_op = PassThrough;
std::string cde_elem_op = Bilinear;
std::string gemm_specialization = "ck::tensor_operation::device::GemmSpecialization::Default";
operation::TileDesc tile_desc{};
operation::BlockTransferDesc a_block_transfer{};
operation::BlockTransferDesc b_block_transfer{};
operation::CShuffleDesc cshuffle{};
operation::CBlockTransferDesc c_block_transfer{};
Solution ToSolution() const;
};
} // namespace device_gemm_multiple_d
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <string>
#include "ck/host/types.hpp"
namespace ck {
namespace host {
namespace device_gemm_multiple_d {
struct Problem
{
std::size_t M = 0;
std::size_t N = 0;
std::size_t K = 0;
bool TransA = false;
bool TransB = false;
bool TransE = false;
std::vector<bool> DsTrans = {};
DataType ADataType = DataType::Half;
DataType BDataType = DataType::Half;
DataType EDataType = DataType::Half;
std::vector<DataType> DsDataType = {};
std::string AElementOp = PassThrough;
std::string BElementOp = PassThrough;
std::string CDEElementOp = PassThrough;
std::string GetIncludeHeader() const;
std::vector<Solution> GetSolutions(const std::string& arch) const;
};
} // namespace device_gemm_multiple_d
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <string_view>
#include <utility>
#include <unordered_map>
#include <vector>
namespace ck {
namespace host {
std::unordered_map<std::string_view, std::string_view> GetHeaders();
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
namespace ck {
namespace host {
namespace operation {
struct TileDesc
{
int block_size = 0;
int m_per_block = 0;
int n_per_block = 0;
int k_per_block = 0;
int ak1 = 0;
int bk1 = 0;
int m_per_XDL = 0;
int n_per_XDL = 0;
int m_Xdl_per_wave = 0;
int n_Xdl_per_wave = 0;
int num_gemmk_prefetch_stage = 0;
};
struct BlockTransferDesc
{
std::string thread_cluster_length = "";
std::string thread_cluster_arrange_order = "";
std::string src_access_order = "";
int src_vec_dim = 0;
int src_scalar_per_vector = 0;
int dst_scalar_per_vector_k1 = 0;
int lds_add_extra_dim = 0;
};
struct CShuffleDesc
{
int m_Xdl_per_wave_per_shuffle = 0;
int n_Xdl_per_wave_per_shuffle = 0;
};
struct CBlockTransferDesc
{
std::string cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl = "";
int scalar_per_vector_n_wave_n_per_Xdl = 0;
};
} // namespace operation
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <cassert>
#include <numeric>
#include <string>
#include <utility>
#include <unordered_map>
#include <vector>
namespace ck {
namespace host {
template <class F>
std::string trim(const std::string& s, F f)
{
auto start = std::find_if_not(s.begin(), s.end(), f);
auto last = std::find_if_not(s.rbegin(), std::string::const_reverse_iterator(start), f).base();
return {start, last};
}
inline std::string trim(const std::string& s)
{
return trim(s, [](unsigned char c) { return std::isspace(c); });
}
template <class Strings>
inline std::string JoinStrings(Strings strings, const std::string& delim)
{
auto it = strings.begin();
if(it == strings.end())
return "";
auto nit = std::next(it);
return std::accumulate(nit, strings.end(), *it, [&](std::string x, std::string y) {
return std::move(x) + delim + std::move(y);
});
}
template <class F>
inline std::string
InterpolateString(const std::string& input, F f, std::string start = "${", std::string end = "}")
{
std::string result = "";
result.reserve(input.size());
auto it = input.begin();
while(it != input.end())
{
auto next_start = std::search(it, input.end(), start.begin(), start.end());
auto next_end = std::search(next_start, input.end(), end.begin(), end.end());
result.append(it, next_start);
if(next_start == input.end())
break;
if(next_end == input.end())
{
throw std::runtime_error("Unbalanced brackets");
}
auto r = f(next_start + start.size(), next_end);
result.append(r.begin(), r.end());
it = next_end + end.size();
}
return result;
}
inline std::string InterpolateString(const std::string& input,
const std::unordered_map<std::string, std::string>& vars,
std::string start = "${",
std::string end = "}")
{
return InterpolateString(
input,
[&](auto start_it, auto last_it) {
auto key = trim({start_it, last_it});
auto it = vars.find(key);
if(it == vars.end())
throw std::runtime_error("Unknown key: " + key);
return it->second;
},
std::move(start),
std::move(end));
}
template <class Range, class F>
inline auto Transform(const Range& r, F f) -> std::vector<decltype(f(*r.begin()))>
{
std::vector<decltype(f(*r.begin()))> result;
std::transform(r.begin(), r.end(), std::back_inserter(result), f);
return result;
}
template <class Range1, class Range2, class F>
inline auto Transform(const Range1& r1, const Range2& r2, F f)
-> std::vector<decltype(f(*r1.begin(), *r2.begin()))>
{
std::vector<decltype(f(*r1.begin(), *r2.begin()))> result;
assert(std::distance(r1.begin(), r1.end()) == std::distance(r2.begin(), r2.end()));
std::transform(r1.begin(), r1.end(), r2.begin(), std::back_inserter(result), f);
return result;
}
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <sstream>
#include <utility>
#include <unordered_map>
#include <vector>
namespace ck {
namespace host {
struct Solution
{
Solution() = default;
Solution(std::string str, std::unordered_map<std::string, std::string> values);
std::string ToTemplateString() const;
std::string GetTemplateParameter(const std::string& name) const;
template <class T>
T GetTemplateParameter(const std::string& name) const
{
T result;
std::stringstream ss(GetTemplateParameter(name));
ss >> result;
return result;
}
private:
std::string template_str;
std::unordered_map<std::string, std::string> template_values;
};
enum class DataType
{
Half,
Float,
Int8,
Int32
};
std::string ToString(DataType dt);
enum class Layout
{
Row,
Column
};
std::string ToString(Layout dl);
enum class GemmType
{
Default
};
std::string ToString(GemmType gt);
struct TensorDesc
{
DataType element;
Layout layout;
};
std::string SequenceStr(const std::vector<int>& v);
std::string MakeTuple(const std::vector<std::string>& v);
template <int... xs>
const std::string S = SequenceStr({xs...});
constexpr const char* PassThrough = "ck::tensor_operation::element_wise::PassThrough";
constexpr const char* Bilinear = "ck::tensor_operation::element_wise::Bilinear";
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdint>
#include <unordered_set>
namespace ck {
namespace host {
std::size_t integer_divide_ceil(std::size_t x, std::size_t y);
const std::unordered_set<std::string>& get_xdlop_archs();
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/host/device_gemm_multiple_d/problem.hpp"
#include "ck/host/device_gemm_multiple_d/operation.hpp"
#include "ck/host/utils.hpp"
#include <algorithm>
namespace ck {
namespace host {
namespace device_gemm_multiple_d {
std::string Problem::GetIncludeHeader() const
{
return "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp";
}
std::vector<Solution> Problem::GetSolutions(const std::string& arch) const
{
if(get_xdlop_archs().count(arch) == 0)
return {};
auto ops = ck::host::device_gemm_multiple_d::Operation_Xdl_CShuffle::CreateOperations(*this);
std::vector<Solution> result;
std::transform(ops.begin(), ops.end(), std::back_inserter(result), [&](const auto& op) {
return op.ToSolution();
});
return result;
}
} // namespace device_gemm_multiple_d
} // namespace host
} // namespace ck
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/host/device_gemm_multiple_d/operation.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include <cassert>
namespace ck {
namespace host {
namespace device_gemm_multiple_d {
static std::string GetGemmSpec(const std::size_t m,
const std::size_t n,
const std::size_t k,
const std::size_t m_per_block,
const std::size_t n_per_block,
const std::size_t k_per_block)
{
std::string spec = "";
if(integer_divide_ceil(m, m_per_block) * m_per_block - m != 0)
spec += "M";
if(integer_divide_ceil(n, n_per_block) * n_per_block - n != 0)
spec += "N";
if(integer_divide_ceil(k, k_per_block) * k_per_block - k != 0)
spec += "K";
if(spec == "")
return "ck::tensor_operation::device::GemmSpecialization::Default";
return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
}
static Layout ToLayout(bool Trans) { return Trans ? Layout::Column : Layout::Row; }
std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(const Problem& prob)
{
std::vector<Operation_Xdl_CShuffle> result;
std::vector<operation::TileDesc> tile_descriptions = {
// clang-format off
// Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| NumGemmK|
// Size| Block| Block| Block| | | XDL| XDL| Per| Per| Prefetch|
// | | | | | | | | Wave| Wave| Stage|
// | | | | | | | | | | |
{ 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, 1},
{ 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, 1},
{ 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, 1},
{ 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, 1},
{ 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, 1},
{ 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, 1},
{ 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, 1},
{ 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, 1},
// clang-format on
};
std::vector<operation::BlockTransferDesc> a_block_descriptions_rowmajor = {
// clang-format off
// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
// | | | | | | |
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
// clang-format on
};
std::vector<operation::BlockTransferDesc> a_block_descriptions_colmajor = {
// clang-format off
// ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM|
// Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
// | | | | | | |
// clang-format on
{S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1},
};
std::vector<operation::BlockTransferDesc> b_block_descriptions_rowmajor = {
// clang-format off
// BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
// | | | | | | |
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
{ S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1},
{ S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
// clang-format on
};
std::vector<operation::BlockTransferDesc> b_block_descriptions_colmajor = {
// clang-format off
// BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|
// ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|
// Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| |
// | | | | | | |
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
{ S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1},
// clang-format on
};
std::vector<operation::CShuffleDesc> cshuffle_descriptions = {
// clang-format off
// CShuffle| CShuffle|
// MXdlPerWave| NXdlPerWave|
// PerShuffle| PerShuffle|
// | |
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
{ 1, 1},
// clang-format on
};
std::vector<operation::CBlockTransferDesc> c_block_descriptions = {
// clang-format off
// CBlockTransferClusterLengths| CBlockTransfer
// _MBlock_MWaveMPerXdl| ScalarPerVector
// _NBlock_NWaveNPerXdl| _NWaveNPerXdl
// |
{ S<1, 32, 1, 8>, 8},
{ S<1, 32, 1, 8>, 8},
{ S<1, 16, 1, 8>, 8},
{ S<1, 32, 1, 8>, 8},
{ S<1, 32, 1, 4>, 8},
{ S<1, 16, 1, 8>, 8},
{ S<1, 32, 1, 8>, 8},
{ S<1, 32, 1, 8>, 8},
// clang-format on
};
const auto a_block_descriptions =
prob.TransA ? a_block_descriptions_colmajor : a_block_descriptions_rowmajor;
const auto b_block_descriptions =
prob.TransB ? b_block_descriptions_colmajor : b_block_descriptions_rowmajor;
assert(tile_descriptions.size() == a_block_descriptions.size());
assert(tile_descriptions.size() == b_block_descriptions.size());
assert(tile_descriptions.size() == cshuffle_descriptions.size());
assert(tile_descriptions.size() == c_block_descriptions.size());
for(std::size_t i = 0; i < tile_descriptions.size(); i++)
{
Operation_Xdl_CShuffle x;
x.tile_desc = tile_descriptions[i];
x.a_block_transfer = a_block_descriptions[i];
x.b_block_transfer = b_block_descriptions[i];
x.cshuffle = cshuffle_descriptions[i];
x.c_block_transfer = c_block_descriptions[i];
x.A = TensorDesc{prob.ADataType, ToLayout(prob.TransA)};
x.B = TensorDesc{prob.BDataType, ToLayout(prob.TransB)};
x.E = TensorDesc{prob.EDataType, ToLayout(prob.TransE)};
x.Ds = Transform(prob.DsTrans, prob.DsDataType, [](auto trans, auto dt) {
return TensorDesc{dt, ToLayout(trans)};
});
x.a_elem_op = prob.AElementOp;
x.b_elem_op = prob.BElementOp;
x.cde_elem_op = prob.CDEElementOp;
x.gemm_specialization = GetGemmSpec(prob.M,
prob.N,
prob.K,
x.tile_desc.m_per_block,
x.tile_desc.n_per_block,
x.tile_desc.k_per_block);
result.push_back(x);
}
return result;
}
std::vector<std::vector<Operation_Xdl_CShuffle>> Operation_Xdl_CShuffle::CreateOperations()
{
std::vector<Problem> problems;
for(bool TransA : {true, false})
for(bool TransB : {true, false})
{
Problem prob;
prob.TransA = TransA;
prob.TransB = TransB;
problems.push_back(prob);
}
return Transform(problems, [](const Problem& p) { return CreateOperations(p); });
}
static const char* const DeviceGemmMultipleD_Xdl_CShuffleTemplate =
"ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<${LayoutA}, ${LayoutB}, "
"${LayoutDs}, ${LayoutE}, ${ADataType}, ${BDataType}, ${AccDataType}, ${CShuffleDataType}, "
"${DsDataType}, ${EDataType}, ${AElementwiseOperation}, ${BElementwiseOperation}, "
"${CDEElementwiseOperation}, ${GemmSpecialization}, ${NumGemmkPrefetchStage}, ${BlockSize}, "
"${MPerBlock}, ${NPerBlock}, ${KPerBlock}, ${AK1}, ${BK1}, ${MPerXDL}, ${NPerXDL}, "
"${MXdlPerWave}, ${NXdlPerWave}, ${ABlockTransferThreadClusterLengths_AK0_M_AK1}, "
"${ABlockTransferThreadClusterArrangeOrder}, ${ABlockTransferSrcAccessOrder}, "
"${ABlockTransferSrcVectorDim}, ${ABlockTransferSrcScalarPerVector}, "
"${ABlockTransferDstScalarPerVector_AK1}, ${ABlockLdsExtraM}, "
"${BBlockTransferThreadClusterLengths_BK0_N_BK1}, ${BBlockTransferThreadClusterArrangeOrder}, "
"${BBlockTransferSrcAccessOrder}, ${BBlockTransferSrcVectorDim}, "
"${BBlockTransferSrcScalarPerVector}, ${BBlockTransferDstScalarPerVector_BK1}, "
"${BBlockLdsExtraN}, ${CShuffleMXdlPerWavePerShuffle}, ${CShuffleNXdlPerWavePerShuffle}, "
"${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, "
"${CDEBlockTransferScalarPerVector_NPerBlock}>";
Solution Operation_Xdl_CShuffle::ToSolution() const
{
std::unordered_map<std::string, std::string> values = {
{"LayoutA", ToString(this->A.layout)},
{"LayoutB", ToString(this->B.layout)},
{"LayoutDs",
MakeTuple(Transform(this->Ds, [](auto tensor) { return ToString(tensor.layout); }))},
{"LayoutE", ToString(this->E.layout)},
{"ADataType", ToString(this->A.element)},
{"BDataType", ToString(this->B.element)},
{"AccDataType", ToString(this->acc)},
{"CShuffleDataType", ToString(this->cs_type)},
{"DsDataType",
MakeTuple(Transform(this->Ds, [](auto tensor) { return ToString(tensor.element); }))},
{"EDataType", ToString(this->E.element)},
{"AElementwiseOperation", this->a_elem_op},
{"BElementwiseOperation", this->b_elem_op},
{"CDEElementwiseOperation", this->cde_elem_op},
{"GemmSpecialization", this->gemm_specialization},
{"NumGemmkPrefetchStage", std::to_string(this->tile_desc.num_gemmk_prefetch_stage)},
{"BlockSize", std::to_string(this->tile_desc.block_size)},
{"MPerBlock", std::to_string(this->tile_desc.m_per_block)},
{"NPerBlock", std::to_string(this->tile_desc.n_per_block)},
{"KPerBlock", std::to_string(this->tile_desc.k_per_block)},
{"AK1", std::to_string(this->tile_desc.ak1)},
{"BK1", std::to_string(this->tile_desc.bk1)},
{"MPerXDL", std::to_string(this->tile_desc.m_per_XDL)},
{"NPerXDL", std::to_string(this->tile_desc.n_per_XDL)},
{"MXdlPerWave", std::to_string(this->tile_desc.m_Xdl_per_wave)},
{"NXdlPerWave", std::to_string(this->tile_desc.n_Xdl_per_wave)},
{"ABlockTransferThreadClusterLengths_AK0_M_AK1",
this->a_block_transfer.thread_cluster_length},
{"ABlockTransferThreadClusterArrangeOrder",
this->a_block_transfer.thread_cluster_arrange_order},
{"ABlockTransferSrcAccessOrder", this->a_block_transfer.src_access_order},
{"ABlockTransferSrcVectorDim", std::to_string(this->a_block_transfer.src_vec_dim)},
{"ABlockTransferSrcScalarPerVector",
std::to_string(this->a_block_transfer.src_scalar_per_vector)},
{"ABlockTransferDstScalarPerVector_AK1",
std::to_string(this->a_block_transfer.dst_scalar_per_vector_k1)},
{"ABlockLdsExtraM", std::to_string(this->a_block_transfer.lds_add_extra_dim)},
{"BBlockTransferThreadClusterLengths_BK0_N_BK1",
this->b_block_transfer.thread_cluster_length},
{"BBlockTransferThreadClusterArrangeOrder",
this->b_block_transfer.thread_cluster_arrange_order},
{"BBlockTransferSrcAccessOrder", this->b_block_transfer.src_access_order},
{"BBlockTransferSrcVectorDim", std::to_string(this->b_block_transfer.src_vec_dim)},
{"BBlockTransferSrcScalarPerVector",
std::to_string(this->b_block_transfer.src_scalar_per_vector)},
{"BBlockTransferDstScalarPerVector_BK1",
std::to_string(this->b_block_transfer.dst_scalar_per_vector_k1)},
{"BBlockLdsExtraN", std::to_string(this->b_block_transfer.lds_add_extra_dim)},
{"CShuffleMXdlPerWavePerShuffle",
std::to_string(this->cshuffle.m_Xdl_per_wave_per_shuffle)},
{"CShuffleNXdlPerWavePerShuffle",
std::to_string(this->cshuffle.n_Xdl_per_wave_per_shuffle)},
{"CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock",
this->c_block_transfer.cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl},
{"CDEBlockTransferScalarPerVector_NPerBlock",
std::to_string(this->c_block_transfer.scalar_per_vector_n_wave_n_per_Xdl)},
};
return Solution{InterpolateString(DeviceGemmMultipleD_Xdl_CShuffleTemplate, values),
std::move(values)};
}
} // namespace device_gemm_multiple_d
} // namespace host
} // namespace ck
#include "ck/host/headers.hpp"
#include "ck_headers.hpp"
namespace ck {
namespace host {
const std::string config_header = "";
std::unordered_map<std::string_view, std::string_view> GetHeaders()
{
auto headers = ck_headers();
headers.insert(std::make_pair("ck/config.h", config_header));
return headers;
}
} // namespace host
} // namespace ck
\ No newline at end of file
#include "ck/host/types.hpp"
#include "ck/host/stringutils.hpp"
#include <algorithm>
#include <stdexcept>
namespace ck {
namespace host {
Solution::Solution(std::string str, std::unordered_map<std::string, std::string> values)
: template_str(std::move(str)), template_values(std::move(values))
{
}
std::string Solution::ToTemplateString() const { return this->template_str; }
std::string Solution::GetTemplateParameter(const std::string& name) const
{
return this->template_values.at(name);
}
std::string ToString(DataType dt)
{
switch(dt)
{
case DataType::Float: return "float";
case DataType::Half: return "ck::half_t";
case DataType::Int8: return "int8_t";
case DataType::Int32: return "int32_t";
}
throw std::runtime_error("Incorrect data type");
}
std::string ToString(Layout dl)
{
switch(dl)
{
case Layout::Row: return "ck::tensor_layout::gemm::RowMajor";
case Layout::Column: return "ck::tensor_layout::gemm::ColumnMajor";
}
throw std::runtime_error("Incorrect layout");
}
std::string ToString(GemmType gt)
{
switch(gt)
{
case GemmType::Default: return "ck::tensor_operation::device::GemmSpecialization::Default";
}
throw std::runtime_error("Incorrect gemm type");
}
std::string SequenceStr(const std::vector<int>& v)
{
return "ck::Sequence<" +
JoinStrings(Transform(v, [](int x) { return std::to_string(x); }), ", ") + ">";
}
std::string MakeTuple(const std::vector<std::string>& v)
{
return "ck::Tuple<" + JoinStrings(v, ", ") + ">";
}
} // namespace host
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/host/utils.hpp"
namespace ck {
namespace host {
std::size_t integer_divide_ceil(std::size_t x, std::size_t y)
{
return (x + y - std::size_t{1}) / y;
}
const std::unordered_set<std::string>& get_xdlop_archs()
{
static std::unordered_set<std::string> supported_archs{"gfx90a", "gfx908", "gfx940", "gfx942"};
return supported_archs;
}
} // namespace host
} // namespace ck
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
add_subdirectory(rtc)
file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp)
foreach(TEST_SRC ${TEST_SRCS})
get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE)
rocm_add_test_executable(test_host_${BASE_NAME} ${TEST_SRC})
target_link_libraries(test_host_${BASE_NAME} ck_rtc ck_host)
target_include_directories(test_host_${BASE_NAME} PUBLIC include())
endforeach()
#include "ck/host/device_gemm_multiple_d/problem.hpp"
#include "ck/host/device_gemm_multiple_d/operation.hpp"
#include "ck/host/headers.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include <algorithm>
#include <cmath>
#include <iterator>
#include <random>
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
using half = _Float16;
// using half = __fp16;
std::vector<rtc::src_file> get_headers_for_test()
{
std::vector<rtc::src_file> result;
auto hs = ck::host::GetHeaders();
std::transform(
hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
return {p.first, p.second};
});
return result;
}
template <class T>
rtc::buffer<T> generate_buffer(std::size_t n, std::size_t seed = 0)
{
rtc::buffer<T> result(n);
std::mt19937 gen(seed);
std::uniform_real_distribution<double> dis(-1.0);
std::generate(result.begin(), result.end(), [&] { return dis(gen); });
return result;
}
template <class T, class U>
bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
{
return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) {
return fabs(x - y) < atol + rtol * fabs(y);
});
}
std::string classify(double x)
{
switch(std::fpclassify(x))
{
case FP_INFINITE: return "inf";
case FP_NAN: return "nan";
case FP_NORMAL: return "normal";
case FP_SUBNORMAL: return "subnormal";
case FP_ZERO: return "zero";
default: return "unknown";
}
}
template <class Buffer>
void print_classification(const Buffer& x)
{
std::unordered_set<std::string> result;
for(const auto& i : x)
result.insert(classify(i));
for(const auto& c : result)
std::cout << c << ", ";
std::cout << std::endl;
}
template <class Buffer>
void print_statistics(const Buffer& x)
{
std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", ";
std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", ";
double num_elements = x.size();
auto mean =
std::accumulate(x.begin(), x.end(), double{0.0}, std::plus<double>{}) / num_elements;
auto stddev = std::sqrt(
std::accumulate(x.begin(),
x.end(),
double{0.0},
[&](double r, double v) { return r + std::pow((v - mean), 2.0); }) /
num_elements);
std::cout << "Mean: " << mean << ", ";
std::cout << "StdDev: " << stddev << "\n";
}
template <class Buffer>
void print_preview(const Buffer& x)
{
if(x.size() <= 10)
{
std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; });
}
else
{
std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; });
std::cout << "..., ";
std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; });
}
std::cout << std::endl;
}
template <class T>
struct check_all
{
rtc::buffer<T> data{};
bool operator()(const rtc::buffer<T>& x)
{
if(data.empty())
{
data = x;
return true;
}
if(std::any_of(x.begin(), x.end(), [](double y) { return std::isnan(y); }))
return false;
return allclose(data, x);
}
};
template <class Solution>
auto report(const Solution& solution, bool pass)
{
return test::make_predicate(solution.ToTemplateString(), [=] { return pass; });
}
const std::string gemm_compile_check = R"__ck__(
#include <${include}>
extern "C" __global__ void f(const ck::half_t* a, const ck::half_t* b, ck::half_t* c) {
using G = ${template};
constexpr auto desc = ${template}::make_descriptor(ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${k})),
ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${k}), ck::make_tuple(1, ${n})),
ck::make_tuple(),
ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${n})));
static_assert(desc.IsValid(), "Invalid ck gemm.");
if constexpr(desc.IsValid())
{
${template}::Run(desc,
a,
b,
ck::make_tuple(),
c);
}
}
)__ck__";
TEST_CASE(test_problem_kernel)
{
ck::host::device_gemm_multiple_d::Problem prob;
prob.M = 1024;
prob.N = 1024;
prob.K = 1024;
check_all<half> check;
auto a = to_gpu(generate_buffer<half>(1024 * 1024, 0));
auto b = to_gpu(generate_buffer<half>(1024 * 1024, 1));
auto c = to_gpu(generate_buffer<half>(1024 * 1024, 2));
for(auto solution : prob.GetSolutions("gfx90a"))
{
auto src = ck::host::InterpolateString(gemm_compile_check,
{{"include", prob.GetIncludeHeader()},
{"template", solution.ToTemplateString()},
{"m", std::to_string(prob.M)},
{"n", std::to_string(prob.N)},
{"k", std::to_string(prob.K)}});
auto srcs = get_headers_for_test();
srcs.push_back({"main.cpp", src});
rtc::compile_options options;
options.kernel_name = "f";
auto k = rtc::compile_kernel(srcs, options);
auto block_size = solution.GetTemplateParameter<std::size_t>("BlockSize");
auto m_per_block = solution.GetTemplateParameter<std::size_t>("MPerBlock");
auto n_per_block = solution.GetTemplateParameter<std::size_t>("NPerBlock");
auto grid_size = ck::host::integer_divide_ceil(prob.M, m_per_block) *
ck::host::integer_divide_ceil(prob.N, n_per_block);
k.launch(nullptr, grid_size * block_size, block_size)(a.data(), b.data(), c.data());
CHECK(report(solution, check(rtc::from_gpu(c))));
}
}
int main(int argc, const char* argv[]) { test::run(argc, argv); }
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment