Commit 17acbbf4 authored by Alan Turner's avatar Alan Turner
Browse files

Add jit library

parent bef0cb20
......@@ -51,3 +51,6 @@ install.dir*
# directories containing generated documentation
docs/source/_build/
docs/docBin/
# Generated source
library/src/jit_library/solution_instances/
......@@ -17,23 +17,12 @@ include(ROCMInstallSymlinks)
include(ROCMCreatePackage)
include(CheckCXXCompilerFlag)
option(CK_BUILD_JIT_LIB, "Only build the CK JIT Helper Library" OFF)
rocm_setup_version(VERSION 0.2.0)
include(TargetFlags)
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
if(USE_BITINT_EXTENSION_INT4)
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
add_compile_options(-Wno-bit-int-extension)
message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
endif()
## Threads
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
link_libraries(Threads::Threads)
## C++
enable_language(CXX)
set(CMAKE_CXX_STANDARD 17)
......@@ -41,50 +30,65 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
else()
find_package(OpenMP REQUIRED)
endif()
message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
link_libraries(${OpenMP_gomp_LIBRARY})
link_libraries(${OpenMP_pthread_LIBRARY})
## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary.
# The variables set by find_package() can't be overwritten,
# therefore let's use intermediate variables.
set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}")
set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}")
set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}")
if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR )
set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}")
message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}")
endif()
if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR )
set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}")
message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}")
endif()
if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}")
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
if (NOT CK_BUILD_JIT_LIB)
option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
if(USE_BITINT_EXTENSION_INT4)
add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
add_compile_options(-Wno-bit-int-extension)
message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
endif()
## Threads
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
link_libraries(Threads::Threads)
## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
else()
find_package(OpenMP REQUIRED)
endif()
message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
link_libraries(${OpenMP_gomp_LIBRARY})
link_libraries(${OpenMP_pthread_LIBRARY})
## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary.
# The variables set by find_package() can't be overwritten,
# therefore let's use intermediate variables.
set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}")
set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}")
set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}")
if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR )
set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}")
message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}")
endif()
if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR )
set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}")
message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}")
endif()
if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}")
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
link_libraries(hip::device)
add_compile_definitions(__HIP_PLATFORM_HCC__=1)
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
link_libraries(hip::device)
add_compile_definitions(__HIP_PLATFORM_HCC__=1)
## tidy
include(EnableCompilerWarnings)
......@@ -196,6 +200,7 @@ enable_clang_tidy(
-DCK_USE_CLANG_TIDY
)
include(CppCheck)
enable_cppcheck(
CHECKS
......@@ -231,51 +236,60 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
${HIP_INCLUDE_DIRS}
)
if (NOT CK_BUILD_JIT_LIB)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
add_compile_options(-Werror)
add_compile_options(-Weverything)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
set(CK_DEVICE_INSTANCES)
FOREACH(subdir_path ${dir_list})
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
ENDIF()
ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name
)
rocm_package_setup_component(examples
LIBRARY_NAME composablekernel
PACKAGE_NAME examples
)
rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler
)
add_subdirectory(example)
add_subdirectory(test)
add_subdirectory(profiler)
else()
add_subdirectory(library)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
add_compile_options(-Werror)
add_compile_options(-Weverything)
rocm_package_setup_component(jit_library
LIBRARY_NAME composablekernel
PACKAGE_NAME jit_library
)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
set(CK_DEVICE_INSTANCES)
FOREACH(subdir_path ${dir_list})
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
ENDIF()
ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name
)
rocm_package_setup_component(examples
LIBRARY_NAME composablekernel
PACKAGE_NAME examples
)
rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler
)
add_subdirectory(library)
add_subdirectory(example)
add_subdirectory(test)
add_subdirectory(profiler)
#Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers)
......
@PACKAGE_INIT@
set(_composable_kernel_supported_components device_operations utility)
set(_composable_kernel_supported_components device_operations utility jit_library)
foreach(_comp ${composable_kernel_FIND_COMPONENTS})
if(NOT _comp IN_LIST _composable_kernel_supported_components)
......
#####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################
find_program(EMBED_LD ld)
find_program(EMBED_OBJCOPY objcopy)
function(generate_embed_source EMBED_NAME)
set(options)
set(oneValueArgs SRC HEADER RELATIVE)
set(multiValueArgs OBJECTS SYMBOLS)
cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(EXTERNS)
set(INIT_KERNELS)
list(LENGTH PARSE_SYMBOLS SYMBOLS_LEN)
list(LENGTH PARSE_OBJECTS OBJECTS_LEN)
if(NOT ${SYMBOLS_LEN} EQUAL ${OBJECTS_LEN})
message(FATAL_ERROR "Symbols and objects dont match: ${SYMBOLS_LEN} != ${OBJECTS_LEN}")
endif()
math(EXPR LEN "${SYMBOLS_LEN} - 1")
foreach(idx RANGE ${LEN})
list(GET PARSE_SYMBOLS ${idx} SYMBOL)
list(GET PARSE_OBJECTS ${idx} OBJECT)
set(START_SYMBOL "_binary_${SYMBOL}_start")
set(END_SYMBOL "_binary_${SYMBOL}_end")
string(APPEND EXTERNS "
extern const char ${START_SYMBOL}[];
extern const char ${END_SYMBOL}[];
")
file(RELATIVE_PATH BASE_NAME ${PARSE_RELATIVE} "${OBJECT}")
string(REGEX REPLACE ".[A-Za-z0-9_]$" "" BASE_NAME ${BASE_NAME})
string(APPEND INIT_KERNELS "
{ \"${BASE_NAME}\", { ${START_SYMBOL}, ${END_SYMBOL}} },
")
endforeach()
file(WRITE "${PARSE_HEADER}" "
#include <unordered_map>
#include <string>
#include <utility>
const std::unordered_map<std::string, std::pair<const char*,const char*>>& ${EMBED_NAME}();
")
file(WRITE "${PARSE_SRC}" "
#include <${EMBED_NAME}.hpp>
${EXTERNS}
const std::unordered_map<std::string, std::pair<const char*,const char*>>& ${EMBED_NAME}()
{
static const std::unordered_map<std::string, std::pair<const char*,const char*>> result = {${INIT_KERNELS}};
return result;
}
")
endfunction()
function(embed_file OUTPUT_FILE OUTPUT_SYMBOL FILE)
set(WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
# Glob is used to compute the relative path
file(GLOB FILES RELATIVE ${WORKING_DIRECTORY} ${FILE})
foreach(REL_FILE ${FILES})
string(MAKE_C_IDENTIFIER "${REL_FILE}" SYMBOL)
get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
set(${OUTPUT_SYMBOL} ${SYMBOL} PARENT_SCOPE)
set(${OUTPUT_FILE} "${OUT_FILE}" PARENT_SCOPE)
add_custom_command(
OUTPUT "${OUT_FILE}"
COMMAND ${EMBED_LD} -r -o "${OUT_FILE}" -z noexecstack --format=binary "${REL_FILE}"
COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUT_FILE}"
WORKING_DIRECTORY ${WORKING_DIRECTORY}
DEPENDS ${FILE}
VERBATIM
)
endforeach()
endfunction()
function(add_embed_library EMBED_NAME)
set(options)
set(oneValueArgs RELATIVE)
set(multiValueArgs)
cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/embed)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
set(EMBED_DIR ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
set(SRC_FILE "${EMBED_DIR}/${EMBED_NAME}.cpp")
set(HEADER_FILE "${EMBED_DIR}/include/${EMBED_NAME}.hpp")
set(WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
set(OUTPUT_FILES)
set(SYMBOLS)
message(STATUS "Embedding files")
foreach(FILE ${PARSE_UNPARSED_ARGUMENTS})
embed_file(OUTPUT_FILE OUTPUT_SYMBOL ${FILE})
list(APPEND OUTPUT_FILES ${OUTPUT_FILE})
list(APPEND SYMBOLS ${OUTPUT_SYMBOL})
endforeach()
message(STATUS "Generating embedding library ${EMBED_NAME}")
generate_embed_source(${EMBED_NAME} SRC ${SRC_FILE} HEADER ${HEADER_FILE} OBJECTS ${OUTPUT_FILES} SYMBOLS ${SYMBOLS} RELATIVE ${PARSE_RELATIVE})
add_library(${EMBED_NAME} STATIC ${OUTPUT_FILES} "${SRC_FILE}")
target_include_directories(${EMBED_NAME} PUBLIC "$<BUILD_INTERFACE:${EMBED_DIR}/include>")
target_compile_options(${EMBED_NAME} PRIVATE -Wno-reserved-identifier)
set_target_properties(${EMBED_NAME} PROPERTIES POSITION_INDEPENDENT_CODE On)
endfunction()
......@@ -37,6 +37,42 @@ struct BaseInvoker
virtual ~BaseInvoker() {}
};
struct BaseParameters
{
BaseParameters() = default;
BaseParameters(const BaseParameters&) = default;
BaseParameters& operator=(const BaseParameters&) = default;
virtual void SetAElementOp(const std::string&) {}
virtual void SetBElementOp(const std::string&) {}
virtual void SetCDEElementOp(const std::string&) {}
virtual void SetDsLayout(const std::string&) {}
virtual void SetDsDataType(const std::string&) {}
virtual void SetGemmSpec(const index_t, const index_t, const index_t) {}
virtual index_t GetGridSize(const index_t, const index_t)
{
return 0;
}
virtual index_t GetBlockSize()
{
return 0;
}
virtual std::string GetParametersString()
{
return "";
}
virtual ~BaseParameters() {}
};
struct BaseOperator
{
BaseOperator() = default;
......
......@@ -51,6 +51,26 @@ struct DeviceGemmMultipleD : public BaseOperator
CDEElementwiseOperation cde_element_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
virtual std::unique_ptr<BaseParameters> MakeParametersPointer()
{
return std::make_unique<BaseParameters>(BaseParameters{});
}
virtual index_t GetBlockSize() const
{
return 0;
}
virtual index_t GetMPerBlock() const
{
return 0;
}
virtual index_t GetNPerBlock() const
{
return 0;
}
};
} // namespace device
......
......@@ -690,6 +690,308 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
return str.str();
}
struct Parameters : BaseParameters
{
template <class S>
static std::string GetSequenceString(S s)
{
auto str = std::stringstream();
str << "ck::Sequence<";
auto size = s.Size();
for(int i = 0; i < size; ++i)
{
str << s.At(i);
if(i < size - 1)
str << ",";
}
str << ">";
return str.str();
}
template <class T>
static std::string GetTypeString(T)
{
return "";
}
template <>
static std::string GetTypeString<float>(float)
{
return "float";
}
template <>
static std::string GetTypeString<ck::half_t>(ck::half_t)
{
return "ck::half_t";
}
template <>
static std::string
GetTypeString<tensor_layout::gemm::RowMajor>(tensor_layout::gemm::RowMajor)
{
return "ck::tensor_layout::gemm::RowMajor";
}
template <>
static std::string
GetTypeString<tensor_layout::gemm::ColumnMajor>(tensor_layout::gemm::ColumnMajor)
{
return "ck::tensor_layout::gemm::ColumnMajor";
}
template <class T>
static std::string GetTupleString(T t)
{
auto str = std::stringstream();
str << "ck::Tuple<";
static_for<0, t.Size(), 1>{}([&](auto i) {
str << GetTypeString(t.At(i));
if(i < t.Size() - 1)
str << ",";
});
str << ">";
return str.str();
}
template <>
static std::string GetTupleString<Tuple<>>(Tuple<>)
{
return "ck::Tuple<>";
}
void SetAElementOp(const std::string& s) override { a_elementwise_op = s; }
void SetBElementOp(const std::string& s) override { b_elementwise_op = s; }
void SetCDEElementOp(const std::string& s) override { cde_elementwise_op = s; }
void SetDsLayout(const std::string& s) override { ds_layout = s; }
void SetDsDataType(const std::string& s) override { ds_data_type = s; }
void SetGemmSpec(const index_t m, const index_t n, const index_t k) override
{
std::string spec = "";
if(math::integer_divide_ceil(m, MPerBlock) * MPerBlock - m != 0)
spec += "M";
if(math::integer_divide_ceil(n, NPerBlock) * NPerBlock - n != 0)
spec += "N";
if(math::integer_divide_ceil(k, KPerBlock) * KPerBlock - k != 0)
spec += "K";
if(spec == "")
gemm_spec = "ck::tensor_operation::device::GemmSpecialization::Default";
else
gemm_spec = "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
}
index_t GetGridSize(const index_t m, const index_t n) override
{
return math::integer_divide_ceil(m, MPerBlock) *
math::integer_divide_ceil(n, NPerBlock);
}
index_t GetBlockSize() override { return BlockSize; }
std::string GetParametersString() override
{
auto str = std::stringstream();
std::map<LoopScheduler, std::string> LoopSchedToString{
{LoopScheduler::Default, "ck::LoopScheduler::Default"},
{LoopScheduler::Interwave, "ck::LoopScheduler::Interwave"}};
std::map<PipelineVersion, std::string> PipelineVersionToString{
{PipelineVersion::v1, "ck::PipelineVersion::v1"},
{PipelineVersion::v2, "ck::PipelineVersion::v2"}};
// clang-format off
str << "ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle"
<< "<"
<< GetTypeString(ALayout{}) << ", "
<< GetTypeString(BLayout{}) << ", "
<< ds_layout << ", "
<< GetTypeString(ELayout{}) << ", "
<< GetTypeString(ADataType{}) << ", "
<< GetTypeString(BDataType{}) << ", "
<< GetTypeString(AccDataType{}) << ", "
<< GetTypeString(CShuffleDataType{}) << ", "
<< ds_data_type << ", "
<< GetTypeString(EDataType{}) << ", "
<< a_elementwise_op << ", "
<< b_elementwise_op << ", "
<< cde_elementwise_op << ", "
<< gemm_spec << ", "
<< NumGemmKPrefetchStage << ", "
<< BlockSize << ", "
<< MPerBlock << ", "
<< NPerBlock << ", "
<< KPerBlock << ", "
<< AK1 << ", "
<< BK1 << ", "
<< MPerXDL << ", "
<< NPerXDL << ", "
<< MXdlPerWave << ", "
<< NXdlPerWave << ", "
<< GetSequenceString(ABlockTransferThreadClusterLengths_AK0_M_AK1{}) << ", "
<< GetSequenceString(ABlockTransferThreadClusterArrangeOrder{}) << ", "
<< GetSequenceString(ABlockTransferSrcAccessOrder{}) << ", "
<< ABlockTransferSrcVectorDim << ", "
<< ABlockTransferSrcScalarPerVector << ", "
<< ABlockTransferDstScalarPerVector_AK1 << ", "
<< ABlockLdsExtraM << ", "
<< GetSequenceString(BBlockTransferThreadClusterLengths_BK0_N_BK1{}) << ", "
<< GetSequenceString(BBlockTransferThreadClusterArrangeOrder{}) << ", "
<< GetSequenceString(BBlockTransferSrcAccessOrder{}) << ", "
<< BBlockTransferSrcVectorDim << ", "
<< BBlockTransferSrcScalarPerVector << ", "
<< BBlockTransferDstScalarPerVector_BK1 << ", "
<< BBlockLdsExtraN << ", "
<< CShuffleMXdlPerWavePerShuffle << ", "
<< CShuffleNXdlPerWavePerShuffle << ", "
<< GetSequenceString(CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock{}) << ", "
<< CDEBlockTransferScalarPerVector_NPerBlock << ", "
<< LoopSchedToString[LoopSched] << ", "
<< PipelineVersionToString[PipelineVer]
<< ">";
// clang-format on
return str.str();
}
std::string a_elementwise_op = "ck::tensor_operation::element_wise::PassThrough";
std::string b_elementwise_op = "ck::tensor_operation::element_wise::PassThrough";
std::string cde_elementwise_op = "ck::tensor_operation::element_wise::PassThrough";
std::string ds_layout = "ck::Tuple<>";
std::string ds_data_type = "ck::Tuple<>";
std::string gemm_spec = "ck::tensor_operation::device::GemmSpecialization::" +
getGemmSpecializationString(GemmSpec);
};
std::unique_ptr<BaseParameters> MakeParametersPointer() override
{
return std::make_unique<Parameters>(Parameters{});
}
index_t GetBlockSize() const override { return BlockSize; }
index_t GetMPerBlock() const override { return MPerBlock; }
index_t GetNPerBlock() const override { return NPerBlock; }
template <class ADesc, class BDesc, class DsDesc, class EDesc>
struct Descriptor
{
static constexpr auto ds_tuple()
{
return transform_tuples(
[&](auto d) constexpr { return DeviceOp::matrix_padder.PadCDescriptor_M_N(d); },
DsDesc{});
}
using AGridDesc_AK0_M_AK1 =
remove_cvref_t<decltype(GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
DeviceOp::matrix_padder.PadADescriptor_M_K(ADesc{})))>;
using BGridDesc_BK0_N_BK1 =
remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
DeviceOp::matrix_padder.PadBDescriptor_N_K(BDesc{})))>;
using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_tuple()))>;
using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
DeviceOp::matrix_padder.PadCDescriptor_M_N(EDesc{})))>;
using Block2ETileMap = remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(
DeviceOp::matrix_padder.PadCDescriptor_M_N(EDesc{})))>;
AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1;
BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1;
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock;
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock;
Block2ETileMap block_2_etile_map;
bool has_main_k_block_loop = true;
bool is_valid = false;
constexpr Descriptor(ADesc a, BDesc b, DsDesc ds, EDesc e)
: a_grid_desc_ak0_m_ak1{GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(
DeviceOp::matrix_padder.PadADescriptor_M_K(a))},
b_grid_desc_bk0_n_bk1{GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
DeviceOp::matrix_padder.PadBDescriptor_N_K(b))},
ds_grid_desc_mblock_mperblock_nblock_nperblock{
GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
transform_tuples(
[&](auto d) constexpr {
return DeviceOp::matrix_padder.PadCDescriptor_M_N(d);
},
ds))},
e_grid_desc_mblock_mperblock_nblock_nperblock{
GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
DeviceOp::matrix_padder.PadCDescriptor_M_N(e))},
block_2_etile_map{GridwiseGemm::MakeDefaultBlock2ETileMap(
DeviceOp::matrix_padder.PadCDescriptor_M_N(e))},
has_main_k_block_loop{GridwiseGemm::CalculateHasMainKBlockLoop(
a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2))},
is_valid{GridwiseGemm::CheckValidity(
(DeviceOp::matrix_padder.PadADescriptor_M_K(a)),
DeviceOp::matrix_padder.PadBDescriptor_N_K(b),
transform_tuples(
[&](auto d) constexpr {
return DeviceOp::matrix_padder.PadCDescriptor_M_N(d);
},
ds),
DeviceOp::matrix_padder.PadCDescriptor_M_N(e),
block_2_etile_map)}
{
}
};
template <class ADesc, class BDesc, class DsDesc, class EDesc>
static constexpr auto make_descriptor(ADesc a, BDesc b, DsDesc ds, EDesc e)
{
return Descriptor<ADesc, BDesc, DsDesc, EDesc>(a, b, ds, e);
}
template <class Desc, class DsPointer>
__device__ static void Run(Desc desc,
const ADataType* __restrict__ p_a_grid,
const BDataType* __restrict__ p_b_grid,
DsPointer p_ds_grid,
EDataType* __restrict__ p_e_grid)
{
__shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
assert(desc.is_valid);
if(desc.has_main_k_block_loop)
{
GridwiseGemm::template Run<true>(p_a_grid,
p_b_grid,
p_ds_grid,
p_e_grid,
p_shared_block,
AElementwiseOperation{},
BElementwiseOperation{},
CDEElementwiseOperation{},
desc.a_grid_desc_ak0_m_ak1,
desc.b_grid_desc_bk0_n_bk1,
desc.ds_grid_desc_mblock_mperblock_nblock_nperblock,
desc.e_grid_desc_mblock_mperblock_nblock_nperblock,
desc.block_2_etile_map);
}
else
{
GridwiseGemm::template Run<false>(p_a_grid,
p_b_grid,
p_ds_grid,
p_e_grid,
p_shared_block,
AElementwiseOperation{},
BElementwiseOperation{},
CDEElementwiseOperation{},
desc.a_grid_desc_ak0_m_ak1,
desc.b_grid_desc_bk0_n_bk1,
desc.ds_grid_desc_mblock_mperblock_nblock_nperblock,
desc.e_grid_desc_mblock_mperblock_nblock_nperblock,
desc.block_2_etile_map);
}
}
};
} // namespace device
......
......@@ -4,7 +4,7 @@
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
#include "ck/utility/math.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/quantization_operation.hpp"
......
......@@ -117,15 +117,15 @@ struct BlockToCTileMap_M00_N0_M01Adapt
static constexpr auto I2 = Number<2>{};
static constexpr auto I3 = Number<3>{};
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt() = default;
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt() = default;
__host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
__host__ __device__ constexpr BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
index_t M01 = 8)
: M01_(M01), c_grid_desc_m_n_(c_grid_desc_m_n)
{
}
__host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
__host__ __device__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
{
const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
......@@ -203,13 +203,13 @@ struct BlockToCTileMap_M00_N0_M01Adapt
}
template <typename CTileIdx, typename CTileDim>
__host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
__host__ __device__ constexpr bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
const CTileDim& /* c_tile_dim */) const
{
return true; // always valid provided that user gets grid size from CalculateGridSize()
}
__host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
__host__ __device__ constexpr bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
private:
index_t M01_;
......
......@@ -5,6 +5,7 @@
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
#include <iostream>
namespace ck {
......
add_subdirectory(src/tensor_operation_instance/gpu)
add_subdirectory(src/utility)
if (CK_BUILD_JIT_LIB)
add_subdirectory(src/jit_library)
else()
add_subdirectory(src/tensor_operation_instance/gpu)
add_subdirectory(src/utility)
endif()
include(Embed)
file(GLOB_RECURSE KERNEL_FILES ${CONFIGURE_DEPENDS}
${PROJECT_SOURCE_DIR}/include/ck/*.hpp)
message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
add_embed_library(ck_headers ${KERNEL_FILES} RELATIVE ${PROJECT_SOURCE_DIR}/build/include)
execute_process(
COMMAND python3 ${CMAKE_CURRENT_SOURCE_DIR}/util/make_instance_strings.py
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../tensor_operation_instance/gpu/
)
set(JIT_LIB_SOURCE
${CMAKE_CURRENT_SOURCE_DIR}/include/device_gemm_multiple_d_xdlop_cshuffle.hpp
)
add_library(jit_library STATIC ${JIT_LIB_SOURCE})
add_library(composable_kernel::jit_library ALIAS jit_library)
set_target_properties(jit_library PROPERTIES LINKER_LANGUAGE CXX)
target_include_directories(jit_library PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)
target_link_libraries(jit_library PRIVATE ck_headers)
rocm_install(
TARGETS jit_library ck_headers
EXPORT jit_libraryTargets
)
set(INCLUDE_DIRS
${PROJECT_SOURCE_DIR}/include/ck/
${PROJECT_SOURCE_DIR}/library/src/jit_library/include
${PROJECT_SOURCE_DIR}/library/src/jit_library/solution_instances
${CMAKE_CURRENT_BINARY_DIR}/embed/ck_headers/include
)
rocm_install(DIRECTORY ${INCLUDE_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
rocm_install(
EXPORT jit_libraryTargets
FILE composable_kerneljit_libraryTargets.cmake
NAMESPACE composable_kernel::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include <sstream>
#include <iterator>
#include <numeric>
#include "ck/solution_instances/gemm_add_add_fastgelu_instances.hpp"
#include "ck/ck.hpp"
#include "ck/utility/math.hpp"
#include "ck_headers.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_gemm_multiple_d {
struct Solution
{
std::string template_str;
index_t block_size;
index_t grid_size;
Solution(std::string s, index_t b, index_t g) : template_str(s), block_size(b), grid_size(g)
{}
auto GetStr() const
{
return template_str;
}
auto GetBlockSize() const
{
return block_size;
}
auto GetGridSize() const
{
return grid_size;
}
};
std::string GetGemmSpec(const index_t m,
const index_t n,
const index_t k,
const index_t m_per_block,
const index_t n_per_block,
const index_t k_per_block)
{
std::string spec = "";
if(math::integer_divide_ceil(m, m_per_block) * m_per_block - m != 0)
spec += "M";
if(math::integer_divide_ceil(n, n_per_block) * n_per_block - n != 0)
spec += "N";
if(math::integer_divide_ceil(k, k_per_block) * k_per_block - k != 0)
spec += "K";
if(spec == "")
return "ck::tensor_operation::device::GemmSpecialization::Default";
return "ck::tensor_operation::device::GemmSpecialization::" + spec + "Padding";
}
index_t GetGridSize(const index_t m,
const index_t n,
const index_t m_per_block,
const index_t n_per_block)
{
return math::integer_divide_ceil(m, m_per_block) *
math::integer_divide_ceil(n, n_per_block);
}
const std::unordered_set<std::string>& get_xdlop_archs()
{
static std::unordered_set<std::string> supported_archs{"gfx90a"};
return supported_archs;
}
struct Problem
{
index_t M;
index_t N;
index_t K;
index_t NumDTensors;
bool TransA;
bool TransB;
bool TransCDE;
std::string ADataType;
std::string BDataType;
std::string CDEDataType;
std::string AElementOp;
std::string BElementOp;
std::string CDEElementOp;
std::string CDELayout;
static const index_t ds_layout_idx = 3;
static const index_t ds_data_type_idx = 9;
static const index_t a_elementwise_op_idx = 11;
static const index_t b_elementwise_op_idx = 12;
static const index_t ds_elementwise_op_idx = 13;
static const index_t gemm_spec_idx = 14;
static const index_t block_size_idx = 16;
static const index_t m_per_block_idx = 17;
static const index_t n_per_block_idx = 18;
static const index_t k_per_block_idx = 19;
auto GetInstances(const std::string& arch) const
{
std::vector<std::string> instances;
if (get_xdlop_archs().find(arch) != get_xdlop_archs().end())
{
instance::gemm_add_add_fastgelu_instances all_instances{};
if(TransA and TransB)
instances = all_instances.get_col_col_instances();
else if(TransA and not TransB)
instances = all_instances.get_col_row_instances();
else if(not TransA and not TransB)
instances = all_instances.get_row_row_instances();
else
instances = all_instances.get_row_col_instances();
}
return instances;
}
auto GetHeaders() const
{
return ck_headers();
}
auto GetIncludeHeader() const
{
return instance::gemm_add_add_fastgelu_instances{}.get_include_header();
}
Problem(index_t m,
index_t n,
index_t k,
index_t numDTensors,
bool transA,
bool transB,
bool transCDE,
std::string aDataType,
std::string bDataType,
std::string cdeDataType,
std::string aElementOp,
std::string bElementOp,
std::string cdeElementOp,
std::string cdeLayout)
: M(m),
N(n),
K(k),
NumDTensors(numDTensors),
TransA(transA),
TransB(transB),
TransCDE(transCDE),
ADataType(aDataType),
BDataType(bDataType),
CDEDataType(cdeDataType),
AElementOp(aElementOp),
BElementOp(bElementOp),
CDEElementOp(cdeElementOp),
CDELayout(cdeLayout)
{
}
auto MakeSolution(index_t idx, const std::string& arch) const
{
auto template_str = GetInstances(arch).at(idx);
std::istringstream iss(template_str);
std::vector<std::string> params(std::istream_iterator<std::string>{iss},
std::istream_iterator<std::string>());
params[a_elementwise_op_idx] = AElementOp;
params[b_elementwise_op_idx] = BElementOp;
params[ds_layout_idx] = CDELayout;
params[ds_data_type_idx] = CDEDataType;
params[ds_elementwise_op_idx] = CDEElementOp;
auto block_size_str = params[block_size_idx];
auto m_per_block_str = params[m_per_block_idx];
auto n_per_block_str = params[n_per_block_idx];
auto k_per_block_str = params[k_per_block_idx];
const auto block_size = std::stoi(block_size_str);
const auto m_per_block = std::stoi(m_per_block_str);
const auto n_per_block = std::stoi(n_per_block_str);
const auto k_per_block = std::stoi(k_per_block_str);
const auto grid_size = GetGridSize(M, N, m_per_block, n_per_block);
params[gemm_spec_idx] = GetGemmSpec(M, N, K, m_per_block, n_per_block, k_per_block);
std::string str = std::accumulate(params.begin() + 1, params.end(), std::string{},
[](const std::string& a, const std::string& b) {
return a.empty() ? b : a + ", " + b;
});
str = params.front() + "< " + str + ">";
return Solution{str, block_size, grid_size};
}
auto GetSolutions(const std::string& arch) const
{
std::vector<Solution> solutions;
const auto num_instances = GetInstances(arch).size();
for (auto i = 0; i < num_instances; ++i)
{
solutions.push_back(MakeSolution(i, arch));
}
return solutions;
}
};
} // namespace device_gemm_multiple_d
} // namespace device
} // namespace tensor_operation
} // namespace ck
import argparse, re, json, os
out_file = """// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
namespace ck {{
namespace tensor_operation {{
namespace device {{
namespace instance {{
struct {op_name}_instances
{{
static inline std::vector<std::string> {col_row_name} =
{{
{col_row_instances}
}};
static inline std::vector<std::string> {col_col_name} =
{{
{col_col_instances}
}};
static inline std::vector<std::string> {row_row_name} =
{{
{row_row_instances}
}};
static inline std::vector<std::string> {row_col_name} =
{{
{row_col_instances}
}};
static auto get_col_row_instances()
{{
return {col_row_name};
}}
static auto get_col_col_instances()
{{
return {col_col_name};
}}
static auto get_row_row_instances()
{{
return {row_row_name};
}}
static auto get_row_col_instances()
{{
return {row_col_name};
}}
static auto get_include_header()
{{
return "{include_header}";
}}
}};
}} // namespace instance
}} // namespace device
}} // namespace tensor_operation
}} // namespace ck
"""
def strip_sequences(str):
matches = re.findall(r'S<\d+(?:,\s*\d+)*>', str)
for match in matches:
str = str.replace(match, match.replace(' ', ''))
str = str.replace('S<', "ck::Sequence<")
return str
def remove_commas_and_brackets(string):
regex_matches = re.findall(r'ck::Sequence<.*?>', string)
for match in regex_matches:
string = string.replace(match, match.replace(',', '|').replace('<', '%').replace('>', '$'))
string = string.replace(',', '').replace('<', '').replace('>', '')
for match in regex_matches:
string = string.replace(match.replace(',', '|').replace('<', '%').replace('>', '$'), match)
return string
def parse_instances(source):
out_dir = os.path.join(source, "../../../src/jit_library/solution_instances")
aliases = {"F16_F16_Tuple": "ck::Tuple<F16,F16>",
"Row_Row_Tuple": "ck::Tuple<Row,Row>",
"LoopScheduler": "ck::LoopScheduler",
"PipelineVersion": "ck::PipelineVersion",
"Row": "ck::tensor_layout::gemm::RowMajor",
"Col": "ck::tensor_layout::gemm::ColumnMajor",
"F16": "ck::half_t",
"F32": "float"}
device_ops = {"gemm_add_add_fastgelu": "DeviceGemmMultipleD_Xdl_CShuffle",
#"batched_gemm_softmax_gemm": "DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle"
}
for root_, dirs_, files_ in os.walk(source):
for dir in dirs_:
op_name = os.path.split(dir)[-1]
if op_name not in device_ops:
continue
col_row_name = ""
col_col_name = ""
row_row_name = ""
row_col_name = ""
row_row_instances = []
col_row_instances = []
row_col_instances = []
col_col_instances = []
for root, dirs, files in os.walk(os.path.join(root_, dir)):
for file in files:
if not file.endswith(".cpp"):
continue;
file_name = os.path.split(file)[-1]
is_row_row = bool(re.search(".*mk.*kn.*", file_name))
is_col_row = bool(re.search(".*km.*kn.*", file_name))
is_row_col = bool(re.search(".*mk.*nk.*", file_name))
is_col_col = bool(re.search(".*km.*nk.*", file_name))
if is_row_row:
row_row_name = file_name[:-4]
if is_col_row:
col_row_name = file_name[:-4]
if is_row_col:
row_col_name = file_name[:-4]
if is_col_col:
col_col_name = file_name[:-4]
instances_list = []
template_name = device_ops[op_name]
include_header = ""
with open(os.path.join(root, file)) as f:
for line in f:
if "impl" in line:
include_header = line.replace("#include \"", "").replace("\"", "").replace("\n", "")
elif template_name in line:
# Turn all whitespace into single spaces
new_line = " ".join(line.split())
# Remove whitespace from S<*>
new_line = strip_sequences(new_line)
new_line = remove_commas_and_brackets(new_line)
last_char = "\n"
if new_line[-1] == ",":
last_char = ",\n"
new_line = new_line[:-1]
new_line = ' "ck::tensor_operation::device::' + new_line + '",'
for key in aliases:
new_line = new_line.replace(key, aliases[key])
instances_list.append(new_line)
instances_list[-1] = instances_list[-1][:-1]
if is_row_row:
row_row_instances = instances_list
if is_col_row:
col_row_instances = instances_list
if is_row_col:
row_col_instances = instances_list
if is_col_col:
col_col_instances = instances_list
out_file_name = op_name + "_instances.hpp"
if not os.path.exists(out_dir):
os.mkdir(out_dir)
with open(os.path.join(out_dir, out_file_name), "w+") as f:
f.write(out_file.format(op_name=op_name,
col_row_name=col_row_name,
col_row_instances="\n".join(col_row_instances),
col_col_name=col_col_name,
col_col_instances="\n".join(col_col_instances),
row_row_name=row_row_name,
row_row_instances="\n".join(row_row_instances),
row_col_name=row_col_name,
row_col_instances="\n".join(row_col_instances),
include_header=include_header))
def run():
source = "/code/composable_kernel/library/src/tensor_operation_instance/gpu"
parse_instances(source)
if __name__ == '__main__':
run()
\ No newline at end of file
......@@ -21,7 +21,6 @@ ENDFOREACH()
add_library(device_operations STATIC ${CK_DEVICE_INSTANCES})
add_library(composablekernels::device_operations ALIAS device_operations)
set(DEV_OPS_INC_DIRS
${PROJECT_SOURCE_DIR}/include/ck/
${PROJECT_SOURCE_DIR}/library/include/ck/
......@@ -56,7 +55,7 @@ target_compile_options(device_operations PRIVATE
)
# install(TARGETS device_operations LIBRARY DESTINATION lib)
rocm_install(TARGETS device_operations
rocm_install(TARGETS device_operations
EXPORT device_operationsTargets)
rocm_install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
struct gemm_add_add_fastgelu_instances
{
static inline std::vector<std::string> device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance =
{
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 2, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 2, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 2, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 2, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 2, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 2, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 2, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 2, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>"
};
static inline std::vector<std::string> device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance =
{
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 2, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 2, 8, 32, 32, 2, 4, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 2, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 2, 8, 32, 32, 2, 2, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 2, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 2, 8, 32, 32, 2, 2, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 2, 8, 32, 32, 2, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 2, 8, 32, 32, 1, 2, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Col, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>"
};
static inline std::vector<std::string> device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance =
{
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 2, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 2, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 2, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 2, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 2, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 2, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<16,16,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 2, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<8,32,1>, S<0,2,1>, S<0,2,1>, 1, 4, 2, 0, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<0,2,1>, S<0,2,1>, 1, 2, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Row, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<0,2,1>, S<0,2,1>, 1, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>"
};
static inline std::vector<std::string> device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance =
{
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,64,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,32,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,32,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,8>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmDefault, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 8, 8, 1, 1, 1, S<1,16,1,4>, 8, LoopScheduler::Default, PipelineVersion::v2>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Interwave, PipelineVersion::v1>",
"DeviceGemmMultipleD_Xdl_CShuffle< Row, Col, Row_Row_Tuple, Row, F16, F16, F32, F32, F16_F16_Tuple, F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding, 1, 64, 16, 16, 32, 8, 8, 16, 16, 1, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, S<4,16,1>, S<1,0,2>, S<1,0,2>, 2, 1, 8, 1, 1, 1, S<1,16,1,4>, 1, LoopScheduler::Default, PipelineVersion::v2>"
};
static auto get_col_row_instances()
{
return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance;
}
static auto get_col_col_instances()
{
return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance;
}
static auto get_row_row_instances()
{
return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance;
}
static auto get_row_col_instances()
{
return device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance;
}
};
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment