Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
17acbbf4
Commit
17acbbf4
authored
Apr 24, 2023
by
Alan Turner
Browse files
Add jit library
parent
bef0cb20
Changes
16
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
1316 additions
and
104 deletions
+1316
-104
.gitignore
.gitignore
+3
-0
CMakeLists.txt
CMakeLists.txt
+107
-93
Config.cmake.in
Config.cmake.in
+1
-1
cmake/Embed.cmake
cmake/Embed.cmake
+129
-0
include/ck/tensor_operation/gpu/device/device_base.hpp
include/ck/tensor_operation/gpu/device/device_base.hpp
+36
-0
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
...ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+20
-0
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
...n/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+302
-0
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+1
-1
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+5
-5
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
...or_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+1
-0
library/CMakeLists.txt
library/CMakeLists.txt
+6
-2
library/src/jit_library/CMakeLists.txt
library/src/jit_library/CMakeLists.txt
+49
-0
library/src/jit_library/include/device_gemm_multiple_d_xdlop_cshuffle.hpp
...library/include/device_gemm_multiple_d_xdlop_cshuffle.hpp
+220
-0
library/src/jit_library/util/make_instance_strings.py
library/src/jit_library/util/make_instance_strings.py
+183
-0
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+1
-2
library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/gemm_add_add_fastgelu_instances.hpp
...gemm_add_add_fastgelu/gemm_add_add_fastgelu_instances.hpp
+252
-0
No files found.
.gitignore
View file @
17acbbf4
...
...
@@ -51,3 +51,6 @@ install.dir*
# directories containing generated documentation
docs/source/_build/
docs/docBin/
# Generated source
library/src/jit_library/solution_instances/
CMakeLists.txt
View file @
17acbbf4
...
...
@@ -17,23 +17,12 @@ include(ROCMInstallSymlinks)
include
(
ROCMCreatePackage
)
include
(
CheckCXXCompilerFlag
)
option
(
CK_BUILD_JIT_LIB,
"Only build the CK JIT Helper Library"
OFF
)
rocm_setup_version
(
VERSION 0.2.0
)
include
(
TargetFlags
)
list
(
APPEND CMAKE_PREFIX_PATH
${
CMAKE_INSTALL_PREFIX
}
${
CMAKE_INSTALL_PREFIX
}
/llvm
${
CMAKE_INSTALL_PREFIX
}
/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip
)
option
(
USE_BITINT_EXTENSION_INT4,
"Whether to enable clang's BitInt extension to provide int4 data type."
OFF
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_compile_definitions
(
CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
)
add_compile_options
(
-Wno-bit-int-extension
)
message
(
"CK compiled with USE_BITINT_EXTENSION_INT4 set to
${
USE_BITINT_EXTENSION_INT4
}
"
)
endif
()
## Threads
set
(
THREADS_PREFER_PTHREAD_FLAG ON
)
find_package
(
Threads REQUIRED
)
link_libraries
(
Threads::Threads
)
## C++
enable_language
(
CXX
)
set
(
CMAKE_CXX_STANDARD 17
)
...
...
@@ -41,8 +30,22 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set
(
CMAKE_CXX_EXTENSIONS OFF
)
message
(
"CMAKE_CXX_COMPILER_ID:
${
CMAKE_CXX_COMPILER_ID
}
"
)
## OpenMP
if
(
CMAKE_CXX_COMPILER_ID MATCHES
"Clang"
)
if
(
NOT CK_BUILD_JIT_LIB
)
option
(
USE_BITINT_EXTENSION_INT4,
"Whether to enable clang's BitInt extension to provide int4 data type."
OFF
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_compile_definitions
(
CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
)
add_compile_options
(
-Wno-bit-int-extension
)
message
(
"CK compiled with USE_BITINT_EXTENSION_INT4 set to
${
USE_BITINT_EXTENSION_INT4
}
"
)
endif
()
## Threads
set
(
THREADS_PREFER_PTHREAD_FLAG ON
)
find_package
(
Threads REQUIRED
)
link_libraries
(
Threads::Threads
)
## OpenMP
if
(
CMAKE_CXX_COMPILER_ID MATCHES
"Clang"
)
# workaround issue hipcc in rocm3.5 cannot find openmp
set
(
OpenMP_CXX
"
${
CMAKE_CXX_COMPILER
}
"
)
set
(
OpenMP_CXX_FLAGS
"-fopenmp=libomp -Wno-unused-command-line-argument"
)
...
...
@@ -50,41 +53,42 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
set
(
OpenMP_libomp_LIBRARY
${
OpenMP_CXX_LIB_NAMES
}
)
set
(
OpenMP_libgomp_LIBRARY
${
OpenMP_CXX_LIB_NAMES
}
)
set
(
OpenMP_libiomp5_LIBRARY
${
OpenMP_CXX_LIB_NAMES
}
)
else
()
else
()
find_package
(
OpenMP REQUIRED
)
endif
()
message
(
"OpenMP_CXX_LIB_NAMES:
${
OpenMP_CXX_LIB_NAMES
}
"
)
message
(
"OpenMP_gomp_LIBRARY:
${
OpenMP_gomp_LIBRARY
}
"
)
message
(
"OpenMP_pthread_LIBRARY:
${
OpenMP_pthread_LIBRARY
}
"
)
message
(
"OpenMP_CXX_FLAGS:
${
OpenMP_CXX_FLAGS
}
"
)
link_libraries
(
${
OpenMP_gomp_LIBRARY
}
)
link_libraries
(
${
OpenMP_pthread_LIBRARY
}
)
## HIP
find_package
(
HIP REQUIRED
)
# Override HIP version in config.h, if necessary.
# The variables set by find_package() can't be overwritten,
# therefore let's use intermediate variables.
set
(
CK_HIP_VERSION_MAJOR
"
${
HIP_VERSION_MAJOR
}
"
)
set
(
CK_HIP_VERSION_MINOR
"
${
HIP_VERSION_MINOR
}
"
)
set
(
CK_HIP_VERSION_PATCH
"
${
HIP_VERSION_PATCH
}
"
)
if
(
DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR
)
endif
()
message
(
"OpenMP_CXX_LIB_NAMES:
${
OpenMP_CXX_LIB_NAMES
}
"
)
message
(
"OpenMP_gomp_LIBRARY:
${
OpenMP_gomp_LIBRARY
}
"
)
message
(
"OpenMP_pthread_LIBRARY:
${
OpenMP_pthread_LIBRARY
}
"
)
message
(
"OpenMP_CXX_FLAGS:
${
OpenMP_CXX_FLAGS
}
"
)
link_libraries
(
${
OpenMP_gomp_LIBRARY
}
)
link_libraries
(
${
OpenMP_pthread_LIBRARY
}
)
## HIP
find_package
(
HIP REQUIRED
)
# Override HIP version in config.h, if necessary.
# The variables set by find_package() can't be overwritten,
# therefore let's use intermediate variables.
set
(
CK_HIP_VERSION_MAJOR
"
${
HIP_VERSION_MAJOR
}
"
)
set
(
CK_HIP_VERSION_MINOR
"
${
HIP_VERSION_MINOR
}
"
)
set
(
CK_HIP_VERSION_PATCH
"
${
HIP_VERSION_PATCH
}
"
)
if
(
DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR
)
set
(
CK_HIP_VERSION_MAJOR
"
${
CK_OVERRIDE_HIP_VERSION_MAJOR
}
"
)
message
(
STATUS
"CK_HIP_VERSION_MAJOR overriden with
${
CK_OVERRIDE_HIP_VERSION_MAJOR
}
"
)
endif
()
if
(
DEFINED CK_OVERRIDE_HIP_VERSION_MINOR
)
endif
()
if
(
DEFINED CK_OVERRIDE_HIP_VERSION_MINOR
)
set
(
CK_HIP_VERSION_MINOR
"
${
CK_OVERRIDE_HIP_VERSION_MINOR
}
"
)
message
(
STATUS
"CK_HIP_VERSION_MINOR overriden with
${
CK_OVERRIDE_HIP_VERSION_MINOR
}
"
)
endif
()
if
(
DEFINED CK_OVERRIDE_HIP_VERSION_PATCH
)
endif
()
if
(
DEFINED CK_OVERRIDE_HIP_VERSION_PATCH
)
set
(
CK_HIP_VERSION_PATCH
"
${
CK_OVERRIDE_HIP_VERSION_PATCH
}
"
)
message
(
STATUS
"CK_HIP_VERSION_PATCH overriden with
${
CK_OVERRIDE_HIP_VERSION_PATCH
}
"
)
endif
()
message
(
STATUS
"Build with HIP
${
HIP_VERSION
}
"
)
link_libraries
(
hip::device
)
add_compile_definitions
(
__HIP_PLATFORM_HCC__=1
)
endif
()
message
(
STATUS
"Build with HIP
${
HIP_VERSION
}
"
)
link_libraries
(
hip::device
)
add_compile_definitions
(
__HIP_PLATFORM_HCC__=1
)
## tidy
include
(
EnableCompilerWarnings
)
...
...
@@ -196,6 +200,7 @@ enable_clang_tidy(
-DCK_USE_CLANG_TIDY
)
include
(
CppCheck
)
enable_cppcheck
(
CHECKS
...
...
@@ -231,51 +236,60 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set
(
CMAKE_ARCHIVE_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/lib
)
set
(
CMAKE_RUNTIME_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/bin
)
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/library/include
${
HIP_INCLUDE_DIRS
}
)
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
if
(
BUILD_DEV
)
if
(
NOT CK_BUILD_JIT_LIB
)
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
if
(
BUILD_DEV
)
add_compile_options
(
-Werror
)
add_compile_options
(
-Weverything
)
endif
()
message
(
"CMAKE_CXX_FLAGS:
${
CMAKE_CXX_FLAGS
}
"
)
endif
()
message
(
"CMAKE_CXX_FLAGS:
${
CMAKE_CXX_FLAGS
}
"
)
add_custom_target
(
check COMMAND
${
CMAKE_CTEST_COMMAND
}
--output-on-failure -C
${
CMAKE_CFG_INTDIR
}
)
add_custom_target
(
check COMMAND
${
CMAKE_CTEST_COMMAND
}
--output-on-failure -C
${
CMAKE_CFG_INTDIR
}
)
file
(
GLOB_RECURSE INSTANCE_FILES
"
${
PROJECT_SOURCE_DIR
}
/*/device_*_instance.cpp"
)
file
(
GLOB dir_list RELATIVE
${
PROJECT_SOURCE_DIR
}
/library/src/tensor_operation_instance/gpu
${
PROJECT_SOURCE_DIR
}
/library/src/tensor_operation_instance/gpu/*
)
set
(
CK_DEVICE_INSTANCES
)
FOREACH
(
subdir_path
${
dir_list
}
)
file
(
GLOB_RECURSE INSTANCE_FILES
"
${
PROJECT_SOURCE_DIR
}
/*/device_*_instance.cpp"
)
file
(
GLOB dir_list RELATIVE
${
PROJECT_SOURCE_DIR
}
/library/src/tensor_operation_instance/gpu
${
PROJECT_SOURCE_DIR
}
/library/src/tensor_operation_instance/gpu/*
)
set
(
CK_DEVICE_INSTANCES
)
FOREACH
(
subdir_path
${
dir_list
}
)
IF
(
IS_DIRECTORY
"
${
PROJECT_SOURCE_DIR
}
/library/src/tensor_operation_instance/gpu/
${
subdir_path
}
"
)
list
(
APPEND CK_DEVICE_INSTANCES device_
${
subdir_path
}
_instance
)
ENDIF
()
ENDFOREACH
()
add_custom_target
(
instances DEPENDS utility;
${
CK_DEVICE_INSTANCES
}
SOURCES
${
INSTANCE_FILES
}
)
ENDFOREACH
()
add_custom_target
(
instances DEPENDS utility;
${
CK_DEVICE_INSTANCES
}
SOURCES
${
INSTANCE_FILES
}
)
rocm_package_setup_component
(
tests
rocm_package_setup_component
(
tests
LIBRARY_NAME composablekernel
PACKAGE_NAME tests
# Prevent -static suffix on package name
)
)
rocm_package_setup_component
(
examples
rocm_package_setup_component
(
examples
LIBRARY_NAME composablekernel
PACKAGE_NAME examples
)
)
rocm_package_setup_component
(
profiler
rocm_package_setup_component
(
profiler
LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler
)
)
add_subdirectory
(
library
)
add_subdirectory
(
example
)
add_subdirectory
(
test
)
add_subdirectory
(
profiler
)
add_subdirectory
(
example
)
add_subdirectory
(
test
)
add_subdirectory
(
profiler
)
else
()
add_subdirectory
(
library
)
rocm_package_setup_component
(
jit_library
LIBRARY_NAME composablekernel
PACKAGE_NAME jit_library
)
endif
()
#Create an interface target for the include only files and call it "composablekernels"
include
(
CMakePackageConfigHelpers
)
...
...
Config.cmake.in
View file @
17acbbf4
@PACKAGE_INIT@
set(_composable_kernel_supported_components device_operations utility)
set(_composable_kernel_supported_components device_operations utility
jit_library
)
foreach(_comp ${composable_kernel_FIND_COMPONENTS})
if(NOT _comp IN_LIST _composable_kernel_supported_components)
...
...
cmake/Embed.cmake
0 → 100644
View file @
17acbbf4
#####################################################################################
# The MIT License (MIT)
#
# Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#####################################################################################
find_program
(
EMBED_LD ld
)
find_program
(
EMBED_OBJCOPY objcopy
)
function
(
generate_embed_source EMBED_NAME
)
set
(
options
)
set
(
oneValueArgs SRC HEADER RELATIVE
)
set
(
multiValueArgs OBJECTS SYMBOLS
)
cmake_parse_arguments
(
PARSE
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
EXTERNS
)
set
(
INIT_KERNELS
)
list
(
LENGTH PARSE_SYMBOLS SYMBOLS_LEN
)
list
(
LENGTH PARSE_OBJECTS OBJECTS_LEN
)
if
(
NOT
${
SYMBOLS_LEN
}
EQUAL
${
OBJECTS_LEN
}
)
message
(
FATAL_ERROR
"Symbols and objects dont match:
${
SYMBOLS_LEN
}
!=
${
OBJECTS_LEN
}
"
)
endif
()
math
(
EXPR LEN
"
${
SYMBOLS_LEN
}
- 1"
)
foreach
(
idx RANGE
${
LEN
}
)
list
(
GET PARSE_SYMBOLS
${
idx
}
SYMBOL
)
list
(
GET PARSE_OBJECTS
${
idx
}
OBJECT
)
set
(
START_SYMBOL
"_binary_
${
SYMBOL
}
_start"
)
set
(
END_SYMBOL
"_binary_
${
SYMBOL
}
_end"
)
string
(
APPEND EXTERNS
"
extern const char
${
START_SYMBOL
}
[];
extern const char
${
END_SYMBOL
}
[];
"
)
file
(
RELATIVE_PATH BASE_NAME
${
PARSE_RELATIVE
}
"
${
OBJECT
}
"
)
string
(
REGEX REPLACE
".[A-Za-z0-9_]$"
""
BASE_NAME
${
BASE_NAME
}
)
string
(
APPEND INIT_KERNELS
"
{
\"
${
BASE_NAME
}
\"
, {
${
START_SYMBOL
}
,
${
END_SYMBOL
}
} },
"
)
endforeach
()
file
(
WRITE
"
${
PARSE_HEADER
}
"
"
#include <unordered_map>
#include <string>
#include <utility>
const std::unordered_map<std::string, std::pair<const char*,const char*>>&
${
EMBED_NAME
}
();
"
)
file
(
WRITE
"
${
PARSE_SRC
}
"
"
#include <
${
EMBED_NAME
}
.hpp>
${
EXTERNS
}
const std::unordered_map<std::string, std::pair<const char*,const char*>>&
${
EMBED_NAME
}
()
{
static const std::unordered_map<std::string, std::pair<const char*,const char*>> result = {
${
INIT_KERNELS
}
};
return result;
}
"
)
endfunction
()
function
(
embed_file OUTPUT_FILE OUTPUT_SYMBOL FILE
)
set
(
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
)
# Glob is used to compute the relative path
file
(
GLOB FILES RELATIVE
${
WORKING_DIRECTORY
}
${
FILE
}
)
foreach
(
REL_FILE
${
FILES
}
)
string
(
MAKE_C_IDENTIFIER
"
${
REL_FILE
}
"
SYMBOL
)
get_filename_component
(
OUTPUT_FILE_DIR
"
${
REL_FILE
}
"
DIRECTORY
)
file
(
MAKE_DIRECTORY
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
OUTPUT_FILE_DIR
}
"
)
set
(
OUT_FILE
"
${
CMAKE_CURRENT_BINARY_DIR
}
/
${
REL_FILE
}
.o"
)
set
(
${
OUTPUT_SYMBOL
}
${
SYMBOL
}
PARENT_SCOPE
)
set
(
${
OUTPUT_FILE
}
"
${
OUT_FILE
}
"
PARENT_SCOPE
)
add_custom_command
(
OUTPUT
"
${
OUT_FILE
}
"
COMMAND
${
EMBED_LD
}
-r -o
"
${
OUT_FILE
}
"
-z noexecstack --format=binary
"
${
REL_FILE
}
"
COMMAND
${
EMBED_OBJCOPY
}
--rename-section .data=.rodata,alloc,load,readonly,data,contents
"
${
OUT_FILE
}
"
WORKING_DIRECTORY
${
WORKING_DIRECTORY
}
DEPENDS
${
FILE
}
VERBATIM
)
endforeach
()
endfunction
()
function
(
add_embed_library EMBED_NAME
)
set
(
options
)
set
(
oneValueArgs RELATIVE
)
set
(
multiValueArgs
)
cmake_parse_arguments
(
PARSE
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
file
(
MAKE_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/embed
)
file
(
MAKE_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/embed/
${
EMBED_NAME
}
)
set
(
EMBED_DIR
${
CMAKE_CURRENT_BINARY_DIR
}
/embed/
${
EMBED_NAME
}
)
set
(
SRC_FILE
"
${
EMBED_DIR
}
/
${
EMBED_NAME
}
.cpp"
)
set
(
HEADER_FILE
"
${
EMBED_DIR
}
/include/
${
EMBED_NAME
}
.hpp"
)
set
(
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
set
(
OUTPUT_FILES
)
set
(
SYMBOLS
)
message
(
STATUS
"Embedding files"
)
foreach
(
FILE
${
PARSE_UNPARSED_ARGUMENTS
}
)
embed_file
(
OUTPUT_FILE OUTPUT_SYMBOL
${
FILE
}
)
list
(
APPEND OUTPUT_FILES
${
OUTPUT_FILE
}
)
list
(
APPEND SYMBOLS
${
OUTPUT_SYMBOL
}
)
endforeach
()
message
(
STATUS
"Generating embedding library
${
EMBED_NAME
}
"
)
generate_embed_source
(
${
EMBED_NAME
}
SRC
${
SRC_FILE
}
HEADER
${
HEADER_FILE
}
OBJECTS
${
OUTPUT_FILES
}
SYMBOLS
${
SYMBOLS
}
RELATIVE
${
PARSE_RELATIVE
}
)
add_library
(
${
EMBED_NAME
}
STATIC
${
OUTPUT_FILES
}
"
${
SRC_FILE
}
"
)
target_include_directories
(
${
EMBED_NAME
}
PUBLIC
"$<BUILD_INTERFACE:
${
EMBED_DIR
}
/include>"
)
target_compile_options
(
${
EMBED_NAME
}
PRIVATE -Wno-reserved-identifier
)
set_target_properties
(
${
EMBED_NAME
}
PROPERTIES POSITION_INDEPENDENT_CODE On
)
endfunction
()
include/ck/tensor_operation/gpu/device/device_base.hpp
View file @
17acbbf4
...
...
@@ -37,6 +37,42 @@ struct BaseInvoker
virtual
~
BaseInvoker
()
{}
};
struct
BaseParameters
{
BaseParameters
()
=
default
;
BaseParameters
(
const
BaseParameters
&
)
=
default
;
BaseParameters
&
operator
=
(
const
BaseParameters
&
)
=
default
;
virtual
void
SetAElementOp
(
const
std
::
string
&
)
{}
virtual
void
SetBElementOp
(
const
std
::
string
&
)
{}
virtual
void
SetCDEElementOp
(
const
std
::
string
&
)
{}
virtual
void
SetDsLayout
(
const
std
::
string
&
)
{}
virtual
void
SetDsDataType
(
const
std
::
string
&
)
{}
virtual
void
SetGemmSpec
(
const
index_t
,
const
index_t
,
const
index_t
)
{}
virtual
index_t
GetGridSize
(
const
index_t
,
const
index_t
)
{
return
0
;
}
virtual
index_t
GetBlockSize
()
{
return
0
;
}
virtual
std
::
string
GetParametersString
()
{
return
""
;
}
virtual
~
BaseParameters
()
{}
};
struct
BaseOperator
{
BaseOperator
()
=
default
;
...
...
include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
View file @
17acbbf4
...
...
@@ -51,6 +51,26 @@ struct DeviceGemmMultipleD : public BaseOperator
CDEElementwiseOperation
cde_element_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseParameters
>
MakeParametersPointer
()
{
return
std
::
make_unique
<
BaseParameters
>
(
BaseParameters
{});
}
virtual
index_t
GetBlockSize
()
const
{
return
0
;
}
virtual
index_t
GetMPerBlock
()
const
{
return
0
;
}
virtual
index_t
GetNPerBlock
()
const
{
return
0
;
}
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
View file @
17acbbf4
...
...
@@ -690,6 +690,308 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
return
str
.
str
();
}
struct
Parameters
:
BaseParameters
{
template
<
class
S
>
static
std
::
string
GetSequenceString
(
S
s
)
{
auto
str
=
std
::
stringstream
();
str
<<
"ck::Sequence<"
;
auto
size
=
s
.
Size
();
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
str
<<
s
.
At
(
i
);
if
(
i
<
size
-
1
)
str
<<
","
;
}
str
<<
">"
;
return
str
.
str
();
}
template
<
class
T
>
static
std
::
string
GetTypeString
(
T
)
{
return
""
;
}
template
<
>
static
std
::
string
GetTypeString
<
float
>
(
float
)
{
return
"float"
;
}
template
<
>
static
std
::
string
GetTypeString
<
ck
::
half_t
>
(
ck
::
half_t
)
{
return
"ck::half_t"
;
}
template
<
>
static
std
::
string
GetTypeString
<
tensor_layout
::
gemm
::
RowMajor
>
(
tensor_layout
::
gemm
::
RowMajor
)
{
return
"ck::tensor_layout::gemm::RowMajor"
;
}
template
<
>
static
std
::
string
GetTypeString
<
tensor_layout
::
gemm
::
ColumnMajor
>
(
tensor_layout
::
gemm
::
ColumnMajor
)
{
return
"ck::tensor_layout::gemm::ColumnMajor"
;
}
template
<
class
T
>
static
std
::
string
GetTupleString
(
T
t
)
{
auto
str
=
std
::
stringstream
();
str
<<
"ck::Tuple<"
;
static_for
<
0
,
t
.
Size
(),
1
>
{}([
&
](
auto
i
)
{
str
<<
GetTypeString
(
t
.
At
(
i
));
if
(
i
<
t
.
Size
()
-
1
)
str
<<
","
;
});
str
<<
">"
;
return
str
.
str
();
}
template
<
>
static
std
::
string
GetTupleString
<
Tuple
<>>
(
Tuple
<>
)
{
return
"ck::Tuple<>"
;
}
void
SetAElementOp
(
const
std
::
string
&
s
)
override
{
a_elementwise_op
=
s
;
}
void
SetBElementOp
(
const
std
::
string
&
s
)
override
{
b_elementwise_op
=
s
;
}
void
SetCDEElementOp
(
const
std
::
string
&
s
)
override
{
cde_elementwise_op
=
s
;
}
void
SetDsLayout
(
const
std
::
string
&
s
)
override
{
ds_layout
=
s
;
}
void
SetDsDataType
(
const
std
::
string
&
s
)
override
{
ds_data_type
=
s
;
}
void
SetGemmSpec
(
const
index_t
m
,
const
index_t
n
,
const
index_t
k
)
override
{
std
::
string
spec
=
""
;
if
(
math
::
integer_divide_ceil
(
m
,
MPerBlock
)
*
MPerBlock
-
m
!=
0
)
spec
+=
"M"
;
if
(
math
::
integer_divide_ceil
(
n
,
NPerBlock
)
*
NPerBlock
-
n
!=
0
)
spec
+=
"N"
;
if
(
math
::
integer_divide_ceil
(
k
,
KPerBlock
)
*
KPerBlock
-
k
!=
0
)
spec
+=
"K"
;
if
(
spec
==
""
)
gemm_spec
=
"ck::tensor_operation::device::GemmSpecialization::Default"
;
else
gemm_spec
=
"ck::tensor_operation::device::GemmSpecialization::"
+
spec
+
"Padding"
;
}
index_t
GetGridSize
(
const
index_t
m
,
const
index_t
n
)
override
{
return
math
::
integer_divide_ceil
(
m
,
MPerBlock
)
*
math
::
integer_divide_ceil
(
n
,
NPerBlock
);
}
index_t
GetBlockSize
()
override
{
return
BlockSize
;
}
std
::
string
GetParametersString
()
override
{
auto
str
=
std
::
stringstream
();
std
::
map
<
LoopScheduler
,
std
::
string
>
LoopSchedToString
{
{
LoopScheduler
::
Default
,
"ck::LoopScheduler::Default"
},
{
LoopScheduler
::
Interwave
,
"ck::LoopScheduler::Interwave"
}};
std
::
map
<
PipelineVersion
,
std
::
string
>
PipelineVersionToString
{
{
PipelineVersion
::
v1
,
"ck::PipelineVersion::v1"
},
{
PipelineVersion
::
v2
,
"ck::PipelineVersion::v2"
}};
// clang-format off
str
<<
"ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle"
<<
"<"
<<
GetTypeString
(
ALayout
{})
<<
", "
<<
GetTypeString
(
BLayout
{})
<<
", "
<<
ds_layout
<<
", "
<<
GetTypeString
(
ELayout
{})
<<
", "
<<
GetTypeString
(
ADataType
{})
<<
", "
<<
GetTypeString
(
BDataType
{})
<<
", "
<<
GetTypeString
(
AccDataType
{})
<<
", "
<<
GetTypeString
(
CShuffleDataType
{})
<<
", "
<<
ds_data_type
<<
", "
<<
GetTypeString
(
EDataType
{})
<<
", "
<<
a_elementwise_op
<<
", "
<<
b_elementwise_op
<<
", "
<<
cde_elementwise_op
<<
", "
<<
gemm_spec
<<
", "
<<
NumGemmKPrefetchStage
<<
", "
<<
BlockSize
<<
", "
<<
MPerBlock
<<
", "
<<
NPerBlock
<<
", "
<<
KPerBlock
<<
", "
<<
AK1
<<
", "
<<
BK1
<<
", "
<<
MPerXDL
<<
", "
<<
NPerXDL
<<
", "
<<
MXdlPerWave
<<
", "
<<
NXdlPerWave
<<
", "
<<
GetSequenceString
(
ABlockTransferThreadClusterLengths_AK0_M_AK1
{})
<<
", "
<<
GetSequenceString
(
ABlockTransferThreadClusterArrangeOrder
{})
<<
", "
<<
GetSequenceString
(
ABlockTransferSrcAccessOrder
{})
<<
", "
<<
ABlockTransferSrcVectorDim
<<
", "
<<
ABlockTransferSrcScalarPerVector
<<
", "
<<
ABlockTransferDstScalarPerVector_AK1
<<
", "
<<
ABlockLdsExtraM
<<
", "
<<
GetSequenceString
(
BBlockTransferThreadClusterLengths_BK0_N_BK1
{})
<<
", "
<<
GetSequenceString
(
BBlockTransferThreadClusterArrangeOrder
{})
<<
", "
<<
GetSequenceString
(
BBlockTransferSrcAccessOrder
{})
<<
", "
<<
BBlockTransferSrcVectorDim
<<
", "
<<
BBlockTransferSrcScalarPerVector
<<
", "
<<
BBlockTransferDstScalarPerVector_BK1
<<
", "
<<
BBlockLdsExtraN
<<
", "
<<
CShuffleMXdlPerWavePerShuffle
<<
", "
<<
CShuffleNXdlPerWavePerShuffle
<<
", "
<<
GetSequenceString
(
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
{})
<<
", "
<<
CDEBlockTransferScalarPerVector_NPerBlock
<<
", "
<<
LoopSchedToString
[
LoopSched
]
<<
", "
<<
PipelineVersionToString
[
PipelineVer
]
<<
">"
;
// clang-format on
return
str
.
str
();
}
std
::
string
a_elementwise_op
=
"ck::tensor_operation::element_wise::PassThrough"
;
std
::
string
b_elementwise_op
=
"ck::tensor_operation::element_wise::PassThrough"
;
std
::
string
cde_elementwise_op
=
"ck::tensor_operation::element_wise::PassThrough"
;
std
::
string
ds_layout
=
"ck::Tuple<>"
;
std
::
string
ds_data_type
=
"ck::Tuple<>"
;
std
::
string
gemm_spec
=
"ck::tensor_operation::device::GemmSpecialization::"
+
getGemmSpecializationString
(
GemmSpec
);
};
std
::
unique_ptr
<
BaseParameters
>
MakeParametersPointer
()
override
{
return
std
::
make_unique
<
Parameters
>
(
Parameters
{});
}
index_t
GetBlockSize
()
const
override
{
return
BlockSize
;
}
index_t
GetMPerBlock
()
const
override
{
return
MPerBlock
;
}
index_t
GetNPerBlock
()
const
override
{
return
NPerBlock
;
}
template
<
class
ADesc
,
class
BDesc
,
class
DsDesc
,
class
EDesc
>
struct
Descriptor
{
static
constexpr
auto
ds_tuple
()
{
return
transform_tuples
(
[
&
](
auto
d
)
constexpr
{
return
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
d
);
},
DsDesc
{});
}
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDefaultAGridDescriptor_AK0_M_AK1
(
DeviceOp
::
matrix_padder
.
PadADescriptor_M_K
(
ADesc
{})))
>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDefaultBGridDescriptor_BK0_N_BK1
(
DeviceOp
::
matrix_padder
.
PadBDescriptor_N_K
(
BDesc
{})))
>
;
using
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
ds_tuple
()))
>
;
using
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
EDesc
{})))
>
;
using
Block2ETileMap
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
EDesc
{})))
>
;
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1
;
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1
;
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
ds_grid_desc_mblock_mperblock_nblock_nperblock
;
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
e_grid_desc_mblock_mperblock_nblock_nperblock
;
Block2ETileMap
block_2_etile_map
;
bool
has_main_k_block_loop
=
true
;
bool
is_valid
=
false
;
constexpr
Descriptor
(
ADesc
a
,
BDesc
b
,
DsDesc
ds
,
EDesc
e
)
:
a_grid_desc_ak0_m_ak1
{
GridwiseGemm
::
MakeDefaultAGridDescriptor_AK0_M_AK1
(
DeviceOp
::
matrix_padder
.
PadADescriptor_M_K
(
a
))},
b_grid_desc_bk0_n_bk1
{
GridwiseGemm
::
MakeDefaultBGridDescriptor_BK0_N_BK1
(
DeviceOp
::
matrix_padder
.
PadBDescriptor_N_K
(
b
))},
ds_grid_desc_mblock_mperblock_nblock_nperblock
{
GridwiseGemm
::
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
transform_tuples
(
[
&
](
auto
d
)
constexpr
{
return
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
d
);
},
ds
))},
e_grid_desc_mblock_mperblock_nblock_nperblock
{
GridwiseGemm
::
MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
e
))},
block_2_etile_map
{
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
e
))},
has_main_k_block_loop
{
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
a_grid_desc_ak0_m_ak1
.
GetLength
(
I0
)
*
a_grid_desc_ak0_m_ak1
.
GetLength
(
I2
))},
is_valid
{
GridwiseGemm
::
CheckValidity
(
(
DeviceOp
::
matrix_padder
.
PadADescriptor_M_K
(
a
)),
DeviceOp
::
matrix_padder
.
PadBDescriptor_N_K
(
b
),
transform_tuples
(
[
&
](
auto
d
)
constexpr
{
return
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
d
);
},
ds
),
DeviceOp
::
matrix_padder
.
PadCDescriptor_M_N
(
e
),
block_2_etile_map
)}
{
}
};
template
<
class
ADesc
,
class
BDesc
,
class
DsDesc
,
class
EDesc
>
static
constexpr
auto
make_descriptor
(
ADesc
a
,
BDesc
b
,
DsDesc
ds
,
EDesc
e
)
{
return
Descriptor
<
ADesc
,
BDesc
,
DsDesc
,
EDesc
>
(
a
,
b
,
ds
,
e
);
}
template
<
class
Desc
,
class
DsPointer
>
__device__
static
void
Run
(
Desc
desc
,
const
ADataType
*
__restrict__
p_a_grid
,
const
BDataType
*
__restrict__
p_b_grid
,
DsPointer
p_ds_grid
,
EDataType
*
__restrict__
p_e_grid
)
{
__shared__
char
p_shared_block
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
assert
(
desc
.
is_valid
);
if
(
desc
.
has_main_k_block_loop
)
{
GridwiseGemm
::
template
Run
<
true
>(
p_a_grid
,
p_b_grid
,
p_ds_grid
,
p_e_grid
,
p_shared_block
,
AElementwiseOperation
{},
BElementwiseOperation
{},
CDEElementwiseOperation
{},
desc
.
a_grid_desc_ak0_m_ak1
,
desc
.
b_grid_desc_bk0_n_bk1
,
desc
.
ds_grid_desc_mblock_mperblock_nblock_nperblock
,
desc
.
e_grid_desc_mblock_mperblock_nblock_nperblock
,
desc
.
block_2_etile_map
);
}
else
{
GridwiseGemm
::
template
Run
<
false
>(
p_a_grid
,
p_b_grid
,
p_ds_grid
,
p_e_grid
,
p_shared_block
,
AElementwiseOperation
{},
BElementwiseOperation
{},
CDEElementwiseOperation
{},
desc
.
a_grid_desc_ak0_m_ak1
,
desc
.
b_grid_desc_bk0_n_bk1
,
desc
.
ds_grid_desc_mblock_mperblock_nblock_nperblock
,
desc
.
e_grid_desc_mblock_mperblock_nblock_nperblock
,
desc
.
block_2_etile_map
);
}
}
};
}
// namespace device
...
...
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
View file @
17acbbf4
...
...
@@ -4,7 +4,7 @@
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/math
_v2
.hpp"
#include "ck/utility/math.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/quantization_operation.hpp"
...
...
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
View file @
17acbbf4
...
...
@@ -117,15 +117,15 @@ struct BlockToCTileMap_M00_N0_M01Adapt
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
()
=
default
;
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01Adapt
()
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
__host__
__device__
constexpr
BlockToCTileMap_M00_N0_M01Adapt
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
8
)
:
M01_
(
M01
),
c_grid_desc_m_n_
(
c_grid_desc_m_n
)
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
__host__
__device__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
...
...
@@ -203,13 +203,13 @@ struct BlockToCTileMap_M00_N0_M01Adapt
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
/* c_tile_idx */
,
__host__
__device__
constexpr
bool
ValidCTileIndex
(
const
CTileIdx
&
/* c_tile_idx */
,
const
CTileDim
&
/* c_tile_dim */
)
const
{
return
true
;
// always valid provided that user gets grid size from CalculateGridSize()
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
__host__
__device__
constexpr
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
private:
index_t
M01_
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
View file @
17acbbf4
...
...
@@ -5,6 +5,7 @@
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
#include <iostream>
namespace
ck
{
...
...
library/CMakeLists.txt
View file @
17acbbf4
add_subdirectory
(
src/tensor_operation_instance/gpu
)
add_subdirectory
(
src/utility
)
if
(
CK_BUILD_JIT_LIB
)
add_subdirectory
(
src/jit_library
)
else
()
add_subdirectory
(
src/tensor_operation_instance/gpu
)
add_subdirectory
(
src/utility
)
endif
()
library/src/jit_library/CMakeLists.txt
0 → 100644
View file @
17acbbf4
include
(
Embed
)
file
(
GLOB_RECURSE KERNEL_FILES
${
CONFIGURE_DEPENDS
}
${
PROJECT_SOURCE_DIR
}
/include/ck/*.hpp
)
message
(
STATUS
"KERNEL_FILES:
${
KERNEL_FILES
}
"
)
add_embed_library
(
ck_headers
${
KERNEL_FILES
}
RELATIVE
${
PROJECT_SOURCE_DIR
}
/build/include
)
execute_process
(
COMMAND python3
${
CMAKE_CURRENT_SOURCE_DIR
}
/util/make_instance_strings.py
WORKING_DIRECTORY
${
CMAKE_CURRENT_SOURCE_DIR
}
/../tensor_operation_instance/gpu/
)
set
(
JIT_LIB_SOURCE
${
CMAKE_CURRENT_SOURCE_DIR
}
/include/device_gemm_multiple_d_xdlop_cshuffle.hpp
)
add_library
(
jit_library STATIC
${
JIT_LIB_SOURCE
}
)
add_library
(
composable_kernel::jit_library ALIAS jit_library
)
set_target_properties
(
jit_library PROPERTIES LINKER_LANGUAGE CXX
)
target_include_directories
(
jit_library PUBLIC
$<BUILD_INTERFACE:
${
CMAKE_CURRENT_SOURCE_DIR
}
/include>
)
target_link_libraries
(
jit_library PRIVATE ck_headers
)
rocm_install
(
TARGETS jit_library ck_headers
EXPORT jit_libraryTargets
)
set
(
INCLUDE_DIRS
${
PROJECT_SOURCE_DIR
}
/include/ck/
${
PROJECT_SOURCE_DIR
}
/library/src/jit_library/include
${
PROJECT_SOURCE_DIR
}
/library/src/jit_library/solution_instances
${
CMAKE_CURRENT_BINARY_DIR
}
/embed/ck_headers/include
)
rocm_install
(
DIRECTORY
${
INCLUDE_DIRS
}
DESTINATION
${
CMAKE_INSTALL_INCLUDEDIR
}
/ck
)
rocm_install
(
EXPORT jit_libraryTargets
FILE composable_kerneljit_libraryTargets.cmake
NAMESPACE composable_kernel::
DESTINATION
${
CMAKE_INSTALL_LIBDIR
}
/cmake/composable_kernel
)
library/src/jit_library/include/device_gemm_multiple_d_xdlop_cshuffle.hpp
0 → 100644
View file @
17acbbf4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
#include <sstream>
#include <iterator>
#include <numeric>
#include "ck/solution_instances/gemm_add_add_fastgelu_instances.hpp"
#include "ck/ck.hpp"
#include "ck/utility/math.hpp"
#include "ck_headers.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_gemm_multiple_d
{
struct
Solution
{
std
::
string
template_str
;
index_t
block_size
;
index_t
grid_size
;
Solution
(
std
::
string
s
,
index_t
b
,
index_t
g
)
:
template_str
(
s
),
block_size
(
b
),
grid_size
(
g
)
{}
auto
GetStr
()
const
{
return
template_str
;
}
auto
GetBlockSize
()
const
{
return
block_size
;
}
auto
GetGridSize
()
const
{
return
grid_size
;
}
};
std
::
string
GetGemmSpec
(
const
index_t
m
,
const
index_t
n
,
const
index_t
k
,
const
index_t
m_per_block
,
const
index_t
n_per_block
,
const
index_t
k_per_block
)
{
std
::
string
spec
=
""
;
if
(
math
::
integer_divide_ceil
(
m
,
m_per_block
)
*
m_per_block
-
m
!=
0
)
spec
+=
"M"
;
if
(
math
::
integer_divide_ceil
(
n
,
n_per_block
)
*
n_per_block
-
n
!=
0
)
spec
+=
"N"
;
if
(
math
::
integer_divide_ceil
(
k
,
k_per_block
)
*
k_per_block
-
k
!=
0
)
spec
+=
"K"
;
if
(
spec
==
""
)
return
"ck::tensor_operation::device::GemmSpecialization::Default"
;
return
"ck::tensor_operation::device::GemmSpecialization::"
+
spec
+
"Padding"
;
}
index_t
GetGridSize
(
const
index_t
m
,
const
index_t
n
,
const
index_t
m_per_block
,
const
index_t
n_per_block
)
{
return
math
::
integer_divide_ceil
(
m
,
m_per_block
)
*
math
::
integer_divide_ceil
(
n
,
n_per_block
);
}
const
std
::
unordered_set
<
std
::
string
>&
get_xdlop_archs
()
{
static
std
::
unordered_set
<
std
::
string
>
supported_archs
{
"gfx90a"
};
return
supported_archs
;
}
struct
Problem
{
index_t
M
;
index_t
N
;
index_t
K
;
index_t
NumDTensors
;
bool
TransA
;
bool
TransB
;
bool
TransCDE
;
std
::
string
ADataType
;
std
::
string
BDataType
;
std
::
string
CDEDataType
;
std
::
string
AElementOp
;
std
::
string
BElementOp
;
std
::
string
CDEElementOp
;
std
::
string
CDELayout
;
static
const
index_t
ds_layout_idx
=
3
;
static
const
index_t
ds_data_type_idx
=
9
;
static
const
index_t
a_elementwise_op_idx
=
11
;
static
const
index_t
b_elementwise_op_idx
=
12
;
static
const
index_t
ds_elementwise_op_idx
=
13
;
static
const
index_t
gemm_spec_idx
=
14
;
static
const
index_t
block_size_idx
=
16
;
static
const
index_t
m_per_block_idx
=
17
;
static
const
index_t
n_per_block_idx
=
18
;
static
const
index_t
k_per_block_idx
=
19
;
auto
GetInstances
(
const
std
::
string
&
arch
)
const
{
std
::
vector
<
std
::
string
>
instances
;
if
(
get_xdlop_archs
().
find
(
arch
)
!=
get_xdlop_archs
().
end
())
{
instance
::
gemm_add_add_fastgelu_instances
all_instances
{};
if
(
TransA
and
TransB
)
instances
=
all_instances
.
get_col_col_instances
();
else
if
(
TransA
and
not
TransB
)
instances
=
all_instances
.
get_col_row_instances
();
else
if
(
not
TransA
and
not
TransB
)
instances
=
all_instances
.
get_row_row_instances
();
else
instances
=
all_instances
.
get_row_col_instances
();
}
return
instances
;
}
auto
GetHeaders
()
const
{
return
ck_headers
();
}
auto
GetIncludeHeader
()
const
{
return
instance
::
gemm_add_add_fastgelu_instances
{}.
get_include_header
();
}
Problem
(
index_t
m
,
index_t
n
,
index_t
k
,
index_t
numDTensors
,
bool
transA
,
bool
transB
,
bool
transCDE
,
std
::
string
aDataType
,
std
::
string
bDataType
,
std
::
string
cdeDataType
,
std
::
string
aElementOp
,
std
::
string
bElementOp
,
std
::
string
cdeElementOp
,
std
::
string
cdeLayout
)
:
M
(
m
),
N
(
n
),
K
(
k
),
NumDTensors
(
numDTensors
),
TransA
(
transA
),
TransB
(
transB
),
TransCDE
(
transCDE
),
ADataType
(
aDataType
),
BDataType
(
bDataType
),
CDEDataType
(
cdeDataType
),
AElementOp
(
aElementOp
),
BElementOp
(
bElementOp
),
CDEElementOp
(
cdeElementOp
),
CDELayout
(
cdeLayout
)
{
}
auto
MakeSolution
(
index_t
idx
,
const
std
::
string
&
arch
)
const
{
auto
template_str
=
GetInstances
(
arch
).
at
(
idx
);
std
::
istringstream
iss
(
template_str
);
std
::
vector
<
std
::
string
>
params
(
std
::
istream_iterator
<
std
::
string
>
{
iss
},
std
::
istream_iterator
<
std
::
string
>
());
params
[
a_elementwise_op_idx
]
=
AElementOp
;
params
[
b_elementwise_op_idx
]
=
BElementOp
;
params
[
ds_layout_idx
]
=
CDELayout
;
params
[
ds_data_type_idx
]
=
CDEDataType
;
params
[
ds_elementwise_op_idx
]
=
CDEElementOp
;
auto
block_size_str
=
params
[
block_size_idx
];
auto
m_per_block_str
=
params
[
m_per_block_idx
];
auto
n_per_block_str
=
params
[
n_per_block_idx
];
auto
k_per_block_str
=
params
[
k_per_block_idx
];
const
auto
block_size
=
std
::
stoi
(
block_size_str
);
const
auto
m_per_block
=
std
::
stoi
(
m_per_block_str
);
const
auto
n_per_block
=
std
::
stoi
(
n_per_block_str
);
const
auto
k_per_block
=
std
::
stoi
(
k_per_block_str
);
const
auto
grid_size
=
GetGridSize
(
M
,
N
,
m_per_block
,
n_per_block
);
params
[
gemm_spec_idx
]
=
GetGemmSpec
(
M
,
N
,
K
,
m_per_block
,
n_per_block
,
k_per_block
);
std
::
string
str
=
std
::
accumulate
(
params
.
begin
()
+
1
,
params
.
end
(),
std
::
string
{},
[](
const
std
::
string
&
a
,
const
std
::
string
&
b
)
{
return
a
.
empty
()
?
b
:
a
+
", "
+
b
;
});
str
=
params
.
front
()
+
"< "
+
str
+
">"
;
return
Solution
{
str
,
block_size
,
grid_size
};
}
auto
GetSolutions
(
const
std
::
string
&
arch
)
const
{
std
::
vector
<
Solution
>
solutions
;
const
auto
num_instances
=
GetInstances
(
arch
).
size
();
for
(
auto
i
=
0
;
i
<
num_instances
;
++
i
)
{
solutions
.
push_back
(
MakeSolution
(
i
,
arch
));
}
return
solutions
;
}
};
}
// namespace device_gemm_multiple_d
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/jit_library/util/make_instance_strings.py
0 → 100644
View file @
17acbbf4
import
argparse
,
re
,
json
,
os
out_file
=
"""// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <vector>
#include <memory>
namespace ck {{
namespace tensor_operation {{
namespace device {{
namespace instance {{
struct {op_name}_instances
{{
static inline std::vector<std::string> {col_row_name} =
{{
{col_row_instances}
}};
static inline std::vector<std::string> {col_col_name} =
{{
{col_col_instances}
}};
static inline std::vector<std::string> {row_row_name} =
{{
{row_row_instances}
}};
static inline std::vector<std::string> {row_col_name} =
{{
{row_col_instances}
}};
static auto get_col_row_instances()
{{
return {col_row_name};
}}
static auto get_col_col_instances()
{{
return {col_col_name};
}}
static auto get_row_row_instances()
{{
return {row_row_name};
}}
static auto get_row_col_instances()
{{
return {row_col_name};
}}
static auto get_include_header()
{{
return "{include_header}";
}}
}};
}} // namespace instance
}} // namespace device
}} // namespace tensor_operation
}} // namespace ck
"""
def
strip_sequences
(
str
):
matches
=
re
.
findall
(
r
'S<\d+(?:,\s*\d+)*>'
,
str
)
for
match
in
matches
:
str
=
str
.
replace
(
match
,
match
.
replace
(
' '
,
''
))
str
=
str
.
replace
(
'S<'
,
"ck::Sequence<"
)
return
str
def
remove_commas_and_brackets
(
string
):
regex_matches
=
re
.
findall
(
r
'ck::Sequence<.*?>'
,
string
)
for
match
in
regex_matches
:
string
=
string
.
replace
(
match
,
match
.
replace
(
','
,
'|'
).
replace
(
'<'
,
'%'
).
replace
(
'>'
,
'$'
))
string
=
string
.
replace
(
','
,
''
).
replace
(
'<'
,
''
).
replace
(
'>'
,
''
)
for
match
in
regex_matches
:
string
=
string
.
replace
(
match
.
replace
(
','
,
'|'
).
replace
(
'<'
,
'%'
).
replace
(
'>'
,
'$'
),
match
)
return
string
def
parse_instances
(
source
):
out_dir
=
os
.
path
.
join
(
source
,
"../../../src/jit_library/solution_instances"
)
aliases
=
{
"F16_F16_Tuple"
:
"ck::Tuple<F16,F16>"
,
"Row_Row_Tuple"
:
"ck::Tuple<Row,Row>"
,
"LoopScheduler"
:
"ck::LoopScheduler"
,
"PipelineVersion"
:
"ck::PipelineVersion"
,
"Row"
:
"ck::tensor_layout::gemm::RowMajor"
,
"Col"
:
"ck::tensor_layout::gemm::ColumnMajor"
,
"F16"
:
"ck::half_t"
,
"F32"
:
"float"
}
device_ops
=
{
"gemm_add_add_fastgelu"
:
"DeviceGemmMultipleD_Xdl_CShuffle"
,
#"batched_gemm_softmax_gemm": "DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle"
}
for
root_
,
dirs_
,
files_
in
os
.
walk
(
source
):
for
dir
in
dirs_
:
op_name
=
os
.
path
.
split
(
dir
)[
-
1
]
if
op_name
not
in
device_ops
:
continue
col_row_name
=
""
col_col_name
=
""
row_row_name
=
""
row_col_name
=
""
row_row_instances
=
[]
col_row_instances
=
[]
row_col_instances
=
[]
col_col_instances
=
[]
for
root
,
dirs
,
files
in
os
.
walk
(
os
.
path
.
join
(
root_
,
dir
)):
for
file
in
files
:
if
not
file
.
endswith
(
".cpp"
):
continue
;
file_name
=
os
.
path
.
split
(
file
)[
-
1
]
is_row_row
=
bool
(
re
.
search
(
".*mk.*kn.*"
,
file_name
))
is_col_row
=
bool
(
re
.
search
(
".*km.*kn.*"
,
file_name
))
is_row_col
=
bool
(
re
.
search
(
".*mk.*nk.*"
,
file_name
))
is_col_col
=
bool
(
re
.
search
(
".*km.*nk.*"
,
file_name
))
if
is_row_row
:
row_row_name
=
file_name
[:
-
4
]
if
is_col_row
:
col_row_name
=
file_name
[:
-
4
]
if
is_row_col
:
row_col_name
=
file_name
[:
-
4
]
if
is_col_col
:
col_col_name
=
file_name
[:
-
4
]
instances_list
=
[]
template_name
=
device_ops
[
op_name
]
include_header
=
""
with
open
(
os
.
path
.
join
(
root
,
file
))
as
f
:
for
line
in
f
:
if
"impl"
in
line
:
include_header
=
line
.
replace
(
"#include
\"
"
,
""
).
replace
(
"
\"
"
,
""
).
replace
(
"
\n
"
,
""
)
elif
template_name
in
line
:
# Turn all whitespace into single spaces
new_line
=
" "
.
join
(
line
.
split
())
# Remove whitespace from S<*>
new_line
=
strip_sequences
(
new_line
)
new_line
=
remove_commas_and_brackets
(
new_line
)
last_char
=
"
\n
"
if
new_line
[
-
1
]
==
","
:
last_char
=
",
\n
"
new_line
=
new_line
[:
-
1
]
new_line
=
' "ck::tensor_operation::device::'
+
new_line
+
'",'
for
key
in
aliases
:
new_line
=
new_line
.
replace
(
key
,
aliases
[
key
])
instances_list
.
append
(
new_line
)
instances_list
[
-
1
]
=
instances_list
[
-
1
][:
-
1
]
if
is_row_row
:
row_row_instances
=
instances_list
if
is_col_row
:
col_row_instances
=
instances_list
if
is_row_col
:
row_col_instances
=
instances_list
if
is_col_col
:
col_col_instances
=
instances_list
out_file_name
=
op_name
+
"_instances.hpp"
if
not
os
.
path
.
exists
(
out_dir
):
os
.
mkdir
(
out_dir
)
with
open
(
os
.
path
.
join
(
out_dir
,
out_file_name
),
"w+"
)
as
f
:
f
.
write
(
out_file
.
format
(
op_name
=
op_name
,
col_row_name
=
col_row_name
,
col_row_instances
=
"
\n
"
.
join
(
col_row_instances
),
col_col_name
=
col_col_name
,
col_col_instances
=
"
\n
"
.
join
(
col_col_instances
),
row_row_name
=
row_row_name
,
row_row_instances
=
"
\n
"
.
join
(
row_row_instances
),
row_col_name
=
row_col_name
,
row_col_instances
=
"
\n
"
.
join
(
row_col_instances
),
include_header
=
include_header
))
def
run
():
source
=
"/code/composable_kernel/library/src/tensor_operation_instance/gpu"
parse_instances
(
source
)
if
__name__
==
'__main__'
:
run
()
\ No newline at end of file
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
17acbbf4
...
...
@@ -21,7 +21,6 @@ ENDFOREACH()
add_library
(
device_operations STATIC
${
CK_DEVICE_INSTANCES
}
)
add_library
(
composablekernels::device_operations ALIAS device_operations
)
set
(
DEV_OPS_INC_DIRS
${
PROJECT_SOURCE_DIR
}
/include/ck/
${
PROJECT_SOURCE_DIR
}
/library/include/ck/
...
...
library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/gemm_add_add_fastgelu_instances.hpp
0 → 100644
View file @
17acbbf4
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment