Commit 50540084 authored by Adam Osewski's avatar Adam Osewski
Browse files

Merge branch 'develop' into aosewski/gemm_tile_loop

parents 9c4d265f 2474dddb
...@@ -12,6 +12,11 @@ Full documentation for Composable Kernel is not yet available. ...@@ -12,6 +12,11 @@ Full documentation for Composable Kernel is not yet available.
- Improve proformance of normalization kernel - Improve proformance of normalization kernel
### Added ### Added
- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances.
- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types.
- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler.
- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler.
- Added support on MI300A/MI300X.
- Added support on NAVI3x. - Added support on NAVI3x.
- Added user tutorial (#563). - Added user tutorial (#563).
- Added more instances for irregular GEMM sizes (#560). - Added more instances for irregular GEMM sizes (#560).
......
...@@ -5,6 +5,39 @@ project(composable_kernel) ...@@ -5,6 +5,39 @@ project(composable_kernel)
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
if (DTYPES)
add_definitions(-DDTYPES)
if (DTYPES MATCHES "int8")
add_definitions(-D__int8__)
endif()
if (DTYPES MATCHES "fp8")
add_definitions(-D__fp8__)
endif()
if (DTYPES MATCHES "fp16")
add_definitions(-D__fp16__)
endif()
if (DTYPES MATCHES "fp32")
add_definitions(-D__fp32__)
endif()
if (DTYPES MATCHES "fp64")
add_definitions(-D__fp64__)
endif()
if (DTYPES MATCHES "bf16")
add_definitions(-D__bf16__)
endif()
message("DTYPES macro set to ${DTYPES}")
else()
add_definitions(-D__int8__ -D__fp8__ -D__fp16__ -D__fp32__ -D__fp64__ -D__bf16__)
endif()
if(DL_KERNELS)
add_definitions(-DDL_KERNELS)
endif()
if(INSTANCES_ONLY)
add_definitions(-DINSTANCES_ONLY)
endif()
enable_testing() enable_testing()
set(ROCM_SYMLINK_LIBS OFF) set(ROCM_SYMLINK_LIBS OFF)
...@@ -16,11 +49,47 @@ include(ROCMSetupVersion) ...@@ -16,11 +49,47 @@ include(ROCMSetupVersion)
include(ROCMInstallSymlinks) include(ROCMInstallSymlinks)
include(ROCMCreatePackage) include(ROCMCreatePackage)
include(CheckCXXCompilerFlag) include(CheckCXXCompilerFlag)
include(ROCMCheckTargetIds)
rocm_setup_version(VERSION 0.2.0) rocm_setup_version(VERSION 0.2.0)
include(TargetFlags) include(TargetFlags)
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
message("GPU_TARGETS= ${GPU_TARGETS}")
message("checking which targets are supported")
#This is the list of targets to be used in case GPU_TARGETS is not set on command line
#These targets will be filtered and only supported ones will be used
#Setting GPU_TARGETS on command line will override this list
if(NOT PROFILER_ONLY)
rocm_check_target_ids(DEFAULT_GPU_TARGETS
TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
else()
add_definitions(-DPROFILER_ONLY)
if(GPU_TARGETS)
message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx9, gfx10, or gfx11")
endif()
if(GPU_ARCH MATCHES "gfx9")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942")
elseif(GPU_ARCH MATCHES "gfx10")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030")
elseif(GPU_ARCH MATCHES "gfx11")
rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102")
else()
message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx9, gfx10, or gfx11")
endif()
endif()
message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}")
set(AMDGPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " ")
if(GPU_TARGETS)
message("Building CK for the following targets: ${GPU_TARGETS}")
else()
message("Building CK for the following targets: ${AMDGPU_TARGETS}")
endif()
find_package(hip)
option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF) option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF) option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF)
...@@ -258,31 +327,76 @@ file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp" ...@@ -258,31 +327,76 @@ file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp"
file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*) file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
set(CK_DEVICE_INSTANCES) set(CK_DEVICE_INSTANCES)
FOREACH(subdir_path ${dir_list}) FOREACH(subdir_path ${dir_list})
IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}") set(target_dir)
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance) IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
ENDIF() set(cmake_instance)
file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance)
set(add_inst 0)
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8")
#message("fp8 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16")
#message("fp16 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32")
#message("fp32 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64")
#message("fp64 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16")
#message("bf16 instance found!")
set(add_inst 1)
endif()
if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8")
#message("int8 instance found!")
set(add_inst 1)
endif()
if(NOT "${cmake_instance}" MATCHES "DTYPES")
#message("instance should be built for all types!")
set(add_inst 1)
endif()
if(add_inst EQUAL 1 OR NOT DEFINED DTYPES)
list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
endif()
ENDIF()
ENDFOREACH() ENDFOREACH()
add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES}) add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES})
add_subdirectory(library)
rocm_package_setup_component(tests if(NOT DEFINED INSTANCES_ONLY)
if(NOT DEFINED PROFILER_ONLY)
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME tests # Prevent -static suffix on package name PACKAGE_NAME tests # Prevent -static suffix on package name
) )
rocm_package_setup_component(examples rocm_package_setup_component(examples
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME examples PACKAGE_NAME examples
) )
add_subdirectory(example)
add_subdirectory(test)
rocm_package_setup_component(profiler rocm_package_setup_component(profiler
LIBRARY_NAME composablekernel LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler PACKAGE_NAME ckProfiler
) )
add_subdirectory(profiler)
add_subdirectory(library) else()
add_subdirectory(example) #When building PROFILER_ONLY, label the package with GPU_ARCH
add_subdirectory(test) rocm_package_setup_component(profiler
add_subdirectory(profiler) LIBRARY_NAME composablekernel
PACKAGE_NAME ckProfiler_${GPU_ARCH}
)
add_subdirectory(profiler)
endif()
endif()
#Create an interface target for the include only files and call it "composablekernels" #Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers) include(CMakePackageConfigHelpers)
......
...@@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
libpthread-stubs0-dev \ libpthread-stubs0-dev \
llvm-amdgpu \ llvm-amdgpu \
pkg-config \ pkg-config \
python \
python3 \ python3 \
python3-dev \ python3-dev \
python3-pip \ python3-pip \
...@@ -63,6 +64,10 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- ...@@ -63,6 +64,10 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
#Install latest version of cmake #Install latest version of cmake
RUN wget -qO /usr/local/bin/ninja.gz https://github.com/ninja-build/ninja/releases/latest/download/ninja-linux.zip
RUN gunzip /usr/local/bin/ninja.gz
RUN chmod a+x /usr/local/bin/ninja
RUN git clone https://github.com/nico/ninjatracing.git
RUN apt purge --auto-remove -y cmake RUN apt purge --auto-remove -y cmake
RUN apt update RUN apt update
RUN apt install -y software-properties-common lsb-release RUN apt install -y software-properties-common lsb-release
......
...@@ -509,8 +509,7 @@ def Build_CK(Map conf=[:]){ ...@@ -509,8 +509,7 @@ def Build_CK(Map conf=[:]){
cmake_build(conf) cmake_build(conf)
dir("build"){ dir("build"){
//run tests and examples //run tests and examples
def nt = nthreads() sh 'make -j check'
sh 'make -j${nt} check'
if (navi_node == 0 ){ if (navi_node == 0 ){
//we only need the ckProfiler to run the performance tests, so we pack and stash it //we only need the ckProfiler to run the performance tests, so we pack and stash it
//do not stash profiler on Navi nodes //do not stash profiler on Navi nodes
...@@ -741,7 +740,7 @@ pipeline { ...@@ -741,7 +740,7 @@ pipeline {
} }
agent{ label rocmnode("navi21") } agent{ label rocmnode("navi21") }
environment{ environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" """ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
} }
...@@ -749,6 +748,22 @@ pipeline { ...@@ -749,6 +748,22 @@ pipeline {
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
} }
} }
stage("Build CK and run Tests on Navi32")
{
when {
beforeAgent true
expression { !params.RUN_FULL_QA.toBoolean() }
}
agent{ label rocmnode("navi32") }
environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DDTYPES="fp16;fp32;bf16" -DGPU_TARGETS="gfx1101" """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1101" -DDTYPES="fp16;fp32;bf16" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """
}
steps{
Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
}
}
} }
} }
......
...@@ -52,6 +52,8 @@ CK is released under the MIT license. [License File](/LICENSE) ...@@ -52,6 +52,8 @@ CK is released under the MIT license. [License File](/LICENSE)
```bash ```bash
DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile . DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
``` ```
Pre-built dockers are available from this public repo:
https://hub.docker.com/r/rocm/composable_kernel/tags
## Launch docker ## Launch docker
...@@ -76,12 +78,26 @@ mkdir build && cd build ...@@ -76,12 +78,26 @@ mkdir build && cd build
cmake \ cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_CXX_FLAGS="-O3" \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D GPU_TARGETS="gfx908;gfx90a" \ -D GPU_TARGETS="gfx908;gfx90a" \
.. ..
``` ```
If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the
current compiler.
Additional cmake flags can be used to significantly speed-up the build:
INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library
while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency.
DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances
of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works).
DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl
instances. Those instances are only needed for the NAVI2x platforms.
### Build examples and tests ### Build examples and tests
```bash ```bash
......
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp) add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp)
target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations)
...@@ -18,3 +19,4 @@ target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable ...@@ -18,3 +19,4 @@ target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable
add_executable(client_gemm_quantization gemm_quantization.cpp) add_executable(client_gemm_quantization gemm_quantization.cpp)
target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations) target_link_libraries(client_gemm_quantization PRIVATE composable_kernel::device_operations)
endif()
...@@ -101,13 +101,15 @@ template <ck::index_t NumDimSpatial, ...@@ -101,13 +101,15 @@ template <ck::index_t NumDimSpatial,
typename WeiLayout, typename WeiLayout,
typename OutLayout> typename OutLayout>
bool run_grouped_conv_bwd_weight( bool run_grouped_conv_bwd_weight(
ck::index_t G, const ck::index_t G,
ck::index_t N, const ck::index_t N,
ck::index_t K, const ck::index_t K,
ck::index_t C, const ck::index_t C,
const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths, const std::array<ck::index_t, NumDimSpatial>& input_spatial_lengths,
const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths, const std::array<ck::index_t, NumDimSpatial>& filter_spatial_lengths,
const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths, const std::array<ck::index_t, NumDimSpatial>& output_spatial_lengths,
const std::array<ck::index_t, NumDimSpatial + 3>& input_strides,
const std::array<ck::index_t, NumDimSpatial + 3>& output_strides,
const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides, const std::array<ck::index_t, NumDimSpatial>& conv_filter_strides,
const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations, const std::array<ck::index_t, NumDimSpatial>& conv_filter_dilations,
const std::array<ck::index_t, NumDimSpatial>& input_left_pads, const std::array<ck::index_t, NumDimSpatial>& input_left_pads,
...@@ -157,6 +159,8 @@ bool run_grouped_conv_bwd_weight( ...@@ -157,6 +159,8 @@ bool run_grouped_conv_bwd_weight(
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
input_strides,
output_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
...@@ -224,6 +228,8 @@ bool run_grouped_conv_bwd_weight( ...@@ -224,6 +228,8 @@ bool run_grouped_conv_bwd_weight(
input_spatial_lengths, input_spatial_lengths,
filter_spatial_lengths, filter_spatial_lengths,
output_spatial_lengths, output_spatial_lengths,
input_strides,
output_strides,
conv_filter_strides, conv_filter_strides,
conv_filter_dilations, conv_filter_dilations,
input_left_pads, input_left_pads,
......
...@@ -22,6 +22,15 @@ static constexpr ck::index_t C = 192; ...@@ -22,6 +22,15 @@ static constexpr ck::index_t C = 192;
static constexpr ck::index_t X = 3; static constexpr ck::index_t X = 3;
static constexpr ck::index_t Wi = 28; static constexpr ck::index_t Wi = 28;
static constexpr ck::index_t Wo = 28; static constexpr ck::index_t Wo = 28;
static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Wi};
static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{X};
static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Wo};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{N * Wi * C, Wi* C, C, 1};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{N * Wo * K, Wo* K, K, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1};
int main() int main()
{ {
...@@ -31,7 +40,19 @@ int main() ...@@ -31,7 +40,19 @@ int main()
OutDataType, OutDataType,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout>(G, N, K, C, {Wi}, {X}, {Wo}, {1}, {1}, {1}, {1}) OutLayout>(G,
N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_strides,
output_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads)
? EXIT_SUCCESS ? EXIT_SUCCESS
: EXIT_FAILURE; : EXIT_FAILURE;
} }
...@@ -25,6 +25,17 @@ static constexpr ck::index_t Hi = 28; ...@@ -25,6 +25,17 @@ static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 28; static constexpr ck::index_t Wi = 28;
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 28; static constexpr ck::index_t Wo = 28;
static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
N * Hi * Wi * C, Hi* Wi* C, Wi* C, C, 1};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
N * Ho * Wo * K, Ho* Wo* K, Wo* K, K, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
int main() int main()
{ {
...@@ -34,8 +45,19 @@ int main() ...@@ -34,8 +45,19 @@ int main()
OutDataType, OutDataType,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout>( OutLayout>(G,
G, N, K, C, {Hi, Wi}, {Y, X}, {Ho, Wo}, {1, 1}, {1, 1}, {1, 1}, {1, 1}) N,
K,
C,
input_spatial_lengths,
filter_spatial_lengths,
output_spatial_lengths,
input_strides,
output_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads)
? EXIT_SUCCESS ? EXIT_SUCCESS
: EXIT_FAILURE; : EXIT_FAILURE;
} }
...@@ -28,6 +28,17 @@ static constexpr ck::index_t Wi = 3; ...@@ -28,6 +28,17 @@ static constexpr ck::index_t Wi = 3;
static constexpr ck::index_t Do = 28; static constexpr ck::index_t Do = 28;
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 3; static constexpr ck::index_t Wo = 3;
static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
int main() int main()
{ {
...@@ -41,13 +52,15 @@ int main() ...@@ -41,13 +52,15 @@ int main()
N, N,
K, K,
C, C,
{Di, Hi, Wi}, input_spatial_lengths,
{Z, Y, X}, filter_spatial_lengths,
{Do, Ho, Wo}, output_spatial_lengths,
{1, 1, 1}, input_strides,
{1, 1, 1}, output_strides,
{1, 1, 1}, conv_filter_strides,
{1, 1, 1}) conv_filter_dilations,
input_left_pads,
input_right_pads)
? EXIT_SUCCESS ? EXIT_SUCCESS
: EXIT_FAILURE; : EXIT_FAILURE;
} }
...@@ -28,6 +28,17 @@ static constexpr ck::index_t Wi = 3; ...@@ -28,6 +28,17 @@ static constexpr ck::index_t Wi = 3;
static constexpr ck::index_t Do = 28; static constexpr ck::index_t Do = 28;
static constexpr ck::index_t Ho = 28; static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 3; static constexpr ck::index_t Wo = 3;
static constexpr std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Di, Hi, Wi};
static constexpr std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Z, Y, X};
static constexpr std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Do, Ho, Wo};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> input_strides{
N * Di * Hi * Wi * C, Di* Hi* Wi* C, Hi* Wi* C, Wi* C, C, 1};
static constexpr std::array<ck::index_t, NumDimSpatial + 3> output_strides{
N * Do * Ho * Wo * K, Do* Ho* Wo* K, Ho* Wo* K, Wo* K, K, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
static constexpr std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
int main() int main()
{ {
...@@ -37,17 +48,20 @@ int main() ...@@ -37,17 +48,20 @@ int main()
OutDataType, OutDataType,
InLayout, InLayout,
WeiLayout, WeiLayout,
OutLayout>(G, OutLayout>(
N, G,
K, N,
C, K,
{Di, Hi, Wi}, C,
{Z, Y, X}, {Di, Hi, Wi},
{Do, Ho, Wo}, {Z, Y, X},
{1, 1, 1}, {Do, Ho, Wo},
{1, 1, 1}, {N * Di * Hi * Wi * C, Di * Hi * Wi * C, Hi * Wi * C, Wi * C, C, 1},
{1, 1, 1}, {N * Do * Ho * Wo * K, Do * Ho * Wo * K, Ho * Wo * K, Wo * K, K, 1},
{1, 1, 1}) {1, 1, 1},
{1, 1, 1},
{1, 1, 1},
{1, 1, 1})
? EXIT_SUCCESS ? EXIT_SUCCESS
: EXIT_FAILURE; : EXIT_FAILURE;
} }
...@@ -2,6 +2,31 @@ cmake_minimum_required(VERSION 3.15) ...@@ -2,6 +2,31 @@ cmake_minimum_required(VERSION 3.15)
project(ck_app) project(ck_app)
add_compile_options(-std=c++17) add_compile_options(-std=c++17)
if (DTYPES)
add_definitions(-DDTYPES)
if (DTYPES MATCHES "int8")
add_definitions(-D__int8__)
endif()
if (DTYPES MATCHES "fp8")
add_definitions(-D__fp8__)
endif()
if (DTYPES MATCHES "fp16")
add_definitions(-D__fp16__)
endif()
if (DTYPES MATCHES "fp32")
add_definitions(-D__fp32__)
endif()
if (DTYPES MATCHES "fp64")
add_definitions(-D__fp64__)
endif()
if (DTYPES MATCHES "bf16")
add_definitions(-D__bf16__)
endif()
message("DTYPES macro set to ${DTYPES}")
else()
add_definitions(-D__int8__ -D__fp8__ -D__fp16__ -D__fp32__ -D__fp64__ -D__bf16__)
endif()
find_package(composable_kernel 1.0.0 COMPONENTS device_operations) find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
find_package(hip REQUIRED PATHS /opt/rocm) find_package(hip REQUIRED PATHS /opt/rocm)
message(STATUS "Build with HIP ${hip_VERSION}") message(STATUS "Build with HIP ${hip_VERSION}")
......
...@@ -67,6 +67,7 @@ else() ...@@ -67,6 +67,7 @@ else()
-Wunused -Wunused
-Wno-reserved-identifier -Wno-reserved-identifier
-Werror -Werror
-Wno-option-ignored
-Wsign-compare -Wsign-compare
-Wno-extra-semi-stmt -Wno-extra-semi-stmt
) )
......
...@@ -7,8 +7,8 @@ API Reference Guide ...@@ -7,8 +7,8 @@ API Reference Guide
Introduction Introduction
================= =================
This document contains details of the APIs for the Composable Kernel (CK) library and introduces some of the key design This document contains details of the APIs for the Composable Kernel (CK) library and introduces
principles that are used to write new classes that extend CK functionality. some of the key design principles that are used to write new classes that extend CK functionality.
================= =================
Using CK API Using CK API
...@@ -30,8 +30,8 @@ DeviceMem ...@@ -30,8 +30,8 @@ DeviceMem
Kernels For Flashattention Kernels For Flashattention
--------------------------- ---------------------------
The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists the classes that are The Flashattention algorithm is defined in :cite:t:`dao2022flashattention`. This sections lists
used in the CK GPU implementation of Flashattention. the classes that are used in the CK GPU implementation of Flashattention.
**Gridwise classes** **Gridwise classes**
......
...@@ -2,15 +2,16 @@ ...@@ -2,15 +2,16 @@
Supported Primitives Guide Supported Primitives Guide
========================== ==========================
This document contains details of supported primitives in Composable Kernel (CK). In contrast to the API Reference This document contains details of supported primitives in Composable Kernel (CK). In contrast to the
Guide, the Supported Primitives Guide is an introduction to the math which underpins the algorithms implemented in CK. API Reference Guide, the Supported Primitives Guide is an introduction to the math which underpins
the algorithms implemented in CK.
------------ ------------
Softmax Softmax
------------ ------------
For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the softmax of concatenated For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can decompose the
:math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as, softmax of concatenated :math:`x = [ x^{(1)}\ | \ \ldots \ | \ x^{(T)} ]` as,
.. math:: .. math::
:nowrap: :nowrap:
...@@ -25,8 +26,8 @@ For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can d ...@@ -25,8 +26,8 @@ For vectors :math:`x^{(1)}, x^{(2)}, \ldots, x^{(T)}` of size :math:`B` we can d
where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and where :math:`f(x^{(j)}) = \exp( x^{(j)} - m(x^{(j)}) )` is of size :math:`B` and
:math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar. :math:`z(x^{(j)}) = f(x_1^{(j)})+ \ldots+ f(x_B^{(j)})` is a scalar.
For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size :math:`B_r \times B_c` we can For a matrix :math:`X` composed of :math:`T_r \times T_c` tiles, :math:`X_{ij}`, of size
compute the row-wise softmax as follows. :math:`B_r \times B_c` we can compute the row-wise softmax as follows.
For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate, For :math:`j` from :math:`1` to :math:`T_c`, and :math:`i` from :math:`1` to :math:`T_r` calculate,
......
=================== ===================
CK docker hub CK Docker Hub
=================== ===================
`Docker hub <https://hub.docker.com/r/rocm/composable_kernel>`_
------------------------------------- -------------------------------------
Why do I need this? Why do I need this?
------------------------------------- -------------------------------------
To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images. To make our lives easier and bring Composable Kernel dependencies together, we recommend using
docker images that can be found on `Docker Hub <https://hub.docker.com/r/rocm/composable_kernel>`_.
------------------------------------- -------------------------------------
So what is Composable Kernel? So what is Composable Kernel?
------------------------------------- -------------------------------------
Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++. Composable Kernel (CK) library aims to provide a programming model for writing performance critical
kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
through general purpose kernel languages, like HIP C++.
To get the CK library:: To get the CK library::
git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
run a docker container:: run a docker container::
docker run \ docker run \
...@@ -30,7 +30,7 @@ run a docker container:: ...@@ -30,7 +30,7 @@ run a docker container::
--group-add sudo \ --group-add sudo \
-w /root/workspace \ -w /root/workspace \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \
rocm/composable_kernel:ck_ub20.04_rocm5.3_release \ rocm/composable_kernel:ck_ub20.04_rocm5.6 \
/bin/bash /bin/bash
and build the CK:: and build the CK::
...@@ -58,7 +58,9 @@ We can also run specific examples or tests like:: ...@@ -58,7 +58,9 @@ We can also run specific examples or tests like::
./bin/example_gemm_xdl_fp16 ./bin/example_gemm_xdl_fp16
./bin/test_gemm_fp16 ./bin/test_gemm_fp16
For more details visit `CK github repo <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_, `CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_, `even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_. For more details visit `CK github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_,
`CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example)>`_,
`even more CK examples <https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example>`_.
------------------------------------- -------------------------------------
And what is inside? And what is inside?
...@@ -74,12 +76,11 @@ The docker images have everything you need for running CK including: ...@@ -74,12 +76,11 @@ The docker images have everything you need for running CK including:
Which image is right for me? Which image is right for me?
------------------------------------- -------------------------------------
Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are: Let's take a look at the image naming, for example ``ck_ub20.04_rocm5.6``. The image specs are:
* "ck" - made for running Composable Kernel * ``ck`` - made for running Composable Kernel;
* "ub20.04" - based on Ubuntu 20.04 * ``ub20.04`` - based on Ubuntu 20.04;
* "rocm5.4" - ROCm platform version 5.4 * ``rocm5.6`` - ROCm platform version 5.6.
* "release" - compiler version is release
So just pick the right image for your project dependencies and you're all set. So just pick the right image for your project dependencies and you're all set.
...@@ -87,7 +88,9 @@ So just pick the right image for your project dependencies and you're all set. ...@@ -87,7 +88,9 @@ So just pick the right image for your project dependencies and you're all set.
DIY starts here DIY starts here
------------------------------------- -------------------------------------
If you need to customize a docker image or just can't stop tinkering, feel free to adjust the `Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_ for your needs. If you need to customize a docker image or just can't stop tinkering, feel free to adjust the
`Dockerfile <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile>`_
for your needs.
------------------------------------- -------------------------------------
License License
......
...@@ -12,12 +12,15 @@ This document contains instructions for installing, using, and contributing to C ...@@ -12,12 +12,15 @@ This document contains instructions for installing, using, and contributing to C
Methodology Methodology
----------- -----------
Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++. Composable Kernel (CK) library aims to provide a programming model for writing performance critical
kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc,
through general purpose kernel languages, like HIP C++.
CK utilizes two concepts to achieve performance portability and code maintainability: CK utilizes two concepts to achieve performance portability and code maintainability:
* A tile-based programming model * A tile-based programming model
* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation". * Algorithm complexity reduction for complex ML operators, using innovative technique we call
"Tensor Coordinate Transformation".
.. image:: data/ck_component.png .. image:: data/ck_component.png
:alt: CK Components :alt: CK Components
......
...@@ -6,15 +6,26 @@ CK Hello world ...@@ -6,15 +6,26 @@ CK Hello world
Motivation Motivation
------------------------------------- -------------------------------------
This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who would like to optimize their pipelines and squeeze every performance drop by adding Composable Kernel (CK) library to their projects. We would like to make the CK library approachable so the tutorial is not based on the latest release and doesn't have all the bleeding edge features, but it will be reproducible now and forever. This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who
would like to optimize their pipelines and squeeze every performance drop by adding Composable
Kernel (CK) library to their projects. We would like to make the CK library approachable so
the tutorial is not based on the latest release and doesn't have all the bleeding edge features,
but it will be reproducible now and forever.
During this tutorial we will have an introduction to the CK library, we will build it and run some examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go in depth and breadth and get familiar with other tools and ways to integrate CK into your project. During this tutorial we will have an introduction to the CK library, we will build it and run some
examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go
in depth and breadth and get familiar with other tools and ways to integrate CK into your project.
------------------------------------- -------------------------------------
Description Description
------------------------------------- -------------------------------------
Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create new ones. The library has components required for majority of modern neural networks architectures including matrix multiplication, convolution, contraction, reduction, attention modules, variety of activation functions, fused operators and many more. Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and
efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast
and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create
new ones. The library has components required for majority of modern neural networks architectures
including matrix multiplication, convolution, contraction, reduction, attention modules, variety of
activation functions, fused operators and many more.
So how do we (almost) reach the speed of light? CK acceleration abilities are based on: So how do we (almost) reach the speed of light? CK acceleration abilities are based on:
...@@ -24,15 +35,18 @@ So how do we (almost) reach the speed of light? CK acceleration abilities are ba ...@@ -24,15 +35,18 @@ So how do we (almost) reach the speed of light? CK acceleration abilities are ba
* Hardware acceleration use. * Hardware acceleration use.
* Support of low precision data types including fp16, bf16, int8 and int4. * Support of low precision data types including fp16, bf16, int8 and int4.
If you are excited and need more technical details and benchmarking results - read this awesome `blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_. If you are excited and need more technical details and benchmarking results - read this awesome
`blog post <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_.
For more details visit our `github repo <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_. For more details visit our `github repository <https://github.com/ROCmSoftwarePlatform/composable_kernel>`_.
------------------------------------- -------------------------------------
Hardware targets Hardware targets
------------------------------------- -------------------------------------
CK library fully supports "gfx908" and "gfx90a" GPU architectures and only some operators are supported for "gfx1030". Let's check the hardware you have at hand and decide on the target GPU architecture CK library fully supports `gfx908` and `gfx90a` GPU architectures and only some operators are
supported for `gfx1030`. Let's check the hardware you have at hand and decide on the target
GPU architecture.
========== ========= ========== =========
GPU Target AMD GPU GPU Target AMD GPU
...@@ -42,7 +56,8 @@ gfx90a Radeon Instinct MI210, MI250, MI250X ...@@ -42,7 +56,8 @@ gfx90a Radeon Instinct MI210, MI250, MI250X
gfx1030 Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT gfx1030 Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT
========== ========= ========== =========
There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if you don't have an AMD GPU at hand. There are also `cloud options <https://aws.amazon.com/ec2/instance-types/g4/>`_ you can find if
you don't have an AMD GPU at hand.
------------------------------------- -------------------------------------
Build the library Build the library
...@@ -54,9 +69,13 @@ First let's clone the library and rebase to the tested version:: ...@@ -54,9 +69,13 @@ First let's clone the library and rebase to the tested version::
cd composable_kernel/ cd composable_kernel/
git checkout tutorial_hello_world git checkout tutorial_hello_world
To make our lives easier we prepared `docker images <https://hub.docker.com/r/rocm/composable_kernel>`_ with all the necessary dependencies. Pick the right image and create a container. In this tutorial we use "rocm/composable_kernel:ck_ub20.04_rocm5.3_release" image, it is based on Ubuntu 20.04, ROCm v5.3, compiler release version. To make our lives easier we prepared
`docker images <https://hub.docker.com/r/rocm/composable_kernel>`_ with all the necessary
dependencies. Pick the right image and create a container. In this tutorial we use
``rocm/composable_kernel:ck_ub20.04_rocm5.6`` image, it is based on Ubuntu 20.04 and
ROCm v5.6.
If your current folder is ${HOME}, start the docker container with:: If your current folder is ``${HOME}``, start the docker container with::
docker run \ docker run \
-it \ -it \
...@@ -64,20 +83,23 @@ If your current folder is ${HOME}, start the docker container with:: ...@@ -64,20 +83,23 @@ If your current folder is ${HOME}, start the docker container with::
--group-add sudo \ --group-add sudo \
-w /root/workspace \ -w /root/workspace \
-v ${HOME}:/root/workspace \ -v ${HOME}:/root/workspace \
rocm/composable_kernel:ck_ub20.04_rocm5.3_release \ rocm/composable_kernel:ck_ub20.04_rocm5.6 \
/bin/bash /bin/bash
If your current folder is different from ${HOME}, adjust the line `-v ${HOME}:/root/workspace` to fit your folder structure. If your current folder is different from ``${HOME}``, adjust the line ``-v ${HOME}:/root/workspace``
to fit your folder structure.
Inside the docker container current folder is "~/workspace", library path is "~/workspace/composable_kernel", navigate to the library:: Inside the docker container current folder is ``~/workspace``, library path is
``~/workspace/composable_kernel``, navigate to the library::
cd composable_kernel/ cd composable_kernel/
Create and go to the "build" directory:: Create and go to the ``build`` directory::
mkdir build && cd build mkdir build && cd build
In the previous section we talked about target GPU architecture. Once you decide which one is right for you, run cmake using the right GPU_TARGETS flag:: In the previous section we talked about target GPU architecture. Once you decide which one is right
for you, run CMake using the right ``GPU_TARGETS`` flag::
cmake \ cmake \
-D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_PREFIX_PATH=/opt/rocm \
...@@ -87,7 +109,7 @@ In the previous section we talked about target GPU architecture. Once you decide ...@@ -87,7 +109,7 @@ In the previous section we talked about target GPU architecture. Once you decide
-D BUILD_DEV=OFF \ -D BUILD_DEV=OFF \
-D GPU_TARGETS="gfx908;gfx90a;gfx1030" .. -D GPU_TARGETS="gfx908;gfx90a;gfx1030" ..
If everything went well the cmake run will end up with:: If everything went well the CMake run will end up with::
-- Configuring done -- Configuring done
-- Generating done -- Generating done
...@@ -118,9 +140,12 @@ We can also run them separately, here is a separate example execution:: ...@@ -118,9 +140,12 @@ We can also run them separately, here is a separate example execution::
./bin/example_gemm_xdl_fp16 1 1 1 ./bin/example_gemm_xdl_fp16 1 1 1
The arguments "1 1 1" mean that we want to run this example in the mode: verify results with CPU, initialize matrices with integers and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change. The arguments ``1 1 1`` mean that we want to run this example in the mode: verify results with CPU,
initialize matrices with integers and benchmark the kernel execution. You can play around with
these parameters and see how output and execution results change.
If everything goes well and you have a device based on gfx908 or gfx90a architecture you should see something like:: If everything goes well and you have a device based on `gfx908` or `gfx90a` architecture you should see
something like::
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
...@@ -130,14 +155,15 @@ If everything goes well and you have a device based on gfx908 or gfx90a architec ...@@ -130,14 +155,15 @@ If everything goes well and you have a device based on gfx908 or gfx90a architec
Start running 10 times... Start running 10 times...
Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1
Meanwhile, running it on a gfx1030 device should result in:: Meanwhile, running it on a `gfx1030` device should result in::
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem
But don't panic, some of the operators are supported on gfx1030 architecture, so you can run a separate example like:: But don't panic, some of the operators are supported on `gfx1030` architecture, so you can run a
separate example like::
./bin/example_gemm_dl_fp16 1 1 1 ./bin/example_gemm_dl_fp16 1 1 1
...@@ -154,7 +180,14 @@ and it should result in something nice similar to:: ...@@ -154,7 +180,14 @@ and it should result in something nice similar to::
Start running 10 times... Start running 10 times...
Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1> Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1>
Or we can run a separate test:: .. note::
There was a new CMake flag ``DL_KERNELS`` added in the latest versions of CK. If you use one of
the newest versions of the library and do not see the above results when running
``example_gemm_dl_fp16``, it might be necessary to add ``-D DL_KERNELS=ON`` to your CMake command
in order to build the operators supported on the `gfx1030` architecture.
We can also run a separate test::
ctest -R test_gemm_fp16 ctest -R test_gemm_fp16
...@@ -169,6 +202,9 @@ If everything goes well you should see something like:: ...@@ -169,6 +202,9 @@ If everything goes well you should see something like::
Summary Summary
----------- -----------
In this tutorial we took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different configs to find out the best one for your hardware and task. In this tutorial we took the first look at the Composable Kernel library, built it on your system
and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different
configs to find out the best one for your hardware and task.
P.S.: Don't forget to switch out the cloud instance if you have launched one, you can find better ways to spend your money for sure! P.S.: Don't forget to switch off the cloud instance if you have launched one, you can find better
ways to spend your money for sure!
...@@ -2,11 +2,14 @@ add_custom_target(example_gemm_dl) ...@@ -2,11 +2,14 @@ add_custom_target(example_gemm_dl)
add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp) add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp)
add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp) add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
add_dependencies(example_gemm_dl example_gemm_dl_fp32) add_dependencies(example_gemm_dl example_gemm_dl_fp32)
add_dependencies(example_gemm_dl example_gemm_dl_fp16) add_dependencies(example_gemm_dl example_gemm_dl_fp16)
add_dependencies(example_gemm_dl example_gemm_dl_int8)
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
add_dependencies(example_gemm_dl example_gemm_dl_int8)
endif()
if(USE_BITINT_EXTENSION_INT4) if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp) add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp)
...@@ -19,13 +22,16 @@ add_custom_target(example_gemm_xdl) ...@@ -19,13 +22,16 @@ add_custom_target(example_gemm_xdl)
add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp) add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp) add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp)
add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp) add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_fp16) add_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
add_dependencies(example_gemm_xdl example_gemm_xdl_bf16) add_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
add_dependencies(example_gemm_xdl example_gemm_xdl_int8)
add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16) add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16)
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_int8)
endif()
if(USE_BITINT_EXTENSION_INT4) if(USE_BITINT_EXTENSION_INT4)
add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp) add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_int4) add_dependencies(example_gemm_xdl example_gemm_xdl_int4)
...@@ -44,6 +50,8 @@ if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS ...@@ -44,6 +50,8 @@ if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS
add_dependencies(example_gemm_wmma example_gemm_wmma_fp16) add_dependencies(example_gemm_wmma example_gemm_wmma_fp16)
endif() endif()
add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp)
if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942") if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942")
add_example_executable(example_gemm_xdl_f8 gemm_xdl_f8.cpp) add_example_executable(example_gemm_xdl_f8 gemm_xdl_f8.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_f8) add_dependencies(example_gemm_xdl example_gemm_xdl_f8)
......
...@@ -33,6 +33,19 @@ struct ProblemSize final ...@@ -33,6 +33,19 @@ struct ProblemSize final
ck::index_t StrideC = 4096; ck::index_t StrideC = 4096;
}; };
struct ProblemSizeStreamK final
{
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
ck::index_t NumSKBlocks = -1;
};
struct ExecutionConfig final struct ExecutionConfig final
{ {
bool do_verification = true; bool do_verification = true;
...@@ -48,8 +61,17 @@ using Col = ck::tensor_layout::gemm::ColumnMajor; ...@@ -48,8 +61,17 @@ using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
inline bool template <typename ProblemType>
parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config) bool parse_cmd_args(int, char*[], ProblemType&, ExecutionConfig&)
{
return false;
}
template <>
bool parse_cmd_args<ProblemSize>(int argc,
char* argv[],
ProblemSize& problem_size,
ExecutionConfig& config)
{ {
if(argc == 1) if(argc == 1)
{ {
...@@ -87,3 +109,52 @@ parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfi ...@@ -87,3 +109,52 @@ parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfi
return true; return true;
} }
template <>
bool parse_cmd_args<ProblemSizeStreamK>(int argc,
char* argv[],
ProblemSizeStreamK& problem_size,
ExecutionConfig& config)
{
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc >= 10)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
problem_size.M = std::stoi(argv[4]);
problem_size.N = std::stoi(argv[5]);
problem_size.K = std::stoi(argv[6]);
problem_size.StrideA = std::stoi(argv[7]);
problem_size.StrideB = std::stoi(argv[8]);
problem_size.StrideC = std::stoi(argv[9]);
if(argc >= 11)
{
problem_size.NumSKBlocks = std::stoi(argv[10]);
}
}
else
{
std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg10: NumSKBlocks(optional)" << std::endl;
return false;
}
return true;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment