Commit 3552041a authored by danyao12's avatar danyao12
Browse files

Merge branch 'develop' into ck_tile/fa_bwd_opt

parents e8927110 733f33af
* @zjing14 @junliume @illsilin @carlushuang @aosewski @yigex
* @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
# Documentation files
docs/* @ROCm/rocm-documentation @zjing14 @junliume @illsilin @carlushuang @aosewski @yigex
*.md @ROCm/rocm-documentation @zjing14 @junliume @illsilin @carlushuang @aosewski @yigex
*.rst @ROCm/rocm-documentation @zjing14 @junliume @illsilin @carlushuang @aosewski @yigex
.readthedocs.yaml @ROCm/rocm-documentation @zjing14 @junliume @illsilin @carlushuang @aosewski @yigex
docs/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
*.md @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
*.rst @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
.readthedocs.yaml @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
# Header directory for Doxygen documentation
library/include/* @ROCm/rocm-documentation @zjing14 @junliume @illsilin @carlushuang @aosewski @yigex
library/include/ @ROCm/rocm-documentation @junliume @illsilin @carlushuang @aosewski @poyenc @geyyer @bartekxk
File mode changed from 100644 to 100755
# Changelog for Composable Kernel
Full documentation for Composable Kernel is not yet available.
Documentation for Composable Kernel available at [https://rocm.docs.amd.com/projects/composable_kernel/en/latest/](https://rocm.docs.amd.com/projects/composable_kernel/en/latest/).
## CK for ROCm 6.1.0
## Composable Kernel 1.1.0 for ROCm 6.1.0
### Additions
* Added generic instances for GEMM XDL operations (#1161)
* Added gamma and beta parameters for the layernorm and groupnorm bwd operations (#1133)
* Introduced wrapper sublibrary (limited functionality). (#1071, #1098, #1108, #1126)
* Added an option to vary the number of warm-up cycles and iterations for ckProfiler (#1124)
### Optimizations
* New performance optimizations for GEMM operations on MI200 and MI300 architectures (#1135)
### Fixes
* Reduced the build time for most GPU architectures (#1084)
* Fixed some conversion issues for fp8 data type (#1099)
### Changes
None
### Known issues
None
## CK for ROCm 6.0.0
## Composable Kernel 1.1.0 for ROCm 6.0.0
### Fixes
* Fixed a hazard associated with inline v_dot (#808)
* Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
* Fixed a hazard associated with inline v_dot (#808)
* Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
### Optimizations
None
### Additions
* Added an image to a column kernel (#867)
* Added a column to an image kernel (#930)
* Support for 3D grouped convolution on RDNA 3 GPUs (#935, #950, #985)
......@@ -42,18 +50,22 @@ None
* Support for Batched GEMM DL (#732)
### Changes
* Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
## CK 0.2.0 for ROCm 5.7.0
* Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
## Composable Kernel 0.2.0 for ROCm 5.7.0
### Fixes
* Fixed a bug in 6-dimensional kernels (#555)
* Fixed a test case failure with grouped convolution backward weight (#524)
### Optimizations
* Improved the performance of the normalization kernel
### Additions
* New CMake flags:
* "DL_KERNELS"-* Must be set to "ON" in order to build the GEMM DL and batched_gemm_multi_d_dl instances
* "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
......@@ -71,4 +83,5 @@ None
* MaxPool and AvgPool forward (#815); MaxPool backward (#750)
### Changes
None
......@@ -111,8 +111,16 @@ message("checking which targets are supported")
#These targets will be filtered and only supported ones will be used
#Setting GPU_TARGETS on command line will override this list
if(NOT PROFILER_ONLY)
if(NOT ENABLE_ASAN_PACKAGING)
#build CK for all supported targets
rocm_check_target_ids(DEFAULT_GPU_TARGETS
TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102")
TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201")
else()
#build CK only for xnack-supported targets
rocm_check_target_ids(DEFAULT_GPU_TARGETS
TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+")
set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE)
endif()
else()
add_definitions(-DPROFILER_ONLY)
set(GPU_TARGETS "" CACHE STRING "" FORCE)
......@@ -148,7 +156,7 @@ if (GPU_TARGETS)
add_definitions(-DCK_USE_XDL)
set(CK_USE_XDL "ON")
endif()
if (GPU_TARGETS MATCHES "gfx11")
if (GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12")
add_definitions(-DCK_USE_WMMA)
set(CK_USE_WMMA "ON")
endif()
......@@ -227,6 +235,16 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
# https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_macros.html
# _GLIBCXX_ASSERTIONS
# Undefined by default. When defined, enables extra error checking in the form of
# precondition assertions, such as bounds checking in strings and null pointer
# checks when dereferencing smart pointers
option(USE_GLIBCXX_ASSERTIONS "Turn on additional c++ library checks." OFF)
if(USE_GLIBCXX_ASSERTIONS)
add_compile_options(-Wp,-D_GLIBCXX_ASSERTIONS)
endif()
## HIP
set(CMAKE_HIP_PLATFORM amd)
set(CMAKE_HIP_COMPILER ${CMAKE_CXX_COMPILER})
......@@ -442,6 +460,13 @@ if(BUILD_DEV)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
add_compile_options(-fcolor-diagnostics)
endif()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9)
add_compile_options(-fdiagnostics-color=always)
endif()
add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
......
......@@ -23,11 +23,11 @@ RUN if [ "$ROCMVERSION" != "6.2" ]; then \
wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - && \
sh -c "echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] $DEB_ROCM_REPO focal main > /etc/apt/sources.list.d/rocm.list" && \
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCMVERSION/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list'; \
elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc1" ]; then \
elif [ "$ROCMVERSION" = "6.2" ] && [ "$compiler_version" = "rc3" ]; then \
sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amdgpu-install-internal_6.2-20.04-1_all.deb --no-check-certificate" && \
apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install dialog libpopt0 rsync && DEBIAN_FRONTEND=noninteractive apt-get install ./amdgpu-install-internal_6.2-20.04-1_all.deb && \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.2 rel-8 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=1794148; \
sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 6.2 rel-45 > /etc/apt/sources.list.d/rocm-build.list' && \
amdgpu-repo --amdgpu-build=2003709; \
fi
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
......
......@@ -315,6 +315,10 @@ def buildHipClangJob(Map conf=[:]){
if (params.COMPILER_VERSION == "amd-staging" || params.COMPILER_VERSION == "amd-mainline-open" || params.COMPILER_COMMIT != ""){
dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
}
def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
echo "Docker flags: ${dockerOpts}"
def variant = env.STAGE_NAME
......@@ -366,6 +370,11 @@ def runCKProfiler(Map conf=[:]){
if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
}
def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
echo "Docker flags: ${dockerOpts}"
def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
def variant = env.STAGE_NAME
......@@ -653,7 +662,7 @@ def process_results(Map conf=[:]){
}
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1;
CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.1; RUN_CK_TILE_TESTS=true
0 21 * * * % ROCMVERSION=6.1;hipTensor_test=true
0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;COMPILER_COMMIT=;USE_SCCACHE=false
0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline-open;COMPILER_COMMIT=;USE_SCCACHE=false
......@@ -661,6 +670,9 @@ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCM
pipeline {
agent none
triggers {
parameterizedCron(CRON_SETTINGS)
}
options {
parallelsAlwaysFailFast()
}
......@@ -721,6 +733,10 @@ pipeline {
name: "RUN_CODEGEN_TESTS",
defaultValue: true,
description: "Run the codegen tests (default: ON)")
booleanParam(
name: "RUN_CK_TILE_TESTS",
defaultValue: false,
description: "Run the ck_tile tests (default: OFF)")
booleanParam(
name: "BUILD_INSTANCES_ONLY",
defaultValue: false,
......@@ -813,7 +829,6 @@ pipeline {
beforeAgent true
expression { params.RUN_CODEGEN_TESTS.toBoolean() }
}
options { retry(2) }
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
......@@ -830,6 +845,52 @@ pipeline {
}
}
}
}
stage("Run CK_TILE Tests")
{
parallel
{
stage("Run CK_TILE Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_CK_TILE_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ &&
example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \
example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
stage("Run CK_TILE Tests on gfx942")
{
when {
beforeAgent true
expression { params.RUN_CK_TILE_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx942") }
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \
make -j64 tile_example_fmha_fwd tile_example_fmha_bwd && \
cd ../ &&
example/ck_tile/01_fmha/script/smoke_test_fwd.sh && \
example/ck_tile/01_fmha/script/smoke_test_bwd.sh"""
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Build CK and run Tests")
{
......@@ -886,10 +947,10 @@ pipeline {
}
agent{ label rocmnode("gfx90a") }
environment{
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1100;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
-DGPU_TARGETS="gfx908;gfx90a" \
-DGPU_TARGETS="gfx1100;gfx90a" \
-DCMAKE_CXX_COMPILER="${build_compiler()}" \
-DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
}
......@@ -970,7 +1031,7 @@ pipeline {
beforeAgent true
expression { params.RUN_PERFORMANCE_TESTS.toBoolean() }
}
options { retry(2) }
options { retry(1) }
agent{ label rocmnode("gfx90a")}
environment{
setup_args = """ -DGPU_TARGETS="gfx90a" -DBUILD_DEV=On """
......
......@@ -39,6 +39,14 @@ target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE c
add_executable(client_conv3d_fwd_convinvscale_fp8
grouped_convnd_fwd_convinvscale/conv3d_fwd_convinvscale_fp8.cpp)
target_link_libraries(client_conv3d_fwd_convinvscale_fp8 PRIVATE composable_kernel::device_conv_operations)
# Fwd convscale + Bias
add_executable(client_conv3d_fwd_convscale_add_fp8
grouped_convnd_fwd_convscale_add/conv3d_fwd_convscale_add_fp8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_add_fp8 PRIVATE composable_kernel::device_conv_operations)
# Fwd convscale + ReLU
add_executable(client_conv3d_fwd_convscale_relu_fp8
grouped_convnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_kernel::device_conv_operations)
# Fwd convscale
add_executable(client_conv3d_fwd_convscale_fp8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp)
......@@ -51,6 +59,10 @@ target_link_libraries(client_conv3d_fwd_convscale_bf8 PRIVATE composable_kernel:
add_executable(client_conv3d_fwd_convscale_fp8_bf8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8_bf8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_fp8_bf8 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_conv3d_fwd_convscale_bf8_fp8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8_fp8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
# Bwd data bilinear
add_executable(client_grouped_convnd_bwd_data_bilinear_residual_fp16
grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::bf8_t;
using WeiDataType = ck::f8_t;
using CShuffleDataType = float;
using OutDataType = ck::f8_t;
using AComputeDataType = ck::bf8_t;
using BComputeDataType = ck::f8_t;
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 1;
static constexpr ck::index_t N = 64;
static constexpr ck::index_t K = 128;
static constexpr ck::index_t C = 64;
static constexpr ck::index_t Z = 3;
static constexpr ck::index_t Y = 3;
static constexpr ck::index_t X = 3;
static constexpr ck::index_t Di = 28;
static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 3;
static constexpr ck::index_t Do = 28;
static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 3;
int main()
{
return run_grouped_conv_fwd_convscale<NumDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InLayout,
WeiLayout,
OutLayout,
3,
AComputeDataType,
BComputeDataType>(
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
? EXIT_SUCCESS
: EXIT_FAILURE;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <string>
#include <vector>
#include "ck/ck.hpp"
#include "ck/utility/type.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_add.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ConvScaleAdd = ck::tensor_operation::element_wise::ConvScaleAdd;
using F32 = float;
using BiasDataType = F32;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
template <ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
std::size_t
GetFlops(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths,
const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths,
const std::size_t& ds_size)
{
// G * N * C * <output spatial lengths product> * (2 * K * <filter spatial lengths product> +
// <number of scale factors>)
ck::index_t G = weights_lengths[0];
ck::index_t N = output_lengths[1];
ck::index_t K = weights_lengths[1];
ck::index_t C = weights_lengths[2];
return G * N * C *
std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim),
std::end(output_lengths),
static_cast<std::size_t>(1),
std::multiplies<>()) *
(static_cast<std::size_t>(2) * K *
std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim),
std::end(weights_lengths),
static_cast<std::size_t>(1),
std::multiplies<>()) +
ds_size);
}
template <typename InDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
std::size_t
GetInputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& input_lengths)
{
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
return sizeof(InDataType) * std::accumulate(std::begin(input_lengths),
std::end(input_lengths),
static_cast<std::size_t>(1),
std::multiplies<>());
}
template <typename WeiDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
std::size_t
GetWeightByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
{
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
return sizeof(WeiDataType) * std::accumulate(std::begin(weights_lengths),
std::end(weights_lengths),
static_cast<std::size_t>(1),
std::multiplies<>());
}
template <typename OutDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
std::size_t
GetOutputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths)
{
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
return sizeof(OutDataType) * std::accumulate(std::begin(output_lengths),
std::end(output_lengths),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>());
}
template <ck::index_t NumDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout,
ck::index_t NumNonSpatialDim = 3,
typename AComputeType = InDataType,
typename BComputeType = AComputeType>
bool run_grouped_conv_fwd_convscale_add(
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_lengths,
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_lengths,
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_lengths)
{
namespace ctc = ck::tensor_layout::convolution;
static_assert(NumDimSpatial == 3 && ck::is_same_v<InLayout, ctc::NDHWGC> &&
ck::is_same_v<WeiLayout, ctc::GKZYXC> &&
ck::is_same_v<OutLayout, ctc::NDHWGK>,
"Unsupported configuration");
const ck::index_t G = in_lengths[4];
const ck::index_t N = in_lengths[0];
const ck::index_t K = wei_lengths[1];
const ck::index_t C = in_lengths[5];
const ck::index_t Z = wei_lengths[2];
const ck::index_t Y = wei_lengths[3];
const ck::index_t X = wei_lengths[4];
const ck::index_t Di = in_lengths[1];
const ck::index_t Hi = in_lengths[2];
const ck::index_t Wi = in_lengths[3];
const ck::index_t Do = out_lengths[1];
const ck::index_t Ho = out_lengths[2];
const ck::index_t Wo = out_lengths[3];
const std::size_t in_mem_size = sizeof(InDataType) * N * Di * Hi * Wi * G * C;
const std::size_t wei_mem_size = sizeof(WeiDataType) * G * K * Z * Y * X * C;
const std::size_t out_mem_size = sizeof(OutDataType) * N * Do * Ho * Wo * G * K;
const std::size_t bias_mem_size = sizeof(BiasDataType) * N * Do * Ho * Wo * G * K;
SimpleDeviceMem in(in_mem_size);
SimpleDeviceMem wei(wei_mem_size);
SimpleDeviceMem out(out_mem_size);
SimpleDeviceMem bias(bias_mem_size);
float scale_in = float(std::rand()) / float(RAND_MAX);
float scale_wei = float(std::rand()) / float(RAND_MAX);
float scale_out = float(std::rand()) / float(RAND_MAX);
// We have NDHWGC/GKZYXC/NDHWGK (x, weight, y) in memory space.
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
// Hence, we need to adjust the order of strides.
const std::array<ck::index_t, NumDimSpatial + 3> input_lengths{G, N, C, Di, Hi, Wi};
const std::array<ck::index_t, NumDimSpatial + 3> input_strides{
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
const std::array<ck::index_t, NumDimSpatial + 3> weights_lengths{G, K, C, Z, Y, X};
const std::array<ck::index_t, NumDimSpatial + 3> weights_strides{
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
const std::array<ck::index_t, NumDimSpatial + 3> output_lengths{G, N, K, Do, Ho, Wo};
const std::array<ck::index_t, NumDimSpatial + 3> output_strides{
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
const std::array<ck::index_t, NumDimSpatial + 3> bias_lengths{G, N, K, Do, Ho, Wo};
const std::array<ck::index_t, NumDimSpatial + 3> bias_strides{
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
const std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1, 1};
const std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1, 1};
const std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
const std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
std::size_t ds_size = 3 + 1; // 3 element-wise scale multipliers + 1 elementwise Bias
std::size_t flop = GetFlops<NumDimSpatial>(output_lengths, weights_lengths, ds_size);
std::size_t num_bytes = in_mem_size + wei_mem_size + sizeof(float) + sizeof(float) +
sizeof(float) + out_mem_size + bias_mem_size;
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<OutLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<BiasDataType>,
OutDataType,
PassThrough,
PassThrough,
ConvScaleAdd,
AComputeType,
BComputeType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
std::array<const void*, 1>{bias.GetDeviceBuffer()},
out.GetDeviceBuffer(),
input_lengths,
input_strides,
weights_lengths,
weights_strides,
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 1>{
{bias_lengths}},
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 1>{
{bias_strides}},
output_lengths,
output_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
ConvScaleAdd{scale_in, scale_wei, scale_out});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return false;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
std::array<const void*, 1>{bias.GetDeviceBuffer()},
out.GetDeviceBuffer(),
input_lengths,
input_strides,
weights_lengths,
weights_strides,
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 1>{
{bias_lengths}},
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 1>{
{bias_strides}},
output_lengths,
output_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
ConvScaleAdd{scale_in, scale_wei, scale_out});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return true;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::f8_t;
using WeiDataType = ck::f8_t;
using CShuffleDataType = float;
using OutDataType = ck::f8_t;
using AComputeDataType = ck::f8_t;
using BComputeDataType = ck::f8_t;
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 1;
static constexpr ck::index_t N = 64;
static constexpr ck::index_t K = 128;
static constexpr ck::index_t C = 64;
static constexpr ck::index_t Z = 3;
static constexpr ck::index_t Y = 3;
static constexpr ck::index_t X = 3;
static constexpr ck::index_t Di = 28;
static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 3;
static constexpr ck::index_t Do = 28;
static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 3;
int main()
{
return run_grouped_conv_fwd_convscale_add<NumDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InLayout,
WeiLayout,
OutLayout,
3,
AComputeDataType,
BComputeDataType>(
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
? EXIT_SUCCESS
: EXIT_FAILURE;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <string>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ConvScaleRelu = ck::tensor_operation::element_wise::ConvScaleRelu;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
template <ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
std::size_t
GetFlops(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths,
const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths,
const std::size_t& ds_size)
{
// G * N * C * <output spatial lengths product> * (2 * K * <filter spatial lengths product> +
// <number of scale factors>)
ck::index_t G = weights_lengths[0];
ck::index_t N = output_lengths[1];
ck::index_t K = weights_lengths[1];
ck::index_t C = weights_lengths[2];
return G * N * C *
std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim),
std::end(output_lengths),
static_cast<std::size_t>(1),
std::multiplies<>()) *
(static_cast<std::size_t>(2) * K *
std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim),
std::end(weights_lengths),
static_cast<std::size_t>(1),
std::multiplies<>()) +
ds_size);
}
template <typename InDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
std::size_t
GetInputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& input_lengths)
{
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
return sizeof(InDataType) * std::accumulate(std::begin(input_lengths),
std::end(input_lengths),
static_cast<std::size_t>(1),
std::multiplies<>());
}
template <typename WeiDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
std::size_t
GetWeightByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& weights_lengths)
{
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
return sizeof(WeiDataType) * std::accumulate(std::begin(weights_lengths),
std::end(weights_lengths),
static_cast<std::size_t>(1),
std::multiplies<>());
}
template <typename OutDataType, ck::index_t NumDimSpatial, ck::index_t NumNonSpatialDim = 3>
std::size_t
GetOutputByte(const std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>& output_lengths)
{
// sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
return sizeof(OutDataType) * std::accumulate(std::begin(output_lengths),
std::end(output_lengths),
static_cast<std::size_t>(1),
std::multiplies<std::size_t>());
}
template <ck::index_t NumDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout,
ck::index_t NumNonSpatialDim = 3,
typename AComputeType = InDataType,
typename BComputeType = AComputeType>
bool run_grouped_conv_fwd_convscale_relu(
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_lengths,
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_lengths,
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_lengths)
{
std::size_t in_mem_size = GetInputByte<InDataType, NumDimSpatial>(in_lengths);
std::size_t wei_mem_size = GetWeightByte<WeiDataType, NumDimSpatial>(wei_lengths);
std::size_t out_mem_size = GetOutputByte<OutDataType, NumDimSpatial>(out_lengths);
SimpleDeviceMem in(in_mem_size);
SimpleDeviceMem wei(wei_mem_size);
SimpleDeviceMem out(out_mem_size);
float scale_in = float(std::rand()) / float(RAND_MAX);
float scale_wei = float(std::rand()) / float(RAND_MAX);
float scale_out = float(std::rand()) / float(RAND_MAX);
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> in_strides;
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> wei_strides;
std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim> out_strides;
in_strides.fill(0);
wei_strides.fill(0);
out_strides.fill(0);
in_strides.back() = 1;
wei_strides.back() = 1;
out_strides.back() = 1;
std::partial_sum(rbegin(in_lengths),
std::prev(rend(in_lengths)),
std::next(rbegin(in_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(wei_lengths),
std::prev(rend(wei_lengths)),
std::next(rbegin(wei_strides)),
std::multiplies<>{});
std::partial_sum(rbegin(out_lengths),
std::prev(rend(out_lengths)),
std::next(rbegin(out_strides)),
std::multiplies<>{});
// transpose NDHWGC/KZYXGC/NDHWGK to GNDHWC/GKZYXC/GNDHWK to GNCDHW/GKCZYX/GNKDHW
std::rotate(std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 2), rend(in_lengths));
std::rotate(rbegin(in_lengths),
std::next(rbegin(in_lengths)),
std::next(rbegin(in_lengths), NumDimSpatial + 1));
std::rotate(std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 2), rend(in_strides));
std::rotate(rbegin(in_strides),
std::next(rbegin(in_strides)),
std::next(rbegin(in_strides), NumDimSpatial + 1));
std::rotate(rbegin(wei_lengths),
std::next(rbegin(wei_lengths)),
std::next(rbegin(wei_lengths), NumDimSpatial + 1));
std::rotate(rbegin(wei_strides),
std::next(rbegin(wei_strides)),
std::next(rbegin(wei_strides), NumDimSpatial + 1));
std::rotate(
std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 2), rend(out_lengths));
std::rotate(rbegin(out_lengths),
std::next(rbegin(out_lengths)),
std::next(rbegin(out_lengths), NumDimSpatial + 1));
std::rotate(
std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 2), rend(out_strides));
std::rotate(rbegin(out_strides),
std::next(rbegin(out_strides)),
std::next(rbegin(out_strides), NumDimSpatial + 1));
std::array<ck::index_t, NumDimSpatial> conv_filter_strides;
std::array<ck::index_t, NumDimSpatial> conv_filter_dilations;
std::array<ck::index_t, NumDimSpatial> input_left_pads;
std::array<ck::index_t, NumDimSpatial> input_right_pads;
conv_filter_strides.fill(1);
conv_filter_dilations.fill(1);
input_left_pads.fill(1);
input_right_pads.fill(1);
std::size_t ds_size = 3 + 1; // 3 element-wise scale multipliers + 1 elementwise Relu
std::size_t flop = GetFlops<NumDimSpatial>(out_lengths, wei_lengths, ds_size);
std::size_t num_bytes =
in_mem_size + wei_mem_size + sizeof(float) + sizeof(float) + sizeof(float) + out_mem_size;
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
PassThrough,
PassThrough,
ConvScaleRelu,
AComputeType,
BComputeType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
std::array<const void*, 0>{},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{},
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{},
out_lengths,
out_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
ConvScaleRelu{scale_in, scale_wei, scale_out});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return false;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
std::array<const void*, 0>{},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{},
std::array<std::array<ck::index_t, NumDimSpatial + NumNonSpatialDim>, 0>{},
out_lengths,
out_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
ConvScaleRelu{scale_in, scale_wei, scale_out});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return true;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::f8_t;
using WeiDataType = ck::f8_t;
using CShuffleDataType = float;
using OutDataType = ck::f8_t;
using AComputeDataType = ck::f8_t;
using BComputeDataType = ck::f8_t;
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 1;
static constexpr ck::index_t N = 64;
static constexpr ck::index_t K = 128;
static constexpr ck::index_t C = 64;
static constexpr ck::index_t Z = 3;
static constexpr ck::index_t Y = 3;
static constexpr ck::index_t X = 3;
static constexpr ck::index_t Di = 28;
static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 3;
static constexpr ck::index_t Do = 28;
static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 3;
int main()
{
return run_grouped_conv_fwd_convscale_relu<NumDimSpatial,
InDataType,
WeiDataType,
OutDataType,
InLayout,
WeiLayout,
OutLayout,
3,
AComputeDataType,
BComputeDataType>(
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
? EXIT_SUCCESS
: EXIT_FAILURE;
}
......@@ -7,19 +7,23 @@
#include <initializer_list>
#include <vector>
#include "ck/utility/common_header.hpp"
// __gfx9__ defined in the above header via ck.hpp
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp"
#include "ck/host_utility/device_prop.hpp"
struct SimpleDeviceMem
{
......@@ -204,6 +208,14 @@ void PerformGemm(const ck::index_t M,
int main(int argc, char* argv[])
{
bool is_supported = ck::is_xdl_supported();
if(!is_supported)
{
std::cout << "WARNING: xdl example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
using DataType = ck::half_t;
const auto thread_layout =
ck::wrapper::make_layout(ck::make_tuple(ck::Number<64>{}, ck::Number<4>{}),
......@@ -213,3 +225,4 @@ int main(int argc, char* argv[])
3840, 4096, 4096, tile_shape, thread_layout);
return 0;
}
#endif
......@@ -7,18 +7,21 @@
#include <initializer_list>
#include <vector>
#include "ck/library/utility/host_tensor.hpp"
#include "ck/utility/common_header.hpp"
// __gfx9__ defined in the above header via ck.hpp
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/wrapper/layout.hpp"
#include "ck/wrapper/tensor.hpp"
#include "ck/wrapper/operations/copy.hpp"
#include "ck/wrapper/operations/gemm.hpp"
#include "ck/wrapper/utils/kernel_utils.hpp"
#include "ck/host_utility/device_prop.hpp"
struct SimpleDeviceMem
{
......@@ -296,6 +299,14 @@ void PerformGemm(const ck::index_t M,
int main(int argc, char* argv[])
{
bool is_supported = ck::is_xdl_supported();
if(!is_supported)
{
std::cout << "WARNING: xdl example not supported on the platform " << ck::get_device_name()
<< std::endl;
return 0;
}
using DataType = ck::half_t;
const auto thread_layout =
ck::wrapper::make_layout(ck::make_tuple(ck::Number<4>{}, ck::Number<64>{}, ck::Number<1>{}),
......@@ -305,3 +316,4 @@ int main(int argc, char* argv[])
3840, 4096, 4096, tile_shape, thread_layout);
return 0;
}
#endif
rocm-docs-core==1.4.1
rocm-docs-core==1.6.0
sphinxcontrib-bibtex==2.6.2
......@@ -103,7 +103,7 @@ requests==2.31.0
# via
# pygithub
# sphinx
rocm-docs-core==1.4.1
rocm-docs-core==1.6.0
# via -r requirements.in
six==1.16.0
# via
......
......@@ -22,6 +22,8 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
add_example_executable(example_gemm_xdl_fp16_v2 gemm_xdl_fp16_v2.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v2)
add_example_executable(example_gemm_xdl_fp16_streamk_v3 gemm_xdl_fp16_streamk_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_streamk_v3)
add_example_executable(example_gemm_xdl_fp16_v3 gemm_xdl_fp16_v3.cpp)
add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_v3)
add_example_executable(example_gemm_xdl_fp8_v3 gemm_xdl_fp8_v3.cpp)
......
......@@ -7,3 +7,21 @@
#arg3: run kernel # of times (>1)
./bin/example_gemm_xdl 0 1 5
```
# Instructions for ```example_gemm_xdl_fp16_streamk_v3```
## Run ```example_gemm_xdl_fp16_streamk_v3```
```bash
arg1: verification (0=no, 1=yes)
arg2: initialization (0=no init, 1=integer value, 2=decimal value)
arg3: time kernel (0=no, 1=yes)
arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)
arg11: Grid_size(-1 for max occupancy)
bin/example_gemm_xdl_fp16_streamk_v3 1 2 1 3840 4096 4096 4096 4096 4096 1 -1
a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1}
c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
problem {M:3840, N:4096, K:4096, SA:4096, SB:4096, SC:4096, MP:4032, NP:4096, KRead:4096, KP:4096, AK0:512, BK0:2048, MBlock: 18, NBlock: 16, Stream-K Selection:1, Grid size:-1}
Perf: 0.292022 ms, 441.23 TFlops, 330.348 GB/s, DeviceGemmXdlUniversal<MNPadding, RRR> BlkSize: 256, BlkTile: 224x256x64, WaveTile: 16x16, WaveMap: 7x8, VmemReadVec: 8x8, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3, BlkGemmPipelinePrefetchStages: 2
```
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -45,6 +45,19 @@ struct ProblemSizeStreamK final
ck::index_t NumSKBlocks = -1;
};
struct ProblemSizeStreamK_universal final
{
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideC = 4096;
ck::index_t Grid_size = -1; // defaults to max occupancy
ck::index_t Streamk_sel = 1; // defaults to 1-tile SK
};
struct ProblemSizeSplitK final
{
......@@ -123,6 +136,57 @@ bool parse_cmd_args<ProblemSize>(int argc,
return true;
}
template <>
bool parse_cmd_args<ProblemSizeStreamK_universal>(int argc,
char* argv[],
ProblemSizeStreamK_universal& problem_size,
ExecutionConfig& config)
{
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
}
else if(argc >= 10)
{
config.do_verification = std::stoi(argv[1]);
config.init_method = std::stoi(argv[2]);
config.time_kernel = std::stoi(argv[3]);
problem_size.M = std::stoi(argv[4]);
problem_size.N = std::stoi(argv[5]);
problem_size.K = std::stoi(argv[6]);
problem_size.StrideA = std::stoi(argv[7]);
problem_size.StrideB = std::stoi(argv[8]);
problem_size.StrideC = std::stoi(argv[9]);
if(argc >= 11)
{
problem_size.Streamk_sel = std::stoi(argv[10]);
problem_size.Grid_size = std::stoi(argv[11]);
}
}
else
{
std::cerr
<< "arg1: verification (0=no, 1=yes)" << std::endl
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg10: stream-k select (-1: default config, 0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
<< "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
return false;
}
return true;
}
template <>
bool parse_cmd_args<ProblemSizeStreamK>(int argc,
char* argv[],
......@@ -165,7 +229,8 @@ bool parse_cmd_args<ProblemSizeStreamK>(int argc,
<< std::endl
<< "arg3: time kernel (0=no, 1=yes)" << std::endl
<< "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl
<< "arg10: NumSKBlocks(optional)" << std::endl;
<< "arg10: stream-k select (0: all DP, 1: 1-tile SK, 2: 2-tile SK)"
<< "\narg11: Grid_size(-1 for max occupancy)" << std::endl;
return false;
}
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp"
using ADataType = ck::half_t;
using BDataType = ck::half_t;
using AccDataType = float;
using CShuffleDataType = ck::half_t;
using CDataType = ck::half_t;
using ALayout = Row;
using BLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// clang-format off
using DeviceGemmV2_Streamk_Instance =
ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle_Streamk_V3<
ALayout, BLayout, CLayout,
ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,
PassThrough, PassThrough, PassThrough, GemmDefault,
256,
224, 256,
64, 8, 2,
16, 16,
7, 8,
S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
2, 8, 8, 0,
S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
1, 8, 2, 0,
1, 2, S<1, 32, 1, 8>, 8,
ck::BlockGemmPipelineScheduler::Intrawave,ck::BlockGemmPipelineVersion::v3>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
#include "run_gemm_example_streamk_v2.inc"
int main(int argc, char* argv[]) { return !run_gemm_universal_streamk_example(argc, argv); }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment