Commit 9dce6851 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge develop

parents 3cc57101 5d37d7bf
......@@ -45,7 +45,6 @@ message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
link_libraries(${OpenMP_gomp_LIBRARY})
link_libraries(${OpenMP_pthread_LIBRARY})
......@@ -71,17 +70,17 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
## half
#find_path(HALF_INCLUDE_DIR half.hpp)
set(HALF_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/external/half/include")
message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
rocm_create_package(
NAME CK-${CK_BACKEND}
DESCRIPTION "High Performance Composable Kernels for AMD GPUs"
DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
LDCONFIG
)
## half
set(HALF_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/external/include/half")
message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")
## tidy
include(EnableCompilerWarnings)
set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
......@@ -184,7 +183,6 @@ enable_clang_tidy(
-cppcoreguidelines-narrowing-conversions
-altera-struct-pack-align
-cppcoreguidelines-prefer-member-initializer
${CK_TIDY_CHECKS}
${CK_TIDY_ERRORS}
HEADER_FILTER
......@@ -214,70 +212,36 @@ enable_cppcheck(
unmatchedSuppression
FORCE
SOURCES
host/host_tensor/src
host/driver_offline/src
composable_kernel/src/kernel_wrapper
library/src
INCLUDE
host/host_tensor/include
host/device/include
host/solver/include
host/driver_offline/include
composable_kernel/include/*
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_BINARY_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/library/include
DEFINE
CPPCHECK=1
__linux__=1
)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
file(GLOB_RECURSE COMPOSABLE_KERNEL_HEADERS "composable_kernel/include/*/*.hpp")
file(GLOB_RECURSE DEVICE_OPS_HEADERS "device_operation/include/*.hpp")
file(GLOB_RECURSE DEVICE_OPS_SOURCE "device_operation/*.cpp")
configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
set(CK_HEADERS ${COMPOSABLE_KERNEL_HEADERS} ${DEVICE_OPS_HEADERS})
set(CK_SOURCE ${DEVICE_OPS_SOURCE})
add_library(composable_kernel
${CK_SOURCE}
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
${PROJECT_BINARY_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
)
target_include_directories(composable_kernel PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/composable_kernel/include>
)
target_include_directories(composable_kernel PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/device_operation/include>
)
target_include_directories(composable_kernel PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/host/include>
)
target_include_directories(composable_kernel PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/host/host_tensor/include>
)
# The following should eventually be removed
target_include_directories(composable_kernel PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/composable_kernel/include/utility>
)
target_include_directories(composable_kernel PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_operation>
)
target_include_directories(composable_kernel PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/composable_kernel/include/tensor_description>
)
# clang_tidy_check(composable_kernel)
SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
target_compile_options(composable_kernel PRIVATE -Werror)
target_compile_options(composable_kernel PRIVATE -Weverything)
add_compile_options(-Werror)
add_compile_options(-Weverything)
endif()
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
configure_file("${PROJECT_SOURCE_DIR}/composable_kernel/include/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/composable_kernel/include/hip_version.hpp")
add_subdirectory(host)
add_subdirectory(device_operation)
add_subdirectory(library)
add_subdirectory(example)
add_subdirectory(profiler)
add_subdirectory(test)
add_subdirectory(profiler)
FROM ubuntu:18.04
ARG ROCMVERSION=4.5
ARG ROCMVERSION=5.0
ARG OSDB_BKC_VERSION
RUN set -xe
......
......@@ -17,7 +17,7 @@ def cmake_build(Map conf=[:]){
def compiler = conf.get("compiler","/opt/rocm/bin/hipcc")
def config_targets = conf.get("config_targets","check")
def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "")
def build_envs = "CTEST_PARALLEL_LEVEL=4 MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0 " + conf.get("build_env","")
def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
def prefixpath = conf.get("prefixpath","/opt/rocm")
def setup_args = conf.get("setup_args","")
......@@ -60,7 +60,8 @@ def cmake_build(Map conf=[:]){
cd build
"""
def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args} .. ")
def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(nproc) ${config_targets}")
// reduce parallelism when compiling, clang uses too much memory
def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make -j\$(( \$(nproc) / 1 )) ${config_targets}")
def execute_cmd = conf.get("execute_cmd", "")
def cmd = conf.get("cmd", """
......@@ -177,15 +178,27 @@ pipeline {
// buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug')
// }
// }
stage('Build Profiler: gfx908')
stage('Build Profiler: Release, gfx908')
{
agent { label rocmnode("gfx908")}
agent { label rocmnode("nogpu")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
build_cmd = "make -j\$(nproc) -k ckProfiler"
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, build_cmd:build_cmd, no_reboot:true, build_type: 'Release')
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
}
}
stage('Build Profiler: Debug, gfx908')
{
agent { label rocmnode("nogpu")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
}
steps{
// until we stabilize debug build due to compiler crashes
catchError(buildResult: 'SUCCESS', stageResult: 'FAILURE') {
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Debug')
}
}
}
stage('Clang Format') {
......@@ -207,6 +220,24 @@ pipeline {
}
}
}
stage("Tests")
{
parallel
{
stage("Run Tests: gfx908")
{
agent{ label rocmnode("gfx908")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
}
}
}
}
// enable after the cmake file supports packaging
// stage("Packages") {
// when {
......@@ -222,4 +253,4 @@ pipeline {
// }
// }
}
}
\ No newline at end of file
}
#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
template <typename GridwiseOp, typename... Xs>
__global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
run_gridwise_operation(Xs... xs)
{
GridwiseOp{}.Run(xs...);
}
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_blockwise.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType,
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
bool isFirstCall,
bool isLastCall,
index_t GredAccessesPerThreadInBlock>
struct GridwiseReduction_xy_to_x_blockwise
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
using posUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
using blockwise_reduce =
BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
(void)indices_global;
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto in_block_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const posUnaryOpType posUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
constexpr auto in_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load =
BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(src2dDesc,
make_multi_index(block_global_1d_id, 0),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
__syncthreads();
// do element-wise pre-reduction operation
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
threadwise_dst_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
}
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
__shared__ int block_indices_buffer[BlockBufferSize];
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
auto in_block_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
auto in_block_idx_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
constexpr auto in_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load =
BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(src2dDesc,
make_multi_index(block_global_1d_id, 0),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
int indexOffset = 0;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
// load block data from global to LDS, no use of double buffers (to be improved)
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
__syncthreads();
// construct the indices for the current toReduce blocks
blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce2(in_block_val_buf,
in_block_idx_buf,
BlocksInOneOp,
accuValue_buf(I0),
accuIndex_buf(I0));
indexOffset += BlockBufferSize;
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(dst1dDesc,
dst_global_val_buf,
ReducedDataDesc,
make_tuple(I0),
priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(block_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
};
template <>
__device__ static void Run<3>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ ws_values_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)origReduceLen;
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
__shared__ int block_indices_buffer[BlockBufferSize];
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, src2dDesc.GetElementSpaceSize());
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
auto in_block_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
auto in_block_idx_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(block_indices_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_1d_id = get_block_1d_id();
constexpr auto in_block_desc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}, Number<BlockBufferSize>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_val_load =
BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(src2dDesc,
make_multi_index(block_global_1d_id, 0),
in_block_desc,
make_multi_index(0, 0));
auto blockwise_src_idx_load =
BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
int,
int,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(src2dDesc,
make_multi_index(block_global_1d_id, 0),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (toReduceLength + BlockSize - 1) / BlockSize;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
// load block data from global to LDS, no use of double buffers (to be improved)
blockwise_src_val_load.RunRead(src2dDesc, src_global_val_buf);
blockwise_src_idx_load.RunRead(src2dDesc, src_global_idx_buf);
blockwise_src_val_load.RunWrite(in_block_desc, in_block_val_buf);
blockwise_src_idx_load.RunWrite(in_block_desc, in_block_idx_buf);
__syncthreads();
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce2(in_block_val_buf,
in_block_idx_buf,
BlocksInOneOp,
accuValue_buf(I0),
accuIndex_buf(I0));
blockwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
blockwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(dst1dDesc,
make_multi_index(block_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(dst1dDesc,
dst_global_val_buf,
ReducedDataDesc,
make_tuple(I0),
priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(block_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(block_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType,
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
bool isFirstCall,
bool isLastCall,
index_t GredThreadBufferLength>
struct GridwiseReduction_xy_to_x_direct_threadwise
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
using posUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
(void)indices_global;
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
in_thread_buf;
using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const posUnaryOpType posUnaryOp(divider);
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc, make_multi_index(thread_global_1d_id, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += GredThreadBufferLength)
{
threadwise_src_load.Run(
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
// do element-wise pre-reduction operation
threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
// do the reduction on the Thread Buffer
threadwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(
dst1dDesc, make_multi_index(thread_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(thread_global_1d_id));
threadwise_dst_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
in_thread_buf;
using threadwise_reduce = ThreadReduce<decltype(in_thread_buf), opReduce, nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc, make_multi_index(thread_global_1d_id, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
index_t indexStart = 0;
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += GredThreadBufferLength)
{
threadwise_src_load.Run(
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
threadwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
// do the reduction on the Thread Buffer
threadwise_reduce::Reduce2(
in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexStart);
indexStart += GredThreadBufferLength;
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
false>(
dst1dDesc, make_multi_index(thread_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(thread_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(thread_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
};
template <>
__device__ static void Run<3>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ ws_values_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)origReduceLen;
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, src2dDesc.GetElementSpaceSize());
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredThreadBufferLength, true>
in_thread_val_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredThreadBufferLength, true> in_thread_idx_buf;
using threadwise_reduce = ThreadReduceWithIndicesInput<decltype(in_thread_val_buf),
decltype(in_thread_idx_buf),
opReduce,
nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
using ThreadBufferLengths = Sequence<1, GredThreadBufferLength>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredThreadBufferLength>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc, make_multi_index(thread_global_1d_id, 0));
auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
int,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc, make_multi_index(thread_global_1d_id, 0));
constexpr auto in_thread_copy_step = make_multi_index(0, GredThreadBufferLength);
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += GredThreadBufferLength)
{
threadwise_src_val_load.Run(src2dDesc,
src_global_val_buf,
ThreadBufferDesc,
make_tuple(I0, I0),
in_thread_val_buf);
threadwise_src_idx_load.Run(src2dDesc,
src_global_idx_buf,
ThreadBufferDesc,
make_tuple(I0, I0),
in_thread_idx_buf);
// do the reduction on the Thread Buffer
threadwise_reduce::Reduce(
in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
false>(
dst1dDesc, make_multi_index(thread_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_val_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(thread_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
false>(dst1dDesc,
make_multi_index(thread_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_warpwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType,
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
bool isFirstCall,
bool isLastCall,
index_t GredAccessesPerThreadInWarp>
struct GridwiseReduction_xy_to_x_direct_warpwise
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::preUnaryOp;
using posUnaryOpType =
typename reduce_unary_operator<compType, op, isFirstCall, isLastCall>::posUnaryOp;
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
(void)indices_global;
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
in_thread_buf;
using warpwise_reduce =
WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const posUnaryOpType posUnaryOp(divider);
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc,
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
constexpr auto in_thread_copy_step =
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += warpSize * GredAccessesPerThreadInWarp)
{
threadwise_src_load.Run(
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
// do element-wise pre-reduction operation
warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce::Reduce(in_thread_buf, accuValue_buf(I0));
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
accuValue_buf(I0) = posUnaryOp(accuValue_buf[I0]);
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if(thread_inwarp_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(
dst1dDesc, dst_global_buf, ReducedDataDesc, make_tuple(I0), priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf(I0) * beta;
}
auto threadwise_dst_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
threadwise_dst_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_buf);
}
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)ws_indices_global;
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
in_thread_buf;
using warpwise_reduce =
WarpReduce<decltype(in_thread_buf), BlockSize, opReduce, nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc,
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
constexpr auto in_thread_copy_step =
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
index_t indexOffset = 0;
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += warpSize * GredAccessesPerThreadInWarp)
{
threadwise_src_load.Run(
src2dDesc, src_global_buf, ThreadBufferDesc, make_tuple(I0, I0), in_thread_buf);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
warpwise_reduce::operate_on_elements(preUnaryOp, in_thread_buf);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce::Reduce2(
in_thread_buf, accuValue_buf(I0), accuIndex_buf(I0), indexOffset);
indexOffset += warpSize * GredAccessesPerThreadInWarp;
threadwise_src_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if(thread_inwarp_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(dst1dDesc,
dst_global_val_buf,
ReducedDataDesc,
make_tuple(I0),
priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
};
template <>
__device__ static void Run<3>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
srcDataType alpha,
const srcDataType* const __restrict__ ws_values_global,
dstDataType beta,
dstDataType* const __restrict__ p_dst_global,
const int* const __restrict__ ws_indices_global,
int* const __restrict__ indices_global)
{
(void)origReduceLen;
const auto zeroVal = opReduce::GetReductionZeroVal();
const auto src_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
const auto src_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, src2dDesc.GetElementSpaceSize());
auto dst_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_dst_global, dst1dDesc.GetElementSpaceSize());
auto dst_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
indices_global, dst1dDesc.GetElementSpaceSize());
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, GredAccessesPerThreadInWarp, true>
in_thread_val_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, GredAccessesPerThreadInWarp, true>
in_thread_idx_buf;
using warpwise_reduce = WarpReduceWithIndicesInput<decltype(in_thread_val_buf),
decltype(in_thread_idx_buf),
BlockSize,
opReduce,
nanPropaOpt>;
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
using ThreadBufferLengths = Sequence<1, GredAccessesPerThreadInWarp>;
constexpr auto ThreadBufferDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<GredAccessesPerThreadInWarp>{}));
index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
index_t warp_global_1d_id = thread_global_1d_id / warpSize;
index_t thread_inwarp_id = thread_global_1d_id % warpSize;
auto threadwise_src_val_load = ThreadwiseTensorSliceTransfer_v2<srcDataType,
compType,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc,
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
auto threadwise_src_idx_load = ThreadwiseTensorSliceTransfer_v2<int,
int,
src2dDescType,
decltype(ThreadBufferDesc),
ThreadBufferLengths,
Sequence<0, 1>,
1,
1,
1,
false>(
src2dDesc,
make_multi_index(warp_global_1d_id, thread_inwarp_id * GredAccessesPerThreadInWarp));
constexpr auto in_thread_copy_step =
make_multi_index(0, warpSize * GredAccessesPerThreadInWarp);
for(index_t reducedLength = 0; reducedLength < toReduceLength;
reducedLength += warpSize * GredAccessesPerThreadInWarp)
{
threadwise_src_val_load.Run(src2dDesc,
src_global_val_buf,
ThreadBufferDesc,
make_tuple(I0, I0),
in_thread_val_buf);
threadwise_src_idx_load.Run(src2dDesc,
src_global_idx_buf,
ThreadBufferDesc,
make_tuple(I0, I0),
in_thread_idx_buf);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce::Reduce(
in_thread_val_buf, in_thread_idx_buf, accuValue_buf(I0), accuIndex_buf(I0));
threadwise_src_val_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
threadwise_src_idx_load.MoveSrcSliceWindow(src2dDesc, in_thread_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if(thread_inwarp_id == 0)
{
if(!float_equal_one{}(alpha))
accuValue_buf(I0) *= type_convert<compType>(alpha);
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> dstValue_buf;
dstValue_buf(I0) = type_convert<dstDataType>(accuValue_buf[I0]);
if(!float_equal_zero{}(beta))
{
auto threadwise_dst_load =
ThreadwiseTensorSliceTransfer_v2<dstDataType,
dstDataType,
dst1dDescType,
decltype(ReducedDataDesc),
Sequence<1>,
Sequence<0>,
0,
1,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
StaticBuffer<AddressSpaceEnum_t::Vgpr, dstDataType, 1, true> priorDstValue_buf;
threadwise_dst_load.Run(dst1dDesc,
dst_global_val_buf,
ReducedDataDesc,
make_tuple(I0),
priorDstValue_buf);
dstValue_buf(I0) += priorDstValue_buf[I0] * beta;
}
auto threadwise_dst_val_store =
ThreadwiseTensorSliceTransfer_v1r3<dstDataType,
dstDataType,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
auto threadwise_dst_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
dst1dDescType,
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(dst1dDesc,
make_multi_index(warp_global_1d_id));
threadwise_dst_val_store.Run(
ReducedDataDesc, make_tuple(I0), dstValue_buf, dst1dDesc, dst_global_val_buf);
threadwise_dst_idx_store.Run(
ReducedDataDesc, make_tuple(I0), accuIndex_buf, dst1dDesc, dst_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_blockwise.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
namespace ck {
template <index_t BlockSize,
typename srcDataType,
typename dstDataType, // not used together with the beta input
typename compType,
typename src2dDescType,
typename dst1dDescType,
ReduceTensorOp_t op,
NanPropagation_t nanPropaOpt,
ReduceTensorIndices_t reduceIndicesOpt,
index_t GredAccessesPerThreadInBlock>
struct GridwiseReduction_xy_to_x_multiblock
{
using opReduce = typename reduce_binary_operator<compType, op>::opType;
using preUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::preUnaryOp;
using posUnaryOpType = typename reduce_unary_operator<compType, op, true, false>::posUnaryOp;
static constexpr auto buffer2dDesc = make_naive_tensor_descriptor_packed(
make_tuple(Number<GredAccessesPerThreadInBlock>{}, Number<BlockSize>{}));
using blockwise_reduce =
BlockwiseReduction_2d_block_buffer<decltype(buffer2dDesc), true, opReduce, nanPropaOpt>;
static constexpr index_t BlockBufferSize = buffer2dDesc.GetElementSize();
static constexpr auto I0 = Number<0>{};
template <int RunId>
__device__ static void Run(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global);
template <>
__device__ static void Run<1>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global)
{
(void)ws_indices_global;
(void)alpha; // unused
(void)beta; // unused
const auto zeroVal = opReduce::GetReductionZeroVal();
// LDS
__shared__ compType p_in_block_buffer[BlockBufferSize];
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto workspace_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto in_block_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
accuValue_buf(I0) = zeroVal;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / BlkGroupSize;
const index_t block_local_id = block_global_id % BlkGroupSize;
const index_t reduceSizePerBlock =
(((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
BlockBufferSize) *
BlockBufferSize;
constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(
src2dDesc,
make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_buf);
__syncthreads();
// do element-wise pre-reduction operation
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce(in_block_buf, BlocksInOneOp, accuValue_buf(I0));
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
const auto workspace_desc =
make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
auto threadwise_workspace_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
srcDataType,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
threadwise_workspace_store.Run(ReducedDataDesc,
make_tuple(I0),
accuValue_buf,
workspace_desc,
workspace_global_buf);
}
};
template <>
__device__ static void Run<2>(const src2dDescType& src2dDesc,
const dst1dDescType& dst1dDesc,
int origReduceLen,
int BlkGroupSize,
srcDataType alpha,
const srcDataType* const __restrict__ p_src_global,
dstDataType beta,
srcDataType* const __restrict__ ws_values_global,
int* const __restrict__ ws_indices_global)
{
(void)alpha; // unused
(void)beta; // unused
const auto zeroVal = opReduce::GetReductionZeroVal();
// LDS
__shared__ compType p_in_block_values_buffer[BlockBufferSize];
__shared__ int p_in_block_indices_buffer[BlockBufferSize];
const auto src_global_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
p_src_global, src2dDesc.GetElementSpaceSize(), type_convert<srcDataType>(zeroVal));
auto workspace_global_val_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_values_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto workspace_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Global>(
ws_indices_global, dst1dDesc.GetLength(I0) * BlkGroupSize);
auto in_block_val_buf =
make_dynamic_buffer<AddressSpaceEnum_t::Lds>(p_in_block_values_buffer, BlockBufferSize);
auto in_block_idx_buf = make_dynamic_buffer<AddressSpaceEnum_t::Lds>(
p_in_block_indices_buffer, BlockBufferSize);
StaticBuffer<AddressSpaceEnum_t::Vgpr, compType, 1, true> accuValue_buf;
StaticBuffer<AddressSpaceEnum_t::Vgpr, int, 1, true> accuIndex_buf;
accuValue_buf(I0) = zeroVal;
accuIndex_buf(I0) = 0;
const auto toReduceLength = src2dDesc.GetLength(Number<1>{});
const int divider = origReduceLen;
const preUnaryOpType preUnaryOp(divider);
const index_t thread_local_id = get_thread_local_1d_id();
const index_t block_global_id = get_block_1d_id();
const index_t blkgroup_id = block_global_id / BlkGroupSize;
const index_t block_local_id = block_global_id % BlkGroupSize;
const index_t reduceSizePerBlock =
(((toReduceLength + BlkGroupSize - 1) / BlkGroupSize + BlockBufferSize - 1) /
BlockBufferSize) *
BlockBufferSize;
constexpr auto in_block_desc = make_naive_tensor_descriptor_packed(
make_tuple(Number<1>{}, Number<BlockSize * GredAccessesPerThreadInBlock>{}));
using ThreadSliceLengths = Sequence<1, GredAccessesPerThreadInBlock>;
using ThreadClusterLengths = Sequence<1, BlockSize>;
auto blockwise_src_load = BlockwiseTensorSliceTransfer_v4<BlockSize,
InMemoryDataOperationEnum_t::Set,
Sequence<1, BlockBufferSize>,
ThreadSliceLengths,
ThreadClusterLengths,
Sequence<0, 1>,
srcDataType,
compType,
src2dDescType,
decltype(in_block_desc),
Sequence<0, 1>,
Sequence<0, 1>,
1,
1,
1,
1,
1,
1,
false,
true>(
src2dDesc,
make_multi_index(blkgroup_id, block_local_id * reduceSizePerBlock),
in_block_desc,
make_multi_index(0, 0));
constexpr auto in_block_copy_step = make_multi_index(0, BlockBufferSize);
const index_t toReduceBlocks = (reduceSizePerBlock + BlockSize - 1) / BlockSize;
int indexOffset = block_local_id * reduceSizePerBlock;
for(index_t reducedBlocks = 0; reducedBlocks < toReduceBlocks;
reducedBlocks += GredAccessesPerThreadInBlock)
{
blockwise_reduce::init_buffer_indices(in_block_idx_buf, indexOffset);
blockwise_src_load.RunRead(src2dDesc, src_global_buf);
blockwise_src_load.RunWrite(in_block_desc, in_block_val_buf);
__syncthreads();
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
blockwise_reduce::operate_on_elements(preUnaryOp, in_block_val_buf);
index_t BlocksInOneOp = (reducedBlocks < toReduceBlocks - GredAccessesPerThreadInBlock)
? GredAccessesPerThreadInBlock
: toReduceBlocks - reducedBlocks;
blockwise_reduce::Reduce2(in_block_val_buf,
in_block_idx_buf,
BlocksInOneOp,
accuValue_buf(I0),
accuIndex_buf(I0));
indexOffset += BlockBufferSize;
blockwise_src_load.MoveSrcSliceWindow(src2dDesc, in_block_copy_step);
}
constexpr auto ReducedDataDesc =
make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
const auto workspace_desc =
make_naive_tensor_descriptor_packed(make_tuple(dst1dDesc.GetLength(I0) * BlkGroupSize));
// The first thread in the block stores the reduced result to the global location
// representing the block
if(thread_local_id == 0)
{
auto threadwise_workspace_val_store =
ThreadwiseTensorSliceTransfer_v1r3<compType,
srcDataType,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
auto threadwise_workspace_idx_store =
ThreadwiseTensorSliceTransfer_v1r3<int,
int,
decltype(ReducedDataDesc),
decltype(workspace_desc),
Sequence<1>,
Sequence<0>,
0,
1,
InMemoryDataOperationEnum_t::Set,
1,
true>(workspace_desc,
make_multi_index(block_global_id));
threadwise_workspace_val_store.Run(ReducedDataDesc,
make_tuple(I0),
accuValue_buf,
workspace_desc,
workspace_global_val_buf);
threadwise_workspace_idx_store.Run(ReducedDataDesc,
make_tuple(I0),
accuIndex_buf,
workspace_desc,
workspace_global_idx_buf);
}
};
};
} // namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace ck {
template <typename buffer2dDescType,
bool blockIsOneRow,
typename opReduce,
NanPropagation_t nanPropaOpt>
struct BlockwiseReduction_2d_block_buffer
{
using compType = typename opReduce::dataType;
static constexpr auto buffer2dDesc = buffer2dDescType{};
static constexpr index_t BlockSize =
blockIsOneRow ? buffer2dDesc.GetLength(Number<1>{}) : buffer2dDesc.GetLength(Number<0>{});
static constexpr index_t NumBlocks =
blockIsOneRow ? buffer2dDesc.GetLength(Number<0>{}) : buffer2dDesc.GetLength(Number<1>{});
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
// This interface does not accumulate on indices
template <typename BufferType>
__device__ static void
Reduce(BufferType& block_buffer, index_t toReduceBlocks, compType& accuData)
{
const index_t thread_local_id = get_thread_local_1d_id();
compType lAccuData = opReduce::GetReductionZeroVal();
index_t offset;
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
compType opData = type_convert<compType>(block_buffer[offset]);
binop::calculate(lAccuData, opData);
}
offset = blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
block_buffer(offset) = lAccuData;
__syncthreads();
for(index_t indOffset = BlockSize / 2; indOffset > 0; indOffset /= 2)
{
if(thread_local_id < indOffset)
{
index_t offset1 =
blockIsOneRow ? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
index_t offset2 =
blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(0, thread_local_id + indOffset))
: buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
compType opData1 = type_convert<compType>(block_buffer[offset1]);
compType opData2 = type_convert<compType>(block_buffer[offset2]);
binop::calculate(opData1, opData2);
block_buffer(offset1) = type_convert<compType>(opData1);
}
__syncthreads();
}
if(thread_local_id == 0)
{
compType tmpVal = type_convert<compType>(block_buffer[0]);
binop::calculate(accuData, tmpVal);
}
};
// This interface accumulates on both data values and indices
template <typename BufferType, typename IdxBufferType>
__device__ static void Reduce2(BufferType& block_buffer,
IdxBufferType& block_indices_buffer,
index_t toReduceBlocks,
compType& accuData,
int& accuIndex)
{
const index_t thread_local_id = get_thread_local_1d_id();
compType lAccuData = opReduce::GetReductionZeroVal();
int lAccuIndex = 0;
if constexpr(blockIsOneRow)
{
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
{
if(thread_local_id % (indOffset * 2) == 0)
{
index_t offset1 =
buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_local_id));
index_t offset2 = buffer2dDesc.CalculateOffset(
make_tuple(otherDimInd, thread_local_id + indOffset));
compType currVal1 = type_convert<compType>(block_buffer[offset1]);
compType currVal2 = type_convert<compType>(block_buffer[offset2]);
int currIndex1 = block_indices_buffer[offset1];
int currIndex2 = block_indices_buffer[offset2];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
block_buffer(offset1) = type_convert<compType>(currVal1);
block_indices_buffer(offset1) = currIndex1;
}
__syncthreads();
}
}
if(thread_local_id == 0)
{
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
index_t offset = buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, 0));
compType tmpVal = type_convert<compType>(block_buffer[offset]);
int tmpIndex = block_indices_buffer[offset];
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
}
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
}
}
else
{
index_t offset;
for(index_t otherDimInd = 0; otherDimInd < toReduceBlocks; otherDimInd++)
{
offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, otherDimInd));
compType currVal = type_convert<compType>(block_buffer[offset]);
int currIndex = block_indices_buffer[offset];
binop::calculate(lAccuData, currVal, lAccuIndex, currIndex);
}
offset = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
block_buffer(offset) = lAccuData;
block_indices_buffer(offset) = lAccuIndex;
__syncthreads();
for(index_t indOffset = 1; indOffset < BlockSize; indOffset *= 2)
{
if(thread_local_id % (indOffset * 2) == 0)
{
index_t offset1 = buffer2dDesc.CalculateOffset(make_tuple(thread_local_id, 0));
index_t offset2 =
buffer2dDesc.CalculateOffset(make_tuple(thread_local_id + indOffset, 0));
compType currVal1 = type_convert<compType>(block_buffer[offset1]);
compType currVal2 = type_convert<compType>(block_buffer[offset2]);
int currIndex1 = block_indices_buffer[offset1];
int currIndex2 = block_indices_buffer[offset2];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
block_buffer(offset1) = type_convert<compType>(currVal1);
block_indices_buffer(offset1) = currIndex1;
}
__syncthreads();
}
if(thread_local_id == 0)
{
compType tmpVal = type_convert<compType>(block_buffer[0]);
int tmpIndex = block_indices_buffer[0];
binop::calculate(accuData, tmpVal, accuIndex, tmpIndex);
}
}
};
template <typename BufferType>
__device__ static void set_buffer_value(BufferType& block_buffer, compType value)
{
index_t thread_id = get_thread_local_1d_id();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
block_buffer(offset) = value;
__syncthreads();
}
};
// Initialize the block-wise indices buffer, the index for each element in the block-wise
// data buffer is calculated according to its position in the buffer and the global starting
// index
template <typename IdxBufferType>
__device__ static void init_buffer_indices(IdxBufferType& block_indices_buffer, int indexStart)
{
index_t thread_id = get_thread_local_1d_id();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
block_indices_buffer(offset) = offset + indexStart;
__syncthreads();
}
};
// Execute unary operation on the block buffer elements
template <typename unary_op_type, typename BufferType>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& block_buffer)
{
index_t thread_id = get_thread_local_1d_id();
for(index_t otherDimInd = 0; otherDimInd < NumBlocks; otherDimInd++)
{
index_t offset = blockIsOneRow
? buffer2dDesc.CalculateOffset(make_tuple(otherDimInd, thread_id))
: buffer2dDesc.CalculateOffset(make_tuple(thread_id, otherDimInd));
block_buffer(offset) = unary_op(block_buffer[offset]);
__syncthreads();
}
};
};
}; // end of namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace ck {
template <typename BufferType, typename opReduce, NanPropagation_t nanPropaOpt>
struct ThreadReduce
{
using compType = typename opReduce::dataType;
static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
static_assert(
std::is_same<typename BufferType::type, compType>::value,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
// This interface does not accumulate on indices
__device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { binop::calculate(accuData, thread_buffer[I]); });
};
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at first-time reduction
__device__ static void
Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
int currIndex = I + indexStart;
binop::calculate(accuData, thread_buffer[I], accuIndex, currIndex);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
};
};
template <typename BufferType,
typename IdxBufferType,
typename opReduce,
NanPropagation_t nanPropaOpt>
struct ThreadReduceWithIndicesInput
{
using compType = typename opReduce::dataType;
static_assert(BufferType::IsStaticBuffer(), "Thread-wise reduction needs use StaticBuffer!");
static_assert(IdxBufferType::IsStaticBuffer(),
"Thread-wise reduction needs use StaticBuffer for indices!");
static_assert(
std::is_same<typename BufferType::type, compType>::value,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!");
static_assert(std::is_same<typename IdxBufferType::type, index_t>::value,
"Indices type of StaticBuffer for Thread-wise reduction should be index_t!");
static_assert(BufferType::Size() == IdxBufferType::Size(),
"StaticBuffers for data and indices should have the same sizes!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at second-time reduction
__device__ static void Reduce(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
binop::calculate(accuData, thread_buffer[I], accuIndex, thread_indices_buffer[I]);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
};
};
}; // end of namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace ck {
template <typename BufferType, index_t BlockSize, typename opReduce, NanPropagation_t nanPropaOpt>
struct WarpReduce
{
using compType = typename opReduce::dataType;
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
static_assert(BufferType::IsStaticBuffer(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer!");
static_assert(std::is_same<typename BufferType::type, compType>::value,
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
"the compType!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
static constexpr bool have_builtin_shuffle =
std::is_same<compType, float>::value || std::is_same<compType, double>::value;
// This interface does not accumulate on indices
__device__ static void Reduce(const BufferType& thread_buffer, compType& accuData)
{
if constexpr(have_builtin_shuffle)
ReduceImpl1(thread_buffer, accuData);
else
ReduceImpl2(thread_buffer, accuData);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__ static void ReduceImpl1(const BufferType& thread_buffer, compType& accuData)
{
compType lAccuData = opReduce::GetReductionZeroVal();
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
// synchronize among all threads in this warp
__all(1);
for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
{
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
binop::calculate(lAccuData, tmpVal);
__all(1);
}
binop::calculate(accuData, lAccuData);
};
// This interface implementation does not use HIP built-in device shuffling functions
// since for fp16, built-in shuffling functions is not provided by HIP
__device__ static void ReduceImpl2(const BufferType& thread_buffer, compType& accuData)
{
compType lAccuData = opReduce::GetReductionZeroVal();
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { binop::calculate(lAccuData, thread_buffer[I]); });
__syncthreads();
index_t thread_id = get_thread_local_1d_id();
index_t warpId = thread_id / warpSize;
index_t thread_inwarp_id = thread_id % warpSize;
__shared__ compType shuffle_buffer[BlockSize];
compType* myBuffer = &shuffle_buffer[warpId * warpSize];
myBuffer[thread_inwarp_id] = lAccuData;
__syncthreads();
for(index_t stride = warpSize / 2; stride > 0; stride /= 2)
{
if(thread_inwarp_id < stride)
{
compType currVal1 = myBuffer[thread_inwarp_id];
compType currVal2 = myBuffer[thread_inwarp_id + stride];
binop::calculate(currVal1, currVal2);
myBuffer[thread_inwarp_id] = currVal1;
}
__syncthreads();
}
if(thread_inwarp_id == 0)
binop::calculate(accuData, myBuffer[0]);
};
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
// reduction method at first-time reduction
__device__ static void
Reduce2(const BufferType& thread_buffer, compType& accuData, int& accuIndex, int indexStart)
{
if constexpr(have_builtin_shuffle)
Reduce2Impl1(thread_buffer, accuData, accuIndex, indexStart);
else
Reduce2Impl2(thread_buffer, accuData, accuIndex, indexStart);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__ static void Reduce2Impl1(const BufferType& thread_buffer,
compType& accuData,
int& accuIndex,
int indexStart)
{
compType lAccuData = opReduce::GetReductionZeroVal();
int lAccuIndex = 0;
index_t thread_inwarp_id = get_thread_local_1d_id() % warpSize;
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
});
// synchronize among all threads in this warp
__all(1);
for(index_t stride = 1; stride < warpSize; stride *= 2)
{
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize);
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
__all(1);
}
if(thread_inwarp_id == 0)
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
};
// This interface implementation does not use HIP built-in device shuffling functions since for
// fp16, built-in shuffling functions is not provided by HIP
__device__ static void Reduce2Impl2(const BufferType& thread_buffer,
compType& accuData,
int& accuIndex,
int indexStart)
{
compType lAccuData = opReduce::GetReductionZeroVal();
int lAccuIndex = 0;
index_t thread_id = get_thread_local_1d_id();
index_t warpId = thread_id / warpSize;
index_t thread_inwarp_id = thread_id % warpSize;
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
int currIndex = thread_inwarp_id * ThreadBufferLen + I + indexStart;
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, currIndex);
});
__shared__ compType shuffle_data_buffer[BlockSize];
__shared__ int shuffle_indices_buffer[BlockSize];
compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize];
myDataBuffer[thread_inwarp_id] = lAccuData;
myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
__syncthreads();
for(index_t stride = 1; stride < warpSize; stride *= 2)
{
compType currVal1 = myDataBuffer[thread_inwarp_id];
compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
int currIndex1 = myIndicesBuffer[thread_inwarp_id];
int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
myDataBuffer[thread_inwarp_id] = currVal1;
myIndicesBuffer[thread_inwarp_id] = currIndex1;
__syncthreads();
}
if(thread_inwarp_id == 0)
binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
};
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
__all(1);
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
__all(1);
};
};
template <typename BufferType,
typename IdxBufferType,
index_t BlockSize,
typename opReduce,
NanPropagation_t nanPropaOpt>
struct WarpReduceWithIndicesInput
{
using compType = typename opReduce::dataType;
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
static_assert(BufferType::IsStaticBuffer(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer!");
static_assert(IdxBufferType::IsStaticBuffer(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!");
static_assert(std::is_same<typename BufferType::type, compType>::value,
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
"the compType!");
static_assert(
std::is_same<typename IdxBufferType::type, index_t>::value,
"Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!");
static_assert(BufferType::Size() == IdxBufferType::Size(),
"StaticBuffers for data and indices should have the same sizes!");
static constexpr index_t ThreadBufferLen = BufferType::Size();
static constexpr bool have_builtin_shuffle =
std::is_same<compType, float>::value || std::is_same<compType, double>::value;
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
// reduction method at second-time reduction
__device__ static void Reduce(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
if constexpr(have_builtin_shuffle)
ReduceImpl1(thread_buffer, thread_indices_buffer, accuData, accuIndex);
else
ReduceImpl2(thread_buffer, thread_indices_buffer, accuData, accuIndex);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__ static void ReduceImpl1(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
compType lAccuData = opReduce::GetReductionZeroVal();
int lAccuIndex = 0;
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
});
// synchronize among all threads in this warp
__all(1);
for(index_t stride = 1; stride < warpSize; stride *= 2)
{
compType tmpVal = __shfl_down(lAccuData, stride, warpSize);
int tmpIndex = __shfl_down(lAccuIndex, stride, warpSize);
binop::calculate(lAccuData, tmpVal, lAccuIndex, tmpIndex);
__all(1);
}
binop::calculate(accuData, lAccuData, accuIndex, lAccuIndex);
};
// This interface implementation does not use HIP built-in device shuffling functions
// since for fp16, built-in shuffling functions is not provided by HIP
__device__ static void ReduceImpl2(const BufferType& thread_buffer,
const IdxBufferType& thread_indices_buffer,
compType& accuData,
int& accuIndex)
{
compType lAccuData = opReduce::GetReductionZeroVal();
int lAccuIndex = 0;
index_t thread_id = get_thread_local_1d_id();
index_t warpId = thread_id / warpSize;
index_t thread_inwarp_id = thread_id % warpSize;
static_for<0, ThreadBufferLen, 1>{}([&](auto I) {
binop::calculate(lAccuData, thread_buffer[I], lAccuIndex, thread_indices_buffer[I]);
});
__shared__ compType shuffle_data_buffer[BlockSize];
__shared__ int shuffle_indices_buffer[BlockSize];
compType* myDataBuffer = &shuffle_data_buffer[warpId * warpSize];
int* myIndicesBuffer = &shuffle_indices_buffer[warpId * warpSize];
myDataBuffer[thread_inwarp_id] = lAccuData;
myIndicesBuffer[thread_inwarp_id] = lAccuIndex;
__syncthreads();
for(index_t stride = 1; stride < warpSize; stride *= 2)
{
compType currVal1 = myDataBuffer[thread_inwarp_id];
compType currVal2 = myDataBuffer[thread_inwarp_id + stride];
int currIndex1 = myIndicesBuffer[thread_inwarp_id];
int currIndex2 = myIndicesBuffer[thread_inwarp_id + stride];
binop::calculate(currVal1, currVal2, currIndex1, currIndex2);
myDataBuffer[thread_inwarp_id] = currVal1;
myIndicesBuffer[thread_inwarp_id] = currIndex1;
__syncthreads();
}
if(thread_inwarp_id == 0)
binop::calculate(accuData, myDataBuffer[0], accuIndex, myIndicesBuffer[0]);
};
// cppcheck-suppress constParameter
__device__ static void set_buffer_value(BufferType& thread_buffer, compType value)
{
static_for<0, ThreadBufferLen, 1>{}([&](auto I) { thread_buffer(I) = value; });
__all(1);
};
// Execute unary operation on the per-thread buffer elements
template <typename unary_op_type>
__device__ static void operate_on_elements(unary_op_type& unary_op, BufferType& thread_buffer)
{
static_for<0, ThreadBufferLen, 1>{}(
[&](auto I) { thread_buffer(I) = unary_op(thread_buffer[I]); });
__all(1);
};
};
}; // end of namespace ck
#endif
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v1r2.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
using namespace ck;
constexpr DataTypeEnum_t ABDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
constexpr DataTypeEnum_t CDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
using FloatAB = typename get_datatype_from_enum<ABDataTypeEnum>::type;
using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
using FloatC = typename get_datatype_from_enum<CDataTypeEnum>::type;
constexpr index_t BlockSize = CK_PARAM_BlockSize;
constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
constexpr index_t M1PerThread = CK_PARAM_M1PerThread;
constexpr index_t N1PerThread = CK_PARAM_N1PerThread;
constexpr index_t KPerThread = CK_PARAM_KPerThread;
constexpr index_t M1N1ThreadClusterM10 = CK_PARAM_M1N1ThreadClusterM10;
constexpr index_t M1N1ThreadClusterN10 = CK_PARAM_M1N1ThreadClusterN10;
constexpr index_t M1N1ThreadClusterM11 = CK_PARAM_M1N1ThreadClusterM11;
constexpr index_t M1N1ThreadClusterN11 = CK_PARAM_M1N1ThreadClusterN11;
using ABlockTransferThreadSliceLengths_K_M0_M1 =
Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1>;
using ABlockTransferThreadClusterLengths_K_M0_M1 =
Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1>;
using ABlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim;
constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
constexpr index_t ABlockTransferDstScalarPerVector_M1 =
CK_PARAM_ABlockTransferDstScalarPerVector_M1;
constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
using BBlockTransferThreadSliceLengths_K_N0_N1 =
Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1>;
using BBlockTransferThreadClusterLengths_K_N0_N1 =
Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1>;
using BBlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim;
constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
constexpr index_t BBlockTransferDstScalarPerVector_N1 =
CK_PARAM_BBlockTransferDstScalarPerVector_N1;
constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim;
constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
constexpr bool HasMainKBlockLoop = static_cast<bool>(CK_PARAM_HAS_MAIN_KBLOCK_LOOP);
constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP);
extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare(
int n,
int c,
int hi,
int wi,
int k,
int y,
int x,
int convStrideH,
int convStrideW,
int convDilationY,
int convDilationX,
int leftPadH,
int leftPadW,
int rightPadH,
int rightPadW,
void* p_a_k_m0_m1_grid_desc,
void* p_b_k_n0_n1_grid_desc,
void* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
void* p_cblockid_to_m0_n0_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
const auto descs = transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(convStrideH, convStrideW),
make_tuple(convDilationY, convDilationX),
make_tuple(leftPadH, leftPadW),
make_tuple(rightPadH, rightPadW));
const auto a_k_m_grid_desc = descs[I0];
const auto b_k_n_grid_desc = descs[I1];
const auto c_m_n_grid_desc = descs[I2];
using AKMGridDesc = decltype(a_k_m_grid_desc);
using BKNGridDesc = decltype(b_k_n_grid_desc);
using CMNGridDesc = decltype(c_m_n_grid_desc);
using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})));
using BGridStepHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{})));
using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
using GridwiseGemm =
GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
AKMGridDesc,
BKNGridDesc,
CMNGridDesc,
MPerBlock,
NPerBlock,
KPerBlock,
M1PerThread,
N1PerThread,
KPerThread,
M1N1ThreadClusterM10,
M1N1ThreadClusterN10,
M1N1ThreadClusterM11,
M1N1ThreadClusterN11,
ABlockTransferThreadSliceLengths_K_M0_M1,
ABlockTransferThreadClusterLengths_K_M0_M1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_M1,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K_N0_N1,
BBlockTransferThreadClusterLengths_K_N0_N1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_N1,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridStepHacks,
BGridStepHacks,
CGridStepHacks,
AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowStepHacks>;
auto a_k_m0_m1_grid_desc = GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
auto b_k_n0_n1_grid_desc = GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
auto c_m0_m10_m11_n0_n10_n11_grid_desc =
GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
auto cblockid_to_m0_n0_block_cluster_adaptor =
GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
if(hipThreadIdx_x == 0)
{
*static_cast<decltype(a_k_m0_m1_grid_desc)*>(p_a_k_m0_m1_grid_desc) = a_k_m0_m1_grid_desc;
*static_cast<decltype(b_k_n0_n1_grid_desc)*>(p_b_k_n0_n1_grid_desc) = b_k_n0_n1_grid_desc;
*static_cast<decltype(c_m0_m10_m11_n0_n10_n11_grid_desc)*>(
p_c_m0_m10_m11_n0_n10_n11_grid_desc) = c_m0_m10_m11_n0_n10_n11_grid_desc;
*static_cast<decltype(cblockid_to_m0_n0_block_cluster_adaptor)*>(
p_cblockid_to_m0_n0_block_cluster_adaptor) = cblockid_to_m0_n0_block_cluster_adaptor;
};
};
extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw(
const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
const void CONSTANT* p_a_k_m0_m1_grid_desc,
const void CONSTANT* p_b_k_n0_n1_grid_desc,
const void CONSTANT* p_c_m0_m10_m11_n0_n10_n11_grid_desc,
const void CONSTANT* p_cblockid_to_m0_n0_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto in_n_c_hi_wi_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
constexpr auto wei_k_c_y_x_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
constexpr auto out_n_k_ho_wo_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
constexpr auto descs =
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1));
constexpr auto a_k_m_grid_desc = descs[I0];
constexpr auto b_k_n_grid_desc = descs[I1];
constexpr auto c_m_n_grid_desc = descs[I2];
using AKMGridDesc = decltype(a_k_m_grid_desc);
using BKNGridDesc = decltype(b_k_n_grid_desc);
using CMNGridDesc = decltype(c_m_n_grid_desc);
using AGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{})));
using BGridStepHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{})));
using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{})));
using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
using GridwiseGemm =
GridwiseGemmDlops_km_kn_mn_v1r2<BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperationEnum_t::Set, /* ToDo tunable */
AKMGridDesc,
BKNGridDesc,
CMNGridDesc,
MPerBlock,
NPerBlock,
KPerBlock,
M1PerThread,
N1PerThread,
KPerThread,
M1N1ThreadClusterM10,
M1N1ThreadClusterN10,
M1N1ThreadClusterM11,
M1N1ThreadClusterN11,
ABlockTransferThreadSliceLengths_K_M0_M1,
ABlockTransferThreadClusterLengths_K_M0_M1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_M1,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K_N0_N1,
BBlockTransferThreadClusterLengths_K_N0_N1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_N1,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridStepHacks,
BGridStepHacks,
CGridStepHacks,
AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowStepHacks>;
constexpr auto a_k_m0_m1_grid_desc_tmp =
GridwiseGemm::MakeAKM0M1GridDescriptor(a_k_m_grid_desc);
constexpr auto b_k_n0_n1_grid_desc_tmp =
GridwiseGemm::MakeBKN0N1GridDescriptor(b_k_n_grid_desc);
constexpr auto c_m0_m10_m11_n0_n10_n11_grid_desc_tmp =
GridwiseGemm::MakeCM0M10M11N0N10N11GridDescriptor(c_m_n_grid_desc);
constexpr auto cblockid_to_m0_n0_block_cluster_adaptor_tmp =
GridwiseGemm::MakeCBlockIdToM0N0BlockClusterAdaptor(c_m_n_grid_desc);
using AKM0M1GridDesc = decltype(a_k_m0_m1_grid_desc_tmp);
using BKN0N1GridDesc = decltype(b_k_n0_n1_grid_desc_tmp);
using CM0M10M11N0N10N11GridDesc = decltype(c_m0_m10_m11_n0_n10_n11_grid_desc_tmp);
using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor_tmp);
const auto a_k_m0_m1_grid_desc =
*reinterpret_cast<const AKM0M1GridDesc*>((const void*)p_a_k_m0_m1_grid_desc);
const auto b_k_n0_n1_grid_desc =
*reinterpret_cast<const BKN0N1GridDesc*>((const void*)p_b_k_n0_n1_grid_desc);
const auto c_m0_m10_m11_n0_n10_n11_grid_desc =
*reinterpret_cast<const CM0M10M11N0N10N11GridDesc*>(
(const void*)p_c_m0_m10_m11_n0_n10_n11_grid_desc);
const auto cblockid_to_m0_n0_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
(const void*)p_cblockid_to_m0_n0_block_cluster_adaptor);
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::Run(p_a_grid,
p_b_grid,
p_c_grid,
p_shared_block,
a_k_m0_m1_grid_desc,
b_k_n0_n1_grid_desc,
c_m0_m10_m11_n0_n10_n11_grid_desc,
cblockid_to_m0_n0_block_cluster_adaptor,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
};
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
using namespace ck;
constexpr DataTypeEnum_t ABDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
constexpr DataTypeEnum_t CDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
using FloatAB = typename get_datatype_from_enum<ABDataTypeEnum>::type;
using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
using FloatC = typename get_datatype_from_enum<CDataTypeEnum>::type;
constexpr index_t BlockSize = CK_PARAM_BlockSize;
constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
constexpr index_t MPerWave = CK_PARAM_MPerWave;
constexpr index_t NPerWave = CK_PARAM_NPerWave;
constexpr index_t MRepeat = CK_PARAM_MRepeat;
constexpr index_t NRepeat = CK_PARAM_NRepeat;
constexpr index_t K1 = CK_PARAM_K1;
using ABlockTransferThreadSliceLengths_K0_M_K1 =
Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
using ABlockTransferThreadClusterLengths_K0_M_K1 =
Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
using ABlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim;
constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
constexpr index_t ABlockTransferDstScalarPerVector_K1 =
CK_PARAM_ABlockTransferDstScalarPerVector_K1;
constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
using BBlockTransferThreadSliceLengths_K0_N_K1 =
Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
using BBlockTransferThreadClusterLengths_K0_N_K1 =
Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
using BBlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim;
constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
constexpr index_t BBlockTransferDstScalarPerVector_K1 =
CK_PARAM_BBlockTransferDstScalarPerVector_K1;
constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim;
constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare(
int n,
int c,
int hi,
int wi,
int k,
int y,
int x,
int convStrideH,
int convStrideW,
int convDilationY,
int convDilationX,
int leftPadH,
int leftPadW,
int rightPadH,
int rightPadW,
void* p_a_k0_m_k1_grid_desc,
void* p_b_k0_n_k1_grid_desc,
void* p_c_m0_m1_m2_n_grid_desc,
void* p_cblockid_to_m0_n0_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(n, c, hi, wi));
const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(k, c, y, x));
const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(n, k, ho, wo));
const auto descs = transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(convStrideH, convStrideW),
make_tuple(convDilationY, convDilationX),
make_tuple(leftPadH, leftPadW),
make_tuple(rightPadH, rightPadW),
Number<K1>{});
const auto a_k0_m_k1_grid_desc = descs[I0];
const auto b_k0_n_k1_grid_desc = descs[I1];
const auto c_m_n_grid_desc = descs[I2];
using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
using CMNGridDesc = decltype(c_m_n_grid_desc);
using AGridStepHacks = decltype(make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
make_tuple(
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
using BGridStepHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{})));
using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
using GridwiseGemm =
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperationEnum_t::Set,
AK0MK1GridDesc,
BK0NK1GridDesc,
CMNGridDesc,
MPerBlock,
NPerBlock,
KPerBlock,
MPerWave,
NPerWave,
K1,
MRepeat,
NRepeat,
ABlockTransferThreadSliceLengths_K0_M_K1,
ABlockTransferThreadClusterLengths_K0_M_K1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K0_N_K1,
BBlockTransferThreadClusterLengths_K0_N_K1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridStepHacks,
BGridStepHacks,
CGridStepHacks,
AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowStepHacks,
false>;
auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
auto cblockid_to_m0_n0_block_cluster_adaptor =
GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
if(hipThreadIdx_x == 0)
{
*static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
a_k0_m_k1_grid_desc;
*static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
b_k0_n_k1_grid_desc;
*static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
c_m0_m1_m2_n_grid_desc;
*static_cast<decltype(cblockid_to_m0_n0_block_cluster_adaptor)*>(
p_cblockid_to_m0_n0_block_cluster_adaptor) = cblockid_to_m0_n0_block_cluster_adaptor;
}
};
extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw(
const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
const void CONSTANT* p_a_k0_m_k1_grid_desc,
const void CONSTANT* p_b_k0_n_k1_grid_desc,
const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
const void CONSTANT* p_cblockid_to_m0_n0_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto in_n_c_hi_wi_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
constexpr auto wei_k_c_y_x_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
constexpr auto out_n_k_ho_wo_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
constexpr auto descs =
transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1),
Number<K1>{});
constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
constexpr auto c_m_n_grid_desc = descs[I2];
using AGridStepHacks = decltype(make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
make_tuple(
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
using BGridStepHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{})));
using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
using CMNGridDesc = decltype(c_m_n_grid_desc);
using GridwiseGemm =
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperationEnum_t::Set,
AK0MK1GridDesc,
BK0NK1GridDesc,
CMNGridDesc,
MPerBlock,
NPerBlock,
KPerBlock,
MPerWave,
NPerWave,
K1,
MRepeat,
NRepeat,
ABlockTransferThreadSliceLengths_K0_M_K1,
ABlockTransferThreadClusterLengths_K0_M_K1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K0_N_K1,
BBlockTransferThreadClusterLengths_K0_N_K1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridStepHacks,
BGridStepHacks,
CGridStepHacks,
AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowStepHacks,
false>;
constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
constexpr auto cblockid_to_m0_n0_block_cluster_adaptor_tmp =
GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor_tmp);
const auto a_k0_m_k1_grid_desc =
*reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
const auto b_k0_n_k1_grid_desc =
*reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
const auto c_m0_m1_m2_n_grid_desc =
*reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
const auto cblockid_to_m0_n0_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
(const void*)p_cblockid_to_m0_n0_block_cluster_adaptor);
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::Run(p_a_grid,
p_b_grid,
p_c_grid,
p_shared_block,
a_k0_m_k1_grid_desc,
b_k0_n_k1_grid_desc,
c_m0_m1_m2_n_grid_desc,
cblockid_to_m0_n0_block_cluster_adaptor);
};
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
using namespace ck;
constexpr DataTypeEnum_t ABDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
constexpr DataTypeEnum_t CDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
using FloatAB = typename get_datatype_from_enum<ABDataTypeEnum>::type;
using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
using FloatC = typename get_datatype_from_enum<CDataTypeEnum>::type;
constexpr index_t BlockSize = CK_PARAM_BlockSize;
constexpr index_t MPerBlock = CK_PARAM_MPerBlock;
constexpr index_t NPerBlock = CK_PARAM_NPerBlock;
constexpr index_t KPerBlock = CK_PARAM_KPerBlock;
constexpr index_t MPerWave = CK_PARAM_MPerWave;
constexpr index_t NPerWave = CK_PARAM_NPerWave;
constexpr index_t MRepeat = CK_PARAM_MRepeat;
constexpr index_t NRepeat = CK_PARAM_NRepeat;
constexpr index_t K1 = CK_PARAM_K1;
using ABlockTransferThreadSliceLengths_K0_M_K1 =
Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1>;
using ABlockTransferThreadClusterLengths_K0_M_K1 =
Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1>;
using ABlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_ABlockTransferThreadClusterArrangeOrder>;
using ABlockTransferSrcAccessOrder = Sequence<CK_PARAM_ABlockTransferSrcAccessOrder>;
constexpr index_t ABlockTransferSrcVectorDim = CK_PARAM_ABlockTransferSrcVectorDim;
constexpr index_t ABlockTransferSrcScalarPerVector = CK_PARAM_ABlockTransferSrcScalarPerVector;
constexpr index_t ABlockTransferDstScalarPerVector_K1 =
CK_PARAM_ABlockTransferDstScalarPerVector_K1;
constexpr bool AThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun);
using BBlockTransferThreadSliceLengths_K0_N_K1 =
Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1>;
using BBlockTransferThreadClusterLengths_K0_N_K1 =
Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1>;
using BBlockTransferThreadClusterArrangeOrder =
Sequence<CK_PARAM_BBlockTransferThreadClusterArrangeOrder>;
using BBlockTransferSrcAccessOrder = Sequence<CK_PARAM_BBlockTransferSrcAccessOrder>;
constexpr index_t BBlockTransferSrcVectorDim = CK_PARAM_BBlockTransferSrcVectorDim;
constexpr index_t BBlockTransferSrcScalarPerVector = CK_PARAM_BBlockTransferSrcScalarPerVector;
constexpr index_t BBlockTransferDstScalarPerVector_K1 =
CK_PARAM_BBlockTransferDstScalarPerVector_K1;
constexpr bool BThreadTransferSrcResetCoordinateAfterRun =
static_cast<bool>(CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun);
using CThreadTransferSrcDstAccessOrder = Sequence<CK_PARAM_CThreadTransferSrcDstAccessOrder>;
constexpr index_t CThreadTransferSrcDstVectorDim = CK_PARAM_CThreadTransferSrcDstVectorDim;
constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
extern "C" __global__ void convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare(
int n,
int hi,
int wi,
int c,
int k,
int y,
int x,
int convStrideH,
int convStrideW,
int convDilationY,
int convDilationX,
int leftPadH,
int leftPadW,
int rightPadH,
int rightPadW,
void* p_a_k0_m_k1_grid_desc,
void* p_b_k0_n_k1_grid_desc,
void* p_c_m0_m1_m2_n_grid_desc,
void* p_cblockid_to_m0_n0_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
const index_t ho = (hi + leftPadH + rightPadH - convDilationY * (y - 1) - 1) / convStrideH + 1;
const index_t wo = (wi + leftPadW + rightPadW - convDilationX * (x - 1) - 1) / convStrideW + 1;
const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor_packed(make_tuple(n, hi, wi, c));
const auto wei_k_y_x_c_desc = make_naive_tensor_descriptor_packed(make_tuple(k, y, x, c));
const auto out_n_ho_wo_k_desc = make_naive_tensor_descriptor_packed(make_tuple(n, ho, wo, k));
const auto descs = transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk(
in_n_hi_wi_c_desc,
wei_k_y_x_c_desc,
out_n_ho_wo_k_desc,
make_tuple(convStrideH, convStrideW),
make_tuple(convDilationY, convDilationX),
make_tuple(leftPadH, leftPadW),
make_tuple(rightPadH, rightPadW),
Number<K1>{});
const auto a_k0_m_k1_grid_desc = descs[I0];
const auto b_k0_n_k1_grid_desc = descs[I1];
const auto c_m_n_grid_desc = descs[I2];
using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc);
using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc);
using CMNGridDesc = decltype(c_m_n_grid_desc);
using BGridStepHacks = decltype(make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
make_tuple(
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
using AGridStepHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{})));
using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
using GridwiseGemm =
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperationEnum_t::Set,
AK0MK1GridDesc,
BK0NK1GridDesc,
CMNGridDesc,
MPerBlock,
NPerBlock,
KPerBlock,
MPerWave,
NPerWave,
K1,
MRepeat,
NRepeat,
ABlockTransferThreadSliceLengths_K0_M_K1,
ABlockTransferThreadClusterLengths_K0_M_K1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K0_N_K1,
BBlockTransferThreadClusterLengths_K0_N_K1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridStepHacks,
BGridStepHacks,
CGridStepHacks,
AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowStepHacks,
false>;
auto c_m0_m1_m2_n_grid_desc = GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
auto cblockid_to_m0_n0_block_cluster_adaptor =
GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
if(hipThreadIdx_x == 0)
{
*static_cast<remove_cv_t<decltype(a_k0_m_k1_grid_desc)>*>(p_a_k0_m_k1_grid_desc) =
a_k0_m_k1_grid_desc;
*static_cast<remove_cv_t<decltype(b_k0_n_k1_grid_desc)>*>(p_b_k0_n_k1_grid_desc) =
b_k0_n_k1_grid_desc;
*static_cast<decltype(c_m0_m1_m2_n_grid_desc)*>(p_c_m0_m1_m2_n_grid_desc) =
c_m0_m1_m2_n_grid_desc;
*static_cast<decltype(cblockid_to_m0_n0_block_cluster_adaptor)*>(
p_cblockid_to_m0_n0_block_cluster_adaptor) = cblockid_to_m0_n0_block_cluster_adaptor;
}
};
extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk(
const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
const void CONSTANT* p_a_k0_m_k1_grid_desc,
const void CONSTANT* p_b_k0_n_k1_grid_desc,
const void CONSTANT* p_c_m0_m1_m2_n_grid_desc,
const void CONSTANT* p_cblockid_to_m0_n0_block_cluster_adaptor)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto in_n_hi_wi_c_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
constexpr auto wei_k_y_x_c_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 3, 3, 256));
constexpr auto out_n_ho_wo_k_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 28, 28, 256));
constexpr auto descs =
transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk(in_n_hi_wi_c_desc,
wei_k_y_x_c_desc,
out_n_ho_wo_k_desc,
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1),
Number<K1>{});
constexpr auto a_k0_m_k1_grid_desc_tmp = descs[I0];
constexpr auto b_k0_n_k1_grid_desc_tmp = descs[I1];
constexpr auto c_m_n_grid_desc = descs[I2];
using AK0MK1GridDesc = decltype(a_k0_m_k1_grid_desc_tmp);
using BK0NK1GridDesc = decltype(b_k0_n_k1_grid_desc_tmp);
using CMNGridDesc = decltype(c_m_n_grid_desc);
using BGridStepHacks = decltype(make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}),
make_tuple(
Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{}, Sequence<0, 0, 0, 0, 0>{})));
using AGridStepHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0>{})));
using CGridStepHacks = decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 1, 0, 0>{}),
make_tuple(Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 0, 0, 0>{},
Sequence<0, 0, 2, 0, 0>{})));
using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0>;
using BGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0>;
using GridwiseGemm =
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperationEnum_t::Set,
AK0MK1GridDesc,
BK0NK1GridDesc,
CMNGridDesc,
MPerBlock,
NPerBlock,
KPerBlock,
MPerWave,
NPerWave,
K1,
MRepeat,
NRepeat,
ABlockTransferThreadSliceLengths_K0_M_K1,
ABlockTransferThreadClusterLengths_K0_M_K1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorDim,
ABlockTransferSrcScalarPerVector,
ABlockTransferDstScalarPerVector_K1,
AThreadTransferSrcResetCoordinateAfterRun,
BBlockTransferThreadSliceLengths_K0_N_K1,
BBlockTransferThreadClusterLengths_K0_N_K1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorDim,
BBlockTransferSrcScalarPerVector,
BBlockTransferDstScalarPerVector_K1,
BThreadTransferSrcResetCoordinateAfterRun,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridStepHacks,
BGridStepHacks,
CGridStepHacks,
AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowStepHacks,
false>;
constexpr auto c_m0_m1_m2_n_grid_desc_tmp =
GridwiseGemm::MakeCM0M1M2NGridDescriptor(c_m_n_grid_desc);
constexpr auto cblockid_to_m0_n0_block_cluster_adaptor_tmp =
GridwiseGemm::MakeCBlockClusterAdaptor(c_m_n_grid_desc);
using CM0M1M2NGridDesc = decltype(c_m0_m1_m2_n_grid_desc_tmp);
using CBlockIdToM0N0BlockClusterAdaptor = decltype(cblockid_to_m0_n0_block_cluster_adaptor_tmp);
const auto a_k0_m_k1_grid_desc =
*reinterpret_cast<const AK0MK1GridDesc*>((const void*)p_a_k0_m_k1_grid_desc);
const auto b_k0_n_k1_grid_desc =
*reinterpret_cast<const BK0NK1GridDesc*>((const void*)p_b_k0_n_k1_grid_desc);
const auto c_m0_m1_m2_n_grid_desc =
*reinterpret_cast<const CM0M1M2NGridDesc*>((const void*)p_c_m0_m1_m2_n_grid_desc);
const auto cblockid_to_m0_n0_block_cluster_adaptor =
*reinterpret_cast<const CBlockIdToM0N0BlockClusterAdaptor*>(
(const void*)p_cblockid_to_m0_n0_block_cluster_adaptor);
constexpr index_t shared_block_size =
GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
GridwiseGemm::Run(p_a_grid,
p_b_grid,
p_c_grid,
p_shared_block,
a_k0_m_k1_grid_desc,
b_k0_n_k1_grid_desc,
c_m0_m1_m2_n_grid_desc,
cblockid_to_m0_n0_block_cluster_adaptor);
};
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_contraction_dlops_v1r2.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
using namespace ck;
constexpr DataTypeEnum_t ABDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_ABDataTypeEnum);
constexpr DataTypeEnum_t AccDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_AccDataTypeEnum);
constexpr DataTypeEnum_t CDataTypeEnum = static_cast<DataTypeEnum_t>(CK_PARAM_CDataTypeEnum);
using FloatAB = typename get_datatype_from_enum<ABDataTypeEnum>::type;
using FloatAcc = typename get_datatype_from_enum<AccDataTypeEnum>::type;
using FloatC = typename get_datatype_from_enum<CDataTypeEnum>::type;
constexpr index_t BlockSize = CK_PARAM_BlockSize;
constexpr auto GN0 = Number<CK_PARAM_GN0>{};
constexpr auto GK1 = Number<CK_PARAM_GK1>{};
constexpr index_t GM1PerBlockGM11 = CK_PARAM_GM1PerBlockGM11;
constexpr index_t GN1PerBlockGN11 = CK_PARAM_GN1PerBlockGN11;
constexpr index_t GK0PerBlock = CK_PARAM_GK0PerBlock;
constexpr index_t BM1PerThreadBM11 = CK_PARAM_BM1PerThreadBM11;
constexpr index_t BN1PerThreadBN11 = CK_PARAM_BN1PerThreadBN11;
constexpr index_t BK0PerThread = CK_PARAM_BK0PerThread;
using BM10BN10ThreadClusterBM10Xs = Sequence<CK_PARAM_BM10BN10ThreadClusterBM10Xs>;
using BM10BN10ThreadClusterBN10Xs = Sequence<CK_PARAM_BM10BN10ThreadClusterBN10Xs>;
using ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1 =
Sequence<CK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1>;
using ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1 =
Sequence<CK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1>;
using ABlockTransferThreadClusterArrangeOrder = Sequence<1, 2, 3, 0, 4>;
using ABlockTransferSrcAccessOrder = Sequence<3, 2, 1, 0, 4>;
using ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
Sequence<CK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1>;
using ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1 =
Sequence<CK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1>;
using ABlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>;
using BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1 =
Sequence<CK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1>;
using BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1 =
Sequence<CK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1>;
using BBlockTransferThreadClusterArrangeOrder = Sequence<0, 4, 1, 2, 3>;
using BBlockTransferSrcAccessOrder = Sequence<4, 3, 2, 0, 1>;
using BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
Sequence<CK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1>;
using BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1 =
Sequence<CK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1>;
using BBlockTransferSrcVectorTensorContiguousDimOrder = Sequence<0, 1, 2, 3, 4>;
using CThreadTransferSrcDstAccessOrder = Sequence<3, 4, 5, 0, 1, 2>;
constexpr index_t CThreadTransferSrcDstVectorDim = 5;
constexpr index_t CThreadTransferDstScalarPerVector = CK_PARAM_CThreadTransferDstScalarPerVector;
constexpr bool HasMainKBlockLoop = static_cast<bool>(CK_PARAM_HasMainKBlockLoop);
constexpr bool HasDoubleTailKBlockLoop = static_cast<bool>(CK_PARAM_HasDoubleTailKBlockLoop);
extern "C" __global__ void
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare(int N_,
int C_,
int Hi_,
int Wi_,
int K_,
int Y_,
int X_,
int ConvStrideH_,
int ConvStrideW_,
int ConvDilationH_,
int ConvDilationW_,
int InLeftPadH_,
int InLeftPadW_,
int InRightPadH_,
int InRightPadW_,
void* p_desc_tuple)
{
index_t N = static_cast<index_t>(N_);
index_t C = static_cast<index_t>(C_);
index_t Hi = static_cast<index_t>(Hi_);
index_t Wi = static_cast<index_t>(Wi_);
index_t K = static_cast<index_t>(K_);
index_t Y = static_cast<index_t>(Y_);
index_t X = static_cast<index_t>(X_);
index_t ConvStrideH = static_cast<index_t>(ConvStrideH_);
index_t ConvStrideW = static_cast<index_t>(ConvStrideW_);
index_t ConvDilationH = static_cast<index_t>(ConvDilationH_);
index_t ConvDilationW = static_cast<index_t>(ConvDilationW_);
index_t InLeftPadH = static_cast<index_t>(InLeftPadH_);
index_t InLeftPadW = static_cast<index_t>(InLeftPadW_);
index_t InRightPadH = static_cast<index_t>(InRightPadH_);
index_t InRightPadW = static_cast<index_t>(InRightPadW_);
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
const index_t Ho =
(Hi + InLeftPadH + InRightPadH - ConvDilationH * (Y - 1) - 1) / ConvStrideH + 1;
const index_t Wo =
(Wi + InLeftPadW + InRightPadW - ConvDilationW * (X - 1) - 1) / ConvStrideW + 1;
const auto in_n_c_hi_wi_desc = make_naive_tensor_descriptor_packed(make_tuple(N, C, Hi, Wi));
const auto wei_k_c_y_x_desc = make_naive_tensor_descriptor_packed(make_tuple(K, C, Y, X));
const auto out_n_k_ho_wo_desc = make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
const auto descs = transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(ConvStrideH, ConvStrideW),
make_tuple(ConvDilationH, ConvDilationW),
make_tuple(InLeftPadH, InLeftPadW),
make_tuple(InRightPadH, InRightPadW),
GN0,
GK1);
const auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
const auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
const auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1);
using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
using AGridStepHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 0+: GK0
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 1+: GM0
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 2+: GM10
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 3+: GM11
Sequence<0, 0, 0, 0, 0, 0, 0>{}), // 4+: GK1
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 0-: GK0
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 1-: GM0
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 2-: GM10
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 3-: GM11
Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
using BGridStepHacks = decltype(make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: GK0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 1+: GN0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 2+: GN10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 3+: GN11
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 4+: GK1
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: GK0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 1-: GN0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 2-: GN10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 3-: GN11
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
using CGridStepHacks = decltype(make_tuple(
make_tuple(
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: GM10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, // 1+: BM0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, // 2+: BM1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3+: GN10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}, // 4+: BN0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
make_tuple(
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: GM10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, // 1-: BM0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, // 2-: BM1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3-: GN10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}, // 4-: BN0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
using BGridMoveSliceWindowStepHacks =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
using GridwiseContraction =
GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperationEnum_t::Set,
AGridDesc_GK0_GM0_GM1_GK1,
BGridDesc_GK0_GN0_GN1_GK1,
CGridDesc_GM0_GM1_GN0_GN1,
GM1PerBlockGM11,
GN1PerBlockGN11,
GK0PerBlock,
BM1PerThreadBM11,
BN1PerThreadBN11,
BK0PerThread,
BM10BN10ThreadClusterBM10Xs,
BM10BN10ThreadClusterBN10Xs,
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferSrcVectorTensorContiguousDimOrder,
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferSrcVectorTensorContiguousDimOrder,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridStepHacks,
BGridStepHacks,
CGridStepHacks,
AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowStepHacks>;
if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
{
auto desc_tuple =
make_tuple(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
a_grid_desc_gk0_gm0_gm1_gk1),
GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
b_grid_desc_gk0_gn0_gn1_gk1),
GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
c_grid_desc_gm0_gm1_gn0_gn1),
GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
c_grid_desc_gm0_gm1_gn0_gn1));
*static_cast<decltype(desc_tuple)*>(p_desc_tuple) = desc_tuple;
}
};
extern "C" __global__ void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
#endif
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
const FloatAB* __restrict__ p_a_grid,
const FloatAB* __restrict__ p_b_grid,
FloatC* __restrict__ p_c_grid,
const void CONSTANT* p_desc_tuple)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_n_c_hi_wi_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
constexpr auto wei_k_c_y_x_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 3, 3));
constexpr auto out_n_k_ho_wo_desc =
make_naive_tensor_descriptor_packed(make_tuple(256, 256, 28, 28));
constexpr auto descs =
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(wei_k_c_y_x_desc,
in_n_c_hi_wi_desc,
out_n_k_ho_wo_desc,
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1),
make_tuple(1, 1),
GN0,
GK1);
constexpr auto a_grid_desc_gk0_gm0_gm1_gk1 = descs[I0];
constexpr auto b_grid_desc_gk0_gn0_gn1_gk1 = descs[I1];
constexpr auto c_grid_desc_gm0_gm1_gn0_gn1 = descs[I2];
using AGridDesc_GK0_GM0_GM1_GK1 = decltype(a_grid_desc_gk0_gm0_gm1_gk1);
using BGridDesc_GK0_GN0_GN1_GK1 = decltype(b_grid_desc_gk0_gn0_gn1_gk1);
using CGridDesc_GM0_GM1_GN0_GN1 = decltype(c_grid_desc_gm0_gm1_gn0_gn1);
using AGridStepHacks =
decltype(make_tuple(make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 0+: GK0
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 1+: GM0
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 2+: GM10
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 3+: GM11
Sequence<0, 0, 0, 0, 0, 0, 0>{}), // 4+: GK1
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 0-: GK0
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 1-: GM0
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 2-: GM10
Sequence<0, 0, 0, 0, 0, 0, 0>{}, // 3-: GM11
Sequence<0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
using BGridStepHacks = decltype(make_tuple(
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: GK0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 1+: GN0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 2+: GN10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0>{}, // 3+: GN11
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}), // 4+: GK1
make_tuple(Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: GK0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 1-: GN0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 2-: GN10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0>{}, // 3-: GN11
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}))); // 4-: GK1
using CGridStepHacks = decltype(make_tuple(
make_tuple(
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0+: GM10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, // 1+: BM0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0>{}, // 2+: BM1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3+: GN10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}, // 4+: BN0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0>{}), // 5+: GN1
make_tuple(
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 0-: GM10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, // 1-: BM0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0>{}, // 2-: BM1
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>{}, // 3-: GN10
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}, // 4-: BN0
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0>{}))); // 5-: GN1
using AGridMoveSliceWindowStepHacks = Sequence<0, 0, 0, 0, 0, 0, 0>;
using BGridMoveSliceWindowStepHacks =
Sequence<0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0>;
using GridwiseContraction =
GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1<
BlockSize,
FloatAB,
FloatAcc,
FloatC,
InMemoryDataOperationEnum_t::Set,
AGridDesc_GK0_GM0_GM1_GK1,
BGridDesc_GK0_GN0_GN1_GK1,
CGridDesc_GM0_GM1_GN0_GN1,
GM1PerBlockGM11,
GN1PerBlockGN11,
GK0PerBlock,
BM1PerThreadBM11,
BN1PerThreadBN11,
BK0PerThread,
BM10BN10ThreadClusterBM10Xs,
BM10BN10ThreadClusterBN10Xs,
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferThreadClusterArrangeOrder,
ABlockTransferSrcAccessOrder,
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
ABlockTransferSrcVectorTensorContiguousDimOrder,
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferThreadClusterArrangeOrder,
BBlockTransferSrcAccessOrder,
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
BBlockTransferSrcVectorTensorContiguousDimOrder,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AGridStepHacks,
BGridStepHacks,
CGridStepHacks,
AGridMoveSliceWindowStepHacks,
BGridMoveSliceWindowStepHacks>;
using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
decltype(GridwiseContraction::MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
a_grid_desc_gk0_gm0_gm1_gk1));
using BGridDesc_GK0_GN0_GN10_GN11_GK1 =
decltype(GridwiseContraction::MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
b_grid_desc_gk0_gn0_gn1_gk1));
using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 =
decltype(GridwiseContraction::MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
c_grid_desc_gm0_gm1_gn0_gn1));
using CGridBlockCluster_BlockId_To_GM10_GN10 =
decltype(GridwiseContraction::MakeCGridBlockCluster_BlockId_To_GM10_GN10(
c_grid_desc_gm0_gm1_gn0_gn1));
using DescTuple = decltype(make_tuple(AGridDesc_GK0_GM0_GM10_GM11_GK1{},
BGridDesc_GK0_GN0_GN10_GN11_GK1{},
CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1{},
CGridBlockCluster_BlockId_To_GM10_GN10{}));
const auto desc_tuple =
*reinterpret_cast<const DescTuple*>(cast_pointer_to_generic_address_space(p_desc_tuple));
const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = desc_tuple[I0];
const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = desc_tuple[I1];
const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = desc_tuple[I2];
const auto c_grid_block_cluster_blockid_to_gm10_gn10 = desc_tuple[I3];
constexpr index_t shared_block_size =
GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
__shared__ FloatAB p_shared_block[shared_block_size];
GridwiseContraction::Run(p_a_grid,
p_b_grid,
p_c_grid,
p_shared_block,
a_grid_desc_gk0_gm0_gm10_gm11_gk1,
b_grid_desc_gk0_gn0_gn10_gn11_gk1,
c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
c_grid_block_cluster_blockid_to_gm10_gn10,
integral_constant<bool, HasMainKBlockLoop>{},
integral_constant<bool, HasDoubleTailKBlockLoop>{});
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
void* __restrict__ ws_global)
{
(void)GridSize;
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple(1);
const auto tupleDstStrides = make_tuple(1);
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto one_dim_srcDesc = transform_tensor_descriptor(
srcDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
auto src2dDesc = transform_tensor_descriptor(
one_dim_srcDesc,
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
constexpr int invariantLen = 1;
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
if constexpr(src2d_need_padding)
{
const auto srcPad =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pass_through_transform(invariantLen),
make_pad_transform(toReduceLen, 0, srcPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
};
template <index_t srcDims>
struct get_ref_desc_types
{
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_src2dDesc =
transform_tensor_descriptor(ref_one_dim_srcDesc,
make_tuple(make_unmerge_transform(
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the BlockWise and MultiBlock method
using refType_src2dDesc_padded_34 = decltype(
transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pass_through_transform(ref_invariantLen),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dstDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dstDesc);
};
using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_34 =
typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
const void CONSTANT* ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredAccessesPerThreadInBlock>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS;
constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
using toReduceDims = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)GridSize;
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
auto src2dDesc =
transform_tensor_descriptor(srcDesc,
make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
if constexpr(src2d_need_padding)
{
const auto srcPad =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pass_through_transform(invariantLen),
make_pad_transform(toReduceLen, 0, srcPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
};
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_toReduceDimLengths =
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
static constexpr auto ref_invariantDimLengths =
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the BlockWise and MultiBlock method
using refType_src2dDesc_padded_34 = decltype(
transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pass_through_transform(ref_invariantLen),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_34 =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_src2dDesc_padded_34;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
const void CONSTANT* ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_blockwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredAccessesPerThreadInBlock>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
void* __restrict__ ws_global)
{
(void)GridSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple(1);
const auto tupleDstStrides = make_tuple(1);
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto one_dim_srcDesc = transform_tensor_descriptor(
srcDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
auto src2dDesc = transform_tensor_descriptor(
one_dim_srcDesc,
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
constexpr int invariantLen = 1;
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
const index_t reduceSizePerBlock =
(((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
copySliceLen;
if constexpr(src2d_need_padding)
{
const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pass_through_transform(invariantLen),
make_pad_transform(toReduceLen, 0, srcPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
};
template <index_t srcDims>
struct get_ref_desc_types
{
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_src2dDesc =
transform_tensor_descriptor(ref_one_dim_srcDesc,
make_tuple(make_unmerge_transform(
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the BlockWise and MultiBlock method
using refType_src2dDesc_padded_34 = decltype(
transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pass_through_transform(ref_invariantLen),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dstDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dstDesc);
};
using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_34 =
typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_34;
using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
const void CONSTANT* ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)p_dst_global;
(void)indices_global;
const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
void* ws_buf1_global = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
GredAccessesPerThreadInBlock>;
void* const ws_buf2_global =
ws_buf2_bytes_offset > 0
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
: nullptr;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
BlkGroupSize,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<srcDataType* const __restrict__>(ws_buf1_global),
static_cast<int* const __restrict__>(ws_buf2_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr index_t dstDims = CK_PARAM_OUT_DIMS;
constexpr index_t num_toReduceDims = CK_PARAM_NUM_TOREDUCE_DIMS;
constexpr index_t num_invariantDims = srcDims - num_toReduceDims;
using invariantDims = typename arithmetic_sequence_gen<0, num_invariantDims, 1>::type;
using toReduceDims = typename arithmetic_sequence_gen<num_invariantDims, srcDims, 1>::type;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
static_assert(num_invariantDims > 0, "Not all dimensins are reduced for this kernel !!");
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredAccessesPerThreadInBlock = CK_PARAM_ACCESSES_PER_THREAD_INBLOCK; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
int outStride0,
int outStride1,
int outStride2,
int outStride3,
int outStride4,
int outStride5,
void* __restrict__ ws_global)
{
(void)GridSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const int dstStrides[6] = {
outStride0, outStride1, outStride2, outStride3, outStride4, outStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple_from_array(srcLengths, Number<dstDims>{});
const auto tupleDstStrides = make_tuple_from_array(dstStrides, Number<dstDims>{});
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
const auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto toReduceDimLengths = make_tuple_from_array_and_index_seq(srcLengths, toReduceDims{});
const auto invariantDimLengths =
make_tuple_from_array_and_index_seq(srcLengths, invariantDims{});
auto src2dDesc =
transform_tensor_descriptor(srcDesc,
make_tuple(make_merge_transform(invariantDimLengths),
make_merge_transform(toReduceDimLengths)),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
auto dst1dDesc = transform_tensor_descriptor(
dstDesc,
make_tuple(make_merge_transform(tupleDstLengths)),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
const auto invariantLen = src2dDesc.GetLength(Number<0>{});
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = BlockSize * GredAccessesPerThreadInBlock;
const index_t reduceSizePerBlock =
(((toReduceLen + BlkGroupSize - 1) / BlkGroupSize + copySliceLen - 1) / copySliceLen) *
copySliceLen;
if constexpr(src2d_need_padding)
{
const auto srcPad = reduceSizePerBlock * BlkGroupSize - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pass_through_transform(invariantLen),
make_pad_transform(toReduceLen, 0, srcPad)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(dst1dDesc)*>(p_dst1dDesc) = dst1dDesc;
};
template <index_t srcDims, index_t dstDims, typename invariantDims, typename toReduceDims>
struct get_ref_desc_types
{
static constexpr auto ref_toReduceDimLengths =
typename uniform_sequence_gen<toReduceDims::Size(), 8>::type{};
static constexpr auto ref_invariantDimLengths =
typename uniform_sequence_gen<invariantDims::Size(), 8>::type{};
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
static constexpr auto ref_dstLengths = typename uniform_sequence_gen<dstDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_dstLengths), make_tuple_from_seq(ref_dstLengths));
static constexpr auto ref_src2dDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_invariantDimLengths)),
make_merge_transform(make_tuple_from_seq(ref_toReduceDimLengths))),
make_tuple(invariantDims{}, toReduceDims{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
static constexpr auto ref_dst1dDesc = transform_tensor_descriptor(
ref_dstDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_dstLengths))),
make_tuple(typename arithmetic_sequence_gen<0, dstDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the BlockWise and MultiBlock method
using refType_src2dDesc_padded_34 = decltype(
transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pass_through_transform(ref_invariantLen),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dst1dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dst1dDesc);
};
using refType_src2dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_src2dDesc;
using refType_dst1dDesc =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_34 =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_src2dDesc_padded_34;
using refType_dst1dDesc_padded =
typename get_ref_desc_types<srcDims, dstDims, invariantDims, toReduceDims>::
refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_34*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
const void CONSTANT* ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)p_dst_global;
(void)indices_global;
const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
void* ws_buf1_global = const_cast<char*>(static_cast<const char*>(p_src2dDesc) + 4096);
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_multiblock<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
GredAccessesPerThreadInBlock>;
void* const ws_buf2_global =
ws_buf2_bytes_offset > 0
? static_cast<void*>(static_cast<char*>(ws_buf1_global) + ws_buf2_bytes_offset)
: nullptr;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
BlkGroupSize,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<srcDataType* const __restrict__>(ws_buf1_global),
static_cast<int* const __restrict__>(ws_buf2_global));
};
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
using namespace ck;
using srcDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_SRC_DATATYPE)>::type;
using dstDataType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_DST_DATATYPE)>::type;
using compType =
typename get_datatype_from_enum<static_cast<DataTypeEnum_t>(CK_PARAM_REDUCE_COMPTYPE)>::type;
constexpr index_t BlockSize = CK_PARAM_BLOCKSIZE; // tunable
constexpr index_t srcDims = CK_PARAM_IN_DIMS;
constexpr ReduceTensorOp_t op = static_cast<ReduceTensorOp_t>(CK_PARAM_REDUCE_OP);
constexpr NanPropagation_t nanPropaOpt = CK_PARAM_NAN_PROPAGATE == 0
? NanPropagation_t::NOT_PROPAGATE_NAN
: NanPropagation_t::PROPAGATE_NAN;
constexpr ReduceTensorIndices_t reduceIndicesOpt = CK_PARAM_REDUCE_INDICES == 0
? ReduceTensorIndices_t::NO_INDICES
: ReduceTensorIndices_t::FLATTENED_INDICES;
constexpr bool src2d_need_padding = static_cast<bool>(CK_PARAM_SRC2D_PADDING);
constexpr bool dst1d_need_padding = static_cast<bool>(CK_PARAM_DST1D_PADDING);
constexpr bool indexable = reduce_binary_operator<compType, op>::indexable;
constexpr bool need_indices = indexable && (reduceIndicesOpt != ReduceTensorIndices_t::NO_INDICES);
constexpr index_t GredThreadBufferLength = CK_PARAM_THREAD_BUFFER_LENGTH; // tunable
// helper functions using variadic template arguments
template <index_t... Ns>
__device__ static auto make_tuple_from_array_and_index_seq(const int* lengths, Sequence<Ns...>)
{
return make_tuple(static_cast<index_t>(lengths[Ns])...);
};
template <index_t arraySize>
__device__ static auto make_tuple_from_array(const int* lengths, Number<arraySize>)
{
static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
return make_tuple_from_array_and_index_seq(lengths, index_seq);
};
template <index_t... Ns>
__device__ static constexpr auto make_tuple_from_seq(Sequence<Ns...>)
{
return make_tuple(Ns...);
};
extern "C" __global__ void gridwise_generic_reduce_1_prepare(int GridSize,
int BlkGroupSize,
int inLength0,
int inLength1,
int inLength2,
int inLength3,
int inLength4,
int inLength5,
int inStride0,
int inStride1,
int inStride2,
int inStride3,
int inStride4,
int inStride5,
void* __restrict__ ws_global)
{
(void)BlkGroupSize;
void* p_src2dDesc = ws_global;
void* p_dst1dDesc = static_cast<char*>(ws_global) + 2048;
const int srcLengths[6] = {inLength0, inLength1, inLength2, inLength3, inLength4, inLength5};
const int srcStrides[6] = {inStride0, inStride1, inStride2, inStride3, inStride4, inStride5};
const auto tupleSrcLengths = make_tuple_from_array(srcLengths, Number<srcDims>{});
const auto tupleSrcStrides = make_tuple_from_array(srcStrides, Number<srcDims>{});
const auto tupleDstLengths = make_tuple(1);
const auto tupleDstStrides = make_tuple(1);
const auto srcDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
auto dstDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
const auto one_dim_srcDesc = transform_tensor_descriptor(
srcDesc,
make_tuple(make_merge_transform(tupleSrcLengths)),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
auto src2dDesc = transform_tensor_descriptor(
one_dim_srcDesc,
make_tuple(make_unmerge_transform(make_tuple(1, one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
constexpr int invariantLen = 1;
const auto toReduceLen = src2dDesc.GetLength(Number<1>{});
constexpr auto copySliceLen = GredThreadBufferLength;
if constexpr(src2d_need_padding)
{
const auto srcPad1 = GridSize * BlockSize - invariantLen;
const auto srcPad2 =
((toReduceLen + copySliceLen - 1) / copySliceLen) * copySliceLen - toReduceLen;
auto src2dDesc_2 =
transform_tensor_descriptor(src2dDesc,
make_tuple(make_pad_transform(invariantLen, 0, srcPad1),
make_pad_transform(toReduceLen, 0, srcPad2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc_2)*>(p_src2dDesc) = src2dDesc_2;
}
else
{
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(src2dDesc)*>(p_src2dDesc) = src2dDesc;
}
if constexpr(dst1d_need_padding)
{
const auto dstPad = GridSize * BlockSize - invariantLen;
auto dst1dDesc_2 =
transform_tensor_descriptor(dstdDesc,
make_tuple(make_pad_transform(invariantLen, 0, dstPad)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{}));
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(dst1dDesc_2)*>(p_dst1dDesc) = dst1dDesc_2;
}
else
{
if(get_thread_local_1d_id() == 0)
*static_cast<decltype(dstDesc)*>(p_dst1dDesc) = dstDesc;
}
};
template <index_t srcDims>
struct get_ref_desc_types
{
static constexpr auto ref_srcLengths = typename uniform_sequence_gen<srcDims, 8>::type{};
// don't have to use accurate strides to get an expected referrence type
static constexpr auto ref_srcDesc = make_naive_tensor_descriptor(
make_tuple_from_seq(ref_srcLengths), make_tuple_from_seq(ref_srcLengths));
static constexpr auto ref_dstDesc = make_naive_tensor_descriptor(make_tuple(1), make_tuple(1));
static constexpr auto ref_one_dim_srcDesc = transform_tensor_descriptor(
ref_srcDesc,
make_tuple(make_merge_transform(make_tuple_from_seq(ref_srcLengths))),
make_tuple(typename arithmetic_sequence_gen<0, srcDims, 1>::type{}),
make_tuple(Sequence<0>{}));
static constexpr auto ref_src2dDesc =
transform_tensor_descriptor(ref_one_dim_srcDesc,
make_tuple(make_unmerge_transform(
make_tuple(1, ref_one_dim_srcDesc.GetLength(Number<0>{})))),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0, 1>{}));
static constexpr auto ref_invariantLen = ref_src2dDesc.GetLength(Number<0>{});
static constexpr auto ref_toReduceLen = ref_src2dDesc.GetLength(Number<1>{});
// used by the DirectThreadWise and DirectWarpWise method
using refType_src2dDesc_padded_12 =
decltype(transform_tensor_descriptor(ref_src2dDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2),
make_pad_transform(ref_toReduceLen, 0, 2)),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0>{}, Sequence<1>{})));
using refType_dst1dDesc_padded =
decltype(transform_tensor_descriptor(ref_dstDesc,
make_tuple(make_pad_transform(ref_invariantLen, 0, 2)),
make_tuple(Sequence<0>{}),
make_tuple(Sequence<0>{})));
using refType_src2dDesc = decltype(ref_src2dDesc);
using refType_dst1dDesc = decltype(ref_dstDesc);
};
using refType_src2dDesc = typename get_ref_desc_types<srcDims>::refType_src2dDesc;
using refType_dst1dDesc = typename get_ref_desc_types<srcDims>::refType_dst1dDesc;
using refType_src2dDesc_padded_12 =
typename get_ref_desc_types<srcDims>::refType_src2dDesc_padded_12;
using refType_dst1dDesc_padded = typename get_ref_desc_types<srcDims>::refType_dst1dDesc_padded;
template <bool need_padding>
static __device__ auto get_reduction_src2d_descriptor(const void* p_src2dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_src2dDesc_padded_12*>(p_src2dDesc));
else
return (*reinterpret_cast<const refType_src2dDesc*>(p_src2dDesc));
};
template <bool need_padding>
static __device__ auto get_reduction_dst1d_descriptor(const void* p_dst1dDesc)
{
if constexpr(need_padding)
return (*reinterpret_cast<const refType_dst1dDesc_padded*>(p_dst1dDesc));
else
return (*reinterpret_cast<const refType_dst1dDesc*>(p_dst1dDesc));
};
extern "C" __global__ void gridwise_generic_reduce_1(int origReduceLen,
int BlkGroupSize,
float alpha,
const void* __restrict__ p_src_global,
float beta,
void* __restrict__ p_dst_global,
const void CONSTANT* ws_global,
long ws_buf2_bytes_offset,
void* __restrict__ indices_global)
{
(void)BlkGroupSize;
(void)ws_buf2_bytes_offset;
const void* p_src2dDesc = cast_pointer_to_generic_address_space(ws_global);
const void* p_dst1dDesc = static_cast<const char*>(p_src2dDesc) + 2048;
const auto src2dDesc = get_reduction_src2d_descriptor<src2d_need_padding>(p_src2dDesc);
const auto dst1dDesc = get_reduction_dst1d_descriptor<dst1d_need_padding>(p_dst1dDesc);
using gridwise_2d_reduce = GridwiseReduction_xy_to_x_direct_threadwise<BlockSize,
srcDataType,
dstDataType,
compType,
decltype(src2dDesc),
decltype(dst1dDesc),
op,
nanPropaOpt,
reduceIndicesOpt,
true,
true,
GredThreadBufferLength>;
constexpr int RunId = need_indices ? 2 : 1;
gridwise_2d_reduce::template Run<RunId>(
src2dDesc,
dst1dDesc,
origReduceLen,
alpha,
static_cast<const srcDataType* const __restrict__>(p_src_global),
beta,
static_cast<dstDataType* const __restrict__>(p_dst_global),
static_cast<const int* const __restrict__>(nullptr),
static_cast<int* const __restrict__>(indices_global));
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment