Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9dce6851
Commit
9dce6851
authored
Mar 10, 2022
by
Jing Zhang
Browse files
merge develop
parents
3cc57101
5d37d7bf
Changes
473
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
57 additions
and
5829 deletions
+57
-5829
CMakeLists.txt
CMakeLists.txt
+18
-54
Dockerfile
Dockerfile
+1
-1
Jenkinsfile
Jenkinsfile
+38
-7
composable_kernel/include/gridwise_operation_wrapper.hpp
composable_kernel/include/gridwise_operation_wrapper.hpp
+0
-14
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
...sor_operation/gridwise_generic_2d_reduction_blockwise.hpp
+0
-623
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
...ation/gridwise_generic_2d_reduction_direct_threadwise.hpp
+0
-501
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
...eration/gridwise_generic_2d_reduction_direct_warpwise.hpp
+0
-542
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
...or_operation/gridwise_generic_2d_reduction_multiblock.hpp
+0
-376
composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
...nclude/tensor_operation/reduction_functions_blockwise.hpp
+0
-271
composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
...clude/tensor_operation/reduction_functions_threadwise.hpp
+0
-141
composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
...include/tensor_operation/reduction_functions_warpwise.hpp
+0
-371
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
...ution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
+0
-369
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
...tion_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
+0
-357
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
...tion_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
+0
-356
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
...ution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
+0
-400
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
...eneric_reduction_first_call_blockwise_reduce_all_dims.cpp
+0
-271
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
...ic_reduction_first_call_blockwise_reduce_partial_dims.cpp
+0
-305
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
...neric_reduction_first_call_multiblock_reduce_all_dims.cpp
+0
-276
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
...c_reduction_first_call_multiblock_reduce_partial_dims.cpp
+0
-310
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
...neric_reduction_first_call_threadwise_reduce_all_dims.cpp
+0
-284
No files found.
CMakeLists.txt
View file @
9dce6851
...
...
@@ -45,7 +45,6 @@ message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
message
(
"OpenMP_pthread_LIBRARY:
${
OpenMP_pthread_LIBRARY
}
"
)
message
(
"OpenMP_CXX_FLAGS:
${
OpenMP_CXX_FLAGS
}
"
)
# set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
link_libraries
(
${
OpenMP_gomp_LIBRARY
}
)
link_libraries
(
${
OpenMP_pthread_LIBRARY
}
)
...
...
@@ -71,17 +70,17 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
endif
()
message
(
STATUS
"Build with HIP
${
HIP_VERSION
}
"
)
## half
#find_path(HALF_INCLUDE_DIR half.hpp)
set
(
HALF_INCLUDE_DIR
"
${
PROJECT_SOURCE_DIR
}
/external/half/include"
)
message
(
"HALF_INCLUDE_DIR:
${
HALF_INCLUDE_DIR
}
"
)
rocm_create_package
(
NAME CK-
${
CK_BACKEND
}
DESCRIPTION
"High Performance Composable Kernel
s
for AMD GPUs"
DESCRIPTION
"High Performance Composable Kernel for AMD GPUs"
LDCONFIG
)
## half
set
(
HALF_INCLUDE_DIR
"
${
PROJECT_SOURCE_DIR
}
/external/include/half"
)
message
(
"HALF_INCLUDE_DIR:
${
HALF_INCLUDE_DIR
}
"
)
## tidy
include
(
EnableCompilerWarnings
)
set
(
CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name
)
...
...
@@ -184,7 +183,6 @@ enable_clang_tidy(
-cppcoreguidelines-narrowing-conversions
-altera-struct-pack-align
-cppcoreguidelines-prefer-member-initializer
${
CK_TIDY_CHECKS
}
${
CK_TIDY_ERRORS
}
HEADER_FILTER
...
...
@@ -214,70 +212,36 @@ enable_cppcheck(
unmatchedSuppression
FORCE
SOURCES
host/host_tensor/src
host/driver_offline/src
composable_kernel/src/kernel_wrapper
library/src
INCLUDE
host/host_tensor/include
host/device/include
host/solver/include
host/driver_offline/include
composable_kernel/include/*
${
CMAKE_CURRENT_SOURCE_DIR
}
/include
${
CMAKE_CURRENT_BINARY_DIR
}
/include
${
CMAKE_CURRENT_SOURCE_DIR
}
/library/include
DEFINE
CPPCHECK=1
__linux__=1
)
set
(
CMAKE_LIBRARY_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/lib
)
set
(
CMAKE_ARCHIVE_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/lib
)
set
(
CMAKE_RUNTIME_OUTPUT_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
/bin
)
file
(
GLOB_RECURSE COMPOSABLE_KERNEL_HEADERS
"composable_kernel/include/*/*.hpp"
)
file
(
GLOB_RECURSE DEVICE_OPS_HEADERS
"device_operation/include/*.hpp"
)
file
(
GLOB_RECURSE DEVICE_OPS_SOURCE
"device_operation/*.cpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/include/ck/hip_version.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/include/ck/hip_version.hpp"
)
set
(
CK_HEADERS
${
COMPOSABLE_KERNEL_HEADERS
}
${
DEVICE_OPS_HEADERS
}
)
set
(
CK_SOURCE
${
DEVICE_OPS_SOURCE
}
)
add_library
(
composable_kernel
${
CK_SOURCE
}
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/include
${
PROJECT_BINARY_DIR
}
/include
${
PROJECT_SOURCE_DIR
}
/library/include
)
target_include_directories
(
composable_kernel PUBLIC
$<BUILD_INTERFACE:
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include>
)
target_include_directories
(
composable_kernel PUBLIC
$<BUILD_INTERFACE:
${
PROJECT_SOURCE_DIR
}
/device_operation/include>
)
target_include_directories
(
composable_kernel PUBLIC
$<BUILD_INTERFACE:
${
PROJECT_SOURCE_DIR
}
/host/include>
)
target_include_directories
(
composable_kernel PUBLIC
$<BUILD_INTERFACE:
${
PROJECT_SOURCE_DIR
}
/host/host_tensor/include>
)
# The following should eventually be removed
target_include_directories
(
composable_kernel PUBLIC
$<BUILD_INTERFACE:
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility>
)
target_include_directories
(
composable_kernel PUBLIC
$<BUILD_INTERFACE:
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_operation>
)
target_include_directories
(
composable_kernel PUBLIC
$<BUILD_INTERFACE:
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/tensor_description>
)
# clang_tidy_check(composable_kernel)
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
if
(
BUILD_DEV
)
target
_compile_options
(
composable_kernel PRIVATE
-Werror
)
target
_compile_options
(
composable_kernel PRIVATE
-Weverything
)
add
_compile_options
(
-Werror
)
add
_compile_options
(
-Weverything
)
endif
()
message
(
"CMAKE_CXX_FLAGS:
${
CMAKE_CXX_FLAGS
}
"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/hip_version.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/hip_version.hpp"
)
add_subdirectory
(
host
)
add_subdirectory
(
device_operation
)
add_subdirectory
(
library
)
add_subdirectory
(
example
)
add_subdirectory
(
profiler
)
add_subdirectory
(
test
)
add_subdirectory
(
profiler
)
Dockerfile
View file @
9dce6851
FROM
ubuntu:18.04
ARG
ROCMVERSION=
4.5
ARG
ROCMVERSION=
5.0
ARG
OSDB_BKC_VERSION
RUN
set
-xe
...
...
Jenkinsfile
View file @
9dce6851
...
...
@@ -17,7 +17,7 @@ def cmake_build(Map conf=[:]){
def
compiler
=
conf
.
get
(
"compiler"
,
"/opt/rocm/bin/hipcc"
)
def
config_targets
=
conf
.
get
(
"config_targets"
,
"check"
)
def
debug_flags
=
"-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined "
+
conf
.
get
(
"extradebugflags"
,
""
)
def
build_envs
=
"CTEST_PARALLEL_LEVEL=4
MIOPEN_CONV_PRECISE_ROCBLAS_TIMING=0
"
+
conf
.
get
(
"build_env"
,
""
)
def
build_envs
=
"CTEST_PARALLEL_LEVEL=4 "
+
conf
.
get
(
"build_env"
,
""
)
def
prefixpath
=
conf
.
get
(
"prefixpath"
,
"/opt/rocm"
)
def
setup_args
=
conf
.
get
(
"setup_args"
,
""
)
...
...
@@ -60,7 +60,8 @@ def cmake_build(Map conf=[:]){
cd build
"""
def
setup_cmd
=
conf
.
get
(
"setup_cmd"
,
"${cmake_envs} cmake ${setup_args} .. "
)
def
build_cmd
=
conf
.
get
(
"build_cmd"
,
"${build_envs} dumb-init make -j\$(nproc) ${config_targets}"
)
// reduce parallelism when compiling, clang uses too much memory
def
build_cmd
=
conf
.
get
(
"build_cmd"
,
"${build_envs} dumb-init make -j\$(( \$(nproc) / 1 )) ${config_targets}"
)
def
execute_cmd
=
conf
.
get
(
"execute_cmd"
,
""
)
def
cmd
=
conf
.
get
(
"cmd"
,
"""
...
...
@@ -177,15 +178,27 @@ pipeline {
// buildHipClangJobAndReboot(build_cmd: build_cmd, no_reboot:true, prefixpath: '/opt/rocm', build_type: 'debug')
// }
// }
stage
(
'Build Profiler: gfx908'
)
stage
(
'Build Profiler:
Release,
gfx908'
)
{
agent
{
label
rocmnode
(
"
gfx908
"
)}
agent
{
label
rocmnode
(
"
nogpu
"
)}
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
build_cmd
=
"make -j\$(nproc) -k ckProfiler"
}
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
build_cmd:
build_cmd
,
no_reboot:
true
,
build_type:
'Release'
)
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
config_targets:
"ckProfiler"
,
no_reboot:
true
,
build_type:
'Release'
)
}
}
stage
(
'Build Profiler: Debug, gfx908'
)
{
agent
{
label
rocmnode
(
"nogpu"
)}
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
}
steps
{
// until we stabilize debug build due to compiler crashes
catchError
(
buildResult:
'SUCCESS'
,
stageResult:
'FAILURE'
)
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
config_targets:
"ckProfiler"
,
no_reboot:
true
,
build_type:
'Debug'
)
}
}
}
stage
(
'Clang Format'
)
{
...
...
@@ -207,6 +220,24 @@ pipeline {
}
}
}
stage
(
"Tests"
)
{
parallel
{
stage
(
"Run Tests: gfx908"
)
{
agent
{
label
rocmnode
(
"gfx908"
)}
environment
{
setup_args
=
""" -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
}
steps
{
buildHipClangJobAndReboot
(
setup_args:
setup_args
,
config_targets:
"check"
,
no_reboot:
true
,
build_type:
'Release'
)
}
}
}
}
// enable after the cmake file supports packaging
// stage("Packages") {
// when {
...
...
@@ -222,4 +253,4 @@ pipeline {
// }
// }
}
}
\ No newline at end of file
}
composable_kernel/include/gridwise_operation_wrapper.hpp
deleted
100644 → 0
View file @
3cc57101
#ifndef CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
#define CK_GRIDWISE_OPERATION_KERNEL_WRAPPER
template
<
typename
GridwiseOp
,
typename
...
Xs
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
run_gridwise_operation
(
Xs
...
xs
)
{
GridwiseOp
{}.
Run
(
xs
...);
}
#endif
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_blockwise.hpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_blockwise.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
srcDataType
,
typename
dstDataType
,
typename
compType
,
typename
src2dDescType
,
typename
dst1dDescType
,
ReduceTensorOp_t
op
,
NanPropagation_t
nanPropaOpt
,
ReduceTensorIndices_t
reduceIndicesOpt
,
bool
isFirstCall
,
bool
isLastCall
,
index_t
GredAccessesPerThreadInBlock
>
struct
GridwiseReduction_xy_to_x_blockwise
{
using
opReduce
=
typename
reduce_binary_operator
<
compType
,
op
>::
opType
;
using
preUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
isFirstCall
,
isLastCall
>::
preUnaryOp
;
using
posUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
isFirstCall
,
isLastCall
>::
posUnaryOp
;
static
constexpr
auto
buffer2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
GredAccessesPerThreadInBlock
>
{},
Number
<
BlockSize
>
{}));
using
blockwise_reduce
=
BlockwiseReduction_2d_block_buffer
<
decltype
(
buffer2dDesc
),
true
,
opReduce
,
nanPropaOpt
>
;
static
constexpr
index_t
BlockBufferSize
=
buffer2dDesc
.
GetElementSize
();
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
int
RunId
>
__device__
static
void
Run
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
);
template
<
>
__device__
static
void
Run
<
1
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
ws_indices_global
;
(
void
)
indices_global
;
// LDS
__shared__
compType
p_in_block_buffer
[
BlockBufferSize
];
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
in_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_buffer
,
BlockBufferSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
const
posUnaryOpType
posUnaryOp
(
divider
);
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
constexpr
auto
in_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
BlockBufferSize
>
{}));
using
ThreadSliceLengths
=
Sequence
<
1
,
GredAccessesPerThreadInBlock
>
;
using
ThreadClusterLengths
=
Sequence
<
1
,
BlockSize
>
;
auto
blockwise_src_load
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
1
,
BlockBufferSize
>
,
ThreadSliceLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
srcDataType
,
compType
,
src2dDescType
,
decltype
(
in_block_desc
),
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
false
,
true
>
(
src2dDesc
,
make_multi_index
(
block_global_1d_id
,
0
),
in_block_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
in_block_copy_step
=
make_multi_index
(
0
,
BlockBufferSize
);
const
index_t
toReduceBlocks
=
(
toReduceLength
+
BlockSize
-
1
)
/
BlockSize
;
for
(
index_t
reducedBlocks
=
0
;
reducedBlocks
<
toReduceBlocks
;
reducedBlocks
+=
GredAccessesPerThreadInBlock
)
{
blockwise_src_load
.
RunRead
(
src2dDesc
,
src_global_buf
);
blockwise_src_load
.
RunWrite
(
in_block_desc
,
in_block_buf
);
__syncthreads
();
// do element-wise pre-reduction operation
blockwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_block_buf
);
index_t
BlocksInOneOp
=
(
reducedBlocks
<
toReduceBlocks
-
GredAccessesPerThreadInBlock
)
?
GredAccessesPerThreadInBlock
:
toReduceBlocks
-
reducedBlocks
;
blockwise_reduce
::
Reduce
(
in_block_buf
,
BlocksInOneOp
,
accuValue_buf
(
I0
));
blockwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_block_copy_step
);
}
accuValue_buf
(
I0
)
=
posUnaryOp
(
accuValue_buf
[
I0
]);
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if
(
thread_local_id
==
0
)
{
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
block_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
[
I0
]
*
beta
;
}
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
block_global_1d_id
));
threadwise_dst_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_buf
);
}
};
template
<
>
__device__
static
void
Run
<
2
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
ws_indices_global
;
// LDS
__shared__
compType
p_in_block_buffer
[
BlockBufferSize
];
__shared__
int
block_indices_buffer
[
BlockBufferSize
];
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
auto
dst_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
dst_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
indices_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
in_block_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_buffer
,
BlockBufferSize
);
auto
in_block_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
block_indices_buffer
,
BlockBufferSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
1
,
true
>
accuIndex_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
accuIndex_buf
(
I0
)
=
0
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
constexpr
auto
in_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
BlockBufferSize
>
{}));
using
ThreadSliceLengths
=
Sequence
<
1
,
GredAccessesPerThreadInBlock
>
;
using
ThreadClusterLengths
=
Sequence
<
1
,
BlockSize
>
;
auto
blockwise_src_load
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
1
,
BlockBufferSize
>
,
ThreadSliceLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
srcDataType
,
compType
,
src2dDescType
,
decltype
(
in_block_desc
),
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
false
,
true
>
(
src2dDesc
,
make_multi_index
(
block_global_1d_id
,
0
),
in_block_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
in_block_copy_step
=
make_multi_index
(
0
,
BlockBufferSize
);
const
index_t
toReduceBlocks
=
(
toReduceLength
+
BlockSize
-
1
)
/
BlockSize
;
int
indexOffset
=
0
;
for
(
index_t
reducedBlocks
=
0
;
reducedBlocks
<
toReduceBlocks
;
reducedBlocks
+=
GredAccessesPerThreadInBlock
)
{
// load block data from global to LDS, no use of double buffers (to be improved)
blockwise_src_load
.
RunRead
(
src2dDesc
,
src_global_buf
);
blockwise_src_load
.
RunWrite
(
in_block_desc
,
in_block_val_buf
);
__syncthreads
();
// construct the indices for the current toReduce blocks
blockwise_reduce
::
init_buffer_indices
(
in_block_idx_buf
,
indexOffset
);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
blockwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_block_val_buf
);
index_t
BlocksInOneOp
=
(
reducedBlocks
<
toReduceBlocks
-
GredAccessesPerThreadInBlock
)
?
GredAccessesPerThreadInBlock
:
toReduceBlocks
-
reducedBlocks
;
blockwise_reduce
::
Reduce2
(
in_block_val_buf
,
in_block_idx_buf
,
BlocksInOneOp
,
accuValue_buf
(
I0
),
accuIndex_buf
(
I0
));
indexOffset
+=
BlockBufferSize
;
blockwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_block_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if
(
thread_local_id
==
0
)
{
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
block_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_val_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
[
I0
]
*
beta
;
}
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
block_global_1d_id
));
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
int
,
int
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
block_global_1d_id
));
threadwise_dst_val_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuIndex_buf
,
dst1dDesc
,
dst_global_idx_buf
);
}
};
template
<
>
__device__
static
void
Run
<
3
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
ws_values_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
origReduceLen
;
// LDS
__shared__
compType
p_in_block_buffer
[
BlockBufferSize
];
__shared__
int
block_indices_buffer
[
BlockBufferSize
];
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_values_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
const
auto
src_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_indices_global
,
src2dDesc
.
GetElementSpaceSize
());
auto
dst_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
dst_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
indices_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
in_block_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_buffer
,
BlockBufferSize
);
auto
in_block_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
block_indices_buffer
,
BlockBufferSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
1
,
true
>
accuIndex_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
accuIndex_buf
(
I0
)
=
0
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_1d_id
=
get_block_1d_id
();
constexpr
auto
in_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
BlockBufferSize
>
{}));
using
ThreadSliceLengths
=
Sequence
<
1
,
GredAccessesPerThreadInBlock
>
;
using
ThreadClusterLengths
=
Sequence
<
1
,
BlockSize
>
;
auto
blockwise_src_val_load
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
1
,
BlockBufferSize
>
,
ThreadSliceLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
srcDataType
,
compType
,
src2dDescType
,
decltype
(
in_block_desc
),
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
false
,
true
>
(
src2dDesc
,
make_multi_index
(
block_global_1d_id
,
0
),
in_block_desc
,
make_multi_index
(
0
,
0
));
auto
blockwise_src_idx_load
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
1
,
BlockBufferSize
>
,
ThreadSliceLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
int
,
int
,
src2dDescType
,
decltype
(
in_block_desc
),
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
false
,
true
>
(
src2dDesc
,
make_multi_index
(
block_global_1d_id
,
0
),
in_block_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
in_block_copy_step
=
make_multi_index
(
0
,
BlockBufferSize
);
const
index_t
toReduceBlocks
=
(
toReduceLength
+
BlockSize
-
1
)
/
BlockSize
;
for
(
index_t
reducedBlocks
=
0
;
reducedBlocks
<
toReduceBlocks
;
reducedBlocks
+=
GredAccessesPerThreadInBlock
)
{
// load block data from global to LDS, no use of double buffers (to be improved)
blockwise_src_val_load
.
RunRead
(
src2dDesc
,
src_global_val_buf
);
blockwise_src_idx_load
.
RunRead
(
src2dDesc
,
src_global_idx_buf
);
blockwise_src_val_load
.
RunWrite
(
in_block_desc
,
in_block_val_buf
);
blockwise_src_idx_load
.
RunWrite
(
in_block_desc
,
in_block_idx_buf
);
__syncthreads
();
index_t
BlocksInOneOp
=
(
reducedBlocks
<
toReduceBlocks
-
GredAccessesPerThreadInBlock
)
?
GredAccessesPerThreadInBlock
:
toReduceBlocks
-
reducedBlocks
;
blockwise_reduce
::
Reduce2
(
in_block_val_buf
,
in_block_idx_buf
,
BlocksInOneOp
,
accuValue_buf
(
I0
),
accuIndex_buf
(
I0
));
blockwise_src_val_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_block_copy_step
);
blockwise_src_idx_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_block_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
// The first thread in the block stores the reduced result to the global location
// representing the block
if
(
thread_local_id
==
0
)
{
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
block_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_val_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
[
I0
]
*
beta
;
}
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
block_global_1d_id
));
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
int
,
int
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
block_global_1d_id
));
threadwise_dst_val_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuIndex_buf
,
dst1dDesc
,
dst_global_idx_buf
);
}
};
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_threadwise.hpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
srcDataType
,
typename
dstDataType
,
typename
compType
,
typename
src2dDescType
,
typename
dst1dDescType
,
ReduceTensorOp_t
op
,
NanPropagation_t
nanPropaOpt
,
ReduceTensorIndices_t
reduceIndicesOpt
,
bool
isFirstCall
,
bool
isLastCall
,
index_t
GredThreadBufferLength
>
struct
GridwiseReduction_xy_to_x_direct_threadwise
{
using
opReduce
=
typename
reduce_binary_operator
<
compType
,
op
>::
opType
;
using
preUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
isFirstCall
,
isLastCall
>::
preUnaryOp
;
using
posUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
isFirstCall
,
isLastCall
>::
posUnaryOp
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
int
RunId
>
__device__
static
void
Run
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
);
template
<
>
__device__
static
void
Run
<
1
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
ws_indices_global
;
(
void
)
indices_global
;
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
GredThreadBufferLength
,
true
>
in_thread_buf
;
using
threadwise_reduce
=
ThreadReduce
<
decltype
(
in_thread_buf
),
opReduce
,
nanPropaOpt
>
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
const
posUnaryOpType
posUnaryOp
(
divider
);
using
ThreadBufferLengths
=
Sequence
<
1
,
GredThreadBufferLength
>
;
constexpr
auto
ThreadBufferDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
GredThreadBufferLength
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
srcDataType
,
compType
,
src2dDescType
,
decltype
(
ThreadBufferDesc
),
ThreadBufferLengths
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
false
>
(
src2dDesc
,
make_multi_index
(
thread_global_1d_id
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
GredThreadBufferLength
);
for
(
index_t
reducedLength
=
0
;
reducedLength
<
toReduceLength
;
reducedLength
+=
GredThreadBufferLength
)
{
threadwise_src_load
.
Run
(
src2dDesc
,
src_global_buf
,
ThreadBufferDesc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
// do element-wise pre-reduction operation
threadwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_thread_buf
);
// do the reduction on the Thread Buffer
threadwise_reduce
::
Reduce
(
in_thread_buf
,
accuValue_buf
(
I0
));
threadwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_thread_copy_step
);
}
accuValue_buf
(
I0
)
=
posUnaryOp
(
accuValue_buf
[
I0
]);
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
thread_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
[
I0
]
*
beta
;
}
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
thread_global_1d_id
));
threadwise_dst_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_buf
);
};
template
<
>
__device__
static
void
Run
<
2
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
ws_indices_global
;
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
auto
dst_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
dst_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
indices_global
,
dst1dDesc
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
GredThreadBufferLength
,
true
>
in_thread_buf
;
using
threadwise_reduce
=
ThreadReduce
<
decltype
(
in_thread_buf
),
opReduce
,
nanPropaOpt
>
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
1
,
true
>
accuIndex_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
accuIndex_buf
(
I0
)
=
0
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
using
ThreadBufferLengths
=
Sequence
<
1
,
GredThreadBufferLength
>
;
constexpr
auto
ThreadBufferDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
GredThreadBufferLength
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
srcDataType
,
compType
,
src2dDescType
,
decltype
(
ThreadBufferDesc
),
ThreadBufferLengths
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
false
>
(
src2dDesc
,
make_multi_index
(
thread_global_1d_id
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
GredThreadBufferLength
);
index_t
indexStart
=
0
;
for
(
index_t
reducedLength
=
0
;
reducedLength
<
toReduceLength
;
reducedLength
+=
GredThreadBufferLength
)
{
threadwise_src_load
.
Run
(
src2dDesc
,
src_global_buf
,
ThreadBufferDesc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
threadwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_thread_buf
);
// do the reduction on the Thread Buffer
threadwise_reduce
::
Reduce2
(
in_thread_buf
,
accuValue_buf
(
I0
),
accuIndex_buf
(
I0
),
indexStart
);
indexStart
+=
GredThreadBufferLength
;
threadwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_thread_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
thread_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_val_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
[
I0
]
*
beta
;
}
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
thread_global_1d_id
));
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
int
,
int
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
thread_global_1d_id
));
threadwise_dst_val_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuIndex_buf
,
dst1dDesc
,
dst_global_idx_buf
);
};
template
<
>
__device__
static
void
Run
<
3
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
ws_values_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
origReduceLen
;
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_values_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
const
auto
src_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_indices_global
,
src2dDesc
.
GetElementSpaceSize
());
auto
dst_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
dst_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
indices_global
,
dst1dDesc
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
GredThreadBufferLength
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
GredThreadBufferLength
,
true
>
in_thread_idx_buf
;
using
threadwise_reduce
=
ThreadReduceWithIndicesInput
<
decltype
(
in_thread_val_buf
),
decltype
(
in_thread_idx_buf
),
opReduce
,
nanPropaOpt
>
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
1
,
true
>
accuIndex_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
accuIndex_buf
(
I0
)
=
0
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
using
ThreadBufferLengths
=
Sequence
<
1
,
GredThreadBufferLength
>
;
constexpr
auto
ThreadBufferDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
GredThreadBufferLength
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
auto
threadwise_src_val_load
=
ThreadwiseTensorSliceTransfer_v2
<
srcDataType
,
compType
,
src2dDescType
,
decltype
(
ThreadBufferDesc
),
ThreadBufferLengths
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
false
>
(
src2dDesc
,
make_multi_index
(
thread_global_1d_id
,
0
));
auto
threadwise_src_idx_load
=
ThreadwiseTensorSliceTransfer_v2
<
int
,
int
,
src2dDescType
,
decltype
(
ThreadBufferDesc
),
ThreadBufferLengths
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
false
>
(
src2dDesc
,
make_multi_index
(
thread_global_1d_id
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
GredThreadBufferLength
);
for
(
index_t
reducedLength
=
0
;
reducedLength
<
toReduceLength
;
reducedLength
+=
GredThreadBufferLength
)
{
threadwise_src_val_load
.
Run
(
src2dDesc
,
src_global_val_buf
,
ThreadBufferDesc
,
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
threadwise_src_idx_load
.
Run
(
src2dDesc
,
src_global_idx_buf
,
ThreadBufferDesc
,
make_tuple
(
I0
,
I0
),
in_thread_idx_buf
);
// do the reduction on the Thread Buffer
threadwise_reduce
::
Reduce
(
in_thread_val_buf
,
in_thread_idx_buf
,
accuValue_buf
(
I0
),
accuIndex_buf
(
I0
));
threadwise_src_val_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_thread_copy_step
);
threadwise_src_idx_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_thread_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
thread_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_val_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
[
I0
]
*
beta
;
}
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
thread_global_1d_id
));
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
int
,
int
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
false
>
(
dst1dDesc
,
make_multi_index
(
thread_global_1d_id
));
threadwise_dst_val_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuIndex_buf
,
dst1dDesc
,
dst_global_idx_buf
);
};
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_direct_warpwise.hpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_DIRECT_WARPWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_warpwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
srcDataType
,
typename
dstDataType
,
typename
compType
,
typename
src2dDescType
,
typename
dst1dDescType
,
ReduceTensorOp_t
op
,
NanPropagation_t
nanPropaOpt
,
ReduceTensorIndices_t
reduceIndicesOpt
,
bool
isFirstCall
,
bool
isLastCall
,
index_t
GredAccessesPerThreadInWarp
>
struct
GridwiseReduction_xy_to_x_direct_warpwise
{
using
opReduce
=
typename
reduce_binary_operator
<
compType
,
op
>::
opType
;
using
preUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
isFirstCall
,
isLastCall
>::
preUnaryOp
;
using
posUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
isFirstCall
,
isLastCall
>::
posUnaryOp
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
int
RunId
>
__device__
static
void
Run
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
);
template
<
>
__device__
static
void
Run
<
1
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
ws_indices_global
;
(
void
)
indices_global
;
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
GredAccessesPerThreadInWarp
,
true
>
in_thread_buf
;
using
warpwise_reduce
=
WarpReduce
<
decltype
(
in_thread_buf
),
BlockSize
,
opReduce
,
nanPropaOpt
>
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
const
posUnaryOpType
posUnaryOp
(
divider
);
using
ThreadBufferLengths
=
Sequence
<
1
,
GredAccessesPerThreadInWarp
>
;
constexpr
auto
ThreadBufferDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
GredAccessesPerThreadInWarp
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
index_t
warp_global_1d_id
=
thread_global_1d_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_global_1d_id
%
warpSize
;
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
srcDataType
,
compType
,
src2dDescType
,
decltype
(
ThreadBufferDesc
),
ThreadBufferLengths
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
false
>
(
src2dDesc
,
make_multi_index
(
warp_global_1d_id
,
thread_inwarp_id
*
GredAccessesPerThreadInWarp
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
warpSize
*
GredAccessesPerThreadInWarp
);
for
(
index_t
reducedLength
=
0
;
reducedLength
<
toReduceLength
;
reducedLength
+=
warpSize
*
GredAccessesPerThreadInWarp
)
{
threadwise_src_load
.
Run
(
src2dDesc
,
src_global_buf
,
ThreadBufferDesc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
// do element-wise pre-reduction operation
warpwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_thread_buf
);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce
::
Reduce
(
in_thread_buf
,
accuValue_buf
(
I0
));
threadwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_thread_copy_step
);
}
accuValue_buf
(
I0
)
=
posUnaryOp
(
accuValue_buf
[
I0
]);
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if
(
thread_inwarp_id
==
0
)
{
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
warp_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
(
I0
)
*
beta
;
}
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
warp_global_1d_id
));
threadwise_dst_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_buf
);
}
};
template
<
>
__device__
static
void
Run
<
2
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
ws_indices_global
;
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
auto
dst_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
dst_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
indices_global
,
dst1dDesc
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
GredAccessesPerThreadInWarp
,
true
>
in_thread_buf
;
using
warpwise_reduce
=
WarpReduce
<
decltype
(
in_thread_buf
),
BlockSize
,
opReduce
,
nanPropaOpt
>
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
1
,
true
>
accuIndex_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
accuIndex_buf
(
I0
)
=
0
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
using
ThreadBufferLengths
=
Sequence
<
1
,
GredAccessesPerThreadInWarp
>
;
constexpr
auto
ThreadBufferDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
GredAccessesPerThreadInWarp
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
index_t
warp_global_1d_id
=
thread_global_1d_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_global_1d_id
%
warpSize
;
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
srcDataType
,
compType
,
src2dDescType
,
decltype
(
ThreadBufferDesc
),
ThreadBufferLengths
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
false
>
(
src2dDesc
,
make_multi_index
(
warp_global_1d_id
,
thread_inwarp_id
*
GredAccessesPerThreadInWarp
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
warpSize
*
GredAccessesPerThreadInWarp
);
index_t
indexOffset
=
0
;
for
(
index_t
reducedLength
=
0
;
reducedLength
<
toReduceLength
;
reducedLength
+=
warpSize
*
GredAccessesPerThreadInWarp
)
{
threadwise_src_load
.
Run
(
src2dDesc
,
src_global_buf
,
ThreadBufferDesc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
warpwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_thread_buf
);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce
::
Reduce2
(
in_thread_buf
,
accuValue_buf
(
I0
),
accuIndex_buf
(
I0
),
indexOffset
);
indexOffset
+=
warpSize
*
GredAccessesPerThreadInWarp
;
threadwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_thread_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if
(
thread_inwarp_id
==
0
)
{
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
warp_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_val_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
[
I0
]
*
beta
;
}
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
warp_global_1d_id
));
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
int
,
int
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
warp_global_1d_id
));
threadwise_dst_val_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuIndex_buf
,
dst1dDesc
,
dst_global_idx_buf
);
}
};
template
<
>
__device__
static
void
Run
<
3
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
ws_values_global
,
dstDataType
beta
,
dstDataType
*
const
__restrict__
p_dst_global
,
const
int
*
const
__restrict__
ws_indices_global
,
int
*
const
__restrict__
indices_global
)
{
(
void
)
origReduceLen
;
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
const
auto
src_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_values_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
const
auto
src_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_indices_global
,
src2dDesc
.
GetElementSpaceSize
());
auto
dst_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_dst_global
,
dst1dDesc
.
GetElementSpaceSize
());
auto
dst_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
indices_global
,
dst1dDesc
.
GetElementSpaceSize
());
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
GredAccessesPerThreadInWarp
,
true
>
in_thread_val_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
GredAccessesPerThreadInWarp
,
true
>
in_thread_idx_buf
;
using
warpwise_reduce
=
WarpReduceWithIndicesInput
<
decltype
(
in_thread_val_buf
),
decltype
(
in_thread_idx_buf
),
BlockSize
,
opReduce
,
nanPropaOpt
>
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
1
,
true
>
accuIndex_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
accuIndex_buf
(
I0
)
=
0
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
using
ThreadBufferLengths
=
Sequence
<
1
,
GredAccessesPerThreadInWarp
>
;
constexpr
auto
ThreadBufferDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
GredAccessesPerThreadInWarp
>
{}));
index_t
thread_global_1d_id
=
get_block_1d_id
()
*
BlockSize
+
get_thread_local_1d_id
();
index_t
warp_global_1d_id
=
thread_global_1d_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_global_1d_id
%
warpSize
;
auto
threadwise_src_val_load
=
ThreadwiseTensorSliceTransfer_v2
<
srcDataType
,
compType
,
src2dDescType
,
decltype
(
ThreadBufferDesc
),
ThreadBufferLengths
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
false
>
(
src2dDesc
,
make_multi_index
(
warp_global_1d_id
,
thread_inwarp_id
*
GredAccessesPerThreadInWarp
));
auto
threadwise_src_idx_load
=
ThreadwiseTensorSliceTransfer_v2
<
int
,
int
,
src2dDescType
,
decltype
(
ThreadBufferDesc
),
ThreadBufferLengths
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
false
>
(
src2dDesc
,
make_multi_index
(
warp_global_1d_id
,
thread_inwarp_id
*
GredAccessesPerThreadInWarp
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
warpSize
*
GredAccessesPerThreadInWarp
);
for
(
index_t
reducedLength
=
0
;
reducedLength
<
toReduceLength
;
reducedLength
+=
warpSize
*
GredAccessesPerThreadInWarp
)
{
threadwise_src_val_load
.
Run
(
src2dDesc
,
src_global_val_buf
,
ThreadBufferDesc
,
make_tuple
(
I0
,
I0
),
in_thread_val_buf
);
threadwise_src_idx_load
.
Run
(
src2dDesc
,
src_global_idx_buf
,
ThreadBufferDesc
,
make_tuple
(
I0
,
I0
),
in_thread_idx_buf
);
// do the warp-wise reduction on data of all thread buffers
warpwise_reduce
::
Reduce
(
in_thread_val_buf
,
in_thread_idx_buf
,
accuValue_buf
(
I0
),
accuIndex_buf
(
I0
));
threadwise_src_val_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_thread_copy_step
);
threadwise_src_idx_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_thread_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
// The first thread in the warp stores the reduced result to the global location
// representing the Warp
if
(
thread_inwarp_id
==
0
)
{
if
(
!
float_equal_one
{}(
alpha
))
accuValue_buf
(
I0
)
*=
type_convert
<
compType
>
(
alpha
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
dstValue_buf
;
dstValue_buf
(
I0
)
=
type_convert
<
dstDataType
>
(
accuValue_buf
[
I0
]);
if
(
!
float_equal_zero
{}(
beta
))
{
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
dstDataType
,
dstDataType
,
dst1dDescType
,
decltype
(
ReducedDataDesc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
warp_global_1d_id
));
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
dstDataType
,
1
,
true
>
priorDstValue_buf
;
threadwise_dst_load
.
Run
(
dst1dDesc
,
dst_global_val_buf
,
ReducedDataDesc
,
make_tuple
(
I0
),
priorDstValue_buf
);
dstValue_buf
(
I0
)
+=
priorDstValue_buf
[
I0
]
*
beta
;
}
auto
threadwise_dst_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
dstDataType
,
dstDataType
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
warp_global_1d_id
));
auto
threadwise_dst_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
int
,
int
,
decltype
(
ReducedDataDesc
),
dst1dDescType
,
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
dst1dDesc
,
make_multi_index
(
warp_global_1d_id
));
threadwise_dst_val_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
dstValue_buf
,
dst1dDesc
,
dst_global_val_buf
);
threadwise_dst_idx_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuIndex_buf
,
dst1dDesc
,
dst_global_idx_buf
);
}
};
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/gridwise_generic_2d_reduction_multiblock.hpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#define CK_GRIDWISE_GENERIC_2D_REDUCTION_MULTIBLOCK_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_blockwise.hpp"
#include "blockwise_tensor_slice_transfer.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
srcDataType
,
typename
dstDataType
,
// not used together with the beta input
typename
compType
,
typename
src2dDescType
,
typename
dst1dDescType
,
ReduceTensorOp_t
op
,
NanPropagation_t
nanPropaOpt
,
ReduceTensorIndices_t
reduceIndicesOpt
,
index_t
GredAccessesPerThreadInBlock
>
struct
GridwiseReduction_xy_to_x_multiblock
{
using
opReduce
=
typename
reduce_binary_operator
<
compType
,
op
>::
opType
;
using
preUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
true
,
false
>::
preUnaryOp
;
using
posUnaryOpType
=
typename
reduce_unary_operator
<
compType
,
op
,
true
,
false
>::
posUnaryOp
;
static
constexpr
auto
buffer2dDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
GredAccessesPerThreadInBlock
>
{},
Number
<
BlockSize
>
{}));
using
blockwise_reduce
=
BlockwiseReduction_2d_block_buffer
<
decltype
(
buffer2dDesc
),
true
,
opReduce
,
nanPropaOpt
>
;
static
constexpr
index_t
BlockBufferSize
=
buffer2dDesc
.
GetElementSize
();
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
int
RunId
>
__device__
static
void
Run
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
int
BlkGroupSize
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
srcDataType
*
const
__restrict__
ws_values_global
,
int
*
const
__restrict__
ws_indices_global
);
template
<
>
__device__
static
void
Run
<
1
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
int
BlkGroupSize
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
srcDataType
*
const
__restrict__
ws_values_global
,
int
*
const
__restrict__
ws_indices_global
)
{
(
void
)
ws_indices_global
;
(
void
)
alpha
;
// unused
(
void
)
beta
;
// unused
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
// LDS
__shared__
compType
p_in_block_buffer
[
BlockBufferSize
];
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
auto
workspace_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_values_global
,
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
);
auto
in_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_buffer
,
BlockBufferSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
BlkGroupSize
;
const
index_t
block_local_id
=
block_global_id
%
BlkGroupSize
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLength
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
BlockBufferSize
-
1
)
/
BlockBufferSize
)
*
BlockBufferSize
;
constexpr
auto
in_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
BlockSize
*
GredAccessesPerThreadInBlock
>
{}));
using
ThreadSliceLengths
=
Sequence
<
1
,
GredAccessesPerThreadInBlock
>
;
using
ThreadClusterLengths
=
Sequence
<
1
,
BlockSize
>
;
auto
blockwise_src_load
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
1
,
BlockBufferSize
>
,
ThreadSliceLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
srcDataType
,
compType
,
src2dDescType
,
decltype
(
in_block_desc
),
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
false
,
true
>
(
src2dDesc
,
make_multi_index
(
blkgroup_id
,
block_local_id
*
reduceSizePerBlock
),
in_block_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
in_block_copy_step
=
make_multi_index
(
0
,
BlockBufferSize
);
const
index_t
toReduceBlocks
=
(
reduceSizePerBlock
+
BlockSize
-
1
)
/
BlockSize
;
for
(
index_t
reducedBlocks
=
0
;
reducedBlocks
<
toReduceBlocks
;
reducedBlocks
+=
GredAccessesPerThreadInBlock
)
{
blockwise_src_load
.
RunRead
(
src2dDesc
,
src_global_buf
);
blockwise_src_load
.
RunWrite
(
in_block_desc
,
in_block_buf
);
__syncthreads
();
// do element-wise pre-reduction operation
blockwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_block_buf
);
index_t
BlocksInOneOp
=
(
reducedBlocks
<
toReduceBlocks
-
GredAccessesPerThreadInBlock
)
?
GredAccessesPerThreadInBlock
:
toReduceBlocks
-
reducedBlocks
;
blockwise_reduce
::
Reduce
(
in_block_buf
,
BlocksInOneOp
,
accuValue_buf
(
I0
));
blockwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_block_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
const
auto
workspace_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
));
// The first thread in the block stores the reduced result to the global location
// representing the block
if
(
thread_local_id
==
0
)
{
auto
threadwise_workspace_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
compType
,
srcDataType
,
decltype
(
ReducedDataDesc
),
decltype
(
workspace_desc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc
,
make_multi_index
(
block_global_id
));
threadwise_workspace_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuValue_buf
,
workspace_desc
,
workspace_global_buf
);
}
};
template
<
>
__device__
static
void
Run
<
2
>
(
const
src2dDescType
&
src2dDesc
,
const
dst1dDescType
&
dst1dDesc
,
int
origReduceLen
,
int
BlkGroupSize
,
srcDataType
alpha
,
const
srcDataType
*
const
__restrict__
p_src_global
,
dstDataType
beta
,
srcDataType
*
const
__restrict__
ws_values_global
,
int
*
const
__restrict__
ws_indices_global
)
{
(
void
)
alpha
;
// unused
(
void
)
beta
;
// unused
const
auto
zeroVal
=
opReduce
::
GetReductionZeroVal
();
// LDS
__shared__
compType
p_in_block_values_buffer
[
BlockBufferSize
];
__shared__
int
p_in_block_indices_buffer
[
BlockBufferSize
];
const
auto
src_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
p_src_global
,
src2dDesc
.
GetElementSpaceSize
(),
type_convert
<
srcDataType
>
(
zeroVal
));
auto
workspace_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_values_global
,
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
);
auto
workspace_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Global
>
(
ws_indices_global
,
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
);
auto
in_block_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_values_buffer
,
BlockBufferSize
);
auto
in_block_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum_t
::
Lds
>
(
p_in_block_indices_buffer
,
BlockBufferSize
);
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
compType
,
1
,
true
>
accuValue_buf
;
StaticBuffer
<
AddressSpaceEnum_t
::
Vgpr
,
int
,
1
,
true
>
accuIndex_buf
;
accuValue_buf
(
I0
)
=
zeroVal
;
accuIndex_buf
(
I0
)
=
0
;
const
auto
toReduceLength
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
const
int
divider
=
origReduceLen
;
const
preUnaryOpType
preUnaryOp
(
divider
);
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
BlkGroupSize
;
const
index_t
block_local_id
=
block_global_id
%
BlkGroupSize
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLength
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
BlockBufferSize
-
1
)
/
BlockBufferSize
)
*
BlockBufferSize
;
constexpr
auto
in_block_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
BlockSize
*
GredAccessesPerThreadInBlock
>
{}));
using
ThreadSliceLengths
=
Sequence
<
1
,
GredAccessesPerThreadInBlock
>
;
using
ThreadClusterLengths
=
Sequence
<
1
,
BlockSize
>
;
auto
blockwise_src_load
=
BlockwiseTensorSliceTransfer_v4
<
BlockSize
,
InMemoryDataOperationEnum_t
::
Set
,
Sequence
<
1
,
BlockBufferSize
>
,
ThreadSliceLengths
,
ThreadClusterLengths
,
Sequence
<
0
,
1
>
,
srcDataType
,
compType
,
src2dDescType
,
decltype
(
in_block_desc
),
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
false
,
true
>
(
src2dDesc
,
make_multi_index
(
blkgroup_id
,
block_local_id
*
reduceSizePerBlock
),
in_block_desc
,
make_multi_index
(
0
,
0
));
constexpr
auto
in_block_copy_step
=
make_multi_index
(
0
,
BlockBufferSize
);
const
index_t
toReduceBlocks
=
(
reduceSizePerBlock
+
BlockSize
-
1
)
/
BlockSize
;
int
indexOffset
=
block_local_id
*
reduceSizePerBlock
;
for
(
index_t
reducedBlocks
=
0
;
reducedBlocks
<
toReduceBlocks
;
reducedBlocks
+=
GredAccessesPerThreadInBlock
)
{
blockwise_reduce
::
init_buffer_indices
(
in_block_idx_buf
,
indexOffset
);
blockwise_src_load
.
RunRead
(
src2dDesc
,
src_global_buf
);
blockwise_src_load
.
RunWrite
(
in_block_desc
,
in_block_val_buf
);
__syncthreads
();
// unary operation before reducing, needed by AMAX; For MIN/MAX, nothing is actually
// done here
blockwise_reduce
::
operate_on_elements
(
preUnaryOp
,
in_block_val_buf
);
index_t
BlocksInOneOp
=
(
reducedBlocks
<
toReduceBlocks
-
GredAccessesPerThreadInBlock
)
?
GredAccessesPerThreadInBlock
:
toReduceBlocks
-
reducedBlocks
;
blockwise_reduce
::
Reduce2
(
in_block_val_buf
,
in_block_idx_buf
,
BlocksInOneOp
,
accuValue_buf
(
I0
),
accuIndex_buf
(
I0
));
indexOffset
+=
BlockBufferSize
;
blockwise_src_load
.
MoveSrcSliceWindow
(
src2dDesc
,
in_block_copy_step
);
}
constexpr
auto
ReducedDataDesc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{}));
const
auto
workspace_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
dst1dDesc
.
GetLength
(
I0
)
*
BlkGroupSize
));
// The first thread in the block stores the reduced result to the global location
// representing the block
if
(
thread_local_id
==
0
)
{
auto
threadwise_workspace_val_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
compType
,
srcDataType
,
decltype
(
ReducedDataDesc
),
decltype
(
workspace_desc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc
,
make_multi_index
(
block_global_id
));
auto
threadwise_workspace_idx_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
int
,
int
,
decltype
(
ReducedDataDesc
),
decltype
(
workspace_desc
),
Sequence
<
1
>
,
Sequence
<
0
>
,
0
,
1
,
InMemoryDataOperationEnum_t
::
Set
,
1
,
true
>
(
workspace_desc
,
make_multi_index
(
block_global_id
));
threadwise_workspace_val_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuValue_buf
,
workspace_desc
,
workspace_global_val_buf
);
threadwise_workspace_idx_store
.
Run
(
ReducedDataDesc
,
make_tuple
(
I0
),
accuIndex_buf
,
workspace_desc
,
workspace_global_idx_buf
);
}
};
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/reduction_functions_blockwise.hpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace
ck
{
template
<
typename
buffer2dDescType
,
bool
blockIsOneRow
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
BlockwiseReduction_2d_block_buffer
{
using
compType
=
typename
opReduce
::
dataType
;
static
constexpr
auto
buffer2dDesc
=
buffer2dDescType
{};
static
constexpr
index_t
BlockSize
=
blockIsOneRow
?
buffer2dDesc
.
GetLength
(
Number
<
1
>
{})
:
buffer2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
index_t
NumBlocks
=
blockIsOneRow
?
buffer2dDesc
.
GetLength
(
Number
<
0
>
{})
:
buffer2dDesc
.
GetLength
(
Number
<
1
>
{});
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
// This interface does not accumulate on indices
template
<
typename
BufferType
>
__device__
static
void
Reduce
(
BufferType
&
block_buffer
,
index_t
toReduceBlocks
,
compType
&
accuData
)
{
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
compType
lAccuData
=
opReduce
::
GetReductionZeroVal
();
index_t
offset
;
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
toReduceBlocks
;
otherDimInd
++
)
{
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_local_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
otherDimInd
));
compType
opData
=
type_convert
<
compType
>
(
block_buffer
[
offset
]);
binop
::
calculate
(
lAccuData
,
opData
);
}
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
0
,
thread_local_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
0
));
block_buffer
(
offset
)
=
lAccuData
;
__syncthreads
();
for
(
index_t
indOffset
=
BlockSize
/
2
;
indOffset
>
0
;
indOffset
/=
2
)
{
if
(
thread_local_id
<
indOffset
)
{
index_t
offset1
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
0
,
thread_local_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
0
));
index_t
offset2
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
0
,
thread_local_id
+
indOffset
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
+
indOffset
,
0
));
compType
opData1
=
type_convert
<
compType
>
(
block_buffer
[
offset1
]);
compType
opData2
=
type_convert
<
compType
>
(
block_buffer
[
offset2
]);
binop
::
calculate
(
opData1
,
opData2
);
block_buffer
(
offset1
)
=
type_convert
<
compType
>
(
opData1
);
}
__syncthreads
();
}
if
(
thread_local_id
==
0
)
{
compType
tmpVal
=
type_convert
<
compType
>
(
block_buffer
[
0
]);
binop
::
calculate
(
accuData
,
tmpVal
);
}
};
// This interface accumulates on both data values and indices
template
<
typename
BufferType
,
typename
IdxBufferType
>
__device__
static
void
Reduce2
(
BufferType
&
block_buffer
,
IdxBufferType
&
block_indices_buffer
,
index_t
toReduceBlocks
,
compType
&
accuData
,
int
&
accuIndex
)
{
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
compType
lAccuData
=
opReduce
::
GetReductionZeroVal
();
int
lAccuIndex
=
0
;
if
constexpr
(
blockIsOneRow
)
{
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
toReduceBlocks
;
otherDimInd
++
)
{
for
(
index_t
indOffset
=
1
;
indOffset
<
BlockSize
;
indOffset
*=
2
)
{
if
(
thread_local_id
%
(
indOffset
*
2
)
==
0
)
{
index_t
offset1
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_local_id
));
index_t
offset2
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_local_id
+
indOffset
));
compType
currVal1
=
type_convert
<
compType
>
(
block_buffer
[
offset1
]);
compType
currVal2
=
type_convert
<
compType
>
(
block_buffer
[
offset2
]);
int
currIndex1
=
block_indices_buffer
[
offset1
];
int
currIndex2
=
block_indices_buffer
[
offset2
];
binop
::
calculate
(
currVal1
,
currVal2
,
currIndex1
,
currIndex2
);
block_buffer
(
offset1
)
=
type_convert
<
compType
>
(
currVal1
);
block_indices_buffer
(
offset1
)
=
currIndex1
;
}
__syncthreads
();
}
}
if
(
thread_local_id
==
0
)
{
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
toReduceBlocks
;
otherDimInd
++
)
{
index_t
offset
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
0
));
compType
tmpVal
=
type_convert
<
compType
>
(
block_buffer
[
offset
]);
int
tmpIndex
=
block_indices_buffer
[
offset
];
binop
::
calculate
(
lAccuData
,
tmpVal
,
lAccuIndex
,
tmpIndex
);
}
binop
::
calculate
(
accuData
,
lAccuData
,
accuIndex
,
lAccuIndex
);
}
}
else
{
index_t
offset
;
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
toReduceBlocks
;
otherDimInd
++
)
{
offset
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
otherDimInd
));
compType
currVal
=
type_convert
<
compType
>
(
block_buffer
[
offset
]);
int
currIndex
=
block_indices_buffer
[
offset
];
binop
::
calculate
(
lAccuData
,
currVal
,
lAccuIndex
,
currIndex
);
}
offset
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
0
));
block_buffer
(
offset
)
=
lAccuData
;
block_indices_buffer
(
offset
)
=
lAccuIndex
;
__syncthreads
();
for
(
index_t
indOffset
=
1
;
indOffset
<
BlockSize
;
indOffset
*=
2
)
{
if
(
thread_local_id
%
(
indOffset
*
2
)
==
0
)
{
index_t
offset1
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
,
0
));
index_t
offset2
=
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_local_id
+
indOffset
,
0
));
compType
currVal1
=
type_convert
<
compType
>
(
block_buffer
[
offset1
]);
compType
currVal2
=
type_convert
<
compType
>
(
block_buffer
[
offset2
]);
int
currIndex1
=
block_indices_buffer
[
offset1
];
int
currIndex2
=
block_indices_buffer
[
offset2
];
binop
::
calculate
(
currVal1
,
currVal2
,
currIndex1
,
currIndex2
);
block_buffer
(
offset1
)
=
type_convert
<
compType
>
(
currVal1
);
block_indices_buffer
(
offset1
)
=
currIndex1
;
}
__syncthreads
();
}
if
(
thread_local_id
==
0
)
{
compType
tmpVal
=
type_convert
<
compType
>
(
block_buffer
[
0
]);
int
tmpIndex
=
block_indices_buffer
[
0
];
binop
::
calculate
(
accuData
,
tmpVal
,
accuIndex
,
tmpIndex
);
}
}
};
template
<
typename
BufferType
>
__device__
static
void
set_buffer_value
(
BufferType
&
block_buffer
,
compType
value
)
{
index_t
thread_id
=
get_thread_local_1d_id
();
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
NumBlocks
;
otherDimInd
++
)
{
index_t
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_id
,
otherDimInd
));
block_buffer
(
offset
)
=
value
;
__syncthreads
();
}
};
// Initialize the block-wise indices buffer, the index for each element in the block-wise
// data buffer is calculated according to its position in the buffer and the global starting
// index
template
<
typename
IdxBufferType
>
__device__
static
void
init_buffer_indices
(
IdxBufferType
&
block_indices_buffer
,
int
indexStart
)
{
index_t
thread_id
=
get_thread_local_1d_id
();
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
NumBlocks
;
otherDimInd
++
)
{
index_t
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_id
,
otherDimInd
));
block_indices_buffer
(
offset
)
=
offset
+
indexStart
;
__syncthreads
();
}
};
// Execute unary operation on the block buffer elements
template
<
typename
unary_op_type
,
typename
BufferType
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
block_buffer
)
{
index_t
thread_id
=
get_thread_local_1d_id
();
for
(
index_t
otherDimInd
=
0
;
otherDimInd
<
NumBlocks
;
otherDimInd
++
)
{
index_t
offset
=
blockIsOneRow
?
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
otherDimInd
,
thread_id
))
:
buffer2dDesc
.
CalculateOffset
(
make_tuple
(
thread_id
,
otherDimInd
));
block_buffer
(
offset
)
=
unary_op
(
block_buffer
[
offset
]);
__syncthreads
();
}
};
};
};
// end of namespace ck
#endif
composable_kernel/include/tensor_operation/reduction_functions_threadwise.hpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#define CK_REDUCTION_FUNCTIONS_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace
ck
{
template
<
typename
BufferType
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
ThreadReduce
{
using
compType
=
typename
opReduce
::
dataType
;
static_assert
(
BufferType
::
IsStaticBuffer
(),
"Thread-wise reduction needs use StaticBuffer!"
);
static_assert
(
std
::
is_same
<
typename
BufferType
::
type
,
compType
>::
value
,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"
);
static
constexpr
index_t
ThreadBufferLen
=
BufferType
::
Size
();
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
// This interface does not accumulate on indices
__device__
static
void
Reduce
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
binop
::
calculate
(
accuData
,
thread_buffer
[
I
]);
});
};
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at first-time reduction
__device__
static
void
Reduce2
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
,
int
&
accuIndex
,
int
indexStart
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
int
currIndex
=
I
+
indexStart
;
binop
::
calculate
(
accuData
,
thread_buffer
[
I
],
accuIndex
,
currIndex
);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__
static
void
set_buffer_value
(
BufferType
&
thread_buffer
,
compType
value
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
thread_buffer
(
I
)
=
value
;
});
};
// Execute unary operation on the per-thread buffer elements
template
<
typename
unary_op_type
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
thread_buffer
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
thread_buffer
(
I
)
=
unary_op
(
thread_buffer
[
I
]);
});
};
};
template
<
typename
BufferType
,
typename
IdxBufferType
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
ThreadReduceWithIndicesInput
{
using
compType
=
typename
opReduce
::
dataType
;
static_assert
(
BufferType
::
IsStaticBuffer
(),
"Thread-wise reduction needs use StaticBuffer!"
);
static_assert
(
IdxBufferType
::
IsStaticBuffer
(),
"Thread-wise reduction needs use StaticBuffer for indices!"
);
static_assert
(
std
::
is_same
<
typename
BufferType
::
type
,
compType
>::
value
,
"Data type of StaticBuffer for Thread-wise reduction should be same as the compType!"
);
static_assert
(
std
::
is_same
<
typename
IdxBufferType
::
type
,
index_t
>::
value
,
"Indices type of StaticBuffer for Thread-wise reduction should be index_t!"
);
static_assert
(
BufferType
::
Size
()
==
IdxBufferType
::
Size
(),
"StaticBuffers for data and indices should have the same sizes!"
);
static
constexpr
index_t
ThreadBufferLen
=
BufferType
::
Size
();
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
// This interface accumulates on both data values and indices and
// is called by Direct_ThreadWise reduction method at second-time reduction
__device__
static
void
Reduce
(
const
BufferType
&
thread_buffer
,
const
IdxBufferType
&
thread_indices_buffer
,
compType
&
accuData
,
int
&
accuIndex
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
binop
::
calculate
(
accuData
,
thread_buffer
[
I
],
accuIndex
,
thread_indices_buffer
[
I
]);
});
};
// Set the elements in the per-thread buffer to a specific value
// cppcheck-suppress constParameter
__device__
static
void
set_buffer_value
(
BufferType
&
thread_buffer
,
compType
value
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
thread_buffer
(
I
)
=
value
;
});
};
// Execute unary operation on the per-thread buffer elements
template
<
typename
unary_op_type
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
thread_buffer
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
thread_buffer
(
I
)
=
unary_op
(
thread_buffer
[
I
]);
});
};
};
};
// end of namespace ck
#endif
composable_kernel/include/tensor_operation/reduction_functions_warpwise.hpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
#define CK_REDUCTION_FUNCTIONS_WARPWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_binop.hpp"
namespace
ck
{
template
<
typename
BufferType
,
index_t
BlockSize
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
WarpReduce
{
using
compType
=
typename
opReduce
::
dataType
;
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
static_assert
(
BufferType
::
IsStaticBuffer
(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer!"
);
static_assert
(
std
::
is_same
<
typename
BufferType
::
type
,
compType
>::
value
,
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
"the compType!"
);
static
constexpr
index_t
ThreadBufferLen
=
BufferType
::
Size
();
static
constexpr
bool
have_builtin_shuffle
=
std
::
is_same
<
compType
,
float
>::
value
||
std
::
is_same
<
compType
,
double
>::
value
;
// This interface does not accumulate on indices
__device__
static
void
Reduce
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
)
{
if
constexpr
(
have_builtin_shuffle
)
ReduceImpl1
(
thread_buffer
,
accuData
);
else
ReduceImpl2
(
thread_buffer
,
accuData
);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__
static
void
ReduceImpl1
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
)
{
compType
lAccuData
=
opReduce
::
GetReductionZeroVal
();
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
]);
});
// synchronize among all threads in this warp
__all
(
1
);
for
(
index_t
stride
=
warpSize
/
2
;
stride
>
0
;
stride
/=
2
)
{
compType
tmpVal
=
__shfl_down
(
lAccuData
,
stride
,
warpSize
);
binop
::
calculate
(
lAccuData
,
tmpVal
);
__all
(
1
);
}
binop
::
calculate
(
accuData
,
lAccuData
);
};
// This interface implementation does not use HIP built-in device shuffling functions
// since for fp16, built-in shuffling functions is not provided by HIP
__device__
static
void
ReduceImpl2
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
)
{
compType
lAccuData
=
opReduce
::
GetReductionZeroVal
();
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
]);
});
__syncthreads
();
index_t
thread_id
=
get_thread_local_1d_id
();
index_t
warpId
=
thread_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_id
%
warpSize
;
__shared__
compType
shuffle_buffer
[
BlockSize
];
compType
*
myBuffer
=
&
shuffle_buffer
[
warpId
*
warpSize
];
myBuffer
[
thread_inwarp_id
]
=
lAccuData
;
__syncthreads
();
for
(
index_t
stride
=
warpSize
/
2
;
stride
>
0
;
stride
/=
2
)
{
if
(
thread_inwarp_id
<
stride
)
{
compType
currVal1
=
myBuffer
[
thread_inwarp_id
];
compType
currVal2
=
myBuffer
[
thread_inwarp_id
+
stride
];
binop
::
calculate
(
currVal1
,
currVal2
);
myBuffer
[
thread_inwarp_id
]
=
currVal1
;
}
__syncthreads
();
}
if
(
thread_inwarp_id
==
0
)
binop
::
calculate
(
accuData
,
myBuffer
[
0
]);
};
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
// reduction method at first-time reduction
__device__
static
void
Reduce2
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
,
int
&
accuIndex
,
int
indexStart
)
{
if
constexpr
(
have_builtin_shuffle
)
Reduce2Impl1
(
thread_buffer
,
accuData
,
accuIndex
,
indexStart
);
else
Reduce2Impl2
(
thread_buffer
,
accuData
,
accuIndex
,
indexStart
);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__
static
void
Reduce2Impl1
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
,
int
&
accuIndex
,
int
indexStart
)
{
compType
lAccuData
=
opReduce
::
GetReductionZeroVal
();
int
lAccuIndex
=
0
;
index_t
thread_inwarp_id
=
get_thread_local_1d_id
()
%
warpSize
;
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
int
currIndex
=
thread_inwarp_id
*
ThreadBufferLen
+
I
+
indexStart
;
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
],
lAccuIndex
,
currIndex
);
});
// synchronize among all threads in this warp
__all
(
1
);
for
(
index_t
stride
=
1
;
stride
<
warpSize
;
stride
*=
2
)
{
compType
tmpVal
=
__shfl_down
(
lAccuData
,
stride
,
warpSize
);
int
tmpIndex
=
__shfl_down
(
lAccuIndex
,
stride
,
warpSize
);
binop
::
calculate
(
lAccuData
,
tmpVal
,
lAccuIndex
,
tmpIndex
);
__all
(
1
);
}
if
(
thread_inwarp_id
==
0
)
binop
::
calculate
(
accuData
,
lAccuData
,
accuIndex
,
lAccuIndex
);
};
// This interface implementation does not use HIP built-in device shuffling functions since for
// fp16, built-in shuffling functions is not provided by HIP
__device__
static
void
Reduce2Impl2
(
const
BufferType
&
thread_buffer
,
compType
&
accuData
,
int
&
accuIndex
,
int
indexStart
)
{
compType
lAccuData
=
opReduce
::
GetReductionZeroVal
();
int
lAccuIndex
=
0
;
index_t
thread_id
=
get_thread_local_1d_id
();
index_t
warpId
=
thread_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_id
%
warpSize
;
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
int
currIndex
=
thread_inwarp_id
*
ThreadBufferLen
+
I
+
indexStart
;
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
],
lAccuIndex
,
currIndex
);
});
__shared__
compType
shuffle_data_buffer
[
BlockSize
];
__shared__
int
shuffle_indices_buffer
[
BlockSize
];
compType
*
myDataBuffer
=
&
shuffle_data_buffer
[
warpId
*
warpSize
];
int
*
myIndicesBuffer
=
&
shuffle_indices_buffer
[
warpId
*
warpSize
];
myDataBuffer
[
thread_inwarp_id
]
=
lAccuData
;
myIndicesBuffer
[
thread_inwarp_id
]
=
lAccuIndex
;
__syncthreads
();
for
(
index_t
stride
=
1
;
stride
<
warpSize
;
stride
*=
2
)
{
compType
currVal1
=
myDataBuffer
[
thread_inwarp_id
];
compType
currVal2
=
myDataBuffer
[
thread_inwarp_id
+
stride
];
int
currIndex1
=
myIndicesBuffer
[
thread_inwarp_id
];
int
currIndex2
=
myIndicesBuffer
[
thread_inwarp_id
+
stride
];
binop
::
calculate
(
currVal1
,
currVal2
,
currIndex1
,
currIndex2
);
myDataBuffer
[
thread_inwarp_id
]
=
currVal1
;
myIndicesBuffer
[
thread_inwarp_id
]
=
currIndex1
;
__syncthreads
();
}
if
(
thread_inwarp_id
==
0
)
binop
::
calculate
(
accuData
,
myDataBuffer
[
0
],
accuIndex
,
myIndicesBuffer
[
0
]);
};
// cppcheck-suppress constParameter
__device__
static
void
set_buffer_value
(
BufferType
&
thread_buffer
,
compType
value
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
thread_buffer
(
I
)
=
value
;
});
__all
(
1
);
};
// Execute unary operation on the per-thread buffer elements
template
<
typename
unary_op_type
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
thread_buffer
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
thread_buffer
(
I
)
=
unary_op
(
thread_buffer
[
I
]);
});
__all
(
1
);
};
};
template
<
typename
BufferType
,
typename
IdxBufferType
,
index_t
BlockSize
,
typename
opReduce
,
NanPropagation_t
nanPropaOpt
>
struct
WarpReduceWithIndicesInput
{
using
compType
=
typename
opReduce
::
dataType
;
using
binop
=
detail
::
binop_with_nan_check
<
nanPropaOpt
,
opReduce
,
compType
>
;
static_assert
(
BufferType
::
IsStaticBuffer
(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer!"
);
static_assert
(
IdxBufferType
::
IsStaticBuffer
(),
"Per-thread buffer for WarpWise reduction should be StaticBuffer for indices!"
);
static_assert
(
std
::
is_same
<
typename
BufferType
::
type
,
compType
>::
value
,
"Data type of per-thread StaticBuffer for WarpWise reduction should be same as "
"the compType!"
);
static_assert
(
std
::
is_same
<
typename
IdxBufferType
::
type
,
index_t
>::
value
,
"Indices type per-thread of StaticBuffer for WarpWise reduction should be index_t!"
);
static_assert
(
BufferType
::
Size
()
==
IdxBufferType
::
Size
(),
"StaticBuffers for data and indices should have the same sizes!"
);
static
constexpr
index_t
ThreadBufferLen
=
BufferType
::
Size
();
static
constexpr
bool
have_builtin_shuffle
=
std
::
is_same
<
compType
,
float
>::
value
||
std
::
is_same
<
compType
,
double
>::
value
;
// This interface accumulates on both data values and indices and is called by Direct_WarpWise
// reduction method at second-time reduction
__device__
static
void
Reduce
(
const
BufferType
&
thread_buffer
,
const
IdxBufferType
&
thread_indices_buffer
,
compType
&
accuData
,
int
&
accuIndex
)
{
if
constexpr
(
have_builtin_shuffle
)
ReduceImpl1
(
thread_buffer
,
thread_indices_buffer
,
accuData
,
accuIndex
);
else
ReduceImpl2
(
thread_buffer
,
thread_indices_buffer
,
accuData
,
accuIndex
);
};
// This interface implementation uses HIP built-in device shuffling functions
__device__
static
void
ReduceImpl1
(
const
BufferType
&
thread_buffer
,
const
IdxBufferType
&
thread_indices_buffer
,
compType
&
accuData
,
int
&
accuIndex
)
{
compType
lAccuData
=
opReduce
::
GetReductionZeroVal
();
int
lAccuIndex
=
0
;
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
],
lAccuIndex
,
thread_indices_buffer
[
I
]);
});
// synchronize among all threads in this warp
__all
(
1
);
for
(
index_t
stride
=
1
;
stride
<
warpSize
;
stride
*=
2
)
{
compType
tmpVal
=
__shfl_down
(
lAccuData
,
stride
,
warpSize
);
int
tmpIndex
=
__shfl_down
(
lAccuIndex
,
stride
,
warpSize
);
binop
::
calculate
(
lAccuData
,
tmpVal
,
lAccuIndex
,
tmpIndex
);
__all
(
1
);
}
binop
::
calculate
(
accuData
,
lAccuData
,
accuIndex
,
lAccuIndex
);
};
// This interface implementation does not use HIP built-in device shuffling functions
// since for fp16, built-in shuffling functions is not provided by HIP
__device__
static
void
ReduceImpl2
(
const
BufferType
&
thread_buffer
,
const
IdxBufferType
&
thread_indices_buffer
,
compType
&
accuData
,
int
&
accuIndex
)
{
compType
lAccuData
=
opReduce
::
GetReductionZeroVal
();
int
lAccuIndex
=
0
;
index_t
thread_id
=
get_thread_local_1d_id
();
index_t
warpId
=
thread_id
/
warpSize
;
index_t
thread_inwarp_id
=
thread_id
%
warpSize
;
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
binop
::
calculate
(
lAccuData
,
thread_buffer
[
I
],
lAccuIndex
,
thread_indices_buffer
[
I
]);
});
__shared__
compType
shuffle_data_buffer
[
BlockSize
];
__shared__
int
shuffle_indices_buffer
[
BlockSize
];
compType
*
myDataBuffer
=
&
shuffle_data_buffer
[
warpId
*
warpSize
];
int
*
myIndicesBuffer
=
&
shuffle_indices_buffer
[
warpId
*
warpSize
];
myDataBuffer
[
thread_inwarp_id
]
=
lAccuData
;
myIndicesBuffer
[
thread_inwarp_id
]
=
lAccuIndex
;
__syncthreads
();
for
(
index_t
stride
=
1
;
stride
<
warpSize
;
stride
*=
2
)
{
compType
currVal1
=
myDataBuffer
[
thread_inwarp_id
];
compType
currVal2
=
myDataBuffer
[
thread_inwarp_id
+
stride
];
int
currIndex1
=
myIndicesBuffer
[
thread_inwarp_id
];
int
currIndex2
=
myIndicesBuffer
[
thread_inwarp_id
+
stride
];
binop
::
calculate
(
currVal1
,
currVal2
,
currIndex1
,
currIndex2
);
myDataBuffer
[
thread_inwarp_id
]
=
currVal1
;
myIndicesBuffer
[
thread_inwarp_id
]
=
currIndex1
;
__syncthreads
();
}
if
(
thread_inwarp_id
==
0
)
binop
::
calculate
(
accuData
,
myDataBuffer
[
0
],
accuIndex
,
myIndicesBuffer
[
0
]);
};
// cppcheck-suppress constParameter
__device__
static
void
set_buffer_value
(
BufferType
&
thread_buffer
,
compType
value
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}([
&
](
auto
I
)
{
thread_buffer
(
I
)
=
value
;
});
__all
(
1
);
};
// Execute unary operation on the per-thread buffer elements
template
<
typename
unary_op_type
>
__device__
static
void
operate_on_elements
(
unary_op_type
&
unary_op
,
BufferType
&
thread_buffer
)
{
static_for
<
0
,
ThreadBufferLen
,
1
>
{}(
[
&
](
auto
I
)
{
thread_buffer
(
I
)
=
unary_op
(
thread_buffer
[
I
]);
});
__all
(
1
);
};
};
};
// end of namespace ck
#endif
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw.cpp
deleted
100644 → 0
View file @
3cc57101
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_dlops_v1r2.hpp"
#include "transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
constexpr
DataTypeEnum_t
ABDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_ABDataTypeEnum
);
constexpr
DataTypeEnum_t
AccDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_AccDataTypeEnum
);
constexpr
DataTypeEnum_t
CDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_CDataTypeEnum
);
using
FloatAB
=
typename
get_datatype_from_enum
<
ABDataTypeEnum
>::
type
;
using
FloatAcc
=
typename
get_datatype_from_enum
<
AccDataTypeEnum
>::
type
;
using
FloatC
=
typename
get_datatype_from_enum
<
CDataTypeEnum
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BlockSize
;
constexpr
index_t
MPerBlock
=
CK_PARAM_MPerBlock
;
constexpr
index_t
NPerBlock
=
CK_PARAM_NPerBlock
;
constexpr
index_t
KPerBlock
=
CK_PARAM_KPerBlock
;
constexpr
index_t
M1PerThread
=
CK_PARAM_M1PerThread
;
constexpr
index_t
N1PerThread
=
CK_PARAM_N1PerThread
;
constexpr
index_t
KPerThread
=
CK_PARAM_KPerThread
;
constexpr
index_t
M1N1ThreadClusterM10
=
CK_PARAM_M1N1ThreadClusterM10
;
constexpr
index_t
M1N1ThreadClusterN10
=
CK_PARAM_M1N1ThreadClusterN10
;
constexpr
index_t
M1N1ThreadClusterM11
=
CK_PARAM_M1N1ThreadClusterM11
;
constexpr
index_t
M1N1ThreadClusterN11
=
CK_PARAM_M1N1ThreadClusterN11
;
using
ABlockTransferThreadSliceLengths_K_M0_M1
=
Sequence
<
CK_PARAM_ABlockTransferThreadSliceLengths_K_M0_M1
>
;
using
ABlockTransferThreadClusterLengths_K_M0_M1
=
Sequence
<
CK_PARAM_ABlockTransferThreadClusterLengths_K_M0_M1
>
;
using
ABlockTransferThreadClusterArrangeOrder
=
Sequence
<
CK_PARAM_ABlockTransferThreadClusterArrangeOrder
>
;
using
ABlockTransferSrcAccessOrder
=
Sequence
<
CK_PARAM_ABlockTransferSrcAccessOrder
>
;
constexpr
index_t
ABlockTransferSrcVectorDim
=
CK_PARAM_ABlockTransferSrcVectorDim
;
constexpr
index_t
ABlockTransferSrcScalarPerVector
=
CK_PARAM_ABlockTransferSrcScalarPerVector
;
constexpr
index_t
ABlockTransferDstScalarPerVector_M1
=
CK_PARAM_ABlockTransferDstScalarPerVector_M1
;
constexpr
bool
AThreadTransferSrcResetCoordinateAfterRun
=
static_cast
<
bool
>
(
CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun
);
using
BBlockTransferThreadSliceLengths_K_N0_N1
=
Sequence
<
CK_PARAM_BBlockTransferThreadSliceLengths_K_N0_N1
>
;
using
BBlockTransferThreadClusterLengths_K_N0_N1
=
Sequence
<
CK_PARAM_BBlockTransferThreadClusterLengths_K_N0_N1
>
;
using
BBlockTransferThreadClusterArrangeOrder
=
Sequence
<
CK_PARAM_BBlockTransferThreadClusterArrangeOrder
>
;
using
BBlockTransferSrcAccessOrder
=
Sequence
<
CK_PARAM_BBlockTransferSrcAccessOrder
>
;
constexpr
index_t
BBlockTransferSrcVectorDim
=
CK_PARAM_BBlockTransferSrcVectorDim
;
constexpr
index_t
BBlockTransferSrcScalarPerVector
=
CK_PARAM_BBlockTransferSrcScalarPerVector
;
constexpr
index_t
BBlockTransferDstScalarPerVector_N1
=
CK_PARAM_BBlockTransferDstScalarPerVector_N1
;
constexpr
bool
BThreadTransferSrcResetCoordinateAfterRun
=
static_cast
<
bool
>
(
CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun
);
using
CThreadTransferSrcDstAccessOrder
=
Sequence
<
CK_PARAM_CThreadTransferSrcDstAccessOrder
>
;
constexpr
index_t
CThreadTransferSrcDstVectorDim
=
CK_PARAM_CThreadTransferSrcDstVectorDim
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
CK_PARAM_CThreadTransferDstScalarPerVector
;
constexpr
bool
HasMainKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HAS_MAIN_KBLOCK_LOOP
);
constexpr
bool
HasDoubleTailKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HAS_DOUBLE_TAIL_KBLOCK_LOOP
);
extern
"C"
__global__
void
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw_prepare
(
int
n
,
int
c
,
int
hi
,
int
wi
,
int
k
,
int
y
,
int
x
,
int
convStrideH
,
int
convStrideW
,
int
convDilationY
,
int
convDilationX
,
int
leftPadH
,
int
leftPadW
,
int
rightPadH
,
int
rightPadW
,
void
*
p_a_k_m0_m1_grid_desc
,
void
*
p_b_k_n0_n1_grid_desc
,
void
*
p_c_m0_m10_m11_n0_n10_n11_grid_desc
,
void
*
p_cblockid_to_m0_n0_block_cluster_adaptor
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
c
,
hi
,
wi
));
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
k
,
c
,
y
,
x
));
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
k
,
ho
,
wo
));
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
in_n_c_hi_wi_desc
,
out_n_k_ho_wo_desc
,
make_tuple
(
convStrideH
,
convStrideW
),
make_tuple
(
convDilationY
,
convDilationX
),
make_tuple
(
leftPadH
,
leftPadW
),
make_tuple
(
rightPadH
,
rightPadW
));
const
auto
a_k_m_grid_desc
=
descs
[
I0
];
const
auto
b_k_n_grid_desc
=
descs
[
I1
];
const
auto
c_m_n_grid_desc
=
descs
[
I2
];
using
AKMGridDesc
=
decltype
(
a_k_m_grid_desc
);
using
BKNGridDesc
=
decltype
(
b_k_n_grid_desc
);
using
CMNGridDesc
=
decltype
(
c_m_n_grid_desc
);
using
AGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{})));
using
BGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{})));
using
CGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{})));
using
AGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
BGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v1r2
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
/* ToDo tunable */
AKMGridDesc
,
BKNGridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM10
,
M1N1ThreadClusterN10
,
M1N1ThreadClusterM11
,
M1N1ThreadClusterN11
,
ABlockTransferThreadSliceLengths_K_M0_M1
,
ABlockTransferThreadClusterLengths_K_M0_M1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_M1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K_N0_N1
,
BBlockTransferThreadClusterLengths_K_N0_N1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_N1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
>
;
auto
a_k_m0_m1_grid_desc
=
GridwiseGemm
::
MakeAKM0M1GridDescriptor
(
a_k_m_grid_desc
);
auto
b_k_n0_n1_grid_desc
=
GridwiseGemm
::
MakeBKN0N1GridDescriptor
(
b_k_n_grid_desc
);
auto
c_m0_m10_m11_n0_n10_n11_grid_desc
=
GridwiseGemm
::
MakeCM0M10M11N0N10N11GridDescriptor
(
c_m_n_grid_desc
);
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
GridwiseGemm
::
MakeCBlockIdToM0N0BlockClusterAdaptor
(
c_m_n_grid_desc
);
if
(
hipThreadIdx_x
==
0
)
{
*
static_cast
<
decltype
(
a_k_m0_m1_grid_desc
)
*>
(
p_a_k_m0_m1_grid_desc
)
=
a_k_m0_m1_grid_desc
;
*
static_cast
<
decltype
(
b_k_n0_n1_grid_desc
)
*>
(
p_b_k_n0_n1_grid_desc
)
=
b_k_n0_n1_grid_desc
;
*
static_cast
<
decltype
(
c_m0_m10_m11_n0_n10_n11_grid_desc
)
*>
(
p_c_m0_m10_m11_n0_n10_n11_grid_desc
)
=
c_m0_m10_m11_n0_n10_n11_grid_desc
;
*
static_cast
<
decltype
(
cblockid_to_m0_n0_block_cluster_adaptor
)
*>
(
p_cblockid_to_m0_n0_block_cluster_adaptor
)
=
cblockid_to_m0_n0_block_cluster_adaptor
;
};
};
extern
"C"
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
convolution_forward_implicit_gemm_v4r4_dlops_nchw_kcyx_nkhw
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
void
CONSTANT
*
p_a_k_m0_m1_grid_desc
,
const
void
CONSTANT
*
p_b_k_n0_n1_grid_desc
,
const
void
CONSTANT
*
p_c_m0_m10_m11_n0_n10_n11_grid_desc
,
const
void
CONSTANT
*
p_cblockid_to_m0_n0_block_cluster_adaptor
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
3
,
3
));
constexpr
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
descs
=
transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
in_n_c_hi_wi_desc
,
out_n_k_ho_wo_desc
,
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
));
constexpr
auto
a_k_m_grid_desc
=
descs
[
I0
];
constexpr
auto
b_k_n_grid_desc
=
descs
[
I1
];
constexpr
auto
c_m_n_grid_desc
=
descs
[
I2
];
using
AKMGridDesc
=
decltype
(
a_k_m_grid_desc
);
using
BKNGridDesc
=
decltype
(
b_k_n_grid_desc
);
using
CMNGridDesc
=
decltype
(
c_m_n_grid_desc
);
using
AGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{})));
using
BGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{})));
using
CGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{})));
using
AGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
BGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
GridwiseGemm
=
GridwiseGemmDlops_km_kn_mn_v1r2
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
/* ToDo tunable */
AKMGridDesc
,
BKNGridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM10
,
M1N1ThreadClusterN10
,
M1N1ThreadClusterM11
,
M1N1ThreadClusterN11
,
ABlockTransferThreadSliceLengths_K_M0_M1
,
ABlockTransferThreadClusterLengths_K_M0_M1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_M1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K_N0_N1
,
BBlockTransferThreadClusterLengths_K_N0_N1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_N1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
>
;
constexpr
auto
a_k_m0_m1_grid_desc_tmp
=
GridwiseGemm
::
MakeAKM0M1GridDescriptor
(
a_k_m_grid_desc
);
constexpr
auto
b_k_n0_n1_grid_desc_tmp
=
GridwiseGemm
::
MakeBKN0N1GridDescriptor
(
b_k_n_grid_desc
);
constexpr
auto
c_m0_m10_m11_n0_n10_n11_grid_desc_tmp
=
GridwiseGemm
::
MakeCM0M10M11N0N10N11GridDescriptor
(
c_m_n_grid_desc
);
constexpr
auto
cblockid_to_m0_n0_block_cluster_adaptor_tmp
=
GridwiseGemm
::
MakeCBlockIdToM0N0BlockClusterAdaptor
(
c_m_n_grid_desc
);
using
AKM0M1GridDesc
=
decltype
(
a_k_m0_m1_grid_desc_tmp
);
using
BKN0N1GridDesc
=
decltype
(
b_k_n0_n1_grid_desc_tmp
);
using
CM0M10M11N0N10N11GridDesc
=
decltype
(
c_m0_m10_m11_n0_n10_n11_grid_desc_tmp
);
using
CBlockIdToM0N0BlockClusterAdaptor
=
decltype
(
cblockid_to_m0_n0_block_cluster_adaptor_tmp
);
const
auto
a_k_m0_m1_grid_desc
=
*
reinterpret_cast
<
const
AKM0M1GridDesc
*>
((
const
void
*
)
p_a_k_m0_m1_grid_desc
);
const
auto
b_k_n0_n1_grid_desc
=
*
reinterpret_cast
<
const
BKN0N1GridDesc
*>
((
const
void
*
)
p_b_k_n0_n1_grid_desc
);
const
auto
c_m0_m10_m11_n0_n10_n11_grid_desc
=
*
reinterpret_cast
<
const
CM0M10M11N0N10N11GridDesc
*>
(
(
const
void
*
)
p_c_m0_m10_m11_n0_n10_n11_grid_desc
);
const
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
*
reinterpret_cast
<
const
CBlockIdToM0N0BlockClusterAdaptor
*>
(
(
const
void
*
)
p_cblockid_to_m0_n0_block_cluster_adaptor
);
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_c_grid
,
p_shared_block
,
a_k_m0_m1_grid_desc
,
b_k_n0_n1_grid_desc
,
c_m0_m10_m11_n0_n10_n11_grid_desc
,
cblockid_to_m0_n0_block_cluster_adaptor
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
};
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw.cpp
deleted
100644 → 0
View file @
3cc57101
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
constexpr
DataTypeEnum_t
ABDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_ABDataTypeEnum
);
constexpr
DataTypeEnum_t
AccDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_AccDataTypeEnum
);
constexpr
DataTypeEnum_t
CDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_CDataTypeEnum
);
using
FloatAB
=
typename
get_datatype_from_enum
<
ABDataTypeEnum
>::
type
;
using
FloatAcc
=
typename
get_datatype_from_enum
<
AccDataTypeEnum
>::
type
;
using
FloatC
=
typename
get_datatype_from_enum
<
CDataTypeEnum
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BlockSize
;
constexpr
index_t
MPerBlock
=
CK_PARAM_MPerBlock
;
constexpr
index_t
NPerBlock
=
CK_PARAM_NPerBlock
;
constexpr
index_t
KPerBlock
=
CK_PARAM_KPerBlock
;
constexpr
index_t
MPerWave
=
CK_PARAM_MPerWave
;
constexpr
index_t
NPerWave
=
CK_PARAM_NPerWave
;
constexpr
index_t
MRepeat
=
CK_PARAM_MRepeat
;
constexpr
index_t
NRepeat
=
CK_PARAM_NRepeat
;
constexpr
index_t
K1
=
CK_PARAM_K1
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1
>
;
using
ABlockTransferThreadClusterArrangeOrder
=
Sequence
<
CK_PARAM_ABlockTransferThreadClusterArrangeOrder
>
;
using
ABlockTransferSrcAccessOrder
=
Sequence
<
CK_PARAM_ABlockTransferSrcAccessOrder
>
;
constexpr
index_t
ABlockTransferSrcVectorDim
=
CK_PARAM_ABlockTransferSrcVectorDim
;
constexpr
index_t
ABlockTransferSrcScalarPerVector
=
CK_PARAM_ABlockTransferSrcScalarPerVector
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
CK_PARAM_ABlockTransferDstScalarPerVector_K1
;
constexpr
bool
AThreadTransferSrcResetCoordinateAfterRun
=
static_cast
<
bool
>
(
CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun
);
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1
>
;
using
BBlockTransferThreadClusterArrangeOrder
=
Sequence
<
CK_PARAM_BBlockTransferThreadClusterArrangeOrder
>
;
using
BBlockTransferSrcAccessOrder
=
Sequence
<
CK_PARAM_BBlockTransferSrcAccessOrder
>
;
constexpr
index_t
BBlockTransferSrcVectorDim
=
CK_PARAM_BBlockTransferSrcVectorDim
;
constexpr
index_t
BBlockTransferSrcScalarPerVector
=
CK_PARAM_BBlockTransferSrcScalarPerVector
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
CK_PARAM_BBlockTransferDstScalarPerVector_K1
;
constexpr
bool
BThreadTransferSrcResetCoordinateAfterRun
=
static_cast
<
bool
>
(
CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun
);
using
CThreadTransferSrcDstAccessOrder
=
Sequence
<
CK_PARAM_CThreadTransferSrcDstAccessOrder
>
;
constexpr
index_t
CThreadTransferSrcDstVectorDim
=
CK_PARAM_CThreadTransferSrcDstVectorDim
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
CK_PARAM_CThreadTransferDstScalarPerVector
;
extern
"C"
__global__
void
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw_prepare
(
int
n
,
int
c
,
int
hi
,
int
wi
,
int
k
,
int
y
,
int
x
,
int
convStrideH
,
int
convStrideW
,
int
convDilationY
,
int
convDilationX
,
int
leftPadH
,
int
leftPadW
,
int
rightPadH
,
int
rightPadW
,
void
*
p_a_k0_m_k1_grid_desc
,
void
*
p_b_k0_n_k1_grid_desc
,
void
*
p_c_m0_m1_m2_n_grid_desc
,
void
*
p_cblockid_to_m0_n0_block_cluster_adaptor
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
c
,
hi
,
wi
));
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
k
,
c
,
y
,
x
));
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
k
,
ho
,
wo
));
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
in_n_c_hi_wi_desc
,
out_n_k_ho_wo_desc
,
make_tuple
(
convStrideH
,
convStrideW
),
make_tuple
(
convDilationY
,
convDilationX
),
make_tuple
(
leftPadH
,
leftPadW
),
make_tuple
(
rightPadH
,
rightPadW
),
Number
<
K1
>
{});
const
auto
a_k0_m_k1_grid_desc
=
descs
[
I0
];
const
auto
b_k0_n_k1_grid_desc
=
descs
[
I1
];
const
auto
c_m_n_grid_desc
=
descs
[
I2
];
using
AK0MK1GridDesc
=
decltype
(
a_k0_m_k1_grid_desc
);
using
BK0NK1GridDesc
=
decltype
(
b_k0_n_k1_grid_desc
);
using
CMNGridDesc
=
decltype
(
c_m_n_grid_desc
);
using
AGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{})));
using
BGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{})));
using
CGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{})));
using
AGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
BGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
GridwiseGemm
=
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
AK0MK1GridDesc
,
BK0NK1GridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerWave
,
NPerWave
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
,
false
>
;
auto
c_m0_m1_m2_n_grid_desc
=
GridwiseGemm
::
MakeCM0M1M2NGridDescriptor
(
c_m_n_grid_desc
);
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
GridwiseGemm
::
MakeCBlockClusterAdaptor
(
c_m_n_grid_desc
);
if
(
hipThreadIdx_x
==
0
)
{
*
static_cast
<
remove_cv_t
<
decltype
(
a_k0_m_k1_grid_desc
)
>*>
(
p_a_k0_m_k1_grid_desc
)
=
a_k0_m_k1_grid_desc
;
*
static_cast
<
remove_cv_t
<
decltype
(
b_k0_n_k1_grid_desc
)
>*>
(
p_b_k0_n_k1_grid_desc
)
=
b_k0_n_k1_grid_desc
;
*
static_cast
<
decltype
(
c_m0_m1_m2_n_grid_desc
)
*>
(
p_c_m0_m1_m2_n_grid_desc
)
=
c_m0_m1_m2_n_grid_desc
;
*
static_cast
<
decltype
(
cblockid_to_m0_n0_block_cluster_adaptor
)
*>
(
p_cblockid_to_m0_n0_block_cluster_adaptor
)
=
cblockid_to_m0_n0_block_cluster_adaptor
;
}
};
extern
"C"
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
convolution_forward_implicit_gemm_v4r4_xdlops_nchw_kcyx_nkhw
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
void
CONSTANT
*
p_a_k0_m_k1_grid_desc
,
const
void
CONSTANT
*
p_b_k0_n_k1_grid_desc
,
const
void
CONSTANT
*
p_c_m0_m1_m2_n_grid_desc
,
const
void
CONSTANT
*
p_cblockid_to_m0_n0_block_cluster_adaptor
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
3
,
3
));
constexpr
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
in_n_c_hi_wi_desc
,
out_n_k_ho_wo_desc
,
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
Number
<
K1
>
{});
constexpr
auto
a_k0_m_k1_grid_desc_tmp
=
descs
[
I0
];
constexpr
auto
b_k0_n_k1_grid_desc_tmp
=
descs
[
I1
];
constexpr
auto
c_m_n_grid_desc
=
descs
[
I2
];
using
AGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{})));
using
BGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{})));
using
CGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{})));
using
AGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
BGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
AK0MK1GridDesc
=
decltype
(
a_k0_m_k1_grid_desc_tmp
);
using
BK0NK1GridDesc
=
decltype
(
b_k0_n_k1_grid_desc_tmp
);
using
CMNGridDesc
=
decltype
(
c_m_n_grid_desc
);
using
GridwiseGemm
=
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
AK0MK1GridDesc
,
BK0NK1GridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerWave
,
NPerWave
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
,
false
>
;
constexpr
auto
c_m0_m1_m2_n_grid_desc_tmp
=
GridwiseGemm
::
MakeCM0M1M2NGridDescriptor
(
c_m_n_grid_desc
);
constexpr
auto
cblockid_to_m0_n0_block_cluster_adaptor_tmp
=
GridwiseGemm
::
MakeCBlockClusterAdaptor
(
c_m_n_grid_desc
);
using
CM0M1M2NGridDesc
=
decltype
(
c_m0_m1_m2_n_grid_desc_tmp
);
using
CBlockIdToM0N0BlockClusterAdaptor
=
decltype
(
cblockid_to_m0_n0_block_cluster_adaptor_tmp
);
const
auto
a_k0_m_k1_grid_desc
=
*
reinterpret_cast
<
const
AK0MK1GridDesc
*>
((
const
void
*
)
p_a_k0_m_k1_grid_desc
);
const
auto
b_k0_n_k1_grid_desc
=
*
reinterpret_cast
<
const
BK0NK1GridDesc
*>
((
const
void
*
)
p_b_k0_n_k1_grid_desc
);
const
auto
c_m0_m1_m2_n_grid_desc
=
*
reinterpret_cast
<
const
CM0M1M2NGridDesc
*>
((
const
void
*
)
p_c_m0_m1_m2_n_grid_desc
);
const
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
*
reinterpret_cast
<
const
CBlockIdToM0N0BlockClusterAdaptor
*>
(
(
const
void
*
)
p_cblockid_to_m0_n0_block_cluster_adaptor
);
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_c_grid
,
p_shared_block
,
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m0_m1_m2_n_grid_desc
,
cblockid_to_m0_n0_block_cluster_adaptor
);
};
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk.cpp
deleted
100644 → 0
View file @
3cc57101
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp"
using
namespace
ck
;
constexpr
DataTypeEnum_t
ABDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_ABDataTypeEnum
);
constexpr
DataTypeEnum_t
AccDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_AccDataTypeEnum
);
constexpr
DataTypeEnum_t
CDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_CDataTypeEnum
);
using
FloatAB
=
typename
get_datatype_from_enum
<
ABDataTypeEnum
>::
type
;
using
FloatAcc
=
typename
get_datatype_from_enum
<
AccDataTypeEnum
>::
type
;
using
FloatC
=
typename
get_datatype_from_enum
<
CDataTypeEnum
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BlockSize
;
constexpr
index_t
MPerBlock
=
CK_PARAM_MPerBlock
;
constexpr
index_t
NPerBlock
=
CK_PARAM_NPerBlock
;
constexpr
index_t
KPerBlock
=
CK_PARAM_KPerBlock
;
constexpr
index_t
MPerWave
=
CK_PARAM_MPerWave
;
constexpr
index_t
NPerWave
=
CK_PARAM_NPerWave
;
constexpr
index_t
MRepeat
=
CK_PARAM_MRepeat
;
constexpr
index_t
NRepeat
=
CK_PARAM_NRepeat
;
constexpr
index_t
K1
=
CK_PARAM_K1
;
using
ABlockTransferThreadSliceLengths_K0_M_K1
=
Sequence
<
CK_PARAM_ABlockTransferThreadSliceLengths_K0_M_K1
>
;
using
ABlockTransferThreadClusterLengths_K0_M_K1
=
Sequence
<
CK_PARAM_ABlockTransferThreadClusterLengths_K0_M_K1
>
;
using
ABlockTransferThreadClusterArrangeOrder
=
Sequence
<
CK_PARAM_ABlockTransferThreadClusterArrangeOrder
>
;
using
ABlockTransferSrcAccessOrder
=
Sequence
<
CK_PARAM_ABlockTransferSrcAccessOrder
>
;
constexpr
index_t
ABlockTransferSrcVectorDim
=
CK_PARAM_ABlockTransferSrcVectorDim
;
constexpr
index_t
ABlockTransferSrcScalarPerVector
=
CK_PARAM_ABlockTransferSrcScalarPerVector
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K1
=
CK_PARAM_ABlockTransferDstScalarPerVector_K1
;
constexpr
bool
AThreadTransferSrcResetCoordinateAfterRun
=
static_cast
<
bool
>
(
CK_PARAM_AThreadTransferSrcResetCoordinateAfterRun
);
using
BBlockTransferThreadSliceLengths_K0_N_K1
=
Sequence
<
CK_PARAM_BBlockTransferThreadSliceLengths_K0_N_K1
>
;
using
BBlockTransferThreadClusterLengths_K0_N_K1
=
Sequence
<
CK_PARAM_BBlockTransferThreadClusterLengths_K0_N_K1
>
;
using
BBlockTransferThreadClusterArrangeOrder
=
Sequence
<
CK_PARAM_BBlockTransferThreadClusterArrangeOrder
>
;
using
BBlockTransferSrcAccessOrder
=
Sequence
<
CK_PARAM_BBlockTransferSrcAccessOrder
>
;
constexpr
index_t
BBlockTransferSrcVectorDim
=
CK_PARAM_BBlockTransferSrcVectorDim
;
constexpr
index_t
BBlockTransferSrcScalarPerVector
=
CK_PARAM_BBlockTransferSrcScalarPerVector
;
constexpr
index_t
BBlockTransferDstScalarPerVector_K1
=
CK_PARAM_BBlockTransferDstScalarPerVector_K1
;
constexpr
bool
BThreadTransferSrcResetCoordinateAfterRun
=
static_cast
<
bool
>
(
CK_PARAM_BThreadTransferSrcResetCoordinateAfterRun
);
using
CThreadTransferSrcDstAccessOrder
=
Sequence
<
CK_PARAM_CThreadTransferSrcDstAccessOrder
>
;
constexpr
index_t
CThreadTransferSrcDstVectorDim
=
CK_PARAM_CThreadTransferSrcDstVectorDim
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
CK_PARAM_CThreadTransferDstScalarPerVector
;
extern
"C"
__global__
void
convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk_prepare
(
int
n
,
int
hi
,
int
wi
,
int
c
,
int
k
,
int
y
,
int
x
,
int
convStrideH
,
int
convStrideW
,
int
convDilationY
,
int
convDilationX
,
int
leftPadH
,
int
leftPadW
,
int
rightPadH
,
int
rightPadW
,
void
*
p_a_k0_m_k1_grid_desc
,
void
*
p_b_k0_n_k1_grid_desc
,
void
*
p_c_m0_m1_m2_n_grid_desc
,
void
*
p_cblockid_to_m0_n0_block_cluster_adaptor
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
const
index_t
ho
=
(
hi
+
leftPadH
+
rightPadH
-
convDilationY
*
(
y
-
1
)
-
1
)
/
convStrideH
+
1
;
const
index_t
wo
=
(
wi
+
leftPadW
+
rightPadW
-
convDilationX
*
(
x
-
1
)
-
1
)
/
convStrideW
+
1
;
const
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
hi
,
wi
,
c
));
const
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
k
,
y
,
x
,
c
));
const
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
n
,
ho
,
wo
,
k
));
const
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk
(
in_n_hi_wi_c_desc
,
wei_k_y_x_c_desc
,
out_n_ho_wo_k_desc
,
make_tuple
(
convStrideH
,
convStrideW
),
make_tuple
(
convDilationY
,
convDilationX
),
make_tuple
(
leftPadH
,
leftPadW
),
make_tuple
(
rightPadH
,
rightPadW
),
Number
<
K1
>
{});
const
auto
a_k0_m_k1_grid_desc
=
descs
[
I0
];
const
auto
b_k0_n_k1_grid_desc
=
descs
[
I1
];
const
auto
c_m_n_grid_desc
=
descs
[
I2
];
using
AK0MK1GridDesc
=
decltype
(
a_k0_m_k1_grid_desc
);
using
BK0NK1GridDesc
=
decltype
(
b_k0_n_k1_grid_desc
);
using
CMNGridDesc
=
decltype
(
c_m_n_grid_desc
);
using
BGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{})));
using
AGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{})));
using
CGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{})));
using
AGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
BGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
GridwiseGemm
=
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
AK0MK1GridDesc
,
BK0NK1GridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerWave
,
NPerWave
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
,
false
>
;
auto
c_m0_m1_m2_n_grid_desc
=
GridwiseGemm
::
MakeCM0M1M2NGridDescriptor
(
c_m_n_grid_desc
);
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
GridwiseGemm
::
MakeCBlockClusterAdaptor
(
c_m_n_grid_desc
);
if
(
hipThreadIdx_x
==
0
)
{
*
static_cast
<
remove_cv_t
<
decltype
(
a_k0_m_k1_grid_desc
)
>*>
(
p_a_k0_m_k1_grid_desc
)
=
a_k0_m_k1_grid_desc
;
*
static_cast
<
remove_cv_t
<
decltype
(
b_k0_n_k1_grid_desc
)
>*>
(
p_b_k0_n_k1_grid_desc
)
=
b_k0_n_k1_grid_desc
;
*
static_cast
<
decltype
(
c_m0_m1_m2_n_grid_desc
)
*>
(
p_c_m0_m1_m2_n_grid_desc
)
=
c_m0_m1_m2_n_grid_desc
;
*
static_cast
<
decltype
(
cblockid_to_m0_n0_block_cluster_adaptor
)
*>
(
p_cblockid_to_m0_n0_block_cluster_adaptor
)
=
cblockid_to_m0_n0_block_cluster_adaptor
;
}
};
extern
"C"
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
convolution_forward_implicit_gemm_v4r4_xdlops_nhwc_kyxc_nhwk
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
void
CONSTANT
*
p_a_k0_m_k1_grid_desc
,
const
void
CONSTANT
*
p_b_k0_n_k1_grid_desc
,
const
void
CONSTANT
*
p_c_m0_m1_m2_n_grid_desc
,
const
void
CONSTANT
*
p_cblockid_to_m0_n0_block_cluster_adaptor
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
in_n_hi_wi_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
28
,
28
,
256
));
constexpr
auto
wei_k_y_x_c_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
3
,
3
,
256
));
constexpr
auto
out_n_ho_wo_k_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
28
,
28
,
256
));
constexpr
auto
descs
=
transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk
(
in_n_hi_wi_c_desc
,
wei_k_y_x_c_desc
,
out_n_ho_wo_k_desc
,
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
Number
<
K1
>
{});
constexpr
auto
a_k0_m_k1_grid_desc_tmp
=
descs
[
I0
];
constexpr
auto
b_k0_n_k1_grid_desc_tmp
=
descs
[
I1
];
constexpr
auto
c_m_n_grid_desc
=
descs
[
I2
];
using
AK0MK1GridDesc
=
decltype
(
a_k0_m_k1_grid_desc_tmp
);
using
BK0NK1GridDesc
=
decltype
(
b_k0_n_k1_grid_desc_tmp
);
using
CMNGridDesc
=
decltype
(
c_m_n_grid_desc
);
using
BGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{})));
using
AGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
>
{})));
using
CGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
1
,
0
,
0
>
{}),
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
0
,
0
,
0
>
{},
Sequence
<
0
,
0
,
2
,
0
,
0
>
{})));
using
AGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
2
,
0
,
0
>
;
using
BGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
>
;
using
GridwiseGemm
=
GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
AK0MK1GridDesc
,
BK0NK1GridDesc
,
CMNGridDesc
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
MPerWave
,
NPerWave
,
K1
,
MRepeat
,
NRepeat
,
ABlockTransferThreadSliceLengths_K0_M_K1
,
ABlockTransferThreadClusterLengths_K0_M_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_K1
,
AThreadTransferSrcResetCoordinateAfterRun
,
BBlockTransferThreadSliceLengths_K0_N_K1
,
BBlockTransferThreadClusterLengths_K0_N_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_K1
,
BThreadTransferSrcResetCoordinateAfterRun
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
,
false
>
;
constexpr
auto
c_m0_m1_m2_n_grid_desc_tmp
=
GridwiseGemm
::
MakeCM0M1M2NGridDescriptor
(
c_m_n_grid_desc
);
constexpr
auto
cblockid_to_m0_n0_block_cluster_adaptor_tmp
=
GridwiseGemm
::
MakeCBlockClusterAdaptor
(
c_m_n_grid_desc
);
using
CM0M1M2NGridDesc
=
decltype
(
c_m0_m1_m2_n_grid_desc_tmp
);
using
CBlockIdToM0N0BlockClusterAdaptor
=
decltype
(
cblockid_to_m0_n0_block_cluster_adaptor_tmp
);
const
auto
a_k0_m_k1_grid_desc
=
*
reinterpret_cast
<
const
AK0MK1GridDesc
*>
((
const
void
*
)
p_a_k0_m_k1_grid_desc
);
const
auto
b_k0_n_k1_grid_desc
=
*
reinterpret_cast
<
const
BK0NK1GridDesc
*>
((
const
void
*
)
p_b_k0_n_k1_grid_desc
);
const
auto
c_m0_m1_m2_n_grid_desc
=
*
reinterpret_cast
<
const
CM0M1M2NGridDesc
*>
((
const
void
*
)
p_c_m0_m1_m2_n_grid_desc
);
const
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
*
reinterpret_cast
<
const
CBlockIdToM0N0BlockClusterAdaptor
*>
(
(
const
void
*
)
p_cblockid_to_m0_n0_block_cluster_adaptor
);
constexpr
index_t
shared_block_size
=
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
GridwiseGemm
::
Run
(
p_a_grid
,
p_b_grid
,
p_c_grid
,
p_shared_block
,
a_k0_m_k1_grid_desc
,
b_k0_n_k1_grid_desc
,
c_m0_m1_m2_n_grid_desc
,
cblockid_to_m0_n0_block_cluster_adaptor
);
};
composable_kernel/src/kernel_wrapper/convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw.cpp
deleted
100644 → 0
View file @
3cc57101
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_contraction_dlops_v1r2.hpp"
#include "transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
constexpr
DataTypeEnum_t
ABDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_ABDataTypeEnum
);
constexpr
DataTypeEnum_t
AccDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_AccDataTypeEnum
);
constexpr
DataTypeEnum_t
CDataTypeEnum
=
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_CDataTypeEnum
);
using
FloatAB
=
typename
get_datatype_from_enum
<
ABDataTypeEnum
>::
type
;
using
FloatAcc
=
typename
get_datatype_from_enum
<
AccDataTypeEnum
>::
type
;
using
FloatC
=
typename
get_datatype_from_enum
<
CDataTypeEnum
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BlockSize
;
constexpr
auto
GN0
=
Number
<
CK_PARAM_GN0
>
{};
constexpr
auto
GK1
=
Number
<
CK_PARAM_GK1
>
{};
constexpr
index_t
GM1PerBlockGM11
=
CK_PARAM_GM1PerBlockGM11
;
constexpr
index_t
GN1PerBlockGN11
=
CK_PARAM_GN1PerBlockGN11
;
constexpr
index_t
GK0PerBlock
=
CK_PARAM_GK0PerBlock
;
constexpr
index_t
BM1PerThreadBM11
=
CK_PARAM_BM1PerThreadBM11
;
constexpr
index_t
BN1PerThreadBN11
=
CK_PARAM_BN1PerThreadBN11
;
constexpr
index_t
BK0PerThread
=
CK_PARAM_BK0PerThread
;
using
BM10BN10ThreadClusterBM10Xs
=
Sequence
<
CK_PARAM_BM10BN10ThreadClusterBM10Xs
>
;
using
BM10BN10ThreadClusterBN10Xs
=
Sequence
<
CK_PARAM_BM10BN10ThreadClusterBN10Xs
>
;
using
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
=
Sequence
<
CK_PARAM_ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
>
;
using
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
=
Sequence
<
CK_PARAM_ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
>
;
using
ABlockTransferThreadClusterArrangeOrder
=
Sequence
<
1
,
2
,
3
,
0
,
4
>
;
using
ABlockTransferSrcAccessOrder
=
Sequence
<
3
,
2
,
1
,
0
,
4
>
;
using
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
=
Sequence
<
CK_PARAM_ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
>
;
using
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
=
Sequence
<
CK_PARAM_ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
>
;
using
ABlockTransferSrcVectorTensorContiguousDimOrder
=
Sequence
<
0
,
1
,
2
,
3
,
4
>
;
using
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
=
Sequence
<
CK_PARAM_BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
>
;
using
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
=
Sequence
<
CK_PARAM_BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
>
;
using
BBlockTransferThreadClusterArrangeOrder
=
Sequence
<
0
,
4
,
1
,
2
,
3
>
;
using
BBlockTransferSrcAccessOrder
=
Sequence
<
4
,
3
,
2
,
0
,
1
>
;
using
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
=
Sequence
<
CK_PARAM_BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
>
;
using
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
=
Sequence
<
CK_PARAM_BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
>
;
using
BBlockTransferSrcVectorTensorContiguousDimOrder
=
Sequence
<
0
,
1
,
2
,
3
,
4
>
;
using
CThreadTransferSrcDstAccessOrder
=
Sequence
<
3
,
4
,
5
,
0
,
1
,
2
>
;
constexpr
index_t
CThreadTransferSrcDstVectorDim
=
5
;
constexpr
index_t
CThreadTransferDstScalarPerVector
=
CK_PARAM_CThreadTransferDstScalarPerVector
;
constexpr
bool
HasMainKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HasMainKBlockLoop
);
constexpr
bool
HasDoubleTailKBlockLoop
=
static_cast
<
bool
>
(
CK_PARAM_HasDoubleTailKBlockLoop
);
extern
"C"
__global__
void
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw_prepare
(
int
N_
,
int
C_
,
int
Hi_
,
int
Wi_
,
int
K_
,
int
Y_
,
int
X_
,
int
ConvStrideH_
,
int
ConvStrideW_
,
int
ConvDilationH_
,
int
ConvDilationW_
,
int
InLeftPadH_
,
int
InLeftPadW_
,
int
InRightPadH_
,
int
InRightPadW_
,
void
*
p_desc_tuple
)
{
index_t
N
=
static_cast
<
index_t
>
(
N_
);
index_t
C
=
static_cast
<
index_t
>
(
C_
);
index_t
Hi
=
static_cast
<
index_t
>
(
Hi_
);
index_t
Wi
=
static_cast
<
index_t
>
(
Wi_
);
index_t
K
=
static_cast
<
index_t
>
(
K_
);
index_t
Y
=
static_cast
<
index_t
>
(
Y_
);
index_t
X
=
static_cast
<
index_t
>
(
X_
);
index_t
ConvStrideH
=
static_cast
<
index_t
>
(
ConvStrideH_
);
index_t
ConvStrideW
=
static_cast
<
index_t
>
(
ConvStrideW_
);
index_t
ConvDilationH
=
static_cast
<
index_t
>
(
ConvDilationH_
);
index_t
ConvDilationW
=
static_cast
<
index_t
>
(
ConvDilationW_
);
index_t
InLeftPadH
=
static_cast
<
index_t
>
(
InLeftPadH_
);
index_t
InLeftPadW
=
static_cast
<
index_t
>
(
InLeftPadW_
);
index_t
InRightPadH
=
static_cast
<
index_t
>
(
InRightPadH_
);
index_t
InRightPadW
=
static_cast
<
index_t
>
(
InRightPadW_
);
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
const
index_t
Ho
=
(
Hi
+
InLeftPadH
+
InRightPadH
-
ConvDilationH
*
(
Y
-
1
)
-
1
)
/
ConvStrideH
+
1
;
const
index_t
Wo
=
(
Wi
+
InLeftPadW
+
InRightPadW
-
ConvDilationW
*
(
X
-
1
)
-
1
)
/
ConvStrideW
+
1
;
const
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
C
,
Hi
,
Wi
));
const
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
C
,
Y
,
X
));
const
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
K
,
Ho
,
Wo
));
const
auto
descs
=
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
in_n_c_hi_wi_desc
,
out_n_k_ho_wo_desc
,
make_tuple
(
ConvStrideH
,
ConvStrideW
),
make_tuple
(
ConvDilationH
,
ConvDilationW
),
make_tuple
(
InLeftPadH
,
InLeftPadW
),
make_tuple
(
InRightPadH
,
InRightPadW
),
GN0
,
GK1
);
const
auto
a_grid_desc_gk0_gm0_gm1_gk1
=
descs
[
I0
];
const
auto
b_grid_desc_gk0_gn0_gn1_gk1
=
descs
[
I1
];
const
auto
c_grid_desc_gm0_gm1_gn0_gn1
=
descs
[
I2
];
using
AGridDesc_GK0_GM0_GM1_GK1
=
decltype
(
a_grid_desc_gk0_gm0_gm1_gk1
);
using
BGridDesc_GK0_GN0_GN1_GK1
=
decltype
(
b_grid_desc_gk0_gn0_gn1_gk1
);
using
CGridDesc_GM0_GM1_GN0_GN1
=
decltype
(
c_grid_desc_gm0_gm1_gn0_gn1
);
using
AGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: GM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: GM11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 4+: GK1
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: GM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: GM11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{})));
// 4-: GK1
using
BGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: GN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: GN10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: GN11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 4+: GK1
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: GN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: GN10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: GN11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{})));
// 4-: GK1
using
CGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: BM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: BM1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: GN10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
>
{},
// 4+: BN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
>
{}),
// 5+: GN1
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: BM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: BM1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: GN10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
>
{},
// 4-: BN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
>
{})));
// 5-: GN1
using
AGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
;
using
BGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
;
using
GridwiseContraction
=
GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
AGridDesc_GK0_GM0_GM1_GK1
,
BGridDesc_GK0_GN0_GN1_GK1
,
CGridDesc_GM0_GM1_GN0_GN1
,
GM1PerBlockGM11
,
GN1PerBlockGN11
,
GK0PerBlock
,
BM1PerThreadBM11
,
BN1PerThreadBN11
,
BK0PerThread
,
BM10BN10ThreadClusterBM10Xs
,
BM10BN10ThreadClusterBN10Xs
,
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferSrcVectorTensorContiguousDimOrder
,
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferSrcVectorTensorContiguousDimOrder
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
>
;
if
(
get_block_1d_id
()
==
0
&&
get_thread_local_1d_id
()
==
0
)
{
auto
desc_tuple
=
make_tuple
(
GridwiseContraction
::
MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1
(
a_grid_desc_gk0_gm0_gm1_gk1
),
GridwiseContraction
::
MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1
(
b_grid_desc_gk0_gn0_gn1_gk1
),
GridwiseContraction
::
MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1
(
c_grid_desc_gm0_gm1_gn0_gn1
),
GridwiseContraction
::
MakeCGridBlockCluster_BlockId_To_GM10_GN10
(
c_grid_desc_gm0_gm1_gn0_gn1
));
*
static_cast
<
decltype
(
desc_tuple
)
*>
(
p_desc_tuple
)
=
desc_tuple
;
}
};
extern
"C"
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
void
CONSTANT
*
p_desc_tuple
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_n_c_hi_wi_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
wei_k_c_y_x_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
3
,
3
));
constexpr
auto
out_n_k_ho_wo_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
256
,
256
,
28
,
28
));
constexpr
auto
descs
=
transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad
(
wei_k_c_y_x_desc
,
in_n_c_hi_wi_desc
,
out_n_k_ho_wo_desc
,
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
make_tuple
(
1
,
1
),
GN0
,
GK1
);
constexpr
auto
a_grid_desc_gk0_gm0_gm1_gk1
=
descs
[
I0
];
constexpr
auto
b_grid_desc_gk0_gn0_gn1_gk1
=
descs
[
I1
];
constexpr
auto
c_grid_desc_gm0_gm1_gn0_gn1
=
descs
[
I2
];
using
AGridDesc_GK0_GM0_GM1_GK1
=
decltype
(
a_grid_desc_gk0_gm0_gm1_gk1
);
using
BGridDesc_GK0_GN0_GN1_GK1
=
decltype
(
b_grid_desc_gk0_gn0_gn1_gk1
);
using
CGridDesc_GM0_GM1_GN0_GN1
=
decltype
(
c_grid_desc_gm0_gm1_gn0_gn1
);
using
AGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: GM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: GM11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 4+: GK1
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: GM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: GM11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{})));
// 4-: GK1
using
BGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: GN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: GN10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: GN11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{}),
// 4+: GK1
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: GK0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: GN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: GN10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: GN11
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{})));
// 4-: GK1
using
CGridStepHacks
=
decltype
(
make_tuple
(
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0+: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1+: BM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2+: BM1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3+: GN10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
>
{},
// 4+: BN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
>
{}),
// 5+: GN1
make_tuple
(
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 0-: GM10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 1-: BM0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 2-: BM1
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
>
{},
// 3-: GN10
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
>
{},
// 4-: BN0
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
0
,
0
,
0
,
0
>
{})));
// 5-: GN1
using
AGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
>
;
using
BGridMoveSliceWindowStepHacks
=
Sequence
<
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
,
0
,
2
,
0
,
0
,
0
,
0
,
0
>
;
using
GridwiseContraction
=
GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
<
BlockSize
,
FloatAB
,
FloatAcc
,
FloatC
,
InMemoryDataOperationEnum_t
::
Set
,
AGridDesc_GK0_GM0_GM1_GK1
,
BGridDesc_GK0_GN0_GN1_GK1
,
CGridDesc_GM0_GM1_GN0_GN1
,
GM1PerBlockGM11
,
GN1PerBlockGN11
,
GK0PerBlock
,
BM1PerThreadBM11
,
BN1PerThreadBN11
,
BK0PerThread
,
BM10BN10ThreadClusterBM10Xs
,
BM10BN10ThreadClusterBN10Xs
,
ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1
,
ABlockTransferSrcVectorTensorContiguousDimOrder
,
BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1
,
BBlockTransferSrcVectorTensorContiguousDimOrder
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AGridStepHacks
,
BGridStepHacks
,
CGridStepHacks
,
AGridMoveSliceWindowStepHacks
,
BGridMoveSliceWindowStepHacks
>
;
using
AGridDesc_GK0_GM0_GM10_GM11_GK1
=
decltype
(
GridwiseContraction
::
MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1
(
a_grid_desc_gk0_gm0_gm1_gk1
));
using
BGridDesc_GK0_GN0_GN10_GN11_GK1
=
decltype
(
GridwiseContraction
::
MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1
(
b_grid_desc_gk0_gn0_gn1_gk1
));
using
CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1
=
decltype
(
GridwiseContraction
::
MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1
(
c_grid_desc_gm0_gm1_gn0_gn1
));
using
CGridBlockCluster_BlockId_To_GM10_GN10
=
decltype
(
GridwiseContraction
::
MakeCGridBlockCluster_BlockId_To_GM10_GN10
(
c_grid_desc_gm0_gm1_gn0_gn1
));
using
DescTuple
=
decltype
(
make_tuple
(
AGridDesc_GK0_GM0_GM10_GM11_GK1
{},
BGridDesc_GK0_GN0_GN10_GN11_GK1
{},
CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1
{},
CGridBlockCluster_BlockId_To_GM10_GN10
{}));
const
auto
desc_tuple
=
*
reinterpret_cast
<
const
DescTuple
*>
(
cast_pointer_to_generic_address_space
(
p_desc_tuple
));
const
auto
a_grid_desc_gk0_gm0_gm10_gm11_gk1
=
desc_tuple
[
I0
];
const
auto
b_grid_desc_gk0_gn0_gn10_gn11_gk1
=
desc_tuple
[
I1
];
const
auto
c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1
=
desc_tuple
[
I2
];
const
auto
c_grid_block_cluster_blockid_to_gm10_gn10
=
desc_tuple
[
I3
];
constexpr
index_t
shared_block_size
=
GridwiseContraction
::
GetSharedMemoryNumberOfByte
()
/
sizeof
(
FloatAB
);
__shared__
FloatAB
p_shared_block
[
shared_block_size
];
GridwiseContraction
::
Run
(
p_a_grid
,
p_b_grid
,
p_c_grid
,
p_shared_block
,
a_grid_desc_gk0_gm0_gm10_gm11_gk1
,
b_grid_desc_gk0_gn0_gn10_gn11_gk1
,
c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1
,
c_grid_block_cluster_blockid_to_gm10_gn10
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_all_dims.cpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
constexpr
int
invariantLen
=
1
;
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
};
template
<
index_t
srcDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple
(
1
),
make_tuple
(
1
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_blockwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredAccessesPerThreadInBlock
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_blockwise_reduce_partial_dims.cpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_blockwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
constexpr
index_t
num_toReduceDims
=
CK_PARAM_NUM_TOREDUCE_DIMS
;
constexpr
index_t
num_invariantDims
=
srcDims
-
num_toReduceDims
;
using
invariantDims
=
typename
arithmetic_sequence_gen
<
0
,
num_invariantDims
,
1
>::
type
;
using
toReduceDims
=
typename
arithmetic_sequence_gen
<
num_invariantDims
,
srcDims
,
1
>::
type
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
static_assert
(
num_invariantDims
>
0
,
"Not all dimensins are reduced for this kernel !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
toReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
invariantDims
{});
auto
src2dDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_toReduceDimLengths
=
typename
uniform_sequence_gen
<
toReduceDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_invariantDimLengths
=
typename
uniform_sequence_gen
<
invariantDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_invariantDimLengths
)),
make_merge_transform
(
make_tuple_from_seq
(
ref_toReduceDimLengths
))),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_blockwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredAccessesPerThreadInBlock
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_all_dims.cpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
constexpr
int
invariantLen
=
1
;
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLen
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
reduceSizePerBlock
*
BlkGroupSize
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
};
template
<
index_t
srcDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple
(
1
),
make_tuple
(
1
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_dst_global
;
(
void
)
indices_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_multiblock
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
GredAccessesPerThreadInBlock
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
BlkGroupSize
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
static_cast
<
int
*
const
__restrict__
>
(
ws_buf2_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_multiblock_reduce_partial_dims.cpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_multiblock.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
index_t
dstDims
=
CK_PARAM_OUT_DIMS
;
constexpr
index_t
num_toReduceDims
=
CK_PARAM_NUM_TOREDUCE_DIMS
;
constexpr
index_t
num_invariantDims
=
srcDims
-
num_toReduceDims
;
using
invariantDims
=
typename
arithmetic_sequence_gen
<
0
,
num_invariantDims
,
1
>::
type
;
using
toReduceDims
=
typename
arithmetic_sequence_gen
<
num_invariantDims
,
srcDims
,
1
>::
type
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
static_assert
(
num_invariantDims
>
0
,
"Not all dimensins are reduced for this kernel !!"
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredAccessesPerThreadInBlock
=
CK_PARAM_ACCESSES_PER_THREAD_INBLOCK
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
int
outStride0
,
int
outStride1
,
int
outStride2
,
int
outStride3
,
int
outStride4
,
int
outStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
GridSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
int
dstStrides
[
6
]
=
{
outStride0
,
outStride1
,
outStride2
,
outStride3
,
outStride4
,
outStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
dstDims
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
dstStrides
,
Number
<
dstDims
>
{});
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
toReduceDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
toReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
srcLengths
,
invariantDims
{});
auto
src2dDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
toReduceDimLengths
)),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
auto
dst1dDesc
=
transform_tensor_descriptor
(
dstDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLen
=
src2dDesc
.
GetLength
(
Number
<
0
>
{});
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
BlockSize
*
GredAccessesPerThreadInBlock
;
const
index_t
reduceSizePerBlock
=
(((
toReduceLen
+
BlkGroupSize
-
1
)
/
BlkGroupSize
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad
=
reduceSizePerBlock
*
BlkGroupSize
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pass_through_transform
(
invariantLen
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc
)
*>
(
p_dst1dDesc
)
=
dst1dDesc
;
};
template
<
index_t
srcDims
,
index_t
dstDims
,
typename
invariantDims
,
typename
toReduceDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_toReduceDimLengths
=
typename
uniform_sequence_gen
<
toReduceDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_invariantDimLengths
=
typename
uniform_sequence_gen
<
invariantDims
::
Size
(),
8
>::
type
{};
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
static
constexpr
auto
ref_dstLengths
=
typename
uniform_sequence_gen
<
dstDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_dstLengths
),
make_tuple_from_seq
(
ref_dstLengths
));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_invariantDimLengths
)),
make_merge_transform
(
make_tuple_from_seq
(
ref_toReduceDimLengths
))),
make_tuple
(
invariantDims
{},
toReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
static
constexpr
auto
ref_dst1dDesc
=
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_dstLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
dstDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the BlockWise and MultiBlock method
using
refType_src2dDesc_padded_34
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pass_through_transform
(
ref_invariantLen
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dst1dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dst1dDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_34
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_src2dDesc_padded_34
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
,
dstDims
,
invariantDims
,
toReduceDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_34
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
p_dst_global
;
(
void
)
indices_global
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
void
*
ws_buf1_global
=
const_cast
<
char
*>
(
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
4096
);
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_multiblock
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
GredAccessesPerThreadInBlock
>
;
void
*
const
ws_buf2_global
=
ws_buf2_bytes_offset
>
0
?
static_cast
<
void
*>
(
static_cast
<
char
*>
(
ws_buf1_global
)
+
ws_buf2_bytes_offset
)
:
nullptr
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
BlkGroupSize
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
srcDataType
*
const
__restrict__
>
(
ws_buf1_global
),
static_cast
<
int
*
const
__restrict__
>
(
ws_buf2_global
));
};
composable_kernel/src/kernel_wrapper/gridwise_generic_reduction_first_call_threadwise_reduce_all_dims.cpp
deleted
100644 → 0
View file @
3cc57101
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#include "config.hpp"
#include "number.hpp"
#include "sequence.hpp"
#include "tensor_descriptor_helper.hpp"
#include "data_type_enum_helper.hpp"
#include "reduction_common.hpp"
#include "gridwise_generic_2d_reduction_direct_threadwise.hpp"
using
namespace
ck
;
using
srcDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_SRC_DATATYPE
)
>::
type
;
using
dstDataType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_DST_DATATYPE
)
>::
type
;
using
compType
=
typename
get_datatype_from_enum
<
static_cast
<
DataTypeEnum_t
>
(
CK_PARAM_REDUCE_COMPTYPE
)
>::
type
;
constexpr
index_t
BlockSize
=
CK_PARAM_BLOCKSIZE
;
// tunable
constexpr
index_t
srcDims
=
CK_PARAM_IN_DIMS
;
constexpr
ReduceTensorOp_t
op
=
static_cast
<
ReduceTensorOp_t
>
(
CK_PARAM_REDUCE_OP
);
constexpr
NanPropagation_t
nanPropaOpt
=
CK_PARAM_NAN_PROPAGATE
==
0
?
NanPropagation_t
::
NOT_PROPAGATE_NAN
:
NanPropagation_t
::
PROPAGATE_NAN
;
constexpr
ReduceTensorIndices_t
reduceIndicesOpt
=
CK_PARAM_REDUCE_INDICES
==
0
?
ReduceTensorIndices_t
::
NO_INDICES
:
ReduceTensorIndices_t
::
FLATTENED_INDICES
;
constexpr
bool
src2d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_SRC2D_PADDING
);
constexpr
bool
dst1d_need_padding
=
static_cast
<
bool
>
(
CK_PARAM_DST1D_PADDING
);
constexpr
bool
indexable
=
reduce_binary_operator
<
compType
,
op
>::
indexable
;
constexpr
bool
need_indices
=
indexable
&&
(
reduceIndicesOpt
!=
ReduceTensorIndices_t
::
NO_INDICES
);
constexpr
index_t
GredThreadBufferLength
=
CK_PARAM_THREAD_BUFFER_LENGTH
;
// tunable
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
__device__
static
auto
make_tuple_from_array_and_index_seq
(
const
int
*
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
__device__
static
auto
make_tuple_from_array
(
const
int
*
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
...
Ns
>
__device__
static
constexpr
auto
make_tuple_from_seq
(
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
Ns
...);
};
extern
"C"
__global__
void
gridwise_generic_reduce_1_prepare
(
int
GridSize
,
int
BlkGroupSize
,
int
inLength0
,
int
inLength1
,
int
inLength2
,
int
inLength3
,
int
inLength4
,
int
inLength5
,
int
inStride0
,
int
inStride1
,
int
inStride2
,
int
inStride3
,
int
inStride4
,
int
inStride5
,
void
*
__restrict__
ws_global
)
{
(
void
)
BlkGroupSize
;
void
*
p_src2dDesc
=
ws_global
;
void
*
p_dst1dDesc
=
static_cast
<
char
*>
(
ws_global
)
+
2048
;
const
int
srcLengths
[
6
]
=
{
inLength0
,
inLength1
,
inLength2
,
inLength3
,
inLength4
,
inLength5
};
const
int
srcStrides
[
6
]
=
{
inStride0
,
inStride1
,
inStride2
,
inStride3
,
inStride4
,
inStride5
};
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
srcLengths
,
Number
<
srcDims
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
srcStrides
,
Number
<
srcDims
>
{});
const
auto
tupleDstLengths
=
make_tuple
(
1
);
const
auto
tupleDstStrides
=
make_tuple
(
1
);
const
auto
srcDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
auto
dstDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
const
auto
one_dim_srcDesc
=
transform_tensor_descriptor
(
srcDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
auto
src2dDesc
=
transform_tensor_descriptor
(
one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
constexpr
int
invariantLen
=
1
;
const
auto
toReduceLen
=
src2dDesc
.
GetLength
(
Number
<
1
>
{});
constexpr
auto
copySliceLen
=
GredThreadBufferLength
;
if
constexpr
(
src2d_need_padding
)
{
const
auto
srcPad1
=
GridSize
*
BlockSize
-
invariantLen
;
const
auto
srcPad2
=
((
toReduceLen
+
copySliceLen
-
1
)
/
copySliceLen
)
*
copySliceLen
-
toReduceLen
;
auto
src2dDesc_2
=
transform_tensor_descriptor
(
src2dDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
srcPad1
),
make_pad_transform
(
toReduceLen
,
0
,
srcPad2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc_2
)
*>
(
p_src2dDesc
)
=
src2dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
src2dDesc
)
*>
(
p_src2dDesc
)
=
src2dDesc
;
}
if
constexpr
(
dst1d_need_padding
)
{
const
auto
dstPad
=
GridSize
*
BlockSize
-
invariantLen
;
auto
dst1dDesc_2
=
transform_tensor_descriptor
(
dstdDesc
,
make_tuple
(
make_pad_transform
(
invariantLen
,
0
,
dstPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dst1dDesc_2
)
*>
(
p_dst1dDesc
)
=
dst1dDesc_2
;
}
else
{
if
(
get_thread_local_1d_id
()
==
0
)
*
static_cast
<
decltype
(
dstDesc
)
*>
(
p_dst1dDesc
)
=
dstDesc
;
}
};
template
<
index_t
srcDims
>
struct
get_ref_desc_types
{
static
constexpr
auto
ref_srcLengths
=
typename
uniform_sequence_gen
<
srcDims
,
8
>::
type
{};
// don't have to use accurate strides to get an expected referrence type
static
constexpr
auto
ref_srcDesc
=
make_naive_tensor_descriptor
(
make_tuple_from_seq
(
ref_srcLengths
),
make_tuple_from_seq
(
ref_srcLengths
));
static
constexpr
auto
ref_dstDesc
=
make_naive_tensor_descriptor
(
make_tuple
(
1
),
make_tuple
(
1
));
static
constexpr
auto
ref_one_dim_srcDesc
=
transform_tensor_descriptor
(
ref_srcDesc
,
make_tuple
(
make_merge_transform
(
make_tuple_from_seq
(
ref_srcLengths
))),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
srcDims
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
static
constexpr
auto
ref_src2dDesc
=
transform_tensor_descriptor
(
ref_one_dim_srcDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
ref_one_dim_srcDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
static
constexpr
auto
ref_invariantLen
=
ref_src2dDesc
.
GetLength
(
Number
<
0
>
{});
static
constexpr
auto
ref_toReduceLen
=
ref_src2dDesc
.
GetLength
(
Number
<
1
>
{});
// used by the DirectThreadWise and DirectWarpWise method
using
refType_src2dDesc_padded_12
=
decltype
(
transform_tensor_descriptor
(
ref_src2dDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
),
make_pad_transform
(
ref_toReduceLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{})));
using
refType_dst1dDesc_padded
=
decltype
(
transform_tensor_descriptor
(
ref_dstDesc
,
make_tuple
(
make_pad_transform
(
ref_invariantLen
,
0
,
2
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{})));
using
refType_src2dDesc
=
decltype
(
ref_src2dDesc
);
using
refType_dst1dDesc
=
decltype
(
ref_dstDesc
);
};
using
refType_src2dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc
;
using
refType_dst1dDesc
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc
;
using
refType_src2dDesc_padded_12
=
typename
get_ref_desc_types
<
srcDims
>::
refType_src2dDesc_padded_12
;
using
refType_dst1dDesc_padded
=
typename
get_ref_desc_types
<
srcDims
>::
refType_dst1dDesc_padded
;
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_src2d_descriptor
(
const
void
*
p_src2dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_src2dDesc_padded_12
*>
(
p_src2dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_src2dDesc
*>
(
p_src2dDesc
));
};
template
<
bool
need_padding
>
static
__device__
auto
get_reduction_dst1d_descriptor
(
const
void
*
p_dst1dDesc
)
{
if
constexpr
(
need_padding
)
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc_padded
*>
(
p_dst1dDesc
));
else
return
(
*
reinterpret_cast
<
const
refType_dst1dDesc
*>
(
p_dst1dDesc
));
};
extern
"C"
__global__
void
gridwise_generic_reduce_1
(
int
origReduceLen
,
int
BlkGroupSize
,
float
alpha
,
const
void
*
__restrict__
p_src_global
,
float
beta
,
void
*
__restrict__
p_dst_global
,
const
void
CONSTANT
*
ws_global
,
long
ws_buf2_bytes_offset
,
void
*
__restrict__
indices_global
)
{
(
void
)
BlkGroupSize
;
(
void
)
ws_buf2_bytes_offset
;
const
void
*
p_src2dDesc
=
cast_pointer_to_generic_address_space
(
ws_global
);
const
void
*
p_dst1dDesc
=
static_cast
<
const
char
*>
(
p_src2dDesc
)
+
2048
;
const
auto
src2dDesc
=
get_reduction_src2d_descriptor
<
src2d_need_padding
>
(
p_src2dDesc
);
const
auto
dst1dDesc
=
get_reduction_dst1d_descriptor
<
dst1d_need_padding
>
(
p_dst1dDesc
);
using
gridwise_2d_reduce
=
GridwiseReduction_xy_to_x_direct_threadwise
<
BlockSize
,
srcDataType
,
dstDataType
,
compType
,
decltype
(
src2dDesc
),
decltype
(
dst1dDesc
),
op
,
nanPropaOpt
,
reduceIndicesOpt
,
true
,
true
,
GredThreadBufferLength
>
;
constexpr
int
RunId
=
need_indices
?
2
:
1
;
gridwise_2d_reduce
::
template
Run
<
RunId
>(
src2dDesc
,
dst1dDesc
,
origReduceLen
,
alpha
,
static_cast
<
const
srcDataType
*
const
__restrict__
>
(
p_src_global
),
beta
,
static_cast
<
dstDataType
*
const
__restrict__
>
(
p_dst_global
),
static_cast
<
const
int
*
const
__restrict__
>
(
nullptr
),
static_cast
<
int
*
const
__restrict__
>
(
indices_global
));
};
Prev
1
2
3
4
5
…
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment