Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
07a673c6
Commit
07a673c6
authored
Apr 14, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
c0f698d5
ac0d8066
Changes
307
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
178 additions
and
275 deletions
+178
-275
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+5
-4
example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+1
-1
example/17_convnd_bwd_data_xdl/README.md
example/17_convnd_bwd_data_xdl/README.md
+3
-36
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+27
-92
example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
...e/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+1
-1
example/CMakeLists.txt
example/CMakeLists.txt
+2
-3
include/ck/config.hpp
include/ck/config.hpp
+66
-93
include/ck/tensor/static_tensor.hpp
include/ck/tensor/static_tensor.hpp
+4
-4
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
.../tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+2
-2
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
.../tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
+2
-2
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
...ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+1
-1
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+3
-3
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp
+1
-1
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+1
-1
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v6r1.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v6r1.hpp
+1
-1
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v6r2.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v6r2.hpp
+1
-1
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v6r3.hpp
...ration/gpu/block/blockwise_tensor_slice_transfer_v6r3.hpp
+1
-1
include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
...sor_operation/gpu/block/reduction_functions_blockwise.hpp
+49
-21
include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
...n/gpu/device/convolution_backward_data_specialization.hpp
+1
-1
include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
...eration/gpu/device/convolution_forward_specialization.hpp
+6
-6
No files found.
example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
View file @
07a673c6
...
...
@@ -4,6 +4,8 @@
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "check_err.hpp"
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
...
...
@@ -40,9 +42,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
_t
::
Default
;
static
constexpr
auto
GemmDefault
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// static constexpr auto GemmMNPadding =
// ck::tensor_operation::device::GemmSpecialization
_t
::MNPadding;
// ck::tensor_operation::device::GemmSpecialization::MNPadding;
// clang-format off
using
DeviceGemmInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedGemmXdl
...
...
@@ -225,8 +227,7 @@ int main(int argc, char* argv[])
c_element_op
);
ref_invoker
.
Run
(
ref_argument
);
check_error
(
c_host_tensors
[
i
],
c_device_tensors
[
i
]);
ck
::
utils
::
check_err
(
c_device_tensors
[
i
].
mData
,
c_host_tensors
[
i
].
mData
);
}
}
...
...
example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
View file @
07a673c6
...
...
@@ -40,7 +40,7 @@ using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
using
D1ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSquareSum
;
static
constexpr
auto
GemmSpecialization
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
_t
::
Default
;
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// clang-format off
using
DeviceGemmReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmReduce_Xdl_CShuffle
...
...
example/17_convnd_bwd_data_xdl/README.md
View file @
07a673c6
# Instructions for ```convnd_bwd_data_xdl```
Example
# Instructions for ```
example_
convnd_bwd_data_xdl```
## Docker script
```
bash
docker run
\
-it
\
--rm
\
--privileged
\
--group-add
sudo
\
-w
/root/workspace
\
-v
${
PATH_TO_LOCAL_WORKSPACE
}
:/root/workspace
\
rocm/tensorflow:rocm4.3.1-tf2.6-dev
\
/bin/bash
```
## Build ```convnd_bwd_data_xdl```
```
bash
mkdir
build
&&
cd
build
```
```
bash
# Need to specify target ID, example below is gfx908
cmake
\
-D
BUILD_DEV
=
OFF
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_CXX_FLAGS
=
"-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
/opt/rocm
\
..
```
```
bash
make
-j
convnd_bwd_data_xdl
```
## Run ```example_convnd_bwd_data_xdl```
## Run ```example_example_convnd_bwd_data_xdl```
```
bash
#arg1: verification (0=no, 1=yes)
#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
#arg3: run kernel # of times (>1)
#arg4: num_dim_spatial(1|2|3)
#arg5 to ...: N, K, C, [Z,] [Y,] X, [Di,] [Hi,] Wi, S[z,] [Sy,] Sx, [Dz,] [Dy,] Dx, [LeftPz,] [LeftPy,] LeftPx, [RightPy,] [RightPy,] RightPx
./bin/convnd_bwd_data_xdl 0 1 5
./bin/
example_
convnd_bwd_data_xdl 0 1 5
```
Result
...
...
example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
View file @
07a673c6
...
...
@@ -6,7 +6,7 @@
#include <half.hpp>
#include "config.hpp"
#include "conv_util
s
.hpp"
#include "conv_
fwd_
util.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
...
...
@@ -29,7 +29,7 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvBwdDefault
=
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
_t
::
Default
;
ck
::
tensor_operation
::
device
::
ConvolutionBackwardDataSpecialization
::
Default
;
using
DeviceConvBwdDataBasePtr
=
ck
::
tensor_operation
::
device
::
DeviceConvBwdDataPtr
<
InElementOp
,
WeiElementOp
,
OutElementOp
>
;
...
...
@@ -44,7 +44,7 @@ using DeviceConvNDBwdDataInstance = ck::tensor_operation::device::
InElementOp
,
// InElementwiseOperation
WeiElementOp
,
// WeiElementwiseOperation
OutElementOp
,
// OutElementwiseOperation
ConvBwdDefault
,
// ConvolutionBackwardDataSpecialization
_t
ConvBwdDefault
,
// ConvolutionBackwardDataSpecialization
NumDimSpatial
,
// NumDimSpatial
256
,
// BlockSize
128
,
// MPerBlock
...
...
@@ -83,7 +83,7 @@ using ReferenceConvBwdDataInstance =
OutElementOp
,
NumDimSpatial
>
;
void
P
rint
UseM
sg
()
void
p
rint
_use_m
sg
()
{
std
::
cout
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=random value, 2= init to 1 )
\n
"
...
...
@@ -99,10 +99,10 @@ void PrintUseMsg()
<<
" <right padding>, (ie RightPy, RightPx for 2D)
\n
"
<<
std
::
endl
;
}
ck
::
conv_util
::
ConvParams
P
arse
C
onv
P
arams
(
int
num_dim_spatial
,
char
*
argv
[])
ck
::
utils
::
conv
::
ConvParams
p
arse
_c
onv
_p
arams
(
int
num_dim_spatial
,
char
*
argv
[])
{
// (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right)
ck
::
conv_util
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
int
arg_idx
=
5
;
params
.
num_dim_spatial
=
num_dim_spatial
;
...
...
@@ -144,73 +144,7 @@ ck::conv_util::ConvParams ParseConvParams(int num_dim_spatial, char* argv[])
return
params
;
}
HostTensorDescriptor
GetInputHostTensorDescriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NDHWC
{});
}
case
2
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NHWC
{});
}
case
1
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NWC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
GetFiltersHostTensorDescriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
KZYXC
{});
}
case
2
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
KYXC
{});
}
case
1
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
KXC
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
HostTensorDescriptor
GetOutputHostTensorDescriptor
(
const
std
::
vector
<
std
::
size_t
>&
dims
,
int
num_dim_spatial
=
2
)
{
namespace
tl
=
ck
::
tensor_layout
::
convolution
;
switch
(
num_dim_spatial
)
{
case
3
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NDHWK
{});
}
case
2
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NHWK
{});
}
case
1
:
{
return
ck
::
conv_util
::
GetHostTensorDescriptor
(
dims
,
tl
::
NWK
{});
}
default:
{
throw
std
::
runtime_error
(
"Unsupported number of spatial dimensions provided!"
);
}
}
}
DeviceConvBwdDataBasePtr
GetConvInstance
(
int
num_dim_spatial
)
DeviceConvBwdDataBasePtr
get_conv_instance
(
int
num_dim_spatial
)
{
switch
(
num_dim_spatial
)
{
...
...
@@ -236,7 +170,7 @@ int main(int argc, char* argv[])
int
nrepeat
=
5
;
int
num_dim_spatial
=
2
;
ck
::
conv_util
::
ConvParams
params
;
ck
::
utils
::
conv
::
ConvParams
params
;
params
.
C
=
128
;
if
(
argc
==
4
)
...
...
@@ -256,15 +190,15 @@ int main(int argc, char* argv[])
int
cmdline_nargs
=
conv_args
+
5
;
if
(
cmdline_nargs
!=
argc
)
{
P
rint
UseM
sg
();
p
rint
_use_m
sg
();
exit
(
1
);
}
params
=
P
arse
C
onv
P
arams
(
num_dim_spatial
,
argv
);
params
=
p
arse
_c
onv
_p
arams
(
num_dim_spatial
,
argv
);
}
else
if
(
argc
!=
1
)
{
P
rint
UseM
sg
();
p
rint
_use_m
sg
();
exit
(
1
);
}
...
...
@@ -288,11 +222,13 @@ int main(int argc, char* argv[])
std
::
end
(
output_spatial_lengths
));
Tensor
<
InDataType
>
in_n_c_hi_wi_host_result
(
GetI
nput
H
ost
T
ensor
D
escriptor
(
input_dims
,
num_dim_spatial
));
ck
::
utils
::
conv
::
get_i
nput
_h
ost
_t
ensor
_d
escriptor
(
input_dims
,
num_dim_spatial
));
Tensor
<
InDataType
>
in_n_c_hi_wi_device_result
(
GetInputHostTensorDescriptor
(
input_dims
,
num_dim_spatial
));
Tensor
<
WeiDataType
>
wei_k_c_y_x
(
GetFiltersHostTensorDescriptor
(
filter_dims
,
num_dim_spatial
));
Tensor
<
OutDataType
>
out_n_k_ho_wo
(
GetOutputHostTensorDescriptor
(
output_dims
,
num_dim_spatial
));
ck
::
utils
::
conv
::
get_input_host_tensor_descriptor
(
input_dims
,
num_dim_spatial
));
Tensor
<
WeiDataType
>
wei_k_c_y_x
(
ck
::
utils
::
conv
::
get_filters_host_tensor_descriptor
(
filter_dims
,
num_dim_spatial
));
Tensor
<
OutDataType
>
out_n_k_ho_wo
(
ck
::
utils
::
conv
::
get_output_host_tensor_descriptor
(
output_dims
,
num_dim_spatial
));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei_k_c_y_x: "
<<
wei_k_c_y_x
.
mDesc
<<
std
::
endl
;
...
...
@@ -318,11 +254,10 @@ int main(int argc, char* argv[])
out_device_buf
.
ToDevice
(
out_n_k_ho_wo
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_k_c_y_x
.
mData
.
data
());
// reset input to zero
in_n_c_hi_wi_device_result
.
GenerateTensorValue
(
GeneratorTensor_1
<
InDataType
>
{
0
});
in_device_buf
.
ToDevice
(
in_n_c_hi_wi_device_result
.
mData
.
data
());
in_device_buf
.
SetZero
();
// do GEMM
auto
conv
=
G
et
C
onv
I
nstance
(
num_dim_spatial
);
auto
conv
=
g
et
_c
onv
_i
nstance
(
num_dim_spatial
);
auto
invoker
=
conv
->
MakeInvokerPointer
();
auto
argument
=
conv
->
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
...
...
@@ -351,15 +286,15 @@ int main(int argc, char* argv[])
float
ave_time
=
invoker
->
Run
(
argument
.
get
(),
nrepeat
);
std
::
size_t
flop
=
ck
::
conv_util
::
G
et
F
lops
(
std
::
size_t
flop
=
ck
::
utils
::
conv
::
g
et
_f
lops
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
ck
::
conv_util
::
GetBtype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
std
::
size_t
num_btype
=
ck
::
utils
::
conv
::
get_btype
<
InDataType
,
WeiDataType
,
OutDataType
>
(
params
.
N
,
params
.
C
,
params
.
K
,
params
.
input_spatial_lengths
,
params
.
filter_spatial_lengths
,
output_spatial_lengths
);
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
...
...
example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
View file @
07a673c6
...
...
@@ -40,7 +40,7 @@ using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
using
D1ReduceOp
=
ck
::
tensor_operation
::
element_wise
::
ReduceSquareSum
;
static
constexpr
auto
GemmSpecialization
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
_t
::
Default
;
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// clang-format off
using
DeviceBatchedGemmReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceBatchedGemmReduce_Xdl_CShuffle
...
...
example/CMakeLists.txt
View file @
07a673c6
...
...
@@ -13,6 +13,7 @@ include_directories(BEFORE
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/host_tensor
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/cpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/reference_tensor_operation/gpu
${
PROJECT_SOURCE_DIR
}
/library/include/ck/library/utility
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
...
...
@@ -29,13 +30,11 @@ add_subdirectory(01_gemm)
add_subdirectory
(
02_gemm_alpha_beta
)
add_subdirectory
(
03_gemm_bias_relu
)
add_subdirectory
(
04_gemm_bias_relu_add
)
add_subdirectory
(
05_conv2d_fwd
)
add_subdirectory
(
06_conv2d_fwd_bias_relu
)
add_subdirectory
(
07_conv2d_fwd_bias_relu_add
)
add_subdirectory
(
08_conv3d_fwd
)
add_subdirectory
(
09_convnd_fwd
)
add_subdirectory
(
10_conv2d_bwd_data
)
add_subdirectory
(
11_conv2d_bwd_w
g
t
)
add_subdirectory
(
11_conv2d_bwd_w
eigh
t
)
add_subdirectory
(
12_reduce
)
add_subdirectory
(
13_pool2d_fwd
)
add_subdirectory
(
14_gemm_xdl_requant_relu_requant
)
...
...
include/ck/config.hpp
View file @
07a673c6
...
...
@@ -6,15 +6,9 @@
#include "hip/hip_fp16.h"
#endif
// "Constant" address space for kernel parameter
#define CONSTANT __attribute__((address_space(4)))
// GPU target
// should enable one and only one GPU target
#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
#error Need to define (only) one GPU target
#endif
// constant address space for kernel parameter
// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
// launch bounds
#define CK_USE_LAUNCH_BOUNDS 1
...
...
@@ -24,144 +18,122 @@
#define CK_MIN_BLOCK_PER_CU 2
#endif
// GPU-specific parameters
#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A)
// buffer resourse
// check GPU target
#ifdef __HIP_DEVICE_COMPILE__
#if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx1030__))
#error Not supported target
#endif
#endif
// buffer resourse, wave size
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
#define CK_GPU_WAVE_SIZE -1
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
// wave size
#define CK_GPU_WAVE_SIZE 64
#elif defined(
CK_AMD_GPU_GFX1030)
#elif defined(
__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
#define CK_GPU_WAVE_SIZE 32
#endif
// FMA instruction
#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
#ifndef __HIP_DEVICE_COMPILE__ // for host code, define nothing
#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
#define CK_USE_AMD_V_MAC_F32
#elif defined(
CK_AMD_GPU_GFX
906) || defined(
CK_AMD_GPU_GFX
908) || defined(
CK_AMD_GPU_GFX
90a) || \
defined(
CK_AMD_GPU_GFX1030)
#elif defined(
__gfx
906
__
) || defined(
__gfx
908
__
) || defined(
__gfx
90a
__
) || \
defined(
__gfx1030__) // for GPU code
#define CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_DOT2_F32_F16
#define CK_USE_AMD_V_DOT4_I32_I8
#endif
// multi index
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
// AMD inline asm
#ifndef CK_USE_AMD_INLINE_ASM
#define CK_USE_AMD_INLINE_ASM 1
// MFMA instruction
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_MFMA
#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
#define CK_USE_AMD_MFMA
#endif
// AMD inner product (DLOP)
#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
#if defined(__gfx90a__)
#define CK_USE_AMD_MFMA_BF16_1K_OP
#endif
// AMD buffer_load
#ifndef CK_USE_AMD_BUFFER_LOAD
// buffer load
#define CK_USE_AMD_BUFFER_LOAD 1
#endif
// AMD buffer_store
#ifndef CK_USE_AMD_BUFFER_STORE
// buffer store
#define CK_USE_AMD_BUFFER_STORE 1
#endif
// AMD buffer_atomic_add
#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
#define CK_USE_AMD_BUFFER_ATOMIC_ADD 1
#endif
// buffer atomic add: integer
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
// AMD XDLOPS
#ifndef CK_USE_AMD_XDLOPS
#define CK_USE_AMD_XDLOPS 0
// buffer atomic add: floating point
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
#else // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
#endif
// inline asm
#define CK_USE_AMD_INLINE_ASM 1
// inner product (DLOP)
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#endif
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
// experimental implementation for buffer load/store/atomic
#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
#endif
// experimental feature: multi index implemented as array
#define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#endif
// experimental feature: static tensor descriptor
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
// experimental feature: buffer load/store/atomic-add OOB trick
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
#endif
// experimental implementation for in-regsiter sub-dword transpose
#ifndef CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
// experimental feature: in-regsiter sub-dword transpose
#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
#endif
#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
// merge transformation use magic number division
#ifndef CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
// experimental feature: merge transformation use magic number division
#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
#endif
// use __builtin_memcpy instead of pointer cast to access a vector from
pointer of scalar
#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
//
experimental feature:
use __builtin_memcpy instead of pointer cast to access a vector from
// pointer of scalar
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
#endif
// use __builtin_memcpy instead of union to do bit_cast
#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
// experimental feature: use __builtin_memcpy instead of union to do bit_cast
#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
#endif
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// thread-invariant, otherwise it's a bug
// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
#endif
// workaround for compiler crash when compiling recursive lambda
#ifndef CK_WORKAROUND_SWDEV_275126
// workaround: compiler crash when compiling recursive lambda
#define CK_WORKAROUND_SWDEV_275126 1
#endif
// workaround for compiler crash when using buffer load/store for i8
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
// workaround: compiler crash when using buffer load/store for i8
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
#endif
// workaround for compiler gnerating inefficient ds_write instructions
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
// workaround: compiler gnerating inefficient ds_write instructions
#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
#endif
// workaround for register spill due to compiler issue, when casting type between fp32 and fp16
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE
#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE 1
#endif
#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE
#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE 1
#endif
// workaround for verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
// workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
// tuning parameter
#ifndef CK_WORKAROUND_SWDEV_325164
#define CK_WORKAROUND_SWDEV_325164 1
#endif
// workaround for verification failure ConvNd forward
// https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/135
#ifndef CK_WORKAROUND_GITHUB_135
#define CK_WORKAROUND_GITHUB_135 1
#endif
#ifndef CK_USE_X86_INLINE_ASM
#define CK_USE_X86_INLINE_ASM 1
...
...
@@ -169,14 +141,15 @@
namespace
ck
{
enum
struct
InMemoryDataOperationEnum
_t
enum
struct
InMemoryDataOperationEnum
{
Set
,
AtomicAdd
,
Add
};
enum
struct
ActivTypeEnum_t
// TODO: no longer needed, remove this
enum
struct
ActivTypeEnum
{
None
,
LeakyRelu
,
...
...
include/ck/tensor/static_tensor.hpp
View file @
07a673c6
...
...
@@ -4,7 +4,7 @@
namespace
ck
{
// StaticTensor for Scalar
template
<
AddressSpaceEnum
_t
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
typename
T
,
typename
TensorDesc
,
bool
InvalidElementUseNumericalZeroValue
,
...
...
@@ -80,7 +80,7 @@ struct StaticTensor
};
// StaticTensor for vector
template
<
AddressSpaceEnum
_t
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
typename
S
,
index_t
ScalarPerVector
,
typename
TensorDesc
,
...
...
@@ -245,7 +245,7 @@ struct StaticTensorTupleOfVectorBuffer
S
ignored_element_scalar_
;
};
template
<
AddressSpaceEnum
_t
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
typename
T
,
typename
TensorDesc
,
typename
enable_if
<
TensorDesc
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
...
...
@@ -255,7 +255,7 @@ __host__ __device__ constexpr auto make_static_tensor(TensorDesc)
}
template
<
AddressSpaceEnum
_t
AddressSpace
,
AddressSpaceEnum
AddressSpace
,
typename
T
,
typename
TensorDesc
,
typename
X
,
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
View file @
07a673c6
...
...
@@ -207,9 +207,9 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
CM0M1N0N1ThreadDesc
{}.
GetLength
(
I2
)
==
N0
,
"wrong"
);
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatA
>
(
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatA
>
(
a_k_m0_m1_thread_desc_
.
GetElementSpaceSize
());
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatB
>
(
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatB
>
(
b_k_n0_n1_thread_desc_
.
GetElementSpaceSize
());
constexpr
auto
threadwise_gemm
=
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
View file @
07a673c6
...
...
@@ -220,9 +220,9 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
CThreadDesc_BM0_BM11_BN0_BN11
{}.
GetLength
(
I2
)
==
BN0
,
"wrong"
);
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatA
>
(
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatA
>
(
a_thread_desc_bk0_bm0_bm1_bk1_
.
GetElementSpaceSize
());
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatB
>
(
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatB
>
(
b_thread_desc_bk0_bn0_bn1_bk1_
.
GetElementSpaceSize
());
constexpr
auto
threadwise_contraction
=
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
View file @
07a673c6
...
...
@@ -119,7 +119,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
constexpr
auto
a_block_mtx
=
ABlockDesc_E1_K1_E2
{};
// thread A buffer for GEMM
StaticBuffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatA
,
a_thread_mtx_
.
GetElementSpaceSize
(),
true
>
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
FloatA
,
a_thread_mtx_
.
GetElementSpaceSize
(),
true
>
a_thread_buf
;
constexpr
auto
threadwise_gemm
=
ThreadwiseGemmDlops_km_kn_mn_v3
<
FloatA
,
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
07a673c6
...
...
@@ -42,7 +42,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
static
constexpr
index_t
MWaves
=
MPerBlock
/
(
MRepeat
*
MPerXDL
);
static
constexpr
index_t
NWaves
=
NPerBlock
/
(
NRepeat
*
NPerXDL
);
StaticBufferTupleOfVector
<
AddressSpaceEnum
_t
::
Vgpr
,
StaticBufferTupleOfVector
<
AddressSpaceEnum
::
Vgpr
,
FloatAcc
,
MRepeat
*
NRepeat
,
xdlops_gemm
.
GetRegSizePerXdlops
(),
...
...
@@ -250,9 +250,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
const
BBlockBuffer
&
b_block_buf
,
CThreadBuffer
&
c_thread_buf
)
const
{
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatAB
>
(
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
a_thread_desc_
.
GetElementSpaceSize
());
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
_t
::
Vgpr
,
FloatAB
>
(
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatAB
>
(
b_thread_desc_
.
GetElementSpaceSize
());
static_for
<
0
,
MRepeat
,
1
>
{}([
&
](
auto
m0
)
{
...
...
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp
View file @
07a673c6
...
...
@@ -16,7 +16,7 @@ namespace ck {
template
<
index_t
BlockSize
,
typename
SrcElementwiseOperation
,
typename
DstElementwiseOperation
,
InMemoryDataOperationEnum
_t
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
BlockSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
...
...
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
View file @
07a673c6
...
...
@@ -14,7 +14,7 @@ namespace ck {
// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
InMemoryDataOperationEnum
_t
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
BlockSliceLengths
,
typename
ThreadSliceLengths
,
typename
ThreadClusterLengths
,
...
...
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v6r1.hpp
View file @
07a673c6
...
...
@@ -15,7 +15,7 @@ namespace ck {
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
_t
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
BlockSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
...
...
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v6r2.hpp
View file @
07a673c6
...
...
@@ -15,7 +15,7 @@ namespace ck {
// 3. Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
_t
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
BlockSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
...
...
include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v6r3.hpp
View file @
07a673c6
...
...
@@ -15,7 +15,7 @@ namespace ck {
// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
template
<
index_t
BlockSize
,
typename
ElementwiseOperation
,
InMemoryDataOperationEnum
_t
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
BlockSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
...
...
include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
View file @
07a673c6
...
...
@@ -26,16 +26,20 @@
#ifndef CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#define CK_REDUCTION_FUNCTIONS_BLOCKWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "cluster_descriptor.hpp"
namespace
ck
{
// clang-format off
// Assume:
// 1) work_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data
// 2) work_buffer has AccDataType elements, and space size is no less than BlockSize
// 3) in_out_value is the input data in vgpr from each thread
// 4) in_out_value is the over-written reduced output in vgpr for each thread
// clang-format on
template
<
typename
AccDataType
,
index_t
BlockSize
,
typename
ThreadClusterLengths_M_K
,
...
...
@@ -61,8 +65,11 @@ struct PartitionedBlockwiseReduction
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
>
;
template
<
typename
BufferType
>
__device__
static
void
Reduce
(
BufferType
&
bloc
k_buffer
,
AccDataType
&
accuData
)
__device__
static
void
Reduce
(
BufferType
&
wor
k_buffer
,
AccDataType
&
in_out_value
)
{
static_assert
(
is_same
<
typename
BufferType
::
type
,
AccDataType
>
{},
"Buffer data type should be consistent as AccDataType!"
);
constexpr
auto
cluster_len_shift
=
get_shift
<
BufferLength_K
>
();
const
auto
thread_cluster_idx
=
...
...
@@ -71,6 +78,10 @@ struct PartitionedBlockwiseReduction
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
Number
<
0
>
{}];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
Number
<
1
>
{}];
work_buffer
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
in_out_value
;
__syncthreads
();
static_for
<
0
,
cluster_len_shift
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
indOffset
=
1
<<
(
cluster_len_shift
-
1
-
I
());
...
...
@@ -80,10 +91,10 @@ struct PartitionedBlockwiseReduction
index_t
offset2
=
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
+
make_tuple
(
0
,
indOffset
));
AccDataType
opData1
=
type_convert
<
AccDataType
>
(
bloc
k_buffer
[
offset1
]
)
;
AccDataType
opData2
=
type_convert
<
AccDataType
>
(
bloc
k_buffer
[
offset2
]
)
;
AccDataType
opData1
=
wor
k_buffer
[
offset1
];
AccDataType
opData2
=
wor
k_buffer
[
offset2
];
Accumulation
::
Calculate
(
opData1
,
opData2
);
bloc
k_buffer
(
offset1
)
=
type_convert
<
AccDataType
>
(
opData1
)
;
wor
k_buffer
(
offset1
)
=
opData1
;
}
__syncthreads
();
...
...
@@ -91,10 +102,17 @@ struct PartitionedBlockwiseReduction
index_t
offset
=
block_buf_desc_m_k
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
,
0
));
accuData
=
type_convert
<
AccDataType
>
(
bloc
k_buffer
[
offset
]
)
;
in_out_value
=
wor
k_buffer
[
offset
];
};
};
// clang-format off
// Assume:
// 1) work_val_buffer/work_idx_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data
// 2) work_val_buffer/work_idx_buffer has AccDataType/IndexDataType elements, and space size is no less than BlockSize
// 3) in_out_value/in_out_index is the input data in vgpr from each thread
// 4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
// clang-format on
template
<
typename
AccDataType
,
typename
IndexDataType
,
index_t
BlockSize
,
...
...
@@ -123,11 +141,16 @@ struct PartitionedBlockwiseReductionWithIndex
// This interface accumulates on both data values and indices
template
<
typename
BufferType
,
typename
IdxBufferType
>
__device__
static
void
Reduce
(
BufferType
&
bloc
k_val_buffer
,
IdxBufferType
&
bloc
k_idx_buffer
,
AccDataType
&
accuData
,
IndexDataType
&
accuI
ndex
)
__device__
static
void
Reduce
(
BufferType
&
wor
k_val_buffer
,
IdxBufferType
&
wor
k_idx_buffer
,
AccDataType
&
in_out_value
,
IndexDataType
&
in_out_i
ndex
)
{
static_assert
(
is_same
<
typename
BufferType
::
type
,
AccDataType
>
{},
"Buffer data type should be consistent as AccDataType!"
);
static_assert
(
is_same
<
typename
IdxBufferType
::
type
,
IndexDataType
>
{},
"Buffer data type should be consistent as IndexDataType!"
);
constexpr
auto
cluster_len_shift
=
get_shift
<
BufferLength_K
>
();
const
auto
thread_cluster_idx
=
...
...
@@ -136,6 +159,11 @@ struct PartitionedBlockwiseReductionWithIndex
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
Number
<
0
>
{}];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
Number
<
1
>
{}];
work_val_buffer
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
in_out_value
;
work_idx_buffer
(
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
))
=
in_out_index
;
__syncthreads
();
static_for
<
0
,
cluster_len_shift
,
1
>
{}([
&
](
auto
I
)
{
constexpr
index_t
indOffset
=
1
<<
I
();
...
...
@@ -145,14 +173,14 @@ struct PartitionedBlockwiseReductionWithIndex
index_t
offset2
=
block_buf_desc_m_k
.
CalculateOffset
(
thread_cluster_idx
+
make_tuple
(
0
,
indOffset
));
AccDataType
opData1
=
type_convert
<
AccDataType
>
(
bloc
k_val_buffer
[
offset1
]
)
;
AccDataType
opData2
=
type_convert
<
AccDataType
>
(
bloc
k_val_buffer
[
offset2
]
)
;
IndexDataType
currIndex1
=
bloc
k_idx_buffer
[
offset1
];
IndexDataType
currIndex2
=
bloc
k_idx_buffer
[
offset2
];
AccDataType
opData1
=
wor
k_val_buffer
[
offset1
];
AccDataType
opData2
=
wor
k_val_buffer
[
offset2
];
IndexDataType
currIndex1
=
wor
k_idx_buffer
[
offset1
];
IndexDataType
currIndex2
=
wor
k_idx_buffer
[
offset2
];
Accumulation
::
Calculate
(
opData1
,
opData2
,
currIndex1
,
currIndex2
);
bloc
k_val_buffer
(
offset1
)
=
type_convert
<
AccDataType
>
(
opData1
)
;
bloc
k_idx_buffer
(
offset1
)
=
currIndex1
;
wor
k_val_buffer
(
offset1
)
=
opData1
;
wor
k_idx_buffer
(
offset1
)
=
currIndex1
;
}
__syncthreads
();
...
...
@@ -160,9 +188,9 @@ struct PartitionedBlockwiseReductionWithIndex
index_t
offset
=
block_buf_desc_m_k
.
CalculateOffset
(
make_tuple
(
thread_m_cluster_id
,
0
));
accuData
=
type_convert
<
AccDataType
>
(
bloc
k_val_buffer
[
offset
]
)
;
accuI
ndex
=
bloc
k_idx_buffer
[
offset
];
}
in_out_value
=
wor
k_val_buffer
[
offset
];
in_out_i
ndex
=
wor
k_idx_buffer
[
offset
];
}
;
};
};
// end of namespace ck
...
...
include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
View file @
07a673c6
...
...
@@ -5,7 +5,7 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
enum
struct
ConvolutionBackwardDataSpecialization
_t
enum
struct
ConvolutionBackwardDataSpecialization
{
Default
,
Filter1x1Stride1Pad0
,
...
...
include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
View file @
07a673c6
...
...
@@ -7,7 +7,7 @@ namespace ck {
namespace
tensor_operation
{
namespace
device
{
enum
struct
ConvolutionForwardSpecialization
_t
enum
struct
ConvolutionForwardSpecialization
{
Default
,
Filter1x1Pad0
,
...
...
@@ -15,14 +15,14 @@ enum struct ConvolutionForwardSpecialization_t
OddC
,
};
inline
std
::
string
getConvFwdSpecializationStr
(
const
ConvolutionForwardSpecialization
_t
&
s
)
inline
std
::
string
getConvFwdSpecializationStr
(
const
ConvolutionForwardSpecialization
&
s
)
{
switch
(
s
)
{
case
ConvolutionForwardSpecialization
_t
::
Default
:
return
"Default"
;
case
ConvolutionForwardSpecialization
_t
::
Filter1x1Pad0
:
return
"Filter1x1Pad0"
;
case
ConvolutionForwardSpecialization
_t
::
Filter1x1Stride1Pad0
:
return
"Filter1x1Stride1Pad0"
;
case
ConvolutionForwardSpecialization
_t
::
OddC
:
return
"OddC"
;
case
ConvolutionForwardSpecialization
::
Default
:
return
"Default"
;
case
ConvolutionForwardSpecialization
::
Filter1x1Pad0
:
return
"Filter1x1Pad0"
;
case
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
:
return
"Filter1x1Stride1Pad0"
;
case
ConvolutionForwardSpecialization
::
OddC
:
return
"OddC"
;
default:
return
"Unrecognized specialization!"
;
}
}
...
...
Prev
1
2
3
4
5
6
7
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment