Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
4cd8f454
Commit
4cd8f454
authored
Jun 17, 2020
by
Chao Liu
Browse files
sync with miopen
parent
6fc49f91
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
943 additions
and
280 deletions
+943
-280
CMakeLists.txt
CMakeLists.txt
+2
-0
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+16
-17
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+6
-6
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+9
-3
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+3
-1
composable_kernel/include/utility/amd_buffer_addressing.hpp
composable_kernel/include/utility/amd_buffer_addressing.hpp
+628
-175
composable_kernel/include/utility/common_header.hpp
composable_kernel/include/utility/common_header.hpp
+1
-4
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+22
-5
composable_kernel/include/utility/float_type.amd.hpp.in
composable_kernel/include/utility/float_type.amd.hpp.in
+94
-0
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+126
-58
composable_kernel/include/utility/synchronization.amd.hpp.in
composable_kernel/include/utility/synchronization.amd.hpp.in
+25
-0
driver/src/conv_bwd_data_driver.cpp
driver/src/conv_bwd_data_driver.cpp
+3
-3
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+8
-8
No files found.
CMakeLists.txt
View file @
4cd8f454
...
...
@@ -56,10 +56,12 @@ if(DEVICE_BACKEND STREQUAL "AMD")
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/config.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/config.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/float_type.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/float_type.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/in_memory_operation.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/in_memory_operation.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/synchronization.amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/synchronization.hpp"
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/config.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/config.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/float_type.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/float_type.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/in_memory_operation.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/in_memory_operation.hpp"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/composable_kernel/include/utility/synchronization.nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/composable_kernel/include/utility/synchronization.hpp"
)
endif
()
add_subdirectory
(
driver
)
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
View file @
4cd8f454
...
...
@@ -36,8 +36,8 @@ template <index_t GridSize,
index_t
ThreadGemmDataPerRead_GemmN
,
typename
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
typename
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
index_t
GemmABlockCopySrcDataPerRead_Gemm
N
,
index_t
GemmABlockCopyDstDataPerWrite_Gemm
N
,
index_t
GemmABlockCopySrcDataPerRead_Gemm
M
,
index_t
GemmABlockCopyDstDataPerWrite_Gemm
M
,
typename
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
,
typename
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
,
index_t
GemmBBlockCopySrcDataPerRead_GemmN
,
...
...
@@ -82,13 +82,6 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
constexpr
auto
wei_gemmk_gemmm_global_desc
=
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
);
// output tensor
constexpr
auto
out_gemmk_gemmn_global_desc
=
transform_tensor_descriptor
(
unfold_tensor_descriptor
(
out_n_k_ho_wo_global_desc
,
I2
,
I3
),
make_tuple
(
PassThrough
<
K
>
{},
Merge
<
Sequence
<
N
,
Ho
*
Wo
>>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// input tensor
constexpr
auto
in_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
...
...
@@ -98,16 +91,15 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{}));
constexpr
index_t
Hip
=
in_n_c_hip_wip_global_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wip
=
in_n_c_hip_wip_global_desc
.
GetLengths
()[
3
];
constexpr
auto
in_n_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Embed
<
Hi
+
InLeftPads
::
At
(
0
)
+
InRightPads
::
At
(
0
),
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Wi
+
InLeftPads
::
At
(
1
)
+
InRightPads
::
At
(
1
),
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
Embed
<
Hip
,
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Wip
,
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
...
...
@@ -117,6 +109,13 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// output tensor
constexpr
auto
out_gemmk_gemmn_global_desc
=
transform_tensor_descriptor
(
unfold_tensor_descriptor
(
out_n_k_ho_wo_global_desc
,
I2
,
I3
),
make_tuple
(
PassThrough
<
K
>
{},
Merge
<
Sequence
<
N
,
Ho
*
Wo
>>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// GEMM
// \todo there are more combinations of Y, ConvDilationH and ConvStrideH that don't need
// atomic, find out all of them
...
...
@@ -152,8 +151,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
GemmABlockCopySrcDataPerRead_Gemm
N
,
GemmABlockCopyDstDataPerWrite_Gemm
N
,
GemmABlockCopySrcDataPerRead_Gemm
M
,
GemmABlockCopyDstDataPerWrite_Gemm
M
,
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
,
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
,
Sequence
<
0
,
1
>
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
4cd8f454
...
...
@@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
#endif
// weight tensor
constexpr
auto
wei_
e_k
_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
constexpr
auto
wei_
gemmk_gemmm
_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
),
Sequence
<
1
,
0
>
{});
// input tensor
...
...
@@ -110,14 +110,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
in_
e_b
_global_desc
=
transform_tensor_descriptor
(
constexpr
auto
in_
gemmm_gemmn
_global_desc
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
,
X
>>
{},
Merge
<
Sequence
<
N
,
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// output tensor
constexpr
auto
out_
k_b
_global_desc
=
constexpr
auto
out_
gemmk_gemmn
_global_desc
=
transform_tensor_descriptor
(
unfold_tensor_descriptor
(
out_n_k_ho_wo_global_desc
,
I2
,
I3
),
make_tuple
(
PassThrough
<
K
>
{},
Merge
<
Sequence
<
N
,
Ho
*
Wo
>>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
...
...
@@ -129,9 +129,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
BlockSize
,
Float
,
AccFloat
,
decltype
(
wei_
e_k
_global_desc
),
decltype
(
in_
e_b
_global_desc
),
decltype
(
out_
k_b
_global_desc
),
decltype
(
wei_
gemmk_gemmm
_global_desc
),
decltype
(
in_
gemmm_gemmn
_global_desc
),
decltype
(
out_
gemmk_gemmn
_global_desc
),
InMemoryDataOperation
::
Set
,
GemmMPerBlock
,
GemmNPerBlock
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
4cd8f454
...
...
@@ -31,7 +31,9 @@ template <index_t BlockSize,
AddressSpace
SrcAddressSpace
=
AddressSpace
::
Generic
,
AddressSpace
ThreadBufferAddressSpace
=
AddressSpace
::
Generic
,
AddressSpace
DstAddressSpace
=
AddressSpace
::
Generic
,
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
Set
>
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
Set
,
index_t
SrcDataStride
=
1
,
index_t
DstDataStride
=
1
>
struct
BlockwiseGenericTensorSliceCopy_v4
{
static
constexpr
index_t
nDim
=
BlockSrcDesc
::
GetNumOfDimension
();
...
...
@@ -178,7 +180,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
1
,
SrcAddressSpace
,
ThreadBufferAddressSpace
,
InMemoryDataOperation
::
Set
>
;
InMemoryDataOperation
::
Set
,
SrcDataStride
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
ThreadBufferDesc
,
BlockDstDesc
,
...
...
@@ -189,7 +193,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
DstDataPerWrite
,
ThreadBufferAddressSpace
,
DstAddressSpace
,
DstInMemOp
>
;
DstInMemOp
,
1
,
DstDataStride
>
;
static
constexpr
auto
mThreadClusterDesc
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
4cd8f454
...
...
@@ -23,7 +23,9 @@ template <typename SrcDesc,
index_t
DstDataPerWrite
,
AddressSpace
SrcAddressSpace
=
AddressSpace
::
Generic
,
AddressSpace
DstAddressSpace
=
AddressSpace
::
Generic
,
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
Set
>
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
Set
,
index_t
SrcDataStride
=
1
,
index_t
DstDataStride
=
1
>
struct
ThreadwiseGenericTensorSliceCopy_v4r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
...
...
composable_kernel/include/utility/amd_buffer_addressing.hpp
View file @
4cd8f454
...
...
@@ -8,65 +8,149 @@ namespace ck {
// For 128bit SGPRs in buffer_load and buffer_store instructions
// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
template
<
typename
T
>
union
Buffer
LoadStoreDword
Config
union
Buffer
Address
Config
{
int32x4_t
data
;
T
*
address
[
2
];
int32_t
range
[
4
];
};
__device__
float
__llvm_amdgcn_buffer_load
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f32"
);
__device__
float
__llvm_amdgcn_buffer_load
_f32
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f32"
);
__device__
float2_t
__llvm_amdgcn_buffer_loadx2
(
int32x4_t
rsrc
,
__device__
float2_t
__llvm_amdgcn_buffer_load_f32x2
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f32"
);
__device__
float4_t
__llvm_amdgcn_buffer_load_f32x4
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f32"
);
__device__
half_t
__llvm_amdgcn_buffer_load_f16
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f32"
);
__device__
float4_t
__llvm_amdgcn_buffer_loadx4
(
int32x4_t
rsrc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.f16"
);
__device__
half2_t
__llvm_amdgcn_buffer_load_f16x2
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2f16"
);
__device__
half4_t
__llvm_amdgcn_buffer_load_f16x4
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f16"
);
__device__
ushort
__llvm_amdgcn_buffer_load_bf16
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.bf16"
);
__device__
ushort2_t
__llvm_amdgcn_buffer_load_bf16x2
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v2bf16"
);
__device__
ushort4_t
__llvm_amdgcn_buffer_load_bf16x4
(
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4bf16"
);
__device__
void
__llvm_amdgcn_buffer_store_f32
(
float
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.f32"
);
__device__
void
__llvm_amdgcn_buffer_store_f32x2
(
float2_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v2f32"
);
__device__
void
__llvm_amdgcn_buffer_store_f32x4
(
float4_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f32"
);
__device__
void
__llvm_amdgcn_buffer_store_f16
(
half_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.f16"
);
__device__
void
__llvm_amdgcn_buffer_store_f16x2
(
half2_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v2f16"
);
__device__
void
__llvm_amdgcn_buffer_store_f16x4
(
half4_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f16"
);
__device__
void
__llvm_amdgcn_buffer_store_bf16
(
ushort
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.load.v4f32"
);
__device__
void
__llvm_amdgcn_buffer_store
(
float
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.f32"
);
__device__
void
__llvm_amdgcn_buffer_storex2
(
float2_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v2f32"
);
__device__
void
__llvm_amdgcn_buffer_storex4
(
float4_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4f32"
);
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.bf16"
);
__device__
void
__llvm_amdgcn_buffer_atomic_add
(
float
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.atomic.fadd.f32"
);
__llvm_amdgcn_buffer_store_bf16x2
(
ushort2_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v2bf16"
);
__device__
void
__llvm_amdgcn_buffer_store_bf16x4
(
ushort4_t
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
glc
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.store.v4bf16"
);
__device__
void
__llvm_amdgcn_buffer_atomic_add_f32
(
float
vdata
,
int32x4_t
rsrc
,
index_t
vindex
,
index_t
offset
,
bool
slc
)
__asm
(
"llvm.amdgcn.buffer.atomic.fadd.f32"
);
// buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
amd_
intrinsic_
buffer_load
(
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
amd_buffer_load
(
const
T
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
);
// buffer_store requires:
...
...
@@ -74,30 +158,44 @@ __device__ typename vector_type<T, VectorSize>::MemoryType amd_intrinsic_buffer_
// 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
amd_intrinsic_buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
);
__device__
void
amd_buffer_store
(
const
T
*
p_src
,
T
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
);
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
amd_intrinsic_buffer_atomic_add
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
);
__device__
void
amd_buffer_atomic_add
(
const
T
*
p_src
,
T
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
);
template
<
>
__device__
float
amd_
intrinsic_
buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
__device__
float
amd_buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
float
dst
;
BufferAddressConfig
<
float
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
BufferLoadStoreDwordConfig
<
float
>
src_block_config
;
return
__llvm_amdgcn_buffer_load_f32
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
}
template
<
>
__device__
float2_t
amd_buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
BufferAddressConfig
<
float
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_block
);
...
...
@@ -106,102 +204,283 @@ __device__ float amd_intrinsic_buffer_load<float, 1>(const float* p_src_block,
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
dst
=
__llvm_amdgcn_buffer_load
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#else
asm
volatile
(
"
\n
\
buffer_load_dword %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset
),
"s"
(
src_block_config
.
data
),
"s"
(
src_const_addr_offset
));
#endif
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
return
dst
;
return
__llvm_amdgcn_buffer_load_f32x2
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
}
template
<
>
__device__
float
2
_t
amd_
intrinsic_
buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
__device__
float
4
_t
amd_buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
float2_t
dst
;
BufferAddressConfig
<
float
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
BufferLoadStoreDwordConfig
<
float
>
src_block_config
;
return
__llvm_amdgcn_buffer_load_f32x4
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
}
template
<
>
__device__
half_t
amd_buffer_load
<
half_t
,
1
>
(
const
half_t
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
BufferAddressConfig
<
half_t
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
floa
t
*>
(
p_src_block
);
src_block_config
.
address
[
0
]
=
const_cast
<
half_
t
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
dst
=
__llvm_amdgcn_buffer_loadx2
(
#if !CK_WORKAROUND_SWDEV_231101
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
return
__llvm_amdgcn_buffer_load_f16
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#else
asm
volatile
(
"
\n
\
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset
),
"s"
(
src_block_config
.
data
),
"s"
(
src_const_addr_offset
));
return
p_src_block
[
src_thread_data_offset
+
src_const_data_offset
];
#endif
}
return
dst
;
template
<
>
__device__
half2_t
amd_buffer_load
<
half_t
,
2
>
(
const
half_t
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
BufferAddressConfig
<
half_t
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_WORKAROUND_SWDEV_231101
return
__llvm_amdgcn_buffer_load_f16x2
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#else
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
*
reinterpret_cast
<
half2_t
*>
(
&
dst_out_tmp
);
#endif
}
template
<
>
__device__
float
4_t
amd_
intrinsic_
buffer_load
<
floa
t
,
4
>
(
const
floa
t
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
__device__
half
4_t
amd_buffer_load
<
half_
t
,
4
>
(
const
half_
t
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
float4_t
dst
;
BufferAddressConfig
<
half_t
>
src_block_config
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
float
);
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
BufferLoadStoreDwordConfig
<
float
>
src_block_config
;
#if !CK_WORKAROUND_SWDEV_231101
return
__llvm_amdgcn_buffer_load_f16x4
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#else
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
*
reinterpret_cast
<
half4_t
*>
(
&
dst_out_tmp
);
#endif
}
template
<
>
__device__
half8_t
amd_buffer_load
<
half_t
,
8
>
(
const
half_t
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
BufferAddressConfig
<
half_t
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
float
*>
(
p_src_block
);
src_block_config
.
address
[
0
]
=
const_cast
<
half_t
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
half_t
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
half_t
);
#if !CK_WORKAROUND_SWDEV_231101
static_assert
(
false
,
"wrong! not supported"
);
#else
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
*
reinterpret_cast
<
half8_t
*>
(
&
dst_out_tmp
);
#endif
}
template
<
>
__device__
ushort
amd_buffer_load
<
ushort
,
1
>
(
const
ushort
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
BufferAddressConfig
<
ushort
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
dst
=
__llvm_amdgcn_buffer_loadx4
(
#if !CK_WORKAROUND_SWDEV_231101
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
return
__llvm_amdgcn_buffer_load_bf16
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#else
asm
volatile
(
"
\n
\
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_addr_offset
),
"s"
(
src_block_config
.
data
),
"s"
(
src_const_addr_offset
));
return
p_src_block
[
src_thread_data_offset
+
src_const_data_offset
];
#endif
}
template
<
>
__device__
ushort2_t
amd_buffer_load
<
ushort
,
2
>
(
const
ushort
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
BufferAddressConfig
<
ushort
>
src_block_config
;
return
dst
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_WORKAROUND_SWDEV_231101
return
__llvm_amdgcn_buffer_load_bf16x2
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#else
float
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
*
reinterpret_cast
<
ushort2_t
*>
(
&
dst_out_tmp
);
#endif
}
template
<
>
__device__
void
amd_intrinsic_buffer_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
__device__
ushort4_t
amd_buffer_load
<
ushort
,
4
>
(
const
ushort
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
BufferAddressConfig
<
ushort
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_WORKAROUND_SWDEV_231101
return
__llvm_amdgcn_buffer_load_bf16x4
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
#else
float2_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x2
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
*
reinterpret_cast
<
ushort4_t
*>
(
&
dst_out_tmp
);
#endif
}
template
<
>
__device__
ushort8_t
amd_buffer_load
<
ushort
,
8
>
(
const
ushort
*
p_src_block
,
index_t
src_thread_data_offset
,
index_t
src_const_data_offset
)
{
BufferAddressConfig
<
ushort
>
src_block_config
;
// fill in byte 0 - 1
src_block_config
.
address
[
0
]
=
const_cast
<
ushort
*>
(
p_src_block
);
// fill in byte 2
src_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
src_block_config
.
range
[
3
]
=
0x00027000
;
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
ushort
);
index_t
src_const_addr_offset
=
src_const_data_offset
*
sizeof
(
ushort
);
#if !CK_WORKAROUND_SWDEV_231101
static_assert
(
false
,
"wrong! not implemented"
);
#else
float4_t
dst_out_tmp
=
__llvm_amdgcn_buffer_load_f32x4
(
src_block_config
.
data
,
0
,
src_thread_addr_offset
+
src_const_addr_offset
,
false
,
false
);
return
*
reinterpret_cast
<
ushort8_t
*>
(
&
dst_out_tmp
);
#endif
}
template
<
>
__device__
void
amd_buffer_store
<
float
,
1
>
(
const
float
*
p_src
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
BufferAddressConfig
<
float
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
// fill in byte 2
dst_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
BufferLoadStoreDwordConfig
<
float
>
dst_block_config
;
__llvm_amdgcn_buffer_store_f32
(
*
p_src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
}
template
<
>
__device__
void
amd_buffer_store
<
float
,
2
>
(
const
float
*
p_src
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
BufferAddressConfig
<
float
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
...
...
@@ -210,35 +489,50 @@ __device__ void amd_intrinsic_buffer_store<float, 1>(const float& src,
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
__llvm_amdgcn_buffer_store
(
src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#else
asm
volatile
(
"
\n
\
buffer_store_dword %1, %2, %0, %3 offen offset:0
\n
\
"
:
:
"s"
(
dst_block_config
.
data
),
"v"
(
src
),
"v"
(
dst_thread_addr_offset
),
"s"
(
dst_const_addr_offset
));
#endif
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
__llvm_amdgcn_buffer_store_f32x2
(
*
reinterpret_cast
<
const
float2_t
*>
(
p_src
),
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
}
template
<
>
__device__
void
amd_
intrinsic_
buffer_store
<
float
,
2
>
(
const
float
2_t
&
src
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
__device__
void
amd_buffer_store
<
float
,
4
>
(
const
float
*
p_
src
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
BufferAddressConfig
<
float
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
// fill in byte 2
dst_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
BufferLoadStoreDwordConfig
<
float
>
dst_block_config
;
__llvm_amdgcn_buffer_store_f32x4
(
*
reinterpret_cast
<
const
float4_t
*>
(
p_src
),
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
}
template
<
>
__device__
void
amd_buffer_store
<
half_t
,
1
>
(
const
half_t
*
p_src
,
half_t
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
BufferAddressConfig
<
half_t
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
...
...
@@ -247,35 +541,68 @@ __device__ void amd_intrinsic_buffer_store<float, 2>(const float2_t& src,
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
__llvm_amdgcn_buffer_storex2
(
src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#if !CK_WORKAROUND_SWDEV_231101
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
__llvm_amdgcn_buffer_store_f16
(
*
p_src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#else
asm
volatile
(
"
\n
\
buffer_store_dwordx2 %1, %2, %0, %3 offen offset:0
\n
\
"
:
:
"s"
(
dst_block_config
.
data
),
"v"
(
src
),
"v"
(
dst_thread_addr_offset
),
"s"
(
dst_const_addr_offset
));
p_dst_block
[
dst_thread_data_offset
+
dst_const_data_offset
]
=
*
p_src
;
#endif
}
template
<
>
__device__
void
amd_
intrinsic_
buffer_store
<
floa
t
,
4
>
(
const
float4_t
&
src
,
floa
t
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
__device__
void
amd_buffer_store
<
half_
t
,
2
>
(
const
half_t
*
p_
src
,
half_
t
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
BufferAddressConfig
<
half_t
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
// fill in byte 2
dst_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
BufferLoadStoreDwordConfig
<
float
>
dst_block_config
;
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_f16x2
(
*
reinterpret_cast
<
const
half2_t
*>
(
p_src
),
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#else
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src
);
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
amd_buffer_store
<
half_t
,
4
>
(
const
half_t
*
p_src
,
half_t
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
half_t
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
half_t
);
BufferAddressConfig
<
half_t
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
...
...
@@ -284,35 +611,99 @@ __device__ void amd_intrinsic_buffer_store<float, 4>(const float4_t& src,
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
#if CK_
USE_AMD_BUFFER_ADDRESSING_INTRINSIC
__llvm_amdgcn_buffer_store
x4
(
src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#if
!
CK_
WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store
_f16x4
(
*
reinterpret_cast
<
const
half4_t
*>
(
p_
src
)
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#else
asm
volatile
(
"
\n
\
buffer_store_dwordx4 %1, %2, %0, %3 offen offset:0
\n
\
"
:
:
"s"
(
dst_block_config
.
data
)
,
"v"
(
src
)
,
"v"
(
dst_thread_addr_offset
)
,
"s"
(
dst_const_addr_offset
)
);
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src
);
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
amd_
intrinsic_
buffer_
a
to
mic_add
<
floa
t
,
1
>
(
const
float
&
src
,
floa
t
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
__device__
void
amd_buffer_
s
to
re
<
ushor
t
,
1
>
(
const
ushort
*
p_
src
,
ushor
t
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
BufferAddressConfig
<
ushort
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
// fill in byte 2
dst_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
#if !CK_WORKAROUND_SWDEV_231101
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
__llvm_amdgcn_buffer_store_bf16
(
*
p_src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#else
p_dst_block
[
dst_thread_data_offset
+
dst_const_data_offset
]
=
*
p_src
;
#endif
}
template
<
>
__device__
void
amd_buffer_store
<
ushort
,
2
>
(
const
ushort
*
p_src
,
ushort
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
BufferAddressConfig
<
ushort
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
// fill in byte 2
dst_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
BufferLoadStoreDwordConfig
<
float
>
dst_block_config
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_bf16x2
(
*
p_src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#else
const
float
*
p_src_tmp
=
reinterpret_cast
<
const
float
*>
(
p_src
);
__llvm_amdgcn_buffer_store_f32
(
*
p_src_tmp
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
amd_buffer_store
<
ushort
,
4
>
(
const
ushort
*
p_src
,
ushort
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
BufferAddressConfig
<
ushort
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
...
...
@@ -321,13 +712,75 @@ __device__ void amd_intrinsic_buffer_atomic_add<float, 1>(const float& src,
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
#if CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
__llvm_amdgcn_buffer_atomic_add
(
src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
);
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
ushort
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
ushort
);
#if !CK_WORKAROUND_SWDEV_231101
__llvm_amdgcn_buffer_store_bf16x4
(
*
p_src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#else
static_assert
(
false
,
" wrong! not implemented"
);
const
float2_t
*
p_src_tmp
=
reinterpret_cast
<
const
float2_t
*>
(
p_src
);
__llvm_amdgcn_buffer_store_f32x2
(
*
p_src_tmp
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
,
false
);
#endif
}
template
<
>
__device__
void
amd_buffer_atomic_add
<
float
,
1
>
(
const
float
*
p_src
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
BufferAddressConfig
<
float
>
dst_block_config
;
// fill in byte 0 - 1
dst_block_config
.
address
[
0
]
=
p_dst_block
;
// fill in byte 2
dst_block_config
.
range
[
2
]
=
-
1
;
// fill in byte 3
dst_block_config
.
range
[
3
]
=
0x00027000
;
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
index_t
dst_const_addr_offset
=
dst_const_data_offset
*
sizeof
(
float
);
__llvm_amdgcn_buffer_atomic_add_f32
(
*
p_src
,
dst_block_config
.
data
,
0
,
dst_thread_addr_offset
+
dst_const_addr_offset
,
false
);
}
template
<
>
__device__
void
amd_buffer_atomic_add
<
float
,
2
>
(
const
float
*
p_src
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
for
(
index_t
i
=
0
;
i
<
2
;
++
i
)
{
amd_buffer_atomic_add
<
float
,
1
>
(
&
p_src
[
i
],
p_dst_block
,
dst_thread_data_offset
,
dst_const_data_offset
+
i
);
}
}
template
<
>
__device__
void
amd_buffer_atomic_add
<
float
,
4
>
(
const
float
*
p_src
,
float
*
p_dst_block
,
index_t
dst_thread_data_offset
,
index_t
dst_const_data_offset
)
{
for
(
index_t
i
=
0
;
i
<
4
;
++
i
)
{
amd_buffer_atomic_add
<
float
,
1
>
(
&
p_src
[
i
],
p_dst_block
,
dst_thread_data_offset
,
dst_const_data_offset
+
i
);
}
}
}
// namespace ck
#endif
composable_kernel/include/utility/common_header.hpp
View file @
4cd8f454
...
...
@@ -16,15 +16,12 @@
#include "functional3.hpp"
#include "functional4.hpp"
#include "in_memory_operation.hpp"
#include "synchronization.hpp"
#if CK_USE_AMD_INLINE_ASM
#include "amd_inline_asm.hpp"
#endif
#if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp"
#endif
#if CK_USE_AMD_XDLOPS
#include "amd_xdlops.hpp"
#endif
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
4cd8f454
...
...
@@ -25,11 +25,7 @@
#define CK_USE_AMD_BUFFER_ADDRESSING 1
#endif
#ifndef CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC
#define CK_USE_AMD_BUFFER_ADDRESSING_INTRINSIC 1
#endif
// only support gfx908
// only gfx908 support native floating point atomic add
#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
#define CK_USE_AMD_BUFFER_ATOMIC_ADD 0
#endif
...
...
@@ -47,6 +43,11 @@
#define CK_USE_AMD_XDLOPS_EMULATE 0 // For internal debug purposes
#endif
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#endif
// experimental implementation
#define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1
#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0
...
...
@@ -54,8 +55,24 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
#ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK
#define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_INPUT_SKIP_OUT_OF_BOUND_CHECK 0
#endif
// workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
// workaround for buffer load/store fp16/bfp16 intrinsic bug
#ifndef CK_WORKAROUND_SWDEV_231101
#define CK_WORKAROUND_SWDEV_231101 1
#endif
namespace ck {
...
...
composable_kernel/include/utility/float_type.amd.hpp.in
View file @
4cd8f454
...
...
@@ -14,10 +14,12 @@ typedef float float32_t __attribute__((ext_vector_type(32)));
typedef _Float16 half_t;
typedef _Float16 half2_t __attribute__((ext_vector_type(2)));
typedef _Float16 half4_t __attribute__((ext_vector_type(4)));
typedef _Float16 half8_t __attribute__((ext_vector_type(8)));
// bfloat16
typedef ushort ushort2_t __attribute__((ext_vector_type(2)));
typedef ushort ushort4_t __attribute__((ext_vector_type(4)));
typedef ushort ushort8_t __attribute__((ext_vector_type(8)));
template <class T, index_t N>
struct vector_type
...
...
@@ -152,6 +154,25 @@ struct vector_type<half_t, 4>
}
};
template <>
struct vector_type<half_t, 8>
{
using MemoryType = half8_t;
union DataType
{
MemoryType vector;
half_t scalar[8];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, half_t s, Number<I>)
{
static_assert(I < 8, "wrong");
*(reinterpret_cast<half_t*>(&v) + I) = s;
}
};
template <>
struct vector_type<ushort, 1>
{
...
...
@@ -221,6 +242,25 @@ struct vector_type<ushort, 4>
}
};
template <>
struct vector_type<ushort, 8>
{
using MemoryType = ushort8_t;
union DataType
{
MemoryType vector;
ushort scalar[8];
};
template <index_t I>
__host__ __device__ static void SetScalar(MemoryType& v, ushort s, Number<I>)
{
static_assert(I < 8, "wrong");
*(reinterpret_cast<ushort*>(&v) + I) = s;
}
};
// data type conversion
template <typename T>
struct type_convert
...
...
@@ -251,6 +291,34 @@ struct inner_product_with_conversion
{
static constexpr auto convert = type_convert<T>();
__device__ T operator()(float4_t a, float4_t b) const
{
const float* p_a_float = reinterpret_cast<const float*>(&a);
const float* p_b_float = reinterpret_cast<const float*>(&b);
T acc = 0;
for(index_t v = 0; v < 4; ++v)
{
acc += convert(p_a_float[v]) * convert(p_b_float[v]);
}
return acc;
}
__device__ T operator()(float2_t a, float2_t b) const
{
const float* p_a_float = reinterpret_cast<const float*>(&a);
const float* p_b_float = reinterpret_cast<const float*>(&b);
T acc = 0;
for(index_t v = 0; v < 2; ++v)
{
acc += convert(p_a_float[v]) * convert(p_b_float[v]);
}
return acc;
}
__device__ T operator()(float a, float b) const { return convert(a) * convert(b); }
__device__ T operator()(half2_t a, half2_t b) const
...
...
@@ -280,6 +348,19 @@ struct inner_product_with_conversion
return acc;
}
__device__ T operator()(half8_t a, half8_t b) const
{
const half_t* p_a_half = reinterpret_cast<const half_t*>(&a);
const half_t* p_b_half = reinterpret_cast<const half_t*>(&b);
T acc = 0;
for(index_t v = 0; v < 8; ++v)
{
acc += convert(p_a_half[v]) * convert(p_b_half[v]);
}
return acc;
}
__device__ T operator()(ushort2_t a, ushort2_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
...
...
@@ -306,6 +387,19 @@ struct inner_product_with_conversion
}
return acc;
}
__device__ T operator()(ushort8_t a, ushort8_t b) const
{
const ushort* p_a_bfloat16 = reinterpret_cast<const ushort*>(&a);
const ushort* p_b_bfloat16 = reinterpret_cast<const ushort*>(&b);
T acc = 0;
for(index_t v = 0; v < 8; ++v)
{
acc += convert(p_a_bfloat16[v]) * convert(p_b_bfloat16[v]);
}
return acc;
}
};
} // namespace ck
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
4cd8f454
...
...
@@ -2,91 +2,159 @@
#define CK_IN_MEMORY_OPERATION_AMD_HPP
#include "float_type.hpp"
#if CK_USE_AMD_BUFFER_ADDRESSING
#include "amd_buffer_addressing.hpp"
#endif
namespace ck {
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void set_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
template <typename T>
__device__ void atomic_add_impl(T* p_dst, T src)
{
atomicAdd(p_dst, src);
}
// atomicAdd for float does not support vector type
template <>
__device__ void atomic_add_impl<float2_t>(float2_t* p_dst, float2_t src)
{
float* p_dst_float = reinterpret_cast<float*>(p_dst);
const float* p_src_float = reinterpret_cast<const float*>(&src);
for(index_t i = 0; i < 2; ++i)
{
atomicAdd(&(p_dst_float[i]), p_src_float[i]);
}
}
template <>
__device__ void atomic_add_impl<float4_t>(float4_t* p_dst, float4_t src)
{
float* p_dst_float = reinterpret_cast<float*>(p_dst);
const float* p_src_float = reinterpret_cast<const float*>(&src);
for(index_t i = 0; i < 4; ++i)
{
atomicAdd(&(p_dst_float[i]), p_src_float[i]);
}
}
template <typename T, index_t DataPerAccess>
struct SetData
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
}
#if CK_USE_AMD_BUFFER_ADDRESSING
// TODO: use static_if::ElseIf, instead of nested static_if
static_if<SrcAddressSpace == AddressSpace::Global &&
DstAddressSpace == AddressSpace::Vgpr>{}([&](auto) {
// buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
// buffer_load requires:
// 1) p_src must be in global memory space, d_dst must be vgpr
// 2) p_src to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Global, AddressSpace::Vgpr>(const T* p_src,
index_t src_offset,
T* p_dst,
index_t dst_offset) const
{
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
amd_intrinsic_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
}).Else([&](auto) {
static_if<SrcAddressSpace == AddressSpace::Vgpr &&
DstAddressSpace == AddressSpace::Global>{}([&](auto) {
// buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
amd_intrinsic_buffer_store<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
}).Else([&](auto) {
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
#else
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
amd_buffer_load<T, DataPerAccess>(p_src, src_offset, 0);
}
// buffer_store requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
T* p_dst,
index_t dst_offset) const
{
amd_buffer_store<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
}
#endif
}
}
;
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace>
__device__ void atomic_add_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
template <typename T, index_t DataPerAccess>
struct AtomicAddData
{
using vector_t = typename vector_type<T, DataPerAccess>::MemoryType;
static_if<SrcAddressSpace == AddressSpace::Vgpr &&
DstAddressSpace == AddressSpace::Global>{}([&](auto) {
#if CK_USE_AMD_BUFFER_ATOMIC_ADD
amd_intrinsic_buffer_atomic_add<T, DataPerAccess>(
*reinterpret_cast<const vector_t*>(&p_src[src_offset]), p_dst, dst_offset, 0);
#else
atomicAdd(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
// This version is only for compatibility, don't use this version if possible
template <AddressSpace SrcAddressSpace, AddressSpace DstAddressSpace>
__device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const
{
atomic_add_impl(reinterpret_cast<vector_t*>(&p_dst[dst_offset]),
*reinterpret_cast<const vector_t*>(&p_src[src_offset]));
}
#if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD
// buffer_atomic_add requires:
// 1) p_src must be in vgpr space, d_dst must be global memory
// 2) p_dst to be a block-invariant pointer.
// It is user's responsibility to make sure that is true.
template <>
__device__ void Run<AddressSpace::Vgpr, AddressSpace::Global>(const T* p_src,
index_t src_offset,
T* p_dst,
index_t dst_offset) const
{
amd_buffer_atomic_add<T, DataPerAccess>(&(p_src[src_offset]), p_dst, dst_offset, 0);
}
#endif
}).Else([&](auto fwd) {
static_assert(fwd(false), "atomic_add doesn't support this memory space");
});
}
};
template <typename T,
index_t DataPerAccess,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp>
InMemoryDataOperation DstInMemOp,
index_t SrcDataStride = 1,
index_t DstDataStride = 1>
__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset)
{
static_assert(DstInMemOp == InMemoryDataOperation::Set ||
DstInMemOp == InMemoryDataOperation::AtomicAdd,
"wrong! InMemoryDataOperation not supported!");
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
set_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
// keep it simple, don't use static_if here, otherwise compiler will do weird things
if(SrcDataStride == 1 && DstDataStride == 1)
{
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, DataPerAccess>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
}
else
{
for(index_t i = 0; i < DataPerAccess; i++)
{
// TODO: use static_if::ElseIf
static_if<DstInMemOp == InMemoryDataOperation::Set>{}([&](auto) {
SetData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
atomic_add_data<T, DataPerAccess, SrcAddressSpace, DstAddressSpace>(
p_src, src_offset, p_dst, dst_offset);
});
static_if<DstInMemOp == InMemoryDataOperation::AtomicAdd>{}([&](auto) {
AtomicAddData<T, 1>{}.template Run<SrcAddressSpace, DstAddressSpace>(
p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride);
});
}
}
}
} // namespace ck
...
...
composable_kernel/include/utility/synchronization.amd.hpp.in
0 → 100644
View file @
4cd8f454
#ifndef CK_SYNCHRONIZATION_AMD_HPP
#define CK_SYNCHRONIZATION_AMD_HPP
#include "config.hpp"
namespace ck {
__device__ void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
__device__ void block_sync_lds()
{
#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
asm volatile("\
s_waitcnt lgkmcnt(0) \n \
s_barrier \
" ::);
#else
__llvm_amdgcn_s_barrier();
#endif
}
__device__ void block_sync_lds_vmem() { __llvm_amdgcn_s_barrier(); }
} // namespace ck
#endif
driver/src/conv_bwd_data_driver.cpp
View file @
4cd8f454
...
...
@@ -160,10 +160,10 @@ int main(int argc, char* argv[])
#elif 0
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
1
28
;
constexpr
index_t
C
=
1
024
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
1
28
;
constexpr
index_t
K
=
1
024
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
7
;
...
...
@@ -247,7 +247,7 @@ int main(int argc, char* argv[])
#if 0
device_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw
#elif
0
#elif
1
device_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw
#elif 0
device_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw
...
...
driver/src/conv_driver.cpp
View file @
4cd8f454
...
...
@@ -19,7 +19,7 @@ int main(int argc, char* argv[])
{
using
namespace
ck
;
#if
1
#if
0
// 1x1, 17x17
constexpr index_t N = 128;
constexpr index_t C = 1024;
...
...
@@ -97,10 +97,10 @@ int main(int argc, char* argv[])
#elif 0
// 7x1, 17x17
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
320
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
7
;
constexpr
index_t
X
=
1
;
...
...
@@ -109,13 +109,13 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
0
#elif
1
// 1x7, 17x17
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
224
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
17
;
constexpr
index_t
WI
=
17
;
constexpr
index_t
K
=
224
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
7
;
...
...
@@ -124,7 +124,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
3
>
;
using
RightPads
=
Sequence
<
0
,
3
>
;
#elif
1
#elif
0
// 3x3, 299x299 stride=2
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
3
;
...
...
@@ -565,7 +565,7 @@ int main(int argc, char* argv[])
#endif
}
#if
0
#if
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment