Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
9f46cdf5
"tests/git@developer.sourcefind.cn:OpenDAS/mmcv.git" did not exist on "6e2b1067bafc131c74d64f89cc01478211823bc3"
Commit
9f46cdf5
authored
Sep 18, 2019
by
Chao Liu
Browse files
experimenting global and buffer load/store
parent
f58bf384
Changes
12
Show whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
501 additions
and
104 deletions
+501
-104
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+58
-23
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+2
-2
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+16
-4
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+209
-2
composable_kernel/include/utility/amd_inline_asm.hpp
composable_kernel/include/utility/amd_inline_asm.hpp
+108
-0
composable_kernel/include/utility/array_helper.hpp
composable_kernel/include/utility/array_helper.hpp
+2
-2
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+6
-6
composable_kernel/include/utility/config_nvidia.hpp.in
composable_kernel/include/utility/config_nvidia.hpp.in
+4
-6
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+82
-45
driver/src/driver.cpp
driver/src/driver.cpp
+11
-11
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
9f46cdf5
...
@@ -225,7 +225,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -225,7 +225,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
// c_thread_mtx definition: this is a mess
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k0k2_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor_packed
(
constexpr
auto
c_k0k2_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor_packed
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N1
*
N2
>
{});
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
GemmNRepeat
*
GemmNPerThreadSubC
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
BlockSize
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
9f46cdf5
...
@@ -224,14 +224,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -224,14 +224,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// c_thread_mtx definition: this is a mess
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k0k
2
_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor_packed
(
constexpr
auto
c_k0k
1
_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor_packed
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
GemmNRepeat
*
Gemm
M
PerThreadSubC
>
{});
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
GemmNRepeat
*
Gemm
N
PerThreadSubC
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
BlockSize
,
decltype
(
a_e_k_block_mtx_desc
),
decltype
(
a_e_k_block_mtx_desc
),
decltype
(
b_e_n1bn2_block_mtx_desc
),
decltype
(
b_e_n1bn2_block_mtx_desc
),
decltype
(
c_k0k
2
_n1n2_thread_mtx_desc
),
decltype
(
c_k0k
1
_n1n2_thread_mtx_desc
),
GemmMPerThreadSubC
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmMLevel0Cluster
,
...
@@ -258,12 +258,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -258,12 +258,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__shared__
Float
p_wei_block_double
[
2
*
wei_block_space
];
__shared__
Float
p_wei_block_double
[
2
*
wei_block_space
];
// register allocation for output
// register allocation for output
Float
p_out_thread
[
c_k0k
2
_n1n2_thread_mtx_desc
.
GetElementSpace
()];
Float
p_out_thread
[
c_k0k
1
_n1n2_thread_mtx_desc
.
GetElementSpace
()];
// zero out threadwise output
// zero out threadwise output
threadwise_matrix_set_zero
(
c_k0k2_n1n2_thread_mtx_desc
,
p_out_thread
);
threadwise_matrix_set_zero
(
c_k0k1_n1n2_thread_mtx_desc
,
p_out_thread
);
const
Float
*
p_wei_block_on_global
=
p_wei_global
;
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
...
@@ -294,14 +292,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -294,14 +292,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_block_on_global
,
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_global
,
p_wei_register_buffer
);
p_wei_register_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
...
@@ -319,13 +316,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -319,13 +316,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
Float
p_wei_register_buffer
[
blockwise_wei_copy
.
GetRegisterBufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_
block_on_
global
,
p_wei_register_buffer
);
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_global
,
p_wei_register_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
...
@@ -347,6 +344,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -347,6 +344,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// copy output: register to global memory
// copy output: register to global memory
{
{
#if 0
constexpr index_t K2 = GemmMPerThreadSubC;
constexpr index_t K2 = GemmMPerThreadSubC;
constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster;
constexpr index_t K1 = GemmMLevel0Cluster * GemmMLevel1Cluster;
...
@@ -392,17 +390,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -392,17 +390,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
#if 0
ThreadwiseGenericTensorSliceCopy_v1r2<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 8, 1>::type,
7,
1,
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
.Run(p_out_thread, p_out_thread_on_global);
#elif
1
ThreadwiseGenericTensorSliceCopy_v2r1<
ThreadwiseGenericTensorSliceCopy_v2r1<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
...
@@ -414,6 +401,54 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -414,6 +401,54 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
1,
1,
1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0})
.Run(p_out_thread, p_out_thread_on_global);
.Run(p_out_thread, p_out_thread_on_global);
#else
constexpr
index_t
K1
=
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
;
// define tensor descriptor for threadwise copy
// output memory layout descriptor in register, src of threadwise copy
constexpr
auto
out_k0_k1_n1_b_n2_thread_mem_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
N1
,
1
,
N2
>
{});
// output memory layout descriptor in device memory
constexpr
auto
out_n0_n1_n2_k0_k1_h_w_global_mem_desc
=
out_n_k_h_w_global_desc
.
Fold
(
I1
,
Number
<
K1
>
{}).
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{});
// output merged global tensor descriptor, dst of threadwise copy
constexpr
auto
out_k0_k1_n1_b_n2_global_merged_desc
=
make_ConstantMergedTensorDescriptor
(
out_n0_n1_n2_k0_k1_h_w_global_mem_desc
,
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
1
>
{},
Sequence
<
0
,
5
,
6
>
{},
Sequence
<
2
>
{});
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
const
auto
c_thread_mtx_on_block
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
const
index_t
b_thread_data_on_global
=
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
/
N2
;
ThreadwiseGenericTensorSliceCopy_v2r1
<
decltype
(
out_k0_k1_n1_b_n2_thread_mem_desc
),
decltype
(
out_k0_k1_n1_b_n2_global_merged_desc
),
decltype
(
out_k0_k1_n1_b_n2_thread_mem_desc
.
GetLengths
()),
arithmetic_sequence_gen
<
0
,
5
,
1
>::
type
,
arithmetic_sequence_gen
<
0
,
5
,
1
>::
type
,
3
,
3
,
1
,
1
>
({
0
,
0
,
0
,
0
,
0
},
{
k_thread_data_on_global
/
K1
,
k_thread_data_on_global
%
K1
,
0
,
b_thread_data_on_global
,
0
})
.
template
Run_amd_experiment
<
Float
,
0
,
2
>(
p_out_thread
,
p_out_global
);
#endif
#endif
}
}
}
}
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
9f46cdf5
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
View file @
9f46cdf5
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
9f46cdf5
...
@@ -439,11 +439,10 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -439,11 +439,10 @@ struct BlockwiseGenericTensorSliceCopy_v2
{
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
using
SrcCoordinate
=
typename
TensorCoordinate
<
SrcDesc
>::
type
;
using
Index
=
MultiIndex
<
nDim
>
;
using
DstCoordinate
=
typename
TensorCoordinate
<
DstDesc
>::
type
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
SrcCoordinate
src_block_slice_origin
,
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
const
Index
&
src_block_slice_origin
,
DstCoordinate
dst_block_slice_origin
)
const
Index
&
dst_block_slice_origin
)
{
{
static_assert
(
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
...
@@ -485,13 +484,21 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -485,13 +484,21 @@ struct BlockwiseGenericTensorSliceCopy_v2
template
<
typename
TData
>
template
<
typename
TData
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
{
{
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseLoad.Run(p_src, p_buffer);
#else
mThreadwiseLoad
.
template
Run_amd_experiment
<
TData
,
2
,
0
>(
p_src
,
p_buffer
);
#endif
}
}
template
<
typename
TData
>
template
<
typename
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
{
{
#if 0
mThreadwiseStore.Run(p_buffer, p_dst);
mThreadwiseStore.Run(p_buffer, p_dst);
#else
mThreadwiseStore
.
template
Run_amd_experiment
<
TData
,
0
,
2
>(
p_buffer
,
p_dst
);
#endif
}
}
template
<
typename
TData
>
template
<
typename
TData
>
...
@@ -499,8 +506,13 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -499,8 +506,13 @@ struct BlockwiseGenericTensorSliceCopy_v2
{
{
TData
p_buffer
[
GetRegisterBufferSize
()];
TData
p_buffer
[
GetRegisterBufferSize
()];
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseLoad.Run(p_src, p_buffer);
mThreadwiseStore.Run(p_buffer, p_dst);
mThreadwiseStore.Run(p_buffer, p_dst);
#else
mThreadwiseLoad
.
template
Run_amd_experiment
<
TData
,
2
,
0
>(
p_src
,
p_buffer
);
mThreadwiseStore
.
template
Run_amd_experiment
<
TData
,
0
,
2
>(
p_buffer
,
p_dst
);
#endif
}
}
template
<
typename
T
,
bool
PositiveDirection
>
template
<
typename
T
,
bool
PositiveDirection
>
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
9f46cdf5
...
@@ -452,11 +452,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -452,11 +452,13 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
{
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoordinate
=
typename
TensorCoordinate
<
SrcDesc
>::
type
;
using
SrcCoordinate
=
typename
TensorCoordinate
<
SrcDesc
>::
type
;
using
DstCoordinate
=
typename
TensorCoordinate
<
DstDesc
>::
type
;
using
DstCoordinate
=
typename
TensorCoordinate
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
(
SrcCoordinate
src_slice_origin
,
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
(
const
Index
&
src_slice_origin
,
DstCoordinate
dst_slice_origin
)
const
Index
&
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
...
@@ -755,6 +757,211 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
...
@@ -755,6 +757,211 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1
}
}
}
}
// memory-space
// 0: VGPR
// 1: LDS
// 2: global-memory
template
<
class
TData
,
index_t
SrcMemorySpace
,
index_t
DstMemorySpace
>
__device__
void
Run_amd_experiment
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
src_merged_offset
=
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. normal dimension is known at compile-time
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
src_vector_t
vector_data
;
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if 1 // source code
// Load vector from src.
// src can be all kinds of memory-space.
// In order for optimized global_load to be emitted by compiler, need to
// assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
#else // inline asm using buffer_load
// Load vector from src
// src's memory-space can only be global-memory (buffer_load inline-asm is
// used)
// In order for buffer_load to be valid, need to assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data
=
buffer_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_normal_offset
));
#endif
}).
Else
([
&
](
auto
)
{
// Load vector from src.
// src can be all kinds of memory-space.
// In order for optimized global_load to be emitted by compiler, need to
// assume:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
});
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
});
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}(
[
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
dst_merged_offset
=
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
// offset w.r.t. normal dimension is known at compile-time
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if 1 // source code
// Write vector into dst.
// dst can be all kinds of memory-space
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
#else // inline asm using buffer_store
// Write vector into dst.
// dst's memory-space need to be global-memory (buffer_store is used)
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
buffer_store
<
TData
,
DstDataPerAccess
>
(
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
#endif
}).
Else
([
&
](
auto
)
{
// Write vector into dst.
// dst can be all kinds of memory-space
// In order for optmized global_store to be emitted by compiler, need to
// assume:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
});
});
});
}
}
// T can be Sequence or Array
// T can be Sequence or Array
template
<
class
T
,
bool
PositiveDirection
>
template
<
class
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
...
...
composable_kernel/include/utility/amd_inline_asm.hpp
View file @
9f46cdf5
...
@@ -8,6 +8,114 @@ namespace ck {
...
@@ -8,6 +8,114 @@ namespace ck {
// cast a pointer of LDS to its address
// cast a pointer of LDS to its address
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
extern
"C"
__attribute__
((
address_space
(
3
)))
__device__
void
*
__to_local
(
void
*
p
);
// buffer_load and buffer_store
template
<
typename
T
,
index_t
VectorSize
>
__device__
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
buffer_load
(
const
T
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
);
template
<
typename
T
,
index_t
VectorSize
>
__device__
void
buffer_store
(
const
typename
vector_type
<
T
,
VectorSize
>::
MemoryType
&
src
,
T
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_const_offset
);
template
<
>
__device__
float
buffer_load
<
float
,
1
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
{
float
dst
;
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
buffer_load_dword %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
return
dst
;
}
template
<
>
__device__
vector_type
<
float
,
2
>::
MemoryType
buffer_load
<
float
,
2
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
{
vector_type
<
float
,
2
>::
MemoryType
dst
;
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
buffer_load_dwordx2 %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
return
dst
;
}
template
<
>
__device__
vector_type
<
float
,
4
>::
MemoryType
buffer_load
<
float
,
4
>
(
const
float
*
p_src_block
,
uint32_t
src_thread_offset
,
uint32_t
src_const_offset
)
{
vector_type
<
float
,
4
>::
MemoryType
dst
;
int32x4_t
src_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
src_block_setting
)
=
const_cast
<
float
*>
(
p_src_block
);
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
src_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
buffer_load_dwordx4 %0, %1, %2, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
"
:
"=v"
(
dst
)
:
"v"
(
src_thread_offset
),
"s"
(
src_block_setting
),
"s"
(
src_const_offset
));
return
dst
;
}
template
<
>
__device__
void
buffer_store
<
float
,
1
>
(
const
float
&
src
,
float
*
p_dst_block
,
uint32_t
dst_thread_offset
,
uint32_t
dst_const_offset
)
{
int32x4_t
dst_block_setting
{
0
};
// fill in byte 0 - 1
*
reinterpret_cast
<
float
**>
(
&
dst_block_setting
)
=
p_dst_block
;
// fill in byte 2
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
2
]
=
-
1
;
// fill in byte 3
reinterpret_cast
<
int
*>
(
&
dst_block_setting
)[
3
]
=
0x00027000
;
asm
volatile
(
"
\n
\
buffer_store_dword %1, %2, %0, %3 offen offset:0
\n
\
s_waitcnt 0
\n
\
"
:
:
"s"
(
dst_block_setting
),
"v"
(
src
),
"v"
(
dst_thread_offset
),
"s"
(
dst_const_offset
));
}
__device__
void
vmcnt
(
index_t
cnt
)
__device__
void
vmcnt
(
index_t
cnt
)
{
{
if
(
cnt
==
0
)
if
(
cnt
==
0
)
...
...
composable_kernel/include/utility/array_helper.hpp
View file @
9f46cdf5
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
namespace
ck
{
namespace
ck
{
template
<
index_t
NSize
>
template
<
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
u
nsigned
_t
,
NSize
>
a
)
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
u
int32
_t
,
NSize
>
a
)
{
{
constexpr
index_t
nsize
=
a
.
GetSize
();
constexpr
index_t
nsize
=
a
.
GetSize
();
...
@@ -90,7 +90,7 @@ __host__ __device__ void print_array(const char* s, Array<unsigned_t, NSize> a)
...
@@ -90,7 +90,7 @@ __host__ __device__ void print_array(const char* s, Array<unsigned_t, NSize> a)
}
}
template
<
index_t
NSize
>
template
<
index_t
NSize
>
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
signed
_t
,
NSize
>
a
)
__host__
__device__
void
print_array
(
const
char
*
s
,
Array
<
int32
_t
,
NSize
>
a
)
{
{
constexpr
index_t
nsize
=
a
.
GetSize
();
constexpr
index_t
nsize
=
a
.
GetSize
();
...
...
composable_kernel/include/utility/config_amd.hpp.in
View file @
9f46cdf5
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include "hip/hip_fp16.h"
#include "hip/hip_fp16.h"
#define CK_DEVICE_BACKEND_AMD 1
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_UNSIGNED_INDEX_TYPE 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
...
@@ -13,13 +14,10 @@
...
@@ -13,13 +14,10 @@
namespace ck {
namespace ck {
using unsigned_t = uint32_t;
#if CK_USE_UNSIGNED_INDEX_TYPE
using signed_t = int;
using index_t = uint32_t;
#if 0 // debug
using index_t = unsigned_t;
#else
#else
using index_t =
signed
_t;
using index_t =
int32
_t;
#endif
#endif
// For some reason, HIP compiler need this definition to generate optimal load and store
// For some reason, HIP compiler need this definition to generate optimal load and store
...
@@ -27,6 +25,8 @@ using index_t = signed_t;
...
@@ -27,6 +25,8 @@ using index_t = signed_t;
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef float float4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
template <class T>
template <class T>
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
{
{
...
...
composable_kernel/include/utility/config_nvidia.hpp.in
View file @
9f46cdf5
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#include "helper_cuda.h"
#include "helper_cuda.h"
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_DEVICE_BACKEND_NVIDIA 1
#define CK_USE_UNSIGNED_INDEX_TYPE 0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
...
@@ -15,13 +16,10 @@
...
@@ -15,13 +16,10 @@
namespace ck {
namespace ck {
using unsigned_t = uint32_t;
#if CK_USE_UNSIGNED_INDEX_TYPE
using signed_t = int;
using index_t = uint32_t;
#if 0 // debug
using index_t = unsigned_t;
#else
#else
using index_t =
signed
_t;
using index_t =
int32
_t;
#endif
#endif
// For some reason, CUDA need this definition, otherwise
// For some reason, CUDA need this definition, otherwise
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
9f46cdf5
...
@@ -48,7 +48,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -48,7 +48,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 1
#if 1
// each thread hold 64 data
//
BlockSize = 256, blockwise-GEMM 128x128,
each thread hold 64 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
BPerBlock
=
16
;
...
@@ -82,10 +82,47 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -82,10 +82,47 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif 0
#elif 0
// each thread hold 32 data
// BlockSize = 64, blockwise-GEMM 64x64, each thread hold 64 data
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
BPerBlock
=
8
;
constexpr
index_t
KPerBlock
=
64
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmNRepeat
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
2
;
constexpr
index_t
GemmNLevel1Cluster
=
2
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
2
,
1
,
4
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
1
,
8
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
2
,
1
,
3
>
;
// [E, B, N1, N2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopyDstDataPerWrite_N2
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
2
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
32
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#elif 1
// BlockSize = 256, blockwise-GEMM 64x128, each thread hold 32 data
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
16
;
constexpr
index_t
BPerBlock
=
16
;
...
@@ -107,7 +144,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -107,7 +144,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopySubLengths_E_N1_B_N2
=
Sequence
<
1
,
1
,
1
,
4
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
2
,
16
,
1
>
;
using
InBlockCopyClusterLengths_E_N1_B_N2
=
Sequence
<
8
,
2
,
16
,
1
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
,
3
,
2
>
;
// [E, N1, N2
, B
]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
2
,
1
,
3
>
;
// [E,
B,
N1, N2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
,
2
,
3
>
;
// [E, N1, B, N2]
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
constexpr
index_t
InBlockCopySrcDataPerRead_B
=
1
;
...
@@ -119,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -119,7 +156,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
2
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
#endif
...
@@ -133,8 +170,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -133,8 +170,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
constexpr
auto
gridwise_conv
=
#if 0
#if 0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -177,6 +212,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -177,6 +212,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
WeiBlockCopyDstDataPerWrite_K
>
{};
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
...
...
driver/src/driver.cpp
View file @
9f46cdf5
...
@@ -75,13 +75,13 @@ int main(int argc, char* argv[])
...
@@ -75,13 +75,13 @@ int main(int argc, char* argv[])
using
namespace
ck
;
using
namespace
ck
;
#if 0
#if 0
constexpr index_t N =
8
;
constexpr index_t N =
64
;
constexpr index_t C =
8
;
constexpr index_t C =
16
;
constexpr index_t HI =
2
;
constexpr index_t HI =
34
;
constexpr index_t WI =
8
;
constexpr index_t WI =
34
;
constexpr index_t K = 128;
constexpr index_t K = 128;
constexpr index_t Y =
1
;
constexpr index_t Y =
3
;
constexpr index_t X =
1
;
constexpr index_t X =
3
;
using ConvStrides = Sequence<1, 1>;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
...
@@ -92,8 +92,8 @@ int main(int argc, char* argv[])
...
@@ -92,8 +92,8 @@ int main(int argc, char* argv[])
// 3x3, 34x34
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
3
2
;
constexpr
index_t
HI
=
3
4
;
constexpr
index_t
WI
=
3
2
;
constexpr
index_t
WI
=
3
4
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
X
=
3
;
...
@@ -101,8 +101,8 @@ int main(int argc, char* argv[])
...
@@ -101,8 +101,8 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
#elif 0
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
...
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
...
@@ -434,7 +434,7 @@ int main(int argc, char* argv[])
#elif 0
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment