Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
9ba3b491
Commit
9ba3b491
authored
Jul 28, 2019
by
Chao Liu
Browse files
adding implicit gemm v4r4
parent
8669e242
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1005 additions
and
27 deletions
+1005
-27
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp
+1
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+344
-0
composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
...l/include/tensor_description/ConstantTensorDescriptor.hpp
+2
-10
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+329
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+59
-0
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+71
-0
composable_kernel/include/utility/Array.hpp
composable_kernel/include/utility/Array.hpp
+18
-2
composable_kernel/include/utility/Sequence.hpp
composable_kernel/include/utility/Sequence.hpp
+2
-1
driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
+4
-4
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+152
-0
driver/src/driver.cpp
driver/src/driver.cpp
+23
-9
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
9ba3b491
...
@@ -332,7 +332,7 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -332,7 +332,7 @@ struct GridwiseConvolutionImplicitGemm_v4r3_nchw_kcyx_nkhw_lds_double_buffer
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_block_on_global
,
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_block_on_global
,
p_wei_register_clipboard
);
p_wei_register_clipboard
);
#if
1
#if
0
if(get_block_1d_id() == 0)
if(get_block_1d_id() == 0)
{
{
printf("tid (%d %d), %f %f %f %f\n",
printf("tid (%d %d), %f %f %f %f\n",
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
0 → 100644
View file @
9ba3b491
#ifndef CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
namespace
ck
{
// B = merge(N, H, W)
template
<
index_t
GridSize
,
index_t
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
class
ConvStrides
,
class
ConvDilations
,
index_t
BPerBlock
,
index_t
KPerBlock
,
index_t
EPerBlock
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMLevel0Cluster
,
index_t
GemmNLevel0Cluster
,
index_t
GemmMLevel1Cluster
,
index_t
GemmNLevel1Cluster
,
index_t
GemmKPerThreadLoop
,
index_t
GemmDataPerReadA
,
index_t
GemmDataPerReadB
,
class
InBlockCopySubLengths_E_B
,
class
InBlockCopyClusterLengths_E_B
,
class
InBlockCopyThreadClusterArrangeOrder
,
class
InBlockCopySrcAccessOrder
,
class
InBlockCopyDstAccessOrder
,
index_t
InBlockCopyDataPerAccess_B
,
class
WeiBlockCopySubLengths_E_K
,
class
WeiBlockCopyClusterLengths_E_K
,
class
WeiBlockCopyThreadClusterArrangeOrder
,
class
WeiBlockCopySrcAccessOrder
,
class
WeiBlockCopyDstAccessOrder
,
index_t
WeiBlockCopySrcDataPerRead_E
,
index_t
WeiBlockCopyDstDataPerWrite_K
>
struct
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I5
=
Number
<
5
>
{};
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
in_n_c_h_w_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_k_c_y_x_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_n_k_h_w_global_desc
=
OutGlobalDesc
{};
constexpr
index_t
N
=
in_n_c_h_w_global_desc
.
GetLengths
()[
0
];
constexpr
index_t
C
=
in_n_c_h_w_global_desc
.
GetLengths
()[
1
];
constexpr
index_t
K
=
out_n_k_h_w_global_desc
.
GetLengths
()[
1
];
constexpr
index_t
Ho
=
out_n_k_h_w_global_desc
.
GetLengths
()[
2
];
constexpr
index_t
Wo
=
out_n_k_h_w_global_desc
.
GetLengths
()[
3
];
constexpr
index_t
Y
=
wei_k_c_y_x_global_desc
.
GetLengths
()[
2
];
constexpr
index_t
X
=
wei_k_c_y_x_global_desc
.
GetLengths
()[
3
];
constexpr
index_t
ConvStrideH
=
ConvStrides
{}[
0
];
constexpr
index_t
ConvStrideW
=
ConvStrides
{}[
1
];
constexpr
index_t
ConvDilationH
=
ConvDilations
{}[
0
];
constexpr
index_t
ConvDilationW
=
ConvDilations
{}[
1
];
constexpr
index_t
E
=
C
*
Y
*
X
;
constexpr
index_t
B
=
N
*
Ho
*
Wo
;
static_assert
((
X
==
1
||
ConvDilationW
%
InBlockCopyDataPerAccess_B
==
0
),
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated"
);
// divide block work by [K, B]
static_assert
(
K
%
KPerBlock
==
0
&&
B
%
BPerBlock
==
0
&&
E
%
EPerBlock
==
0
,
"wrong! cannot divide work evenly among block"
);
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
// input tensor
// tensor descriptor in device memory [N, Ho, Wo]
constexpr
auto
in_n_ho_wo_global_desc
=
in_n_c_h_w_global_desc
.
Extract
(
I0
,
I2
,
I3
)
.
StridedSlice
(
I1
,
Number
<
Ho
>
{},
Number
<
ConvStrideH
>
{})
.
StridedSlice
(
I2
,
Number
<
Wo
>
{},
Number
<
ConvStrideW
>
{});
// batch descritpor for device memory
constexpr
auto
in_c_y_x_global_desc
=
in_n_c_h_w_global_desc
.
StridedSlice
(
I2
,
Number
<
Y
>
{},
Number
<
ConvDilationH
>
{})
.
StridedSlice
(
I3
,
Number
<
X
>
{},
Number
<
ConvDilationW
>
{})
.
Extract
(
Sequence
<
1
,
2
,
3
>
{});
// merged tensor descriptor in device memory [E, B], src of blockwise copy
constexpr
auto
in_e_b_global_desc
=
make_ConstantMergedTensorDescriptor
(
in_c_y_x_global_desc
.
Embed
(
in_n_ho_wo_global_desc
),
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
3
,
4
,
5
>
{});
// memory layout descriptor in LDS [E, B], dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
in_e_b_block_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
EPerBlock
,
BPerBlock
>
{});
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
in_e_b_global_desc
),
decltype
(
in_e_b_block_desc
),
MergedTensorCoordinate
<
decltype
(
in_e_b_global_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
in_e_b_block_desc
)
>
,
decltype
(
in_e_b_block_desc
.
GetLengths
()),
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
>
(
{
0
,
b_block_data_on_global
},
{
0
,
0
});
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
constexpr
auto
wei_e_k_global_desc
=
wei_k_c_y_x_global_desc
.
Unfold
(
I1
,
I3
).
ReorderGivenNew2Old
(
Sequence
<
1
,
0
>
{});
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
wei_e_k_block_desc
=
make_ConstantTensorDescriptor_aligned
(
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
wei_e_k_global_desc
),
decltype
(
wei_e_k_block_desc
),
MergedTensorCoordinate
<
decltype
(
wei_e_k_global_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
wei_e_k_block_desc
)
>
,
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[EPerBlock, KPerBlock] is in LDS
// b_mtx[EPerBlocl, BPerBlock] is in LDS
// c_mtx[KPerBlock, BPerBlock] is distributed among threads, and saved in
// register
constexpr
auto
a_e_k_block_mtx_desc
=
make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor
(
wei_e_k_block_desc
);
constexpr
auto
b_e_b_block_mtx_desc
=
make_ConstantMatrixDescriptor_from_ConstantTensorDescriptor
(
in_e_b_block_desc
);
// sanity check
static_assert
(
KPerBlock
%
(
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
)
==
0
&&
BPerBlock
%
(
GemmNPerThreadSubC
*
GemmNLevel0Cluster
*
GemmNLevel1Cluster
)
==
0
,
"wrong!"
);
constexpr
index_t
GemmMRepeat
=
KPerBlock
/
(
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
);
constexpr
index_t
GemmNRepeat
=
BPerBlock
/
(
GemmNPerThreadSubC
*
GemmNLevel0Cluster
*
GemmNLevel1Cluster
);
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k0k1_b0b1_thread_mtx_desc
=
make_ConstantMatrixDescriptor_packed
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
GemmNRepeat
*
GemmNPerThreadSubC
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
decltype
(
a_e_k_block_mtx_desc
),
decltype
(
b_e_b_block_mtx_desc
),
decltype
(
c_k0k1_b0b1_thread_mtx_desc
),
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
>
{};
// LDS allocation for input and weight: be careful of alignment
constexpr
index_t
max_align
=
math
::
lcm
(
InBlockCopyDataPerAccess_B
,
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
,
GemmDataPerReadB
);
constexpr
index_t
in_block_space
=
math
::
integer_least_multiple
(
in_e_b_block_desc
.
GetElementSpace
(),
max_align
);
constexpr
index_t
wei_block_space
=
math
::
integer_least_multiple
(
wei_e_k_block_desc
.
GetElementSpace
(),
max_align
);
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
// register allocation for output
Float
p_out_thread
[
c_k0k1_b0b1_thread_mtx_desc
.
GetElementSpace
()];
// zero out threadwise output
threadwise_matrix_set_zero
(
c_k0k1_b0b1_thread_mtx_desc
,
p_out_thread
);
const
Float
*
p_wei_block_on_global
=
p_wei_global
;
for
(
index_t
e_block_data_begin
=
0
;
e_block_data_begin
<
E
;
e_block_data_begin
+=
EPerBlock
)
{
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
__syncthreads
();
blockwise_gemm
.
Run
(
p_wei_block
,
p_in_block
,
p_out_thread
);
__syncthreads
();
blockwise_in_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
},
true
);
}
// copy output: register to global memory
{
constexpr
index_t
K1
=
GemmMPerThreadSubC
*
GemmMLevel0Cluster
*
GemmMLevel1Cluster
;
constexpr
index_t
B1
=
GemmNPerThreadSubC
*
GemmNLevel0Cluster
*
GemmNLevel1Cluster
;
// define tensor descriptor for threadwise copy
// output global descriptor, for calculating origin of thread tensor
// in global memory
constexpr
auto
out_k_b_global_desc
=
make_ConstantMergedTensorDescriptor
(
out_n_k_h_w_global_desc
,
Sequence
<
1
>
{},
Sequence
<
0
,
2
,
3
>
{});
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
const
auto
c_thread_mtx_on_block
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
const
index_t
b_thread_data_on_global
=
b_block_data_on_global
+
c_thread_mtx_on_block
.
col
;
#if 0
// origin of dst in device memory
Float* p_out_thread_on_global = p_out_global +
out_k_b_global_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, b_thread_data_on_global);
// dst descriptor
constexpr auto out_k0_k1_b0_b1_global_desc =
out_k_b_global_desc.Fold(I1, Number<B1>{}).Fold(I0, Number<K1>{});
// src descriptor
constexpr auto out_k0_k1_b0_b1_thread_desc = make_ConstantTensorDescriptor_packed(
Sequence<GemmMRepeat, GemmMPerThreadSubC, GemmNRepeat, GemmNPerThreadSubC>{});
const auto threadwise_out_copy =
ThreadwiseGenericTensorSliceCopy_v2<Float,
decltype(out_k0_k1_b0_b1_thread_desc),
decltype(out_k0_k1_b0_b1_global_desc),
decltype(
out_k0_k1_b0_b1_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 4, 1>::type,
1,
1>({0, 0, 0, 0},
{k_thread_data_on_global / K1,
k_thread_data_on_global % K1,
b_thread_data_on_global / B1,
b_thread_data_on_global % B1});
threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
#else
// This is a hack, because slicing a merged dimension is not supported yet.
// This should be replaced with logic above, once slicing a merged dimension support
// become available
// dst descriptor
constexpr
auto
out_k0_k1_b_global_desc
=
make_ConstantMergedTensorDescriptor
(
out_n_k_h_w_global_desc
.
Fold
(
I1
,
Number
<
K1
>
{}),
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
0
,
3
,
4
>
{});
// src descriptor
constexpr
auto
out_k0_k1_b_thread_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNRepeat
*
GemmNPerThreadSubC
>
{});
auto
threadwise_out_copy
=
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
decltype
(
out_k0_k1_b_thread_desc
),
decltype
(
out_k0_k1_b_global_desc
),
NormalTensorCoordinate
<
decltype
(
out_k0_k1_b_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_k0_k1_b_global_desc
)
>
,
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
>>
(
{
0
,
0
,
0
},
{
k_thread_data_on_global
/
K1
,
k_thread_data_on_global
%
K1
,
b_thread_data_on_global
});
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSlicingWindow
({
0
,
0
,
GemmNPerThreadSubC
},
true
);
threadwise_out_copy
.
MoveDstSlicingWindow
({
0
,
0
,
B1
},
true
);
}
#endif
}
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp
View file @
9ba3b491
...
@@ -49,17 +49,9 @@ struct ConstantTensorDescriptor
...
@@ -49,17 +49,9 @@ struct ConstantTensorDescriptor
__host__
__device__
static
constexpr
auto
GetStrides
()
{
return
Strides
{};
}
__host__
__device__
static
constexpr
auto
GetStrides
()
{
return
Strides
{};
}
template
<
class
IDim
>
__host__
__device__
static
constexpr
auto
GetLength
(
index_t
IDim
)
{
return
Lengths
{}[
IDim
];
}
__host__
__device__
static
constexpr
auto
GetLength
(
IDim
)
{
return
Lengths
::
Get
(
IDim
{});
}
template
<
class
IDim
>
__host__
__device__
static
constexpr
auto
GetStride
(
index_t
IDim
)
{
return
Strides
{}[
IDim
];
}
__host__
__device__
static
constexpr
auto
GetStride
(
IDim
)
{
return
Strides
::
Get
(
IDim
{});
}
struct
lambda_AreDimensionsContinuous
struct
lambda_AreDimensionsContinuous
{
{
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
0 → 100644
View file @
9ba3b491
#ifndef CK_TENSOR_COORDINATE_HPP
#define CK_TENSOR_COORDINATE_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
namespace
ck
{
template
<
class
TensorDesc
>
struct
NormalTensorCoordinate
{
using
type
=
NormalTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
__host__
__device__
constexpr
NormalTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mIndex
{
tensor_index
},
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_index
)}
{
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate
(
Xs
...
xs
)
:
NormalTensorCoordinate
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
__host__
__device__
constexpr
Array
<
unsigned
,
nDim
>
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
template
<
class
IDim
,
bool
PositiveDirection
>
__host__
__device__
void
MoveOnDimension
(
IDim
idim
,
index_t
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
if
(
PositiveDirection
)
{
mIndex
(
idim
)
+=
step_size
;
mOffset
+=
step_size
*
tensor_desc_type
::
GetStride
(
idim
);
}
else
{
mIndex
(
idim
)
-=
step_size
;
mOffset
-=
step_size
*
tensor_desc_type
::
GetStride
(
idim
);
}
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
#if 0
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert
(
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
#if 0
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert
(
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
// private:
Array
<
index_t
,
nDim
>
mIndex
;
index_t
mOffset
;
};
template
<
class
TensorDesc
>
struct
MergedTensorCoordinate
{
using
type
=
MergedTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mIndex
{
tensor_index
},
mOriginalIndex
{
tensor_desc_type
::
GetOriginalMultiIndexFromMultiIndex
(
tensor_index
)}
{
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mOriginalIndex
,
partial_original_dims
));
});
// complete offset
mOffset
=
accumulate_on_array
(
mPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
MergedTensorCoordinate
(
Xs
...
xs
)
:
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
__host__
__device__
constexpr
Array
<
index_t
,
nDim
>
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
// step_size should be known at compile time
template
<
class
IDim
,
bool
PositiveDirection
>
__host__
__device__
void
MoveOnDimension
(
IDim
,
index_t
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
constexpr
auto
idim
=
IDim
{};
// update multi-index
if
(
PositiveDirection
)
{
mIndex
(
idim
)
+=
step_size
;
}
else
{
mIndex
(
idim
)
-=
step_size
;
}
// update rest
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
index_t
ndim_partial_original
=
partial_original_dims
.
GetSize
();
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
const
auto
partial_original_step_sizes
=
partial_original_desc
.
GetMultiIndexFrom1dIndex
(
step_size
);
// update partial original multi-id
auto
partial_original_id
=
extract_array
(
mOriginalIndex
,
partial_original_dims
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
partial_original_id
+=
partial_original_step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
carry
)
{
++
partial_original_id
(
i
);
}
carry
=
false
;
if
(
partial_original_id
[
i
]
>=
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
-=
partial_original_desc
.
GetLength
(
i
);
carry
=
true
;
}
});
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id
+=
partial_original_desc
.
GetLengths
()
-
partial_original_step_sizes
;
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
borrow
)
{
--
partial_original_id
(
i
);
}
borrow
=
false
;
if
(
partial_original_id
[
i
]
<
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
+=
partial_original_desc
.
GetLength
(
i
);
borrow
=
true
;
}
});
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id
=
partial_original_id
-
partial_original_desc
.
GetLengths
();
});
// update "mOriginalIndex"
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
idim_original
=
partial_original_dims
[
I
];
mOriginalIndex
(
idim_original
)
=
partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_partial_offset
=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
partial_original_id
);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
)
{
constexpr
auto
idim_original
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
).
Front
();
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mOriginalIndex
(
idim_original
)
+=
step_size
;
mPartialOffsets
(
idim
)
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
fwd
)
{
mOriginalIndex
(
idim_original
)
-=
step_size
;
mPartialOffsets
(
idim
)
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
#if 0
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert
(
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
#if 0
static_assert(is_same<typename T::data_type, index_t>, "wrong!");
#endif
static_assert
(
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
// private:
Array
<
index_t
,
nDim
>
mIndex
;
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// mPartialOffsets is needed for for unsigned index type
index_t
mOffset
;
};
#if 0
// implementation of MergedTensorCoordinate, when index_t is signed integer
// mPartialOffsets is not needed, if index_t is signed integer type
template<>
struct TensorCoordinate<signed_t>
{
private:
Array<_t, nDim> mIndex;
Array<_t, nOriginalDim> mOriginalIndex;
index_t mOffset;
};
#endif
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
9ba3b491
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
...
@@ -373,6 +374,64 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -373,6 +374,64 @@ struct BlockwiseGenericTensorSliceCopy_v1
}
}
};
};
template
<
index_t
BlockSize
,
class
TData
,
class
SrcDesc
,
class
DstDesc
,
class
SrcCoordinate
,
class
DstCoordinate
,
class
SliceLengths
,
class
SubLengths
,
class
DataClusterLengths
,
class
ThreadClusterArrangeOrder
>
struct
BlockwiseGenericTensorSliceCopy_v2
{
using
ThreadwiseCopy
=
ThreadwiseGenericTensorSliceCopy_v2
<
TData
,
SrcDesc
,
DstDesc
,
SrcCoordinate
,
DstCoordinate
,
SubLengths
>
;
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
SrcCoordinate
src_block_slice_origin
,
DstCoordinate
dst_block_slice_origin
)
{
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
DataClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
const
auto
thread_cluster_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_multi_id
=
reorder_array_given_old2new
(
thread_cluster_multi_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_multi_id_begin
=
data_cluster_multi_id
*
SubLengths
{};
mThreadwiseCopy
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_multi_id_begin
);
mThreadwiseCopy
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_multi_id_begin
);
}
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
mThreadwiseCopy
.
Run
(
p_src
,
p_dst
);
}
__device__
void
MoveSrcSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
{
mThreadwiseCopy
.
MoveSrcSlicingWindow
(
step_sizes
,
positive_direction
);
}
__device__
void
MoveDstSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
{
mThreadwiseCopy
.
MoveDstSlicingWindow
(
step_sizes
,
positive_direction
);
}
// private:
ThreadwiseCopy
mThreadwiseCopy
;
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
9ba3b491
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
...
@@ -105,5 +106,75 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
...
@@ -105,5 +106,75 @@ __device__ void threadwise_generic_tensor_slice_copy_v1(
#endif
#endif
}
}
template
<
class
TData
,
class
SrcDesc
,
class
DstDesc
,
class
SrcCoordinate
,
class
DstCoordinate
,
class
SliceLengths
>
struct
ThreadwiseGenericTensorSliceCopy_v2
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2
()
:
mSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
()),
mDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2
(
SrcCoordinate
src_slice_origin
,
DstCoordinate
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoordinate
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoordinate
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
static_ford
<
SliceLengths
>
{}([
&
](
auto
data_id
)
{
p_dst
[(
mDstSliceOrigin
+
data_id
).
GetOffset
()]
=
p_src
[(
mSrcSliceOrigin
+
data_id
).
GetOffset
()];
});
}
__device__
void
MoveSrcSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
{
if
(
positive_direction
)
{
mSrcSliceOrigin
+=
step_sizes
;
}
else
{
mSrcSliceOrigin
-=
step_sizes
;
}
}
__device__
void
MoveDstSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
{
if
(
positive_direction
)
{
mDstSliceOrigin
+=
step_sizes
;
}
else
{
mDstSliceOrigin
-=
step_sizes
;
}
}
// private:
SrcCoordinate
mSrcSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/utility/Array.hpp
View file @
9ba3b491
...
@@ -9,7 +9,8 @@ namespace ck {
...
@@ -9,7 +9,8 @@ namespace ck {
template
<
class
TData
,
index_t
NSize
>
template
<
class
TData
,
index_t
NSize
>
struct
Array
struct
Array
{
{
using
Type
=
Array
<
TData
,
NSize
>
;
using
Type
=
Array
<
TData
,
NSize
>
;
using
data_type
=
TData
;
static
constexpr
index_t
nSize
=
NSize
;
static
constexpr
index_t
nSize
=
NSize
;
...
@@ -20,7 +21,7 @@ struct Array
...
@@ -20,7 +21,7 @@ struct Array
{
{
}
}
__host__
__device__
constexpr
index_t
GetSize
()
const
{
return
NSize
;
}
__host__
__device__
static
constexpr
index_t
GetSize
()
{
return
NSize
;
}
template
<
index_t
I
>
template
<
index_t
I
>
__host__
__device__
constexpr
TData
operator
[](
Number
<
I
>
)
const
__host__
__device__
constexpr
TData
operator
[](
Number
<
I
>
)
const
...
@@ -208,6 +209,21 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
...
@@ -208,6 +209,21 @@ __host__ __device__ constexpr auto operator-(Array<TData, NSize> a, Array<TData,
return
result
;
return
result
;
}
}
// Array += Array
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
+=
(
Array
<
TData
,
NSize
>&
a
,
Array
<
TData
,
NSize
>
b
)
{
a
=
a
+
b
;
return
a
;
}
// Array -= Array
template
<
class
TData
,
index_t
NSize
>
__host__
__device__
constexpr
auto
operator
-=
(
Array
<
TData
,
NSize
>&
a
,
Array
<
TData
,
NSize
>
b
)
{
a
=
a
-
b
;
return
a
;
}
// Array = Array + Sequence
// Array = Array + Sequence
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
template
<
class
TData
,
index_t
NSize
,
index_t
...
Is
>
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
__host__
__device__
constexpr
auto
operator
+
(
Array
<
TData
,
NSize
>
a
,
Sequence
<
Is
...
>
b
)
...
...
composable_kernel/include/utility/Sequence.hpp
View file @
9ba3b491
...
@@ -12,7 +12,8 @@ struct is_valid_sequence_map;
...
@@ -12,7 +12,8 @@ struct is_valid_sequence_map;
template
<
index_t
...
Is
>
template
<
index_t
...
Is
>
struct
Sequence
struct
Sequence
{
{
using
Type
=
Sequence
;
using
Type
=
Sequence
;
using
data_type
=
index_t
;
static
constexpr
index_t
mSize
=
sizeof
...(
Is
);
static
constexpr
index_t
mSize
=
sizeof
...(
Is
);
...
...
driver/include/device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp
View file @
9ba3b491
...
@@ -90,14 +90,14 @@ void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc,
...
@@ -90,14 +90,14 @@ void device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw(InDesc,
constexpr
index_t
InBlockCopyDataPerAccess_W2
=
4
;
constexpr
index_t
InBlockCopyDataPerAccess_W2
=
4
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
2
,
2
>
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
4
,
64
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
4
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
2
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
#endif
constexpr
index_t
N0
=
N
/
(
N1
*
N2
);
constexpr
index_t
N0
=
N
/
(
N1
*
N2
);
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
0 → 100644
View file @
9ba3b491
#pragma once
#include <unistd.h>
#include "device.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
using
namespace
ck
;
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
>
void
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
index_t
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
index_t
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
index_t
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
index_t
K
=
wei_kcyx_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
wei_kcyx_desc
.
GetLength
(
I1
);
constexpr
index_t
Y
=
wei_kcyx_desc
.
GetLength
(
I2
);
constexpr
index_t
X
=
wei_kcyx_desc
.
GetLength
(
I3
);
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nchw_device_buf
(
data_sz
*
in_nchw
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kcyx_device_buf
(
data_sz
*
wei_kcyx
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nkhw_device_buf
(
data_sz
*
out_nkhw
.
mDesc
.
GetElementSpace
());
in_nchw_device_buf
.
ToDevice
(
in_nchw
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if 1
// 1x1 filter, 8x8 image
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
128
;
constexpr
index_t
KPerBlock
=
128
;
constexpr
index_t
EPerBlock
=
8
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmMLevel0Cluster
=
4
;
constexpr
index_t
GemmNLevel0Cluster
=
4
;
constexpr
index_t
GemmMLevel1Cluster
=
4
;
constexpr
index_t
GemmNLevel1Cluster
=
4
;
constexpr
index_t
GemmKPerThreadLoop
=
1
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockCopySubLengths_E_B
=
Sequence
<
4
,
1
>
;
using
InBlockCopyClusterLengths_E_B
=
Sequence
<
2
,
128
>
;
using
InBlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
using
InBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, B]
constexpr
index_t
InBlockCopyDataPerAccess_B
=
1
;
using
WeiBlockCopySubLengths_E_K
=
Sequence
<
4
,
1
>
;
using
WeiBlockCopyClusterLengths_E_K
=
Sequence
<
2
,
128
>
;
using
WeiBlockCopyThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopySrcAccessOrder
=
Sequence
<
1
,
0
>
;
// [K, E]
using
WeiBlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [E, K]
constexpr
index_t
WeiBlockCopySrcDataPerRead_E
=
1
;
constexpr
index_t
WeiBlockCopyDstDataPerWrite_K
=
1
;
#endif
constexpr
index_t
B
=
N
*
Ho
*
Wo
;
constexpr
index_t
GridSize
=
((
B
+
BPerBlock
-
1
)
/
BPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvDilations
,
BPerBlock
,
KPerBlock
,
EPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_B
,
InBlockCopyClusterLengths_E_B
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDataPerAccess_B
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
static_cast
<
T
*>
(
in_nchw_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
wei_kcyx_device_buf
.
GetDeviceBuffer
()),
static_cast
<
T
*>
(
out_nkhw_device_buf
.
GetDeviceBuffer
()));
printf
(
"Elapsed time : %f ms, %f TFlop/s
\n
"
,
time
,
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
time
);
usleep
(
std
::
min
(
time
*
1000
,
float
(
10000
)));
}
out_nkhw_device_buf
.
FromDevice
(
out_nkhw
.
mData
.
data
());
}
driver/src/driver.cpp
View file @
9ba3b491
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
{
{
...
@@ -71,13 +72,16 @@ int main(int argc, char* argv[])
...
@@ -71,13 +72,16 @@ int main(int argc, char* argv[])
using
namespace
ck
;
using
namespace
ck
;
#if 0
#if 0
constexpr index_t N =
8
;
constexpr index_t N =
2
;
constexpr index_t C = 16;
constexpr index_t C = 16;
constexpr index_t HI =
3
;
constexpr index_t HI =
8
;
constexpr index_t WI =
1
8;
constexpr index_t WI = 8;
constexpr index_t K = 128;
constexpr index_t K = 128;
constexpr index_t Y = 3;
constexpr index_t Y = 1;
constexpr index_t X = 3;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
constexpr index_t HPad = 0;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
constexpr index_t WPad = 0;
...
@@ -249,7 +253,7 @@ int main(int argc, char* argv[])
...
@@ -249,7 +253,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
0
#elif
1
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
@@ -265,7 +269,7 @@ int main(int argc, char* argv[])
...
@@ -265,7 +269,7 @@ int main(int argc, char* argv[])
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
#elif
1
#elif
0
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
// cudnn@V100 77%, ck@V100 76%, ck@P100 79%, ck@VII 51%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
@@ -491,7 +495,7 @@ int main(int argc, char* argv[])
...
@@ -491,7 +495,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if
1
#if
0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
0
#elif
0
...
@@ -548,7 +552,7 @@ int main(int argc, char* argv[])
...
@@ -548,7 +552,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
@@ -558,6 +562,16 @@ int main(int argc, char* argv[])
...
@@ -558,6 +562,16 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif 1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif 0
#elif 0
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
device_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment