Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
39d92e7d
Commit
39d92e7d
authored
Sep 25, 2019
by
Chao Liu
Browse files
removing old implementation of tensor descriptor
parent
012b5253
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
128 additions
and
64 deletions
+128
-64
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+28
-38
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+1
-1
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+1
-2
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+5
-4
composable_kernel/include/tensor_description/tensor_descriptor.hpp
...e_kernel/include/tensor_description/tensor_descriptor.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+75
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+4
-4
composable_kernel/include/utility/config_amd.hpp.in
composable_kernel/include/utility/config_amd.hpp.in
+8
-3
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
..._convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+1
-7
driver/src/driver.cpp
driver/src/driver.cpp
+2
-2
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
39d92e7d
...
...
@@ -2,7 +2,6 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp"
...
...
@@ -73,12 +72,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
in_n_c_hi_wi_global_desc
=
make_native_tensor_descriptor
(
InGlobalDesc
::
GetLengths
(),
InGlobalDesc
::
GetStrides
());
constexpr
auto
wei_k_c_y_x_global_desc
=
make_native_tensor_descriptor
(
WeiGlobalDesc
::
GetLengths
(),
WeiGlobalDesc
::
GetStrides
());
constexpr
auto
out_n_k_ho_wo_global_desc
=
make_native_tensor_descriptor
(
OutGlobalDesc
::
GetLengths
(),
OutGlobalDesc
::
GetStrides
());
constexpr
auto
in_n_c_hi_wi_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_k_c_y_x_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_n_k_ho_wo_global_desc
=
OutGlobalDesc
{};
constexpr
index_t
N
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I0
);
constexpr
index_t
C
=
in_n_c_hi_wi_global_desc
.
GetLength
(
I1
);
...
...
@@ -119,11 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
constexpr
auto
block_work_desc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{}),
make_tuple
(
Merge
<
Sequence
<
KBlockWork
,
BBlockWork
>>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
auto
block_work_multi_id
=
block_work_desc
.
CalculateLowerIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
...
...
@@ -139,7 +137,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
auto
in_n0_n1_n2_c_y_ho_x_wo_global_desc
=
transform_tensor_descriptor
(
in_n_c_hip_wip_global_desc
,
make_tuple
(
Un
m
erge
<
Sequence
<
N0
,
N1
,
N2
>>
{},
make_tuple
(
Un
M
erge
<
Sequence
<
N0
,
N1
,
N2
>>
{},
PassThrough
<
C
>
{},
Embed
<
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{}),
...
...
@@ -185,21 +183,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
InBlockCopyDstDataPerWrite_N2
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#if 0
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
constexpr
auto
wei_k_e_global_desc
=
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
);
constexpr
auto
wei_e_k_global_desc
=
transform_tensor_descriptor(wei_k_c_y_x_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#else
// hack
constexpr
auto
wei_e_k_global_desc_old
=
WeiGlobalDesc
::
Unfold
(
I1
,
I3
).
ReorderGivenNew2Old
(
Sequence
<
1
,
0
>
{});
constexpr
auto
wei_e_k_global_desc
=
make_native_tensor_descriptor
(
wei_e_k_global_desc_old
.
GetLengths
(),
wei_e_k_global_desc_old
.
GetStrides
());
#endif
reorder_tensor_descriptor_given_upper2lower
(
wei_k_e_global_desc
,
Sequence
<
1
,
0
>
{});
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
...
...
@@ -340,10 +330,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_in_register_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_wei_register_buffer
,
p_wei_block_next
);
blockwise_in_copy
.
RunStoreRegisterBuffer
(
p_in_register_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterBuffer
(
p_wei_register_buffer
,
p_wei_block_next
);
}
}
...
...
@@ -368,10 +356,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_in_register_buffer
,
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_wei_register_buffer
,
p_wei_block_double
+
wei_block_space
);
blockwise_in_copy
.
RunStoreRegisterBuffer
(
p_in_register_buffer
,
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStoreRegisterBuffer
(
p_wei_register_buffer
,
p_wei_block_double
+
wei_block_space
);
// odd iteration
__syncthreads
();
...
...
@@ -393,12 +381,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
N1
,
1
,
N2
>
{});
// output memory layout descriptor in device memory
constexpr
auto
out_n0_n1_n2_k0_k1_ho_wo_global_desc_old
=
OutGlobalDesc
::
Fold
(
I1
,
Number
<
K1
>
{}).
Fold
(
I0
,
Number
<
N1
>
{},
Number
<
N2
>
{});
constexpr
auto
out_n0_n1_n2_k0_k1_ho_wo_global_desc
=
make_native_tensor_descriptor
(
out_n0_n1_n2_k0_k1_ho_wo_global_desc_old
.
GetLengths
(),
out_n0_n1_n2_k0_k1_ho_wo_global_desc_old
.
GetStrides
());
constexpr
auto
out_n0_n1_n2_k0_k1_ho_wo_global_desc
=
transform_tensor_descriptor
(
out_n_k_ho_wo_global_desc
,
make_tuple
(
UnMerge
<
Sequence
<
N0
,
N1
,
N2
>>
{},
UnMerge
<
Sequence
<
K0
,
K1
>>
{},
PassThrough
<
Ho
>
{},
PassThrough
<
Wo
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{},
Sequence
<
6
>
{}));
// output merged global tensor descriptor, dst of threadwise copy
constexpr
auto
out_k0_k1_n1_b_n2_global_desc
=
transform_tensor_descriptor
(
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
39d92e7d
...
...
@@ -384,7 +384,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
auto
out_k0_k1_b0_b1_global_desc
=
transform_tensor_descriptor
(
out_k_b_global_desc
,
make_tuple
(
Un
m
erge
<
Sequence
<
K0
,
K1
>>
{},
Un
m
erge
<
Sequence
<
B0
,
B1
>>
{}),
make_tuple
(
Un
M
erge
<
Sequence
<
K0
,
K1
>>
{},
Un
M
erge
<
Sequence
<
B0
,
B1
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
39d92e7d
...
...
@@ -252,7 +252,6 @@ struct Merge
});
// highest dimension, no out-of-bound check
if
(
borrow
)
{
--
idx_low_new
(
0
);
...
...
@@ -273,7 +272,7 @@ struct Merge
// UpperLengths: Sequence<...>
template
<
typename
UpperLengths
>
struct
Un
m
erge
struct
Un
M
erge
{
static
constexpr
index_t
nDimLow
=
1
;
static
constexpr
index_t
nDimUp
=
UpperLengths
::
Size
();
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
39d92e7d
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
View file @
39d92e7d
...
...
@@ -81,7 +81,7 @@ struct NativeTensorCoordinate
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
private:
// mIndex may be saved and update, however, the value of some (or all) of its entries may
// mIndex may be saved and update
d
, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
...
...
@@ -178,7 +178,8 @@ struct TransformedTensorCoordinate
}
private:
// mIndexUp may be calculated and update, however, the value of some (or all) of its entries may
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
...
...
composable_kernel/include/tensor_description/tensor_descriptor.hpp
View file @
39d92e7d
...
...
@@ -299,7 +299,7 @@ struct TransformedTensorDescriptor
return
GetLowerTensorDescriptor
().
GetElementSpace
();
}
// TODO: right now return value is constexpr because use of non-conste
p
xr lambda
// TODO: right now return value is
not
constexpr because use of non-constex
p
r lambda
__host__
__device__
static
constexpr
LowerIndex
CalculateLowerIndex
(
const
UpperIndex
&
idx_up
)
{
LowerIndex
idx_low
;
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
39d92e7d
...
...
@@ -96,6 +96,81 @@ __host__ __device__ constexpr auto
LowerTensorDescriptor
{},
typename
sequence_map_inverse
<
MapUpper2Lower
>::
type
{});
}
template
<
typename
Lengths
,
typename
Strides
>
__host__
__device__
constexpr
bool
AreDimensionsUnfoldable
(
Lengths
,
Strides
)
{
static_assert
(
Lengths
::
Size
()
==
Strides
::
Size
(),
"wrong!"
);
bool
flag
=
true
;
for
(
index_t
i
=
0
;
i
<
Lengths
::
Size
()
-
1
;
++
i
)
{
flag
=
flag
&&
Strides
::
At
(
i
)
==
Strides
::
At
(
i
+
1
)
*
Lengths
::
At
(
i
+
1
);
}
return
flag
;
}
// unfold only support NativeTennsorDescriptor, for now
template
<
index_t
FirstUnfoldDim
,
index_t
LastUnfoldDim
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
unfold_tensor_descriptor
(
NativeTensorDescriptor
<
Ts
...
>
desc
,
Number
<
FirstUnfoldDim
>
,
Number
<
LastUnfoldDim
>
)
{
constexpr
index_t
nDim
=
desc
.
GetNumOfDimension
();
static_assert
(
FirstUnfoldDim
>=
0
&&
LastUnfoldDim
<
nDim
&&
FirstUnfoldDim
<=
LastUnfoldDim
,
"wrong! should have FirstUnfoldDim <= LastUnfoldDim!"
);
// left and right
constexpr
auto
left
=
typename
arithmetic_sequence_gen
<
0
,
FirstUnfoldDim
,
1
>::
type
{};
constexpr
auto
middle
=
typename
arithmetic_sequence_gen
<
FirstUnfoldDim
,
LastUnfoldDim
+
1
,
1
>::
type
{};
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
nDim
,
1
>::
type
{};
// sanity-checknfoldable
static_assert
(
AreDimensionsUnfoldable
(
desc
.
GetLengths
(
middle
),
desc
.
GetStrides
(
middle
)),
"wrong! not unfoldable"
);
// unfolded length, stride
constexpr
index_t
unfold_length
=
reduce_on_sequence
(
desc
.
GetLengths
(
middle
),
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{});
constexpr
index_t
unfold_stride
=
desc
.
GetStride
(
Number
<
LastUnfoldDim
>
{});
// new lengths, strides
constexpr
auto
new_lengths
=
desc
.
GetLengths
(
left
).
PushBack
(
Number
<
unfold_length
>
{}).
PushBack
(
desc
.
GetLengths
(
right
));
constexpr
auto
new_strides
=
desc
.
GetStrides
(
left
).
PushBack
(
Number
<
unfold_stride
>
{}).
PushBack
(
desc
.
GetStrides
(
right
));
return
make_native_tensor_descriptor
(
new_lengths
,
new_strides
);
}
#if 0
template <typename LowerTensorDescriptor,
typename PadDimensionIds,
typename LeftPads,
typename RightPads>
__host__ __device__ constexpr auto
pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads)
{
constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension();
constexpr auto non_pad_low_dim_ids = xxx;
return transform_tensor_descriptor(
LowerTensorDescriptor{},
make_tuple(Pad<decltype(LowerTensorDescriptor::GetLengths(PadLowerDimensionIds{})),
LeftPads,
RightPads>{})
.PushBack(PassThrough<xxxx>...),
make_tuple(PadLowerDimensionIds{}).PushBack(xxxx),
sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{}));
}
#endif
template
<
typename
...
NativeDimensions
>
__host__
__device__
void
print_tensor_descriptor
(
const
char
*
s
,
const
NativeTensorDescriptor
<
NativeDimensions
...
>&
desc
)
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
39d92e7d
...
...
@@ -738,12 +738,12 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
{
#if 1
mThreadwiseLoad
.
template
Run_generic
<
TData
,
SrcAddressSpace
,
address_space_t
::
vgpr
>(
mThreadwiseLoad
.
template
Run_generic
<
TData
,
SrcAddressSpace
,
address_space_t
::
generic
>(
p_src
,
p_buffer
);
#else
mThreadwiseLoad
.
template
Run_optimized_src_address_calculation
<
TData
,
SrcAddressSpace
,
address_space_t
::
vgpr
>(
address_space_t
::
generic
>(
p_src
,
p_buffer
);
#endif
}
...
...
@@ -752,11 +752,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
{
#if 1
mThreadwiseStore
.
template
Run_generic
<
TData
,
address_space_t
::
vgpr
,
DstAddressSpace
>(
mThreadwiseStore
.
template
Run_generic
<
TData
,
address_space_t
::
generic
,
DstAddressSpace
>(
p_buffer
,
p_dst
);
#else
mThreadwiseStore
.
template
Run_optimized_dst_address_calculation
<
TData
,
address_space_t
::
vgpr
,
address_space_t
::
generic
,
DstAddressSpace
>(
p_buffer
,
p_dst
);
#endif
...
...
composable_kernel/include/utility/config_amd.hpp.in
View file @
39d92e7d
...
...
@@ -37,11 +37,16 @@ typedef float float4_t __attribute__((ext_vector_type(4)));
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
// data type conversion
template <class T>
__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1)
struct type_convert
{
d += s0 * s1;
}
template <class X>
__device__ T operator()(X x) const
{
return static_cast<T>(x);
}
};
} // namespace ck
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
View file @
39d92e7d
...
...
@@ -33,18 +33,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
#if 1
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcyx_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
#else
constexpr
auto
in_nchw_desc
=
make_native_tensor_descriptor
(
InDesc
::
GetLengths
(),
InDesc
::
GetStrides
());
constexpr
auto
wei_kcyx_desc
=
make_native_tensor_descriptor
(
WeiDesc
::
GetLengths
(),
WeiDesc
::
GetStrides
());
constexpr
auto
out_nkhw_desc
=
make_native_tensor_descriptor
(
OutDesc
::
GetLegnths
(),
OutDesc
::
GetStrides
());
#endif
make_native_tensor_descriptor
(
OutDesc
::
GetLengths
(),
OutDesc
::
GetStrides
());
constexpr
index_t
N
=
out_nkhw_desc
.
GetLength
(
I0
);
constexpr
index_t
K
=
out_nkhw_desc
.
GetLength
(
I1
);
...
...
driver/src/driver.cpp
View file @
39d92e7d
...
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
...
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
0
#elif
1
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment