Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
51a9fa1d
"vscode:/vscode.git/clone" did not exist on "b75e3bb963bbf3571bad45001b4dee693c93c0e2"
Commit
51a9fa1d
authored
Sep 26, 2019
by
Chao Liu
Browse files
removing dependency on old tensor descriptor
parent
0f52c4c0
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
160 additions
and
149 deletions
+160
-149
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+8
-14
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+40
-48
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+37
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+11
-35
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+5
-4
composable_kernel/include/utility/sequence.hpp
composable_kernel/include/utility/sequence.hpp
+7
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
..._convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+42
-42
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
..._convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+8
-4
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
51a9fa1d
...
@@ -6,8 +6,8 @@
...
@@ -6,8 +6,8 @@
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -115,16 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -115,16 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
transform_tensor_descriptor
(
constexpr
auto
block_work_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{}),
make_cluster_descriptor
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
make_tuple
(
Merge
<
Sequence
<
KBlockWork
,
BBlockWork
>>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
block_work_
multi_
id
=
block_work_desc
.
Calculate
Low
erIndex
(
get_block_1d_id
());
const
auto
block_work_id
=
block_work_desc
.
Calculate
Clust
erIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_
multi_
id
[
0
]
*
KPerBlock
;
const
index_t
k_block_data_on_global
=
block_work_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_
multi_
id
[
1
]
*
BPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_id
[
1
]
*
BPerBlock
;
// input tensor
// input tensor
// global memory
// global memory
...
@@ -185,11 +182,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -185,11 +182,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
// weight tensor
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// tensor descriptor in device memory, src of blockwise copy
constexpr
auto
wei_k_e_global_desc
=
constexpr
auto
wei_e_k_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
);
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
),
Sequence
<
1
,
0
>
{});
constexpr
auto
wei_e_k_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
wei_k_e_global_desc
,
Sequence
<
1
,
0
>
{});
// tensor descriptor in LDS, dst of blockwise copy
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
// be careful of LDS alignment
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
51a9fa1d
...
@@ -2,12 +2,12 @@
...
@@ -2,12 +2,12 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "
ConstantT
ensor
D
escriptor.hpp"
#include "
t
ensor
_d
escriptor.hpp"
#include "
ConstantMergedT
ensor
D
escriptor.hpp"
#include "
t
ensor
_d
escriptor
_helper
.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -103,13 +103,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -103,13 +103,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
constexpr
auto
block_work_desc
=
make_
ConstantTensorD
escriptor
_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
make_
cluster_d
escriptor
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
const
auto
block_work_id
=
block_work_desc
.
CalculateClusterIndex
(
get_block_1d_id
());
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_
multi_
id
[
0
]
*
KPerBlock
;
const
index_t
k_block_data_on_global
=
block_work_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_
multi_
id
[
1
]
*
BPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_id
[
1
]
*
BPerBlock
;
// input tensor
// input tensor
// global mem
// global mem
...
@@ -157,21 +156,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -157,21 +156,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
InBlockCopyDataPerAccess_B
>
(
InBlockCopyDataPerAccess_B
>
(
{
0
,
b_block_data_on_global
},
{
0
,
0
});
{
0
,
b_block_data_on_global
},
{
0
,
0
});
// weight tensor
// weight tensor
// global mem
// global mem
#if 0
constexpr
auto
wei_e_k_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
constexpr auto wei_e_k_global_desc =
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
),
Sequence
<
1
,
0
>
{});
transform_tensor_descriptor(wei_k_c_y_x_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#else
// hack
constexpr
auto
wei_e_k_global_desc_old
=
WeiGlobalDesc
::
Unfold
(
I1
,
I3
).
ReorderGivenNew2Old
(
Sequence
<
1
,
0
>
{});
constexpr
auto
wei_e_k_global_desc
=
make_native_tensor_descriptor
(
wei_e_k_global_desc_old
.
GetLengths
(),
wei_e_k_global_desc_old
.
GetStrides
());
#endif
// LDS
// LDS
// be careful of LDS alignment
// be careful of LDS alignment
...
@@ -267,9 +255,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -267,9 +255,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
blockwise_in_copy
.
template
Run
<
Float
,
address_space_t
::
global
,
address_space_t
::
lds
>(
blockwise_in_copy
.
template
Run
<
Float
,
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_block_double
);
p_in_global
,
p_in_block_double
);
blockwise_wei_copy
.
template
Run
<
Float
,
address_space_t
::
global
,
address_space_t
::
lds
>(
blockwise_wei_copy
.
template
Run
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_block_double
);
p_wei_global
,
p_wei_block_double
);
}
}
...
@@ -292,8 +280,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -292,8 +280,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
Float
*
p_wei_block_next
=
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
@@ -301,26 +289,26 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -301,26 +289,26 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
template
RunLoadRegisterBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_in_copy
p_in_global
,
p_in_register_buffer
);
.
template
RunLoadThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
template
RunLoadRegisterBuffer
<
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_thread_buffer
);
p_wei_global
,
p_wei_register_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
blockwise_in_copy
.
RunStoreThreadBuffer
(
p_in_thread_buffer
,
p_in_block_next
);
p_in_register_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreThreadBuffer
(
p_wei_thread_buffer
,
p_wei_block_next
);
blockwise_wei_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_wei_register_buffer
,
p_wei_block_next
);
}
}
}
}
// LDS double buffer: tail
// LDS double buffer: tail
{
{
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
// even iteration
// even iteration
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
@@ -329,19 +317,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -329,19 +317,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
template
RunLoad
RegisterBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_in_copy
.
template
RunLoad
ThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_
register
_buffer
);
p_in_global
,
p_in_
thread
_buffer
);
blockwise_wei_copy
.
template
RunLoad
RegisterBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
template
RunLoad
ThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_
register
_buffer
);
p_wei_global
,
p_wei_
thread
_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
blockwise_in_copy
.
RunStoreThreadBuffer
(
p_in_thread_buffer
,
p_in_register_buffer
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
blockwise_wei_copy
.
RunStoreThreadBuffer
(
p_wei_thread_buffer
,
p_wei_register_buffer
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
// odd iteration
// odd iteration
__syncthreads
();
__syncthreads
();
...
@@ -402,10 +390,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
...
@@ -402,10 +390,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
b_thread_data_on_global
/
B1
,
b_thread_data_on_global
/
B1
,
b_thread_data_on_global
%
B1
})
b_thread_data_on_global
%
B1
})
#if 1
#if 1
.
template
Run_generic
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
.
template
Run_generic
<
Float
,
#elif 1
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#else // tweaking
.
template
Run_optimized_dst_address_calculation
<
Float
,
.
template
Run_optimized_dst_address_calculation
<
Float
,
address_space_t
::
vgpr
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
address_space_t
::
global
>
#endif
#endif
(
p_out_thread
,
p_out_global
);
(
p_out_thread
,
p_out_global
);
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
51a9fa1d
...
@@ -132,7 +132,7 @@ struct Merge
...
@@ -132,7 +132,7 @@ struct Merge
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
{
return
Sequence
<
accumulat
e_on_sequence
(
return
Sequence
<
reduc
e_on_sequence
(
LowerLengths
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
>
{};
LowerLengths
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
>
{};
}
}
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
51a9fa1d
...
@@ -149,6 +149,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
...
@@ -149,6 +149,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
}
}
#if 0
#if 0
// not implemented
template <typename LowerTensorDescriptor,
template <typename LowerTensorDescriptor,
typename PadDimensionIds,
typename PadDimensionIds,
typename LeftPads,
typename LeftPads,
...
@@ -171,6 +172,42 @@ __host__ __device__ constexpr auto
...
@@ -171,6 +172,42 @@ __host__ __device__ constexpr auto
}
}
#endif
#endif
// a cluster map 1d index to N-d index
template
<
typename
Lengths
,
typename
ArrangeOrder
>
struct
ClusterDescriptor
{
static
constexpr
index_t
nDim
=
Lengths
::
Size
();
static
constexpr
auto
mDesc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Lengths
{}),
make_tuple
(
Merge
<
decltype
(
Lengths
::
ReorderGivenNew2Old
(
ArrangeOrder
{}))
>
{}),
make_tuple
(
ArrangeOrder
{}),
make_tuple
(
Sequence
<
0
>
{}));
__host__
__device__
constexpr
ClusterDescriptor
()
{
static_assert
(
Lengths
::
Size
()
==
nDim
&&
ArrangeOrder
::
Size
()
==
nDim
,
"wrong! size not the same"
);
static_assert
(
is_valid_sequence_map
<
ArrangeOrder
>
{},
"wrong! ArrangeOrder is wrong"
);
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
mDesc
.
GetElementSize
();
}
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
{
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
idx_1d
});
}
};
template
<
typename
Lengths
,
typename
ArrangeOrder
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>
::
type
>
__host__
__device__
constexpr
auto
make_cluster_descriptor
(
Lengths
,
ArrangeOrder
order
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>::
type
{})
{
return
ClusterDescriptor
<
Lengths
,
ArrangeOrder
>
{};
}
template
<
typename
...
NativeDimensions
>
template
<
typename
...
NativeDimensions
>
__host__
__device__
void
__host__
__device__
void
print_tensor_descriptor
(
const
char
*
s
,
const
NativeTensorDescriptor
<
NativeDimensions
...
>&
desc
)
print_tensor_descriptor
(
const
char
*
s
,
const
NativeTensorDescriptor
<
NativeDimensions
...
>&
desc
)
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
51a9fa1d
...
@@ -2,13 +2,10 @@
...
@@ -2,13 +2,10 @@
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate_v2.hpp"
#include "tensor_coordinate_v2.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
...
@@ -16,6 +13,8 @@
...
@@ -16,6 +13,8 @@
namespace
ck
{
namespace
ck
{
#if 0
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// This functions assume each thread is reading and writing a normal (not merged) tensor,
...
@@ -677,6 +676,8 @@ struct BlockwiseGenericTensorSliceCopy_v3
...
@@ -677,6 +676,8 @@ struct BlockwiseGenericTensorSliceCopy_v3
ThreadwiseStore
mThreadwiseStore
;
ThreadwiseStore
mThreadwiseStore
;
};
};
#endif
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
BlockSrcDesc
,
typename
BlockSrcDesc
,
typename
BlockDstDesc
,
typename
BlockDstDesc
,
...
@@ -710,42 +711,17 @@ struct BlockwiseGenericTensorSliceCopy_v4
...
@@ -710,42 +711,17 @@ struct BlockwiseGenericTensorSliceCopy_v4
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
"wrong! threads should be mapped to cover entire slicing window"
);
#if 1
// map threads to cluster
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
constexpr
auto
thread_cluster_desc
=
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
#else
constexpr
auto
thread_cluster_lengths_in_arrange_order
=
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{});
constexpr
auto
thread_cluster_desc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
thread_cluster_lengths_in_arrange_order
),
make_tuple
(
Merge
<
decltype
(
thread_cluster_lengths_in_arrange_order
)
>
{}),
make_tuple
(
arithmetic
)
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
constexpr
auto
thread_cluster_id
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{}),
make_tuple
(
Merge
<
Sequence
<
KBlockWork
,
BBlockWork
>>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
block_work_multi_id
=
block_work_desc
.
CalculateLowerIndex
(
get_block_1d_id
());
#endif
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
thread_cluster_desc
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data
_cluster_id
*
ThreadSliceLengths
{};
const
auto
thread_data_id_begin
=
thread
_cluster_id
*
ThreadSliceLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
51a9fa1d
...
@@ -2,11 +2,8 @@
...
@@ -2,11 +2,8 @@
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate_v2.hpp"
#include "tensor_coordinate_v2.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
...
@@ -23,6 +20,8 @@
...
@@ -23,6 +20,8 @@
namespace
ck
{
namespace
ck
{
#if 0
// This threadwise copy allow vector access of src and dst.
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows the vector size to be different on src and dst.
...
@@ -1121,6 +1120,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
...
@@ -1121,6 +1120,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
DstSlice
mDstSlice
;
DstSlice
mDstSlice
;
};
};
#endif
// This version use multi-index transformation
// This version use multi-index transformation
// This threadwise copy allow vector access of src and dst.
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// It allows the vector size to be different on src and dst.
...
...
composable_kernel/include/utility/sequence.hpp
View file @
51a9fa1d
...
@@ -473,6 +473,13 @@ struct sequence_sort_impl<Sequence<Value>, Sequence<Id>, Compare>
...
@@ -473,6 +473,13 @@ struct sequence_sort_impl<Sequence<Value>, Sequence<Id>, Compare>
using
sorted_ids
=
Sequence
<
Id
>
;
using
sorted_ids
=
Sequence
<
Id
>
;
};
};
template
<
typename
Compare
>
struct
sequence_sort_impl
<
Sequence
<>
,
Sequence
<>
,
Compare
>
{
using
sorted_values
=
Sequence
<>
;
using
sorted_ids
=
Sequence
<>
;
};
template
<
typename
Values
,
typename
Compare
>
template
<
typename
Values
,
typename
Compare
>
struct
sequence_sort
struct
sequence_sort
{
{
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
View file @
51a9fa1d
...
@@ -177,52 +177,52 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
...
@@ -177,52 +177,52 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
constexpr
auto
gridwise_conv
=
{
constexpr
auto
gridwise_conv
=
#if 0
#if 0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
#else
#else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer
#endif
#endif
<
GridSize
,
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
in_nchw_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
wei_kcyx_desc
),
decltype
(
out_nkhw_desc
),
decltype
(
out_nkhw_desc
),
ConvStrides
,
ConvStrides
,
ConvDilations
,
ConvDilations
,
LeftPads
,
LeftPads
,
RightPads
,
RightPads
,
BPerBlock
,
BPerBlock
,
KPerBlock
,
KPerBlock
,
EPerBlock
,
EPerBlock
,
GemmNRepeat
,
GemmNRepeat
,
GemmMPerThreadSubC
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMLevel0Cluster
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmKPerThreadLoop
,
GemmDataPerReadA
,
GemmDataPerReadA
,
GemmDataPerReadB
,
GemmDataPerReadB
,
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopyThreadClusterArrangeOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopySrcAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopyDstAccessOrder
,
InBlockCopySrcDataPerRead_B
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
,
InBlockCopyDstDataPerWrite_N2
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopyThreadClusterArrangeOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopySrcAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopyDstAccessOrder
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
WeiBlockCopyDstDataPerWrite_K
>
{};
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
GridSize
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
View file @
51a9fa1d
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
1
#if
0
constexpr index_t BlockSize = 256;
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 128;
constexpr index_t BPerBlock = 128;
...
...
driver/src/driver.cpp
View file @
51a9fa1d
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
@@ -366,6 +366,10 @@ int main(int argc, char* argv[])
...
@@ -366,6 +366,10 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"RightPads"
,
RightPads
{});
print_sequence
(
"ConvStrides"
,
ConvStrides
{});
print_sequence
(
"ConvDilations"
,
ConvDilations
{});
using
in_data_t
=
float
;
using
in_data_t
=
float
;
using
out_data_t
=
float
;
using
out_data_t
=
float
;
...
@@ -444,7 +448,7 @@ int main(int argc, char* argv[])
...
@@ -444,7 +448,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
@@ -486,7 +490,7 @@ int main(int argc, char* argv[])
...
@@ -486,7 +490,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
nrepeat
);
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment