Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
51a9fa1d
Commit
51a9fa1d
authored
Sep 26, 2019
by
Chao Liu
Browse files
removing dependency on old tensor descriptor
parent
0f52c4c0
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
160 additions
and
149 deletions
+160
-149
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+8
-14
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
...cit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+40
-48
composable_kernel/include/tensor_description/multi_index_transform.hpp
...rnel/include/tensor_description/multi_index_transform.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+37
-0
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+11
-35
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+5
-4
composable_kernel/include/utility/sequence.hpp
composable_kernel/include/utility/sequence.hpp
+7
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
..._convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+42
-42
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
..._convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+8
-4
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
51a9fa1d
...
...
@@ -6,8 +6,8 @@
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace
ck
{
...
...
@@ -115,16 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{}),
make_tuple
(
Merge
<
Sequence
<
KBlockWork
,
BBlockWork
>>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_
multi_
id
=
block_work_desc
.
Calculate
Low
erIndex
(
get_block_1d_id
());
const
auto
block_work_id
=
block_work_desc
.
Calculate
Clust
erIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_
multi_
id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_
multi_
id
[
1
]
*
BPerBlock
;
const
index_t
k_block_data_on_global
=
block_work_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_id
[
1
]
*
BPerBlock
;
// input tensor
// global memory
...
...
@@ -185,11 +182,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
constexpr
auto
wei_k_e_global_desc
=
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
);
constexpr
auto
wei_e_k_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
wei_k_e_global_desc
,
Sequence
<
1
,
0
>
{});
constexpr
auto
wei_e_k_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
),
Sequence
<
1
,
0
>
{});
// tensor descriptor in LDS, dst of blockwise copy
// be careful of LDS alignment
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
View file @
51a9fa1d
...
...
@@ -2,12 +2,12 @@
#define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R4_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP
#include "common_header.hpp"
#include "
ConstantT
ensor
D
escriptor.hpp"
#include "
ConstantMergedT
ensor
D
escriptor.hpp"
#include "
t
ensor
_d
escriptor.hpp"
#include "
t
ensor
_d
escriptor
_helper
.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace
ck
{
...
...
@@ -103,13 +103,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
constexpr
index_t
BBlockWork
=
B
/
BPerBlock
;
constexpr
auto
block_work_desc
=
make_
ConstantTensorD
escriptor
_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
make_
cluster_d
escriptor
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
auto
block_work_id
=
block_work_desc
.
CalculateClusterIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_
multi_
id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_
multi_
id
[
1
]
*
BPerBlock
;
const
index_t
k_block_data_on_global
=
block_work_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_id
[
1
]
*
BPerBlock
;
// input tensor
// global mem
...
...
@@ -157,21 +156,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
InBlockCopyDataPerAccess_B
>
(
{
0
,
b_block_data_on_global
},
{
0
,
0
});
// weight tensor
// global mem
#if 0
constexpr auto wei_e_k_global_desc =
transform_tensor_descriptor(wei_k_c_y_x_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
#else
// hack
constexpr
auto
wei_e_k_global_desc_old
=
WeiGlobalDesc
::
Unfold
(
I1
,
I3
).
ReorderGivenNew2Old
(
Sequence
<
1
,
0
>
{});
constexpr
auto
wei_e_k_global_desc
=
make_native_tensor_descriptor
(
wei_e_k_global_desc_old
.
GetLengths
(),
wei_e_k_global_desc_old
.
GetStrides
());
#endif
// weight tensor
// global mem
constexpr
auto
wei_e_k_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
),
Sequence
<
1
,
0
>
{});
// LDS
// be careful of LDS alignment
...
...
@@ -267,9 +255,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
// LDS double buffer: preload data into LDS
{
blockwise_in_copy
.
template
Run
<
Float
,
address_space_t
::
global
,
address_space_t
::
lds
>(
blockwise_in_copy
.
template
Run
<
Float
,
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_block_double
);
blockwise_wei_copy
.
template
Run
<
Float
,
address_space_t
::
global
,
address_space_t
::
lds
>(
blockwise_wei_copy
.
template
Run
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_block_double
);
}
...
...
@@ -292,8 +280,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
...
@@ -301,26 +289,26 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
__syncthreads
();
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
template
RunLoadRegisterBuffer
<
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_register_buffer
);
blockwise_wei_copy
.
template
RunLoadRegisterBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_in_register_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_wei_register_buffer
,
p_wei_block_next
);
blockwise_in_copy
.
RunStoreThreadBuffer
(
p_in_thread_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStoreThreadBuffer
(
p_wei_thread_buffer
,
p_wei_block_next
);
}
}
// LDS double buffer: tail
{
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
// even iteration
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
...
@@ -329,19 +317,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
__syncthreads
();
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
template
RunLoad
RegisterBuffer
<
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_
register
_buffer
);
blockwise_wei_copy
.
template
RunLoad
RegisterBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_
register
_buffer
);
blockwise_in_copy
.
template
RunLoad
ThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_in_global
,
p_in_
thread
_buffer
);
blockwise_wei_copy
.
template
RunLoad
ThreadBuffer
<
Float
,
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_
thread
_buffer
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_in_register_buffer
,
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
template
RunStoreRegisterBuffer
<
Float
,
address_space_t
::
lds
>(
p_wei_register_buffer
,
p_wei_block_double
+
wei_block_space
);
blockwise_in_copy
.
RunStoreThreadBuffer
(
p_in_thread_buffer
,
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStoreThreadBuffer
(
p_wei_thread_buffer
,
p_wei_block_double
+
wei_block_space
);
// odd iteration
__syncthreads
();
...
...
@@ -402,10 +390,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf
b_thread_data_on_global
/
B1
,
b_thread_data_on_global
%
B1
})
#if 1
.
template
Run_generic
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#elif 1
.
template
Run_generic
<
Float
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#else // tweaking
.
template
Run_optimized_dst_address_calculation
<
Float
,
address_space_t
::
vgpr
,
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#endif
(
p_out_thread
,
p_out_global
);
...
...
composable_kernel/include/tensor_description/multi_index_transform.hpp
View file @
51a9fa1d
...
...
@@ -132,7 +132,7 @@ struct Merge
__host__
__device__
static
constexpr
auto
GetUpperLengths
()
{
return
Sequence
<
accumulat
e_on_sequence
(
return
Sequence
<
reduc
e_on_sequence
(
LowerLengths
{},
math
::
multiplies
<
index_t
>
{},
Number
<
1
>
{})
>
{};
}
...
...
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
51a9fa1d
...
...
@@ -149,6 +149,7 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
}
#if 0
// not implemented
template <typename LowerTensorDescriptor,
typename PadDimensionIds,
typename LeftPads,
...
...
@@ -171,6 +172,42 @@ __host__ __device__ constexpr auto
}
#endif
// a cluster map 1d index to N-d index
template
<
typename
Lengths
,
typename
ArrangeOrder
>
struct
ClusterDescriptor
{
static
constexpr
index_t
nDim
=
Lengths
::
Size
();
static
constexpr
auto
mDesc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Lengths
{}),
make_tuple
(
Merge
<
decltype
(
Lengths
::
ReorderGivenNew2Old
(
ArrangeOrder
{}))
>
{}),
make_tuple
(
ArrangeOrder
{}),
make_tuple
(
Sequence
<
0
>
{}));
__host__
__device__
constexpr
ClusterDescriptor
()
{
static_assert
(
Lengths
::
Size
()
==
nDim
&&
ArrangeOrder
::
Size
()
==
nDim
,
"wrong! size not the same"
);
static_assert
(
is_valid_sequence_map
<
ArrangeOrder
>
{},
"wrong! ArrangeOrder is wrong"
);
}
__host__
__device__
static
constexpr
index_t
GetElementSize
()
{
return
mDesc
.
GetElementSize
();
}
__host__
__device__
static
constexpr
auto
CalculateClusterIndex
(
index_t
idx_1d
)
{
return
mDesc
.
CalculateLowerIndex
(
MultiIndex
<
1
>
{
idx_1d
});
}
};
template
<
typename
Lengths
,
typename
ArrangeOrder
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>
::
type
>
__host__
__device__
constexpr
auto
make_cluster_descriptor
(
Lengths
,
ArrangeOrder
order
=
typename
arithmetic_sequence_gen
<
0
,
Lengths
::
Size
(),
1
>::
type
{})
{
return
ClusterDescriptor
<
Lengths
,
ArrangeOrder
>
{};
}
template
<
typename
...
NativeDimensions
>
__host__
__device__
void
print_tensor_descriptor
(
const
char
*
s
,
const
NativeTensorDescriptor
<
NativeDimensions
...
>&
desc
)
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
51a9fa1d
...
...
@@ -2,13 +2,10 @@
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate_v2.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
...
...
@@ -16,6 +13,8 @@
namespace
ck
{
#if 0
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
...
...
@@ -677,6 +676,8 @@ struct BlockwiseGenericTensorSliceCopy_v3
ThreadwiseStore
mThreadwiseStore
;
};
#endif
template
<
index_t
BlockSize
,
typename
BlockSrcDesc
,
typename
BlockDstDesc
,
...
...
@@ -710,42 +711,17 @@ struct BlockwiseGenericTensorSliceCopy_v4
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
#if 1
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
#else
constexpr
auto
thread_cluster_lengths_in_arrange_order
=
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{});
constexpr
auto
thread_cluster_desc
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
thread_cluster_lengths_in_arrange_order
),
make_tuple
(
Merge
<
decltype
(
thread_cluster_lengths_in_arrange_order
)
>
{}),
make_tuple
(
arithmetic
)
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
constexpr
auto
thread_cluster_id
=
transform_tensor_descriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
KBlockWork
,
BBlockWork
>
{}),
make_tuple
(
Merge
<
Sequence
<
KBlockWork
,
BBlockWork
>>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
block_work_multi_id
=
block_work_desc
.
CalculateLowerIndex
(
get_block_1d_id
());
#endif
// map threads to cluster
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
thread_cluster_desc
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
const
auto
thread_data_id_begin
=
data
_cluster_id
*
ThreadSliceLengths
{};
const
auto
thread_data_id_begin
=
thread
_cluster_id
*
ThreadSliceLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
51a9fa1d
...
...
@@ -2,11 +2,8 @@
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_view.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate_v2.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
...
...
@@ -23,6 +20,8 @@
namespace
ck
{
#if 0
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
...
...
@@ -1121,6 +1120,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1
DstSlice
mDstSlice
;
};
#endif
// This version use multi-index transformation
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
...
...
composable_kernel/include/utility/sequence.hpp
View file @
51a9fa1d
...
...
@@ -473,6 +473,13 @@ struct sequence_sort_impl<Sequence<Value>, Sequence<Id>, Compare>
using
sorted_ids
=
Sequence
<
Id
>
;
};
template
<
typename
Compare
>
struct
sequence_sort_impl
<
Sequence
<>
,
Sequence
<>
,
Compare
>
{
using
sorted_values
=
Sequence
<>
;
using
sorted_ids
=
Sequence
<>
;
};
template
<
typename
Values
,
typename
Compare
>
struct
sequence_sort
{
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
View file @
51a9fa1d
...
...
@@ -177,8 +177,6 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
constexpr
auto
gridwise_conv
=
#if 0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
...
...
@@ -223,6 +221,8 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
{};
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
float
time
=
launch_kernel
(
run_gridwise_convolution_kernel
<
decltype
(
gridwise_conv
),
T
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
View file @
51a9fa1d
...
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
1
#if
0
constexpr index_t BlockSize = 256;
constexpr index_t BPerBlock = 128;
...
...
driver/src/driver.cpp
View file @
51a9fa1d
...
...
@@ -14,7 +14,7 @@
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
...
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
...
...
@@ -366,6 +366,10 @@ int main(int argc, char* argv[])
ostream_ConstantTensorDescriptor
(
in_nchw_desc
,
std
::
cout
<<
"in_nchw_desc: "
);
ostream_ConstantTensorDescriptor
(
wei_kcyx_desc
,
std
::
cout
<<
"wei_kcyx_desc: "
);
ostream_ConstantTensorDescriptor
(
out_nkhw_desc
,
std
::
cout
<<
"out_nkhw_desc: "
);
print_sequence
(
"LeftPads"
,
LeftPads
{});
print_sequence
(
"RightPads"
,
RightPads
{});
print_sequence
(
"ConvStrides"
,
ConvStrides
{});
print_sequence
(
"ConvDilations"
,
ConvDilations
{});
using
in_data_t
=
float
;
using
out_data_t
=
float
;
...
...
@@ -444,7 +448,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif
1
#elif
0
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
@@ -486,7 +490,7 @@ int main(int argc, char* argv[])
ConvStrides
{},
ConvDilations
{},
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment