Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a915f574
Commit
a915f574
authored
Feb 26, 2021
by
Chao Liu
Browse files
testing navi
parent
12225618
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
431 additions
and
1 deletion
+431
-1
composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+162
-0
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
...sable_kernel/include/utility/amd_buffer_addressing_v2.hpp
+28
-0
composable_kernel/include/utility/config.amd.hpp.in
composable_kernel/include/utility/config.amd.hpp.in
+1
-1
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
...ble_kernel/include/utility/in_memory_operation.amd.hpp.in
+5
-0
driver/include/device_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
...convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
+208
-0
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+27
-0
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
0 → 100644
View file @
a915f574
#ifndef CK_GRIDWISE_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
#define CK_GRIDWISE_CONVOLUTION_FORWARD_IMPLICIT_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "gridwise_gemm.hpp"
namespace
ck
{
// GemmM = K
// GemmN = N * Ho * Wo
// GemmK = C * Y * X
template
<
index_t
GridSize
,
index_t
BlockSize
,
typename
Float
,
typename
AccFloat
,
typename
InGlobalDesc
,
typename
WeiGlobalDesc
,
typename
OutGlobalDesc
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
,
index_t
GemmMPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmKPerBlock
,
index_t
GemmMPerThread
,
index_t
GemmNPerThread
,
index_t
GemmKPerThread
,
index_t
GemmMLevel0Cluster
,
index_t
GemmNLevel0Cluster
,
index_t
GemmMLevel1Cluster
,
index_t
GemmNLevel1Cluster
,
index_t
ThreadGemmDataPerRead_GemmM
,
index_t
ThreadGemmDataPerRead_GemmN
,
typename
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
typename
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
index_t
GemmABlockCopySrcDataPerRead_GemmK
,
index_t
GemmABlockCopyDstDataPerWrite_GemmM
,
typename
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
,
typename
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
,
index_t
GemmBBlockCopySrcDataPerRead_GemmK
,
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
,
index_t
GemmCThreadCopyDstDataPerWrite_GemmM1
>
struct
GridwiseConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk
{
__device__
void
Run
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_n_hi_wi_c_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_k_y_x_c_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_n_ho_wo_k_global_desc
=
OutGlobalDesc
{};
constexpr
index_t
N
=
in_n_hi_wi_c_global_desc
.
GetLengths
()[
I0
];
constexpr
index_t
Hi
=
in_n_hi_wi_c_global_desc
.
GetLengths
()[
I1
];
constexpr
index_t
Wi
=
in_n_hi_wi_c_global_desc
.
GetLengths
()[
I2
];
constexpr
index_t
C
=
in_n_hi_wi_c_global_desc
.
GetLengths
()[
I3
];
constexpr
index_t
K
=
out_n_ho_wo_k_global_desc
.
GetLengths
()[
I3
];
constexpr
index_t
Ho
=
out_n_ho_wo_k_global_desc
.
GetLengths
()[
I1
];
constexpr
index_t
Wo
=
out_n_ho_wo_k_global_desc
.
GetLengths
()[
I2
];
constexpr
index_t
Y
=
wei_k_y_x_c_global_desc
.
GetLengths
()[
I1
];
constexpr
index_t
X
=
wei_k_y_x_c_global_desc
.
GetLengths
()[
I2
];
constexpr
index_t
ConvStrideH
=
ConvStrides
{}[
I0
];
constexpr
index_t
ConvStrideW
=
ConvStrides
{}[
I1
];
constexpr
index_t
ConvDilationH
=
ConvDilations
{}[
I0
];
constexpr
index_t
ConvDilationW
=
ConvDilations
{}[
I1
];
// weight tensor
constexpr
auto
wei_gemmk_gemmm_global_desc
=
reorder_tensor_descriptor_given_upper2lower
(
unfold_tensor_descriptor
(
wei_k_y_x_c_global_desc
,
I1
,
I3
),
Sequence
<
1
,
0
>
{});
// input tensor
constexpr
auto
in_n_hip_wip_c_global_desc
=
transform_tensor_descriptor
(
in_n_hi_wi_c_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
Pad
<
Sequence
<
Hi
,
Wi
>
,
InLeftPads
,
InRightPads
>
{},
PassThrough
<
C
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
constexpr
index_t
Hip
=
in_n_hip_wip_c_global_desc
.
GetLengths
()[
I1
];
constexpr
index_t
Wip
=
in_n_hip_wip_c_global_desc
.
GetLengths
()[
I2
];
constexpr
auto
in_n_y_ho_x_wo_c_global_desc
=
transform_tensor_descriptor
(
in_n_hip_wip_c_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
Embed
<
Hip
,
Sequence
<
Y
,
Ho
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>>
{},
Embed
<
Wip
,
Sequence
<
X
,
Wo
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>>
{},
PassThrough
<
C
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
constexpr
auto
in_gemmk_gemmn_global_desc
=
transform_tensor_descriptor
(
in_n_y_ho_x_wo_c_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
,
X
>>
{},
Merge
<
Sequence
<
N
,
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
1
,
3
,
5
>
{},
Sequence
<
0
,
2
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// output tensor
constexpr
auto
out_gemmm_gemmn_global_desc
=
transform_tensor_descriptor
(
unfold_tensor_descriptor
(
out_n_ho_wo_k_global_desc
,
I0
,
I2
),
make_tuple
(
PassThrough
<
K
>
{},
Merge
<
Sequence
<
N
*
Ho
*
Wo
>>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// GEMM
constexpr
auto
gridwise_gemm
=
GridwiseGemmTransposedANormalBNormalC_v1
<
GridSize
,
BlockSize
,
Float
,
AccFloat
,
decltype
(
wei_gemmk_gemmm_global_desc
),
decltype
(
in_gemmk_gemmn_global_desc
),
decltype
(
out_gemmm_gemmn_global_desc
),
InMemoryDataOperation
::
Set
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerThread
,
GemmNPerThread
,
GemmKPerThread
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
ThreadGemmDataPerRead_GemmM
,
ThreadGemmDataPerRead_GemmN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
Sequence
<
1
,
0
>
,
Sequence
<
1
,
0
>
,
0
,
GemmABlockCopySrcDataPerRead_GemmK
,
GemmABlockCopyDstDataPerWrite_GemmM
,
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
,
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
,
Sequence
<
1
,
0
>
,
Sequence
<
1
,
0
>
,
0
,
GemmBBlockCopySrcDataPerRead_GemmK
,
GemmBBlockCopyDstDataPerWrite_GemmN
,
Sequence
<
2
,
3
,
0
,
1
>
,
1
,
GemmCThreadCopyDstDataPerWrite_GemmM1
>
{};
gridwise_gemm
.
Run
(
p_wei_global
,
p_in_global
,
p_out_global
);
}
};
}
// namespace ck
#endif
composable_kernel/include/utility/amd_buffer_addressing_v2.hpp
View file @
a915f574
...
...
@@ -90,7 +90,11 @@ __device__ float amd_buffer_load_v2<float, 1>(const float* p_src_wave,
// wavewise range (32 bit)
src_wave_buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
#if 0
src_wave_buffer_resource.config[3] = 0x00027000;
#else
src_wave_buffer_resource
.
config
[
3
]
=
0x31014000
;
#endif
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
...
...
@@ -120,7 +124,11 @@ __device__ float2_t amd_buffer_load_v2<float, 2>(const float* p_src_wave,
// wavewise range (32 bit)
src_wave_buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
#if 0
src_wave_buffer_resource.config[3] = 0x00027000;
#else
src_wave_buffer_resource
.
config
[
3
]
=
0x31014000
;
#endif
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
...
...
@@ -150,7 +158,11 @@ __device__ float4_t amd_buffer_load_v2<float, 4>(const float* p_src_wave,
// wavewise range (32 bit)
src_wave_buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
#if 0
src_wave_buffer_resource.config[3] = 0x00027000;
#else
src_wave_buffer_resource
.
config
[
3
]
=
0x31014000
;
#endif
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
...
...
@@ -180,7 +192,11 @@ __device__ float8_t amd_buffer_load_v2<float, 8>(const float* p_src_wave,
// wavewise range (32 bit)
src_wave_buffer_resource
.
range
[
2
]
=
src_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
#if 0
src_wave_buffer_resource.config[3] = 0x00027000;
#else
src_wave_buffer_resource
.
config
[
3
]
=
0x31014000
;
#endif
index_t
src_thread_addr_offset
=
src_thread_data_offset
*
sizeof
(
float
);
...
...
@@ -226,7 +242,11 @@ __device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
#if 0
dst_wave_buffer_resource.config[3] = 0x00027000;
#else
dst_wave_buffer_resource
.
config
[
3
]
=
0x31014000
;
#endif
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
...
...
@@ -261,7 +281,11 @@ __device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
#if 0
dst_wave_buffer_resource.config[3] = 0x00027000;
#else
dst_wave_buffer_resource
.
config
[
3
]
=
0x31014000
;
#endif
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
...
...
@@ -296,7 +320,11 @@ __device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
// wavewise range (32 bit)
dst_wave_buffer_resource
.
range
[
2
]
=
dst_data_range
*
sizeof
(
float
);
// wavewise setting (32 bit)
#if 0
dst_wave_buffer_resource.config[3] = 0x00027000;
#else
dst_wave_buffer_resource
.
config
[
3
]
=
0x31014000
;
#endif
index_t
dst_thread_addr_offset
=
dst_thread_data_offset
*
sizeof
(
float
);
...
...
composable_kernel/include/utility/config.amd.hpp.in
View file @
a915f574
...
...
@@ -32,7 +32,7 @@
// AMD buffer addressing
#ifndef CK_USE_AMD_BUFFER_ADDRESSING
#define CK_USE_AMD_BUFFER_ADDRESSING
0
#define CK_USE_AMD_BUFFER_ADDRESSING
1
#endif
// only gfx908 support native floating point atomic add
...
...
composable_kernel/include/utility/in_memory_operation.amd.hpp.in
View file @
a915f574
...
...
@@ -61,8 +61,13 @@ struct SetData
{
if(src_valid)
{
#if 0
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
#else
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[0x3fffffff & src_offset]);
#endif
}
else
{
...
...
driver/include/device_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp
0 → 100644
View file @
a915f574
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "gridwise_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
,
class
ConvStrides
,
class
ConvDilations
,
class
InLeftPads
,
class
InRightPads
>
void
device_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcyx
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
ck
::
index_t
nrepeat
)
{
std
::
cout
<<
"device_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk"
<<
std
::
endl
;
using
namespace
ck
;
using
TDevice
=
typename
conditional
<
is_same
<
half_float
::
half
,
T
>::
value
,
half_t
,
T
>::
type
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
N
=
OutDesc
::
GetLengths
()[
I0
];
constexpr
auto
K
=
OutDesc
::
GetLengths
()[
I1
];
constexpr
auto
C
=
WeiDesc
::
GetLengths
()[
I1
];
constexpr
auto
Hi
=
InDesc
::
GetLengths
()[
I2
];
constexpr
auto
Wi
=
InDesc
::
GetLengths
()[
I3
];
constexpr
auto
Ho
=
OutDesc
::
GetLengths
()[
I2
];
constexpr
auto
Wo
=
OutDesc
::
GetLengths
()[
I3
];
constexpr
auto
Y
=
WeiDesc
::
GetLengths
()[
I2
];
constexpr
auto
X
=
WeiDesc
::
GetLengths
()[
I3
];
// compile-time variables
constexpr
auto
in_n_hi_wi_c_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
Hi
,
Wi
,
C
>
{});
constexpr
auto
wei_k_y_x_c_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
Y
,
X
,
C
>
{});
constexpr
auto
out_n_ho_wo_k_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
Ho
,
Wo
,
K
>
{});
Tensor
<
float
>
in_nhwc
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
Hi
,
Wi
,
C
>
{})));
Tensor
<
float
>
wei_kyxc
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
Y
,
X
,
C
>
{})));
Tensor
<
float
>
out_nhwk
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
Ho
,
Wo
,
K
>
{})));
auto
f_nchw2nhwc
=
[
&
](
auto
n
,
auto
hi
,
auto
wi
,
auto
c
)
{
in_nhwc
(
n
,
hi
,
wi
,
c
)
=
in_nchw
(
n
,
c
,
hi
,
wi
);
};
auto
f_kcyx2kyxc
=
[
&
](
auto
k
,
auto
y
,
auto
x
,
auto
c
)
{
wei_kyxc
(
k
,
y
,
x
,
c
)
=
wei_kcyx
(
k
,
c
,
y
,
x
);
};
auto
f_nkhw2nhwk
=
[
&
](
auto
n
,
auto
ho
,
auto
wo
,
auto
k
)
{
out_nhwk
(
n
,
ho
,
wo
,
k
)
=
out_nkhw
(
n
,
k
,
ho
,
wo
);
};
make_ParallelTensorFunctor
(
f_nchw2nhwc
,
N
,
Hi
,
Wi
,
C
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
f_kcyx2kyxc
,
K
,
Y
,
X
,
C
)(
std
::
thread
::
hardware_concurrency
());
make_ParallelTensorFunctor
(
f_nkhw2nhwk
,
N
,
Ho
,
Wo
,
K
)(
std
::
thread
::
hardware_concurrency
());
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_nhwc_device_buf
(
data_sz
*
in_nhwc
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_kyxc_device_buf
(
data_sz
*
wei_kyxc
.
mDesc
.
GetElementSpace
());
DeviceMem
out_nhwk_device_buf
(
data_sz
*
out_nhwk
.
mDesc
.
GetElementSpace
());
in_nhwc_device_buf
.
ToDevice
(
in_nhwc
.
mData
.
data
());
wei_kyxc_device_buf
.
ToDevice
(
wei_kyxc
.
mData
.
data
());
out_nhwk_device_buf
.
ToDevice
(
out_nhwk
.
mData
.
data
());
#if 1
// cdata = 16, BlockSize = 64, 16x64x4
constexpr
index_t
BlockSize
=
64
;
constexpr
index_t
GemmMPerBlock
=
16
;
constexpr
index_t
GemmNPerBlock
=
64
;
constexpr
index_t
GemmKPerBlock
=
4
;
constexpr
index_t
GemmMPerThread
=
2
;
constexpr
index_t
GemmNPerThread
=
2
;
constexpr
index_t
GemmKPerThread
=
1
;
constexpr
index_t
GemmMLevel0Cluster
=
2
;
constexpr
index_t
GemmNLevel0Cluster
=
2
;
constexpr
index_t
GemmMLevel1Cluster
=
2
;
constexpr
index_t
GemmNLevel1Cluster
=
8
;
constexpr
index_t
ThreadGemmDataPerReadM
=
2
;
constexpr
index_t
ThreadGemmDataPerReadN
=
2
;
using
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
=
Sequence
<
1
,
1
>
;
using
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
=
Sequence
<
4
,
16
>
;
constexpr
index_t
GemmABlockCopySrcDataPerRead_GemmK
=
1
;
constexpr
index_t
GemmABlockCopyDstDataPerWrite_GemmM
=
1
;
using
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
=
Sequence
<
4
,
1
>
;
using
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
=
Sequence
<
1
,
64
>
;
constexpr
index_t
GemmBBlockCopySrcDataPerRead_GemmK
=
4
;
constexpr
index_t
GemmBBlockCopyDstDataPerWrite_GemmN
=
1
;
constexpr
index_t
GemmCThreadCopyDstDataPerWrite_GemmM1
=
2
;
#endif
constexpr
index_t
GemmM
=
K
;
constexpr
index_t
GemmN
=
N
*
Ho
*
Wo
;
constexpr
index_t
GridSize
=
math
::
integer_divide_ceil
(
GemmM
,
GemmMPerBlock
)
*
math
::
integer_divide_ceil
(
GemmN
,
GemmNPerBlock
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
using
gridwise_conv
=
GridwiseConvolutionForwardImplicitGemm_v4r4_nhwc_kyxc_nhwk
<
GridSize
,
BlockSize
,
TDevice
,
TDevice
,
decltype
(
in_n_hi_wi_c_desc
),
decltype
(
wei_k_y_x_c_desc
),
decltype
(
out_n_ho_wo_k_desc
),
ConvStrides
,
ConvDilations
,
InLeftPads
,
InRightPads
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerThread
,
GemmNPerThread
,
GemmKPerThread
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
ThreadGemmDataPerReadM
,
ThreadGemmDataPerReadN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
GemmABlockCopySrcDataPerRead_GemmK
,
GemmABlockCopyDstDataPerWrite_GemmM
,
GemmBBlockCopyThreadSliceLengths_GemmK_GemmN
,
GemmBBlockCopyThreadClusterLengths_GemmK_GemmN
,
GemmBBlockCopySrcDataPerRead_GemmK
,
GemmBBlockCopyDstDataPerWrite_GemmN
,
GemmCThreadCopyDstDataPerWrite_GemmM1
>
;
for
(
index_t
i
=
0
;
i
<
5
;
++
i
)
{
std
::
cout
<<
"Start running "
<<
nrepeat
<<
" times..."
<<
std
::
endl
;
KernelTimer
timer
;
timer
.
Start
();
for
(
index_t
j
=
0
;
j
<
nrepeat
;
++
j
)
{
launch_kernel
(
run_gridwise_operation
<
gridwise_conv
,
const
TDevice
*
const
__restrict__
,
const
TDevice
*
const
__restrict__
,
TDevice
*
const
__restrict__
>
,
dim3
(
GridSize
),
dim3
(
BlockSize
),
0
,
0
,
static_cast
<
TDevice
*>
(
in_nhwc_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TDevice
*>
(
wei_kyxc_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TDevice
*>
(
out_nhwk_device_buf
.
GetDeviceBuffer
()));
}
timer
.
End
();
float
ave_time
=
timer
.
GetElapsedTime
()
/
nrepeat
;
float
perf
=
(
float
)
calculate_convolution_flops
(
InDesc
{},
WeiDesc
{},
OutDesc
{})
/
(
std
::
size_t
(
1000
)
*
1000
*
1000
)
/
ave_time
;
std
::
cout
<<
"Average time : "
<<
ave_time
<<
" ms, "
<<
perf
<<
" TFlop/s"
<<
std
::
endl
;
}
out_nhwk_device_buf
.
FromDevice
(
out_nhwk
.
mData
.
data
());
auto
f_nhwk2nkhw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_nhwk
(
n
,
ho
,
wo
,
k
);
};
make_ParallelTensorFunctor
(
f_nhwk2nkhw
,
N
,
K
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
}
driver/src/conv_driver.cpp
View file @
a915f574
...
...
@@ -13,6 +13,7 @@
#include "device_tensor.hpp"
#include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk.hpp"
...
...
@@ -74,6 +75,20 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
4
;
constexpr
index_t
HI
=
1080
;
constexpr
index_t
WI
=
1920
;
constexpr
index_t
K
=
16
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif 0
...
...
@@ -686,6 +701,18 @@ int main(int argc, char* argv[])
LeftPads
{},
RightPads
{},
nrepeat
);
#elif 0
device_convolution_forward_implicit_gemm_v4r4_nhwc_kyxc_nhwk
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
LeftPads
{},
RightPads
{},
nrepeat
);
#elif 1
device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment