Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
aba0880d
Unverified
Commit
aba0880d
authored
Aug 21, 2023
by
zjing14
Committed by
GitHub
Aug 21, 2023
Browse files
Merge branch 'develop' into lwpck-850
parents
c3d720cc
d52ec016
Changes
91
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1734 additions
and
421 deletions
+1734
-421
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
+62
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+147
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
+62
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+62
-0
include/ck/ck.hpp
include/ck/ck.hpp
+3
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
.../ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+370
-0
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
.../ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
+1
-1
include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
...ude/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
+39
-0
include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
+10
-7
include/ck/tensor_operation/gpu/device/device_put_element.hpp
...ude/ck/tensor_operation/gpu/device/device_put_element.hpp
+1
-1
include/ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp
include/ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp
+18
-0
include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
...tion/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
+575
-0
include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
...e_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+3
-2
include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
...de/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
+24
-10
include/ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp
.../tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp
+133
-0
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
...tion/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
+4
-2
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
...sor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
+5
-2
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
...u/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+4
-2
include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
...operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
+58
-295
include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
...eration/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
+153
-99
No files found.
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
0 → 100644
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
ck
::
bhalf_t
;
using
DInDataType
=
ck
::
bhalf_t
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
0 → 100644
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
template
<
typename
TensorLayout
>
std
::
vector
<
ck
::
index_t
>
f_tensor_strides_ncdhw
(
ck
::
index_t
N_
,
ck
::
index_t
C_
,
ck
::
index_t
D
,
ck
::
index_t
H
,
ck
::
index_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
(
void
)
N_
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
return
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
};
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
return
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
};
};
template
<
typename
TensorLayout
>
HostTensorDescriptor
f_host_tensor_descriptor
(
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
}
};
template
<
typename
DevicePoolBwdInstance
,
typename
DOutDataType
,
typename
DInDataType
,
typename
DOutLayout
,
typename
DInLayout
>
bool
pool3d_bwd_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Di
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
,
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
,
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
)
{
auto
OutSpatialLength
=
[
&
](
auto
InSpatialLength
,
int
index
)
{
ck
::
index_t
left_pad
=
dinput_left_pads
[
index
];
ck
::
index_t
right_pad
=
dinput_right_pads
[
index
];
ck
::
index_t
window_len
=
window_lengths
[
index
];
ck
::
index_t
stride
=
window_strides
[
index
];
ck
::
index_t
dilation
=
window_dilations
[
index
];
ck
::
index_t
eff
=
(
window_len
-
1
)
*
dilation
+
1
;
return
(
InSpatialLength
+
left_pad
+
right_pad
-
eff
)
/
stride
+
1
;
};
ck
::
index_t
Do
=
OutSpatialLength
(
Di
,
0
);
ck
::
index_t
Ho
=
OutSpatialLength
(
Hi
,
1
);
ck
::
index_t
Wo
=
OutSpatialLength
(
Wi
,
2
);
Tensor
<
DOutDataType
>
dout
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
DOutLayout
{}));
Tensor
<
DInDataType
>
din_dev
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}));
Tensor
<
DInDataType
>
din_host
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}));
std
::
cout
<<
"dout: "
<<
dout
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"din_host: "
<<
din_host
.
mDesc
<<
std
::
endl
;
dout
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
0.0
,
1.0
});
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_dev
.
mDesc
.
GetElementSpaceSize
());
dout_device_buf
.
ToDevice
(
dout
.
mData
.
data
());
din_device_buf
.
SetZero
();
auto
pool
=
DevicePoolBwdInstance
{};
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Do
,
Ho
,
Wo
},
{
N
,
C
,
Di
,
Hi
,
Wi
},
f_tensor_strides_ncdhw
(
N
,
C
,
Do
,
Ho
,
Wo
,
DOutLayout
{}),
f_tensor_strides_ncdhw
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}),
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
if
(
!
pool
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_op with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Perf: "
<<
ave_time
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
auto
ref_pool
=
ck
::
tensor_operation
::
host
::
ReferenceAvgPoolBwd
<
3
,
DInDataType
,
DOutDataType
>
();
auto
ref_invoker
=
ref_pool
.
MakeInvoker
();
auto
ref_argument
=
ref_pool
.
MakeArgument
(
din_host
,
dout
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
ref_invoker
.
Run
(
ref_argument
);
din_device_buf
.
FromDevice
(
din_dev
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
din_dev
,
din_host
);
}
return
pass
;
}
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
0 → 100644
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
ck
::
half_t
;
using
DInDataType
=
ck
::
half_t
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
0 → 100644
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
float
;
using
DInDataType
=
float
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
include/ck/ck.hpp
View file @
aba0880d
...
...
@@ -125,6 +125,9 @@
// `s_nop`s to avoid hazard
#define CK_USE_AMD_V_DOT_INLINE_ASM 0
// inner product using V_DOT with DPP8 modifiers
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
0 → 100644
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/amd_gemm_dpp.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp"
namespace
ck
{
/**
* DPP8 version of blockwise GEMM algorithm. It uses DPP8 instruction modifier to limit
* the data loaded from LDS to registers.
*
* The algorithm groups threads into groups of size `dpp8::lane_group_size` and splits the matrix C
* between them in such a way that threads from the same group need the same chunk of either
* matrix A (or B, respectively). Without the usage of DPP8, each thread would need to load the
* whole chunk from LDS to its own register space.
* Usage of DPP8 modifiers allow each thread to load less data, exactly `1 / dpp8::lane_group_size`
* of the chunk, and then share that data with other threads from the same lane group.
*
* Assumptions coming from the usage of DPP8:
* 1. `BM10BN10ThreadClusterBM10Xs[1] == dpp8::lane_group_size` or
* `BM10BN10ThreadClusterBN10Xs[1] == dpp8::lane_group_size` -
* - it makes consecutive `dpp8::lane_group_size` threads use the same chunk of either
* matrix A or B;
* - based on these values we determine which matrix to share.
* 2. `BM1PerThreadBM11 % dpp8::lane_group_size == 0` (if sharing A) or
* `BN1PerThreadBN11 % dpp8::lane_group_size == 0` (if sharing B) -
* - we have to make sure that the data to split is divisible by the number of
* threads in the group.
*
* General algorithm:
* C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
* A and B are visible to the whole block, C is distributed among each thread
* Assume:
* 1. A:
* 1. ABlockDesc_BK0_BM_BK1 is known at compile-time
* 2. ABlockBuffer is DynamicBuffer
* 2. B:
* 1. BBlockDesc_BK0_BN_BK1 is known at compile-time
* 2. BBlockBuffer is DynamicBuffer
* 3. C:
* 1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
* 2. CThreadBuffer is StaticBuffer
* 4. BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
*/
template
<
index_t
BlockSize
,
typename
FloatA
,
typename
FloatB
,
typename
FloatC
,
typename
ABlockDesc_BK0_BM_BK1
,
typename
BBlockDesc_BK0_BN_BK1
,
index_t
BM1PerThreadBM11
,
index_t
BN1PerThreadBN11
,
index_t
BK0PerThread
,
typename
BM10BN10ThreadClusterBM10Xs
,
// Sequence<BM10BN10ThreadClusterBM100,
// BM10BN10ThreadClusterBM101, ...>
typename
BM10BN10ThreadClusterBN10Xs
,
// Sequence<BM10BN10ThreadClusterBN100,
// BM10BN10ThreadClusterBN101, ...>
index_t
AThreadCopyScalarPerVector_BM11
,
index_t
BThreadCopyScalarPerVector_BN11
,
typename
enable_if
<
ABlockDesc_BK0_BM_BK1
::
IsKnownAtCompileTime
()
&&
BBlockDesc_BK0_BN_BK1
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0
{
using
AIndex
=
MultiIndex
<
4
>
;
using
BIndex
=
MultiIndex
<
4
>
;
using
CIndex
=
MultiIndex
<
4
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
index_t
BK0
=
ABlockDesc_BK0_BM_BK1
{}.
GetLength
(
I0
);
static
constexpr
index_t
BK1
=
ABlockDesc_BK0_BM_BK1
{}.
GetLength
(
I2
);
static
constexpr
index_t
BM
=
ABlockDesc_BK0_BM_BK1
{}.
GetLength
(
I1
);
static
constexpr
index_t
BN
=
BBlockDesc_BK0_BN_BK1
{}.
GetLength
(
I1
);
static
constexpr
index_t
BM100
=
BM10BN10ThreadClusterBM10Xs
{}[
I0
];
static
constexpr
index_t
BN100
=
BM10BN10ThreadClusterBN10Xs
{}[
I0
];
static
constexpr
index_t
BM101
=
BM10BN10ThreadClusterBM10Xs
{}[
I1
];
static
constexpr
index_t
BN101
=
BM10BN10ThreadClusterBN10Xs
{}[
I1
];
static
constexpr
index_t
BM11
=
BM1PerThreadBM11
;
static
constexpr
index_t
BN11
=
BN1PerThreadBN11
;
static
constexpr
index_t
BM1
=
BM100
*
BM101
*
BM11
;
static
constexpr
index_t
BN1
=
BN100
*
BN101
*
BN11
;
static
constexpr
index_t
BM0
=
BM
/
BM1
;
static
constexpr
index_t
BN0
=
BN
/
BN1
;
// We assume that either `BM101` or `BN101` is equal to `dpp8::lane_group_size`. It makes all
// threads in a lane group need the same chunk of B or A matrices and we can share them using
// DPP.
static_assert
(
BM101
==
dpp8
::
lane_group_size
||
BN101
==
dpp8
::
lane_group_size
);
static
constexpr
bool
ShareB
=
BM101
==
dpp8
::
lane_group_size
?
true
:
false
;
static
constexpr
bool
ShareA
=
!
ShareB
;
// If DPP shares A (B, respectively), lane group gets `BM1PerThreadBM11` (`BN1PerThreadBN11`,
// respectively) elements, so we split them between threads in lane group so each thread loads
// less data from LDS.
static
constexpr
index_t
BM1PerThread
=
ShareA
?
BM1PerThreadBM11
/
dpp8
::
lane_group_size
:
BM1PerThreadBM11
;
static
constexpr
index_t
BN1PerThread
=
ShareB
?
BN1PerThreadBN11
/
dpp8
::
lane_group_size
:
BN1PerThreadBN11
;
__host__
__device__
static
constexpr
auto
MakeABlockDescriptor_BK0_BM0_BM1_BK1
(
const
ABlockDesc_BK0_BM_BK1
&
a_block_desc_bk0_bm_bk1
)
{
const
auto
a_block_bk0_bm0_bm1_bk1
=
transform_tensor_descriptor
(
a_block_desc_bk0_bm_bk1
,
make_tuple
(
make_pass_through_transform
(
Number
<
BK0
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
BM0
>
{},
Number
<
BM1
>
{})),
make_pass_through_transform
(
Number
<
BK1
>
{})),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
return
a_block_bk0_bm0_bm1_bk1
;
}
__host__
__device__
static
constexpr
auto
MakeBBlockDescriptor_BK0_BN0_BN1_BK1
(
const
BBlockDesc_BK0_BN_BK1
&
b_block_desc_bk0_bn_bk1
)
{
const
auto
b_block_desc_bk0_bn0_bn1_bk1
=
transform_tensor_descriptor
(
b_block_desc_bk0_bn_bk1
,
make_tuple
(
make_pass_through_transform
(
Number
<
BK0
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
BN0
>
{},
Number
<
BN1
>
{})),
make_pass_through_transform
(
Number
<
BK1
>
{})),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
return
b_block_desc_bk0_bn0_bn1_bk1
;
}
__host__
__device__
static
constexpr
auto
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN
()
{
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// lower: [BM, BN]
constexpr
auto
c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_unmerge_transform
(
make_tuple
(
Number
<
BM0
>
{},
Number
<
BM100
>
{},
Number
<
BM101
>
{},
Number
<
BM11
>
{})),
make_unmerge_transform
(
make_tuple
(
Number
<
BN0
>
{},
Number
<
BN100
>
{},
Number
<
BN101
>
{},
Number
<
BN11
>
{}))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{},
Sequence
<
4
,
5
,
6
,
7
>
{}));
return
c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n
;
}
__host__
__device__
static
constexpr
auto
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1
()
{
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// lower: [BM0, BM1, BN0, BN1]
constexpr
auto
c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_pass_through_transform
(
Number
<
BM0
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
BM100
>
{},
Number
<
BM101
>
{},
Number
<
BM11
>
{})),
make_pass_through_transform
(
Number
<
BN0
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
BN100
>
{},
Number
<
BN101
>
{},
Number
<
BN11
>
{}))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
,
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
,
6
,
7
>
{}));
return
c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1
;
}
__host__
__device__
static
constexpr
auto
GetCThreadTensorLengths_BM0_BM1_BN0_BN1
()
{
return
Sequence
<
BM0
,
BM11
,
BN0
,
BN11
>
{};
}
static
constexpr
auto
a_block_desc_bk0_bm0_bm1_bk1_
=
MakeABlockDescriptor_BK0_BM0_BM1_BK1
(
ABlockDesc_BK0_BM_BK1
{});
static
constexpr
auto
b_block_desc_bk0_bn0_bn1_bk1_
=
MakeBBlockDescriptor_BK0_BN0_BN1_BK1
(
BBlockDesc_BK0_BN_BK1
{});
public:
__device__
BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0
()
:
c_thread_origin_data_idx_
{
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
get_thread_local_1d_id
())},
a_thread_copy_
{
CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1
()},
b_thread_copy_
{
CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1
()}
{
static_assert
(
ABlockDesc_BK0_BM_BK1
::
IsKnownAtCompileTime
()
&&
BBlockDesc_BK0_BN_BK1
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
static_assert
(
BM
%
BM1
==
0
&&
BN
%
BN1
==
0
,
"wrong!"
);
static_assert
(
ABlockDesc_BK0_BM_BK1
{}.
GetLength
(
I0
)
==
BBlockDesc_BK0_BN_BK1
{}.
GetLength
(
I0
),
"wrong! K dimension not consistent"
);
static_assert
(
BM10BN10ThreadClusterBM10Xs
::
Size
()
==
2
&&
BM10BN10ThreadClusterBN10Xs
::
Size
()
==
2
,
"wrong!"
);
}
__device__
static
CIndex
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
index_t
thread_id
)
{
// lower: [BM0, BM1, BN0, BN1]
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
constexpr
auto
adaptor0
=
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1
();
// lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// upper: [Tid, BM0, BM11, BN0, BN11]
constexpr
auto
adaptor1
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
BM100
,
BN100
,
BM101
,
BN101
)),
make_pass_through_transform
(
BM0
),
make_pass_through_transform
(
BM11
),
make_pass_through_transform
(
BN0
),
make_pass_through_transform
(
BN11
)),
make_tuple
(
Sequence
<
1
,
5
,
2
,
6
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
7
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
constexpr
auto
adaptor
=
chain_tensor_adaptors
(
adaptor0
,
adaptor1
);
return
adaptor
.
CalculateBottomIndex
(
make_multi_index
(
thread_id
,
0
,
0
,
0
,
0
));
}
__device__
AIndex
CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1
()
{
const
auto
offsetBM0
=
c_thread_origin_data_idx_
[
I0
];
// If sharing matrix A, we need a separate BM1 offset for each thread in lane group.
const
auto
offsetBM1
=
ShareA
?
c_thread_origin_data_idx_
[
I1
]
+
dpp8
::
get_thread_idx_in_lane_group
()
*
BM1PerThread
:
c_thread_origin_data_idx_
[
I1
];
return
make_tuple
(
0
,
offsetBM0
,
offsetBM1
,
0
);
}
__device__
BIndex
CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1
()
{
const
auto
offsetBN0
=
c_thread_origin_data_idx_
[
I2
];
// If sharing matrix B, we need a separate BN1 offset for each thread in lane group.
const
auto
offsetBN1
=
ShareB
?
c_thread_origin_data_idx_
[
I3
]
+
dpp8
::
get_thread_idx_in_lane_group
()
*
BN1PerThread
:
c_thread_origin_data_idx_
[
I3
];
return
make_tuple
(
0
,
offsetBN0
,
offsetBN1
,
0
);
}
template
<
typename
CThreadDesc_BM0_BM11_BN0_BN11
,
typename
ABlockBuffer
,
typename
BBlockBuffer
,
typename
CThreadBuffer
>
__device__
void
Run
(
const
CThreadDesc_BM0_BM11_BN0_BN11
&
,
const
ABlockBuffer
&
a_block_buf
,
const
BBlockBuffer
&
b_block_buf
,
CThreadBuffer
&
c_thread_buf
)
const
{
static_assert
(
CThreadDesc_BM0_BM11_BN0_BN11
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatA
>
(
a_thread_desc_bk0_bm0_bm1_bk1_
.
GetElementSpaceSize
());
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatB
>
(
b_thread_desc_bk0_bn0_bn1_bk1_
.
GetElementSpaceSize
());
constexpr
auto
threadwise_contraction
=
ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
<
FloatA
,
FloatB
,
FloatC
,
decltype
(
a_thread_desc_bk0_bm0_bm1_bk1_
),
decltype
(
b_thread_desc_bk0_bn0_bn1_bk1_
),
CThreadDesc_BM0_BM11_BN0_BN11
,
Sequence
<
BK0PerThread
,
BK1
>
,
Sequence
<
1
,
BM1PerThreadBM11
>
,
Sequence
<
1
,
BN1PerThreadBN11
>
,
ShareA
>
{};
static_for
<
0
,
BN0
,
1
>
{}([
&
](
auto
bn0
)
{
static_for
<
0
,
BM0
,
1
>
{}([
&
](
auto
bm0
)
{
a_thread_copy_
.
Run
(
a_block_desc_bk0_bm0_bm1_bk1_
,
make_tuple
(
I0
,
bm0
,
I0
,
I0
),
a_block_buf
,
a_thread_desc_bk0_bm0_bm1_bk1_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
a_thread_buf
);
b_thread_copy_
.
Run
(
b_block_desc_bk0_bn0_bn1_bk1_
,
make_tuple
(
I0
,
bn0
,
I0
,
I0
),
b_block_buf
,
b_thread_desc_bk0_bn0_bn1_bk1_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
);
threadwise_contraction
.
Run
(
a_thread_buf
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
make_tuple
(
bm0
,
I0
,
bn0
,
I0
));
static_for
<
BK0PerThread
,
BK0
,
BK0PerThread
>
{}([
&
](
auto
bk0
)
{
a_thread_copy_
.
Run
(
a_block_desc_bk0_bm0_bm1_bk1_
,
make_tuple
(
bk0
,
bm0
,
I0
,
I0
),
a_block_buf
,
a_thread_desc_bk0_bm0_bm1_bk1_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
a_thread_buf
);
b_thread_copy_
.
Run
(
b_block_desc_bk0_bn0_bn1_bk1_
,
make_tuple
(
bk0
,
bn0
,
I0
,
I0
),
b_block_buf
,
b_thread_desc_bk0_bn0_bn1_bk1_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
);
threadwise_contraction
.
Run
(
a_thread_buf
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
make_tuple
(
bm0
,
I0
,
bn0
,
I0
));
});
});
});
}
private:
// A[BK0, BM0, BM1, BK1]
static
constexpr
auto
a_thread_desc_bk0_bm0_bm1_bk1_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BK0PerThread
>
{},
Number
<
BM0
>
{},
Number
<
BM1PerThread
>
{},
Number
<
BK1
>
{}));
// B[BK0, BN0, BN1, BK1]
static
constexpr
auto
b_thread_desc_bk0_bn0_bn1_bk1_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BK0PerThread
>
{},
Number
<
BN0
>
{},
Number
<
BN1PerThread
>
{},
Number
<
BK1
>
{}));
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4r1
<
FloatA
,
FloatA
,
decltype
(
a_block_desc_bk0_bm0_bm1_bk1_
),
decltype
(
a_thread_desc_bk0_bm0_bm1_bk1_
),
Sequence
<
BK0PerThread
,
1
,
BM1PerThread
,
BK1
>
,
// SliceLengths
Sequence
<
0
,
1
,
2
,
3
>
,
// DimAccessOrder
Sequence
<
1
,
1
,
BM1PerThread
,
BK1
>
,
// SrcVectorTensorLengths
Sequence
<
0
,
1
,
2
,
3
>>
;
// SrcVectorTensorContiguousDimOrder
using
BThreadCopy
=
ThreadwiseTensorSliceTransfer_v4r1
<
FloatB
,
FloatB
,
decltype
(
b_block_desc_bk0_bn0_bn1_bk1_
),
decltype
(
b_thread_desc_bk0_bn0_bn1_bk1_
),
Sequence
<
BK0PerThread
,
1
,
BN1PerThread
,
BK1
>
,
// SliceLengths
Sequence
<
0
,
1
,
2
,
3
>
,
// DimAccessOrder
Sequence
<
1
,
1
,
BN1PerThread
,
BK1
>
,
// SrcVectorTensorLengths
Sequence
<
0
,
1
,
2
,
3
>>
;
// SrcVectorTensorContiguousDimOrder
CIndex
c_thread_origin_data_idx_
;
AThreadCopy
a_thread_copy_
;
BThreadCopy
b_thread_copy_
;
};
}
// namespace ck
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
View file @
aba0880d
...
...
@@ -11,7 +11,7 @@
namespace
ck
{
// C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
// A and B are vis
a
ble to the whole block, C is distributed among each thread
// A and B are vis
i
ble to the whole block, C is distributed among each thread
// Assume:
// 1. A:
// 1. ABlockDesc_BK0_BM_BK1 is known at compile-time
...
...
include/ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp
0 → 100644
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
index_t
NDimSpatial
,
typename
DOutDataType
,
typename
DInDataType
,
typename
DOutLayout
,
typename
DInLayout
>
struct
DeviceAvgPoolBwd
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_dout
,
void
*
p_din
,
std
::
vector
<
ck
::
index_t
>
dout_n_k_wos_lengths
,
std
::
vector
<
ck
::
index_t
>
dout_n_k_wos_strides
,
std
::
vector
<
ck
::
index_t
>
din_n_k_wos_length
,
std
::
vector
<
ck
::
index_t
>
din_n_k_wos_strides
,
std
::
vector
<
ck
::
index_t
>
window_k_c_xs_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_pool_fwd.hpp
View file @
aba0880d
...
...
@@ -17,6 +17,8 @@ template <index_t InOutRank,
typename
InDataType
,
typename
OutDataType
,
typename
IndexDataType
,
typename
InLayout
,
typename
OutLayout
,
ReduceTensorOp
ReduceOpId
,
bool
OutputIndex
>
struct
DevicePoolFwd
:
public
BaseOperator
...
...
@@ -25,13 +27,14 @@ struct DevicePoolFwd : public BaseOperator
MakeArgumentPointer
(
const
void
*
p_in_dev
,
void
*
p_out_dev
,
void
*
p_out_indices_dev
,
std
::
vector
<
ck
::
index_t
>
input_lengths
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
output_lengths
,
std
::
vector
<
ck
::
index_t
>
input_stride
,
std
::
vector
<
ck
::
index_t
>
output_stride
,
std
::
vector
<
ck
::
index_t
>
indices_stride
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
input_n_c_wis_lengths
,
std
::
vector
<
ck
::
index_t
>
window_xs_lengths
,
std
::
vector
<
ck
::
index_t
>
output_n_c_wos_lengths
,
std
::
vector
<
ck
::
index_t
>
input_n_c_wis_stride
,
std
::
vector
<
ck
::
index_t
>
output_n_c_wis_stride
,
std
::
vector
<
ck
::
index_t
>
indices_n_c_wis_stride
,
std
::
vector
<
ck
::
index_t
>
window_xs_strides
,
std
::
vector
<
ck
::
index_t
>
window_xs_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
std
::
vector
<
ck
::
index_t
>
pooling_dims
)
=
0
;
...
...
include/ck/tensor_operation/gpu/device/device_put_element.hpp
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
library/src
/tensor_operation
_instance/gpu/pool_fwd/device_avg_pool2d_fwd_nhwc_f16_instance.c
pp
→
include/ck
/tensor_operation
/gpu/device/gemm_dl_algorithm.h
pp
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#
include "pool_fwd_instance_common.hpp"
#
pragma once
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
void
add_device_pool2d_fwd_nhwc_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DevicePoolFwd
<
4
,
2
,
F16
,
F16
,
I32
,
ReduceOpId
,
false
>>>&
instances
)
enum
struct
GemmDlAlgorithm
{
add_device_operation_instances
(
instances
,
device_pool2d_fwd_nhwc_instances
<
F16
,
F16
,
I32
,
F32
,
ReduceOpId
,
false
>
{});
}
Default
,
// Uses DOT vector instructions
Dpp8
,
// Uses DOT vector instructions with DPP8 SEL modifier to reduce data loads from LDS
}
;
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp
0 → 100644
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_avgpool_bwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// In and Din = [N, C, Di, Hi, Wi]
// Out and Dout = [N, C, Do, Ho, Wo]
// Out = AvgPoolFwd(In)
// Din = AvgPoolBwd(Dout)
// Pooling dimension = D, H, W
template
<
typename
DOutDataType
,
typename
DInDataType
,
typename
ComputeDataType
,
ck
::
index_t
BlockSize
,
ck
::
index_t
MThreadClusterSize
,
ck
::
index_t
KThreadClusterSize
,
ck
::
index_t
MThreadSliceSize
,
ck
::
index_t
KThreadSliceSize
,
ck
::
index_t
InSrcOutDstVectorSize
>
struct
DeviceAvgPool3dBwd_NDHWC_NDHWC
:
public
DeviceAvgPoolBwd
<
3
,
DOutDataType
,
DInDataType
,
tensor_layout
::
convolution
::
NDHWC
,
tensor_layout
::
convolution
::
NDHWC
>
{
static
constexpr
ck
::
index_t
NDimSpatial
=
3
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
ck
::
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
ck
::
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
auto
Make3DGridDescriptor_Out_M_K_In_M
(
const
std
::
vector
<
ck
::
index_t
>&
dout_n_c_wos_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
din_n_c_wos_length
,
const
std
::
vector
<
ck
::
index_t
>&
dout_n_c_wos_strides
,
const
std
::
vector
<
ck
::
index_t
>&
din_n_c_wos_strides
,
const
std
::
vector
<
ck
::
index_t
>&
window_lengths
,
const
std
::
vector
<
ck
::
index_t
>&
window_strides
,
const
std
::
vector
<
ck
::
index_t
>&
window_dilations
,
const
std
::
vector
<
ck
::
index_t
>&
input_left_pads
,
const
std
::
vector
<
ck
::
index_t
>&
input_right_pads
,
const
std
::
vector
<
ck
::
index_t
>&
tildes
)
{
index_t
i_ztilde
=
tildes
[
0
];
index_t
i_ytilde
=
tildes
[
1
];
index_t
i_xtilde
=
tildes
[
2
];
const
index_t
N
=
dout_n_c_wos_lengths
[
0
];
const
index_t
C
=
dout_n_c_wos_lengths
[
1
];
const
index_t
Di
=
din_n_c_wos_length
[
2
];
const
index_t
Hi
=
din_n_c_wos_length
[
3
];
const
index_t
Wi
=
din_n_c_wos_length
[
4
];
const
index_t
Do
=
dout_n_c_wos_lengths
[
2
];
const
index_t
Ho
=
dout_n_c_wos_lengths
[
3
];
const
index_t
Wo
=
dout_n_c_wos_lengths
[
4
];
const
index_t
Z
=
window_lengths
[
0
];
const
index_t
Y
=
window_lengths
[
1
];
const
index_t
X
=
window_lengths
[
2
];
const
index_t
InLeftPadD
=
input_left_pads
[
0
];
const
index_t
InLeftPadH
=
input_left_pads
[
1
];
const
index_t
InLeftPadW
=
input_left_pads
[
2
];
const
index_t
InRightPadD
=
input_right_pads
[
0
];
const
index_t
InRightPadH
=
input_right_pads
[
1
];
const
index_t
InRightPadW
=
input_right_pads
[
2
];
const
index_t
ConvStrideD
=
window_strides
[
0
];
const
index_t
ConvStrideH
=
window_strides
[
1
];
const
index_t
ConvStrideW
=
window_strides
[
2
];
const
index_t
ConvDilationD
=
window_dilations
[
0
];
const
index_t
ConvDilationH
=
window_dilations
[
1
];
const
index_t
ConvDilationW
=
window_dilations
[
2
];
const
auto
out_n_do_ho_wo_c_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
N
,
Do
,
Ho
,
Wo
,
C
),
make_tuple
(
dout_n_c_wos_strides
[
0
],
dout_n_c_wos_strides
[
2
],
dout_n_c_wos_strides
[
3
],
dout_n_c_wos_strides
[
4
],
dout_n_c_wos_strides
[
1
]));
const
auto
GcdStrideDilationD
=
math
::
gcd
(
ConvStrideD
,
ConvDilationD
);
const
auto
GcdStrideDilationH
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
const
auto
GcdStrideDilationW
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
const
auto
ZTilde
=
ConvStrideD
/
GcdStrideDilationD
;
const
auto
YTilde
=
ConvStrideH
/
GcdStrideDilationH
;
const
auto
XTilde
=
ConvStrideW
/
GcdStrideDilationW
;
const
auto
ZDot
=
math
::
integer_divide_ceil
(
Z
,
ZTilde
);
const
auto
YDot
=
math
::
integer_divide_ceil
(
Y
,
YTilde
);
const
auto
XDot
=
math
::
integer_divide_ceil
(
X
,
XTilde
);
const
auto
DTilde
=
Do
+
math
::
integer_divide_ceil
(
ConvDilationD
*
(
Z
-
I1
),
ConvStrideD
);
const
auto
HTilde
=
Ho
+
math
::
integer_divide_ceil
(
ConvDilationH
*
(
Y
-
I1
),
ConvStrideH
);
const
auto
WTilde
=
Wo
+
math
::
integer_divide_ceil
(
ConvDilationW
*
(
X
-
I1
),
ConvStrideW
);
// only work on Tildes that contribute to non-padding area of input tensor
const
auto
IDTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadD
-
ConvDilationD
*
(
ZTilde
-
I1
)),
ConvStrideD
);
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadH
-
ConvDilationH
*
(
YTilde
-
I1
)),
ConvStrideH
);
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadW
-
ConvDilationW
*
(
XTilde
-
I1
)),
ConvStrideW
);
const
auto
IDTildeSliceEnd
=
math
::
min
(
DTilde
,
math
::
integer_divide_ceil
(
InLeftPadD
+
Di
-
I1
,
ConvStrideD
)
+
I1
);
const
auto
IHTildeSliceEnd
=
math
::
min
(
HTilde
,
math
::
integer_divide_ceil
(
InLeftPadH
+
Hi
-
I1
,
ConvStrideH
)
+
I1
);
const
auto
IWTildeSliceEnd
=
math
::
min
(
WTilde
,
math
::
integer_divide_ceil
(
InLeftPadW
+
Wi
-
I1
,
ConvStrideW
)
+
I1
);
const
auto
DTildeSlice
=
IDTildeSliceEnd
-
IDTildeSliceBegin
;
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
const
auto
WTildeSlice
=
IWTildeSliceEnd
-
IWTildeSliceBegin
;
// ReduceK is different for each Reduce
const
auto
ZDotSlice
=
math
::
integer_divide_ceil
(
Z
-
i_ztilde
,
ZTilde
);
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
i_ytilde
,
YTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
i_xtilde
,
XTilde
);
// Problem size of reduction kernel
const
index_t
MRaw
=
N
*
DTildeSlice
*
HTildeSlice
*
WTildeSlice
*
C
;
const
index_t
MPad
=
math
::
integer_least_multiple
(
MRaw
,
M_BlockTileSize
)
-
MRaw
;
const
index_t
KRaw
=
ZDotSlice
*
YDotSlice
*
XDotSlice
;
const
index_t
KPad
=
math
::
integer_least_multiple
(
KRaw
,
K_BlockTileSize
)
-
KRaw
;
// Out[ReduceM, ReduceK]
const
auto
out_n_dop_hop_wop_c_grid_desc
=
transform_tensor_descriptor
(
out_n_do_ho_wo_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Do
,
I0
,
I0
),
make_pad_transform
(
Ho
,
I0
,
I0
),
make_pad_transform
(
Wo
,
I0
,
I0
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_c_grid_desc
=
transform_tensor_descriptor
(
out_n_dop_hop_wop_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
ZDot
,
DTilde
),
make_tuple
(
-
ConvDilationD
/
GcdStrideDilationD
,
I1
)),
make_embed_transform
(
make_tuple
(
YDot
,
HTilde
),
make_tuple
(
-
ConvDilationH
/
GcdStrideDilationH
,
I1
)),
make_embed_transform
(
make_tuple
(
XDot
,
WTilde
),
make_tuple
(
-
ConvDilationW
/
GcdStrideDilationW
,
I1
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
,
6
>
{},
Sequence
<
7
>
{}));
const
auto
out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_c_grid_desc
=
transform_tensor_descriptor
(
out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_slice_transform
(
ZDot
,
I0
,
ZDotSlice
),
make_slice_transform
(
DTilde
,
IDTildeSliceBegin
,
DTildeSlice
),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{},
Sequence
<
6
>
{},
Sequence
<
7
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{},
Sequence
<
6
>
{},
Sequence
<
7
>
{}));
const
auto
out_grid_desc_reducemraw_reducekraw
=
transform_tensor_descriptor
(
out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
DTildeSlice
,
HTildeSlice
,
WTildeSlice
,
C
)),
make_merge_transform
(
make_tuple
(
ZDotSlice
,
YDotSlice
,
XDotSlice
))),
make_tuple
(
Sequence
<
0
,
2
,
4
,
6
,
7
>
{},
Sequence
<
1
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
const
auto
out_grid_desc_reducem_reducek
=
transform_tensor_descriptor
(
out_grid_desc_reducemraw_reducekraw
,
make_tuple
(
make_right_pad_transform
(
MRaw
,
MPad
),
make_right_pad_transform
(
KRaw
,
KPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// In[ReduceM]
const
auto
in_n_di_hi_wi_c_grid_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
N
,
Di
,
Hi
,
Wi
,
C
),
make_tuple
(
din_n_c_wos_strides
[
0
],
din_n_c_wos_strides
[
2
],
din_n_c_wos_strides
[
3
],
din_n_c_wos_strides
[
4
],
din_n_c_wos_strides
[
1
]));
const
auto
in_n_dip_hip_wip_c_grid_desc
=
transform_tensor_descriptor
(
in_n_di_hi_wi_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Di
,
InLeftPadD
,
InRightPadD
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc
=
transform_tensor_descriptor
(
in_n_dip_hip_wip_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
XTilde
,
DTilde
),
make_tuple
(
ConvDilationD
,
ConvStrideD
)),
make_embed_transform
(
make_tuple
(
YTilde
,
HTilde
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
XTilde
,
WTilde
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
,
6
>
{},
Sequence
<
7
>
{}));
const
auto
in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc
=
transform_tensor_descriptor
(
in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_freeze_transform
(
i_ztilde
),
make_slice_transform
(
DTilde
,
IDTildeSliceBegin
,
DTildeSlice
),
make_freeze_transform
(
i_ytilde
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_freeze_transform
(
i_xtilde
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{},
Sequence
<
6
>
{},
Sequence
<
7
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<>
{},
Sequence
<
1
>
{},
Sequence
<>
{},
Sequence
<
2
>
{},
Sequence
<>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
const
auto
in_grid_desc_reducemraw
=
transform_tensor_descriptor
(
in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
DTildeSlice
,
HTildeSlice
,
WTildeSlice
,
C
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
in_grid_desc_reducem
=
transform_tensor_descriptor
(
in_grid_desc_reducemraw
,
make_tuple
(
make_right_pad_transform
(
MRaw
,
MPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
make_tuple
(
out_grid_desc_reducem_reducek
,
in_grid_desc_reducem
);
}
using
DoutDinGridDesc
=
decltype
(
Make3DGridDescriptor_Out_M_K_In_M
({
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
},
{
0
,
0
,
0
},
{
0
,
0
,
0
},
{
0
,
0
,
0
},
{
0
,
0
,
0
},
{
0
,
0
,
0
}));
using
DoutGridDesc_M_K
=
remove_cvref_t
<
tuple_element_t
<
0
,
DoutDinGridDesc
>>
;
using
DinGridDesc_M
=
remove_cvref_t
<
tuple_element_t
<
1
,
DoutDinGridDesc
>>
;
// FIXME
// for NDHWC, the dim C is the fastest dimension, and is not reduced.
// Hence, it is in M dimension for reduction kernel.
static
constexpr
index_t
OutSrcInDstVectorDim
=
0
;
// 0: M, 1: K
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
using
Div
=
tensor_operation
::
element_wise
::
UnaryDivide
;
using
gridwise_reduce
=
GridwiseReduction_mk_to_m_threadwise
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
int
,
DoutGridDesc_M_K
,
DinGridDesc_M
,
reduce
::
Add
,
PassThrough
,
Div
,
InMemoryDataOperationEnum
::
Set
,
false
,
// propagate_nan
BlockSize
,
MThreadSliceSize
,
KThreadSliceSize
,
OutSrcInDstVectorDim
,
InSrcOutDstVectorSize
,
InSrcOutDstVectorSize
>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
DOutDataType
*
p_dout
,
DInDataType
*
p_din
,
std
::
vector
<
ck
::
index_t
>
dout_n_c_wos_lengths
,
std
::
vector
<
ck
::
index_t
>
din_n_c_wos_length
,
std
::
vector
<
ck
::
index_t
>
dout_n_c_wos_strides
,
std
::
vector
<
ck
::
index_t
>
din_n_c_wos_strides
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
:
p_dout_grid_
{
p_dout
},
p_din_grid_
{
p_din
},
dout_n_c_wos_lengths_
{
dout_n_c_wos_lengths
},
din_n_c_wos_length_
{
din_n_c_wos_length
},
dout_n_c_wos_strides_
{
dout_n_c_wos_strides
},
din_n_c_wos_strides_
{
din_n_c_wos_strides
},
num_reduce_
{
1
},
div_element_op_
{
window_lengths
[
0
]
*
window_lengths
[
1
]
*
window_lengths
[
2
]}
{
std
::
vector
<
ck
::
index_t
>
Tildes
(
NDimSpatial
);
for
(
int
i
=
0
;
i
<
NDimSpatial
;
++
i
)
{
int
GcdStrideDilation
=
math
::
gcd
(
window_strides
[
i
],
window_dilations
[
i
]);
Tildes
[
i
]
=
window_strides
[
i
]
/
GcdStrideDilation
;
num_reduce_
*=
Tildes
[
i
];
}
for
(
index_t
i_ztilde
=
0
;
i_ztilde
<
Tildes
[
0
];
++
i_ztilde
)
{
for
(
index_t
i_ytilde
=
0
;
i_ytilde
<
Tildes
[
1
];
++
i_ytilde
)
{
for
(
index_t
i_xtilde
=
0
;
i_xtilde
<
Tildes
[
2
];
++
i_xtilde
)
{
// check slice is valid
const
auto
ZDotSlice
=
math
::
integer_divide_ceil
(
window_lengths
[
0
]
-
i_ztilde
,
Tildes
[
0
]);
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
window_lengths
[
1
]
-
i_ytilde
,
Tildes
[
1
]);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
window_lengths
[
2
]
-
i_xtilde
,
Tildes
[
2
]);
if
(
ZDotSlice
*
YDotSlice
*
XDotSlice
<=
0
)
{
continue
;
}
const
auto
dout_din_grid_desc
=
Make3DGridDescriptor_Out_M_K_In_M
(
dout_n_c_wos_lengths
,
din_n_c_wos_length
,
dout_n_c_wos_strides
,
din_n_c_wos_strides
,
window_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
{
i_ztilde
,
i_ytilde
,
i_xtilde
});
dout_grid_desc_m_k_container_
.
push_back
(
dout_din_grid_desc
[
I0
]);
din_grid_desc_m_container_
.
push_back
(
dout_din_grid_desc
[
I1
]);
}
}
}
}
const
DOutDataType
*
p_dout_grid_
;
DInDataType
*
p_din_grid_
;
std
::
vector
<
ck
::
index_t
>
dout_n_c_wos_lengths_
;
std
::
vector
<
ck
::
index_t
>
din_n_c_wos_length_
;
std
::
vector
<
ck
::
index_t
>
dout_n_c_wos_strides_
;
std
::
vector
<
ck
::
index_t
>
din_n_c_wos_strides_
;
int
num_reduce_
;
std
::
vector
<
DoutGridDesc_M_K
>
dout_grid_desc_m_k_container_
;
std
::
vector
<
DinGridDesc_M
>
din_grid_desc_m_container_
;
Div
div_element_op_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
float
ave_time
=
0
;
for
(
index_t
i
=
0
;
i
<
arg
.
num_reduce_
;
i
++
)
{
const
auto
kernel
=
kernel_reduce_threadwise
<
gridwise_reduce
,
false
,
false
,
false
,
// don't have index input
DOutDataType
,
DInDataType
,
ComputeDataType
,
int
,
DoutGridDesc_M_K
,
DinGridDesc_M
,
PassThrough
,
Div
>
;
ck
::
index_t
M
=
arg
.
dout_grid_desc_m_k_container_
[
i
].
GetLength
(
I0
);
const
index_t
grid_size
=
(
M
/
M_BlockTileSize
);
ave_time
+=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
dout_grid_desc_m_k_container_
[
i
],
arg
.
din_grid_desc_m_container_
[
i
],
PassThrough
{},
arg
.
div_element_op_
,
float
(
1
),
arg
.
p_dout_grid_
,
nullptr
,
float
(
0
),
arg
.
p_din_grid_
,
nullptr
);
}
return
ave_time
;
}
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
constexpr
index_t
Rank
=
NDimSpatial
+
2
;
int
doutFastestDim
=
-
1
;
int
dinFastestDim
=
-
1
;
for
(
int
i
=
0
;
i
<
Rank
;
++
i
)
{
if
(
arg
.
dout_n_c_wos_strides_
[
i
]
==
1
)
doutFastestDim
=
i
;
if
(
arg
.
din_n_c_wos_strides_
[
i
]
==
1
)
dinFastestDim
=
i
;
}
if
(
doutFastestDim
==
-
1
||
dinFastestDim
==
-
1
)
{
if
constexpr
(
InSrcOutDstVectorSize
!=
1
)
return
false
;
}
else
{
if
(
arg
.
dout_n_c_wos_lengths_
[
doutFastestDim
]
%
InSrcOutDstVectorSize
!=
0
)
return
false
;
if
(
arg
.
din_n_c_wos_length_
[
dinFastestDim
]
%
InSrcOutDstVectorSize
!=
0
)
return
false
;
}
return
true
;
}
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_dout
,
void
*
p_din
,
std
::
vector
<
ck
::
index_t
>
dout_n_c_wos_lengths
,
std
::
vector
<
ck
::
index_t
>
din_n_c_wos_length
,
std
::
vector
<
ck
::
index_t
>
dout_n_c_wos_strides
,
std
::
vector
<
ck
::
index_t
>
din_n_c_wos_strides
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
override
{
constexpr
index_t
Rank
=
NDimSpatial
+
2
;
if
(
dout_n_c_wos_strides
.
size
()
!=
Rank
||
din_n_c_wos_strides
.
size
()
!=
Rank
||
dout_n_c_wos_lengths
.
size
()
!=
Rank
||
din_n_c_wos_length
.
size
()
!=
Rank
)
throw
std
::
runtime_error
(
"dimension is incorrect"
);
if
(
window_lengths
.
size
()
!=
NDimSpatial
||
window_strides
.
size
()
!=
NDimSpatial
||
window_dilations
.
size
()
!=
NDimSpatial
||
input_left_pads
.
size
()
!=
NDimSpatial
||
input_right_pads
.
size
()
!=
NDimSpatial
)
throw
std
::
runtime_error
(
"dimension is incorrect"
);
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
DOutDataType
*>
(
p_dout
),
static_cast
<
DInDataType
*>
(
p_din
),
dout_n_c_wos_lengths
,
din_n_c_wos_length
,
dout_n_c_wos_strides
,
din_n_c_wos_strides
,
window_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceAvgPool3dBwd<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcOutDstVectorSize_"
<<
InSrcOutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
View file @
aba0880d
...
...
@@ -532,11 +532,12 @@ struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_
float
ave_time
=
0
;
const
auto
Run
=
[
&
](
const
auto
&
kernel
)
{
hipGetErrorString
(
hipMemset
(
hipGetErrorString
(
hipMemset
Async
(
arg
.
p_c_grid_
,
0
,
arg
.
c_grid_desc_mblock_mperblock_nblock_nperblock_
.
GetElementSpaceSize
()
*
sizeof
(
CDataType
)));
sizeof
(
CDataType
),
stream_config
.
stream_id_
));
ave_time
=
launch_and_time_kernel
(
stream_config
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
View file @
aba0880d
...
...
@@ -11,6 +11,7 @@
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
#include "ck/host_utility/device_prop.hpp"
...
...
@@ -59,6 +60,7 @@ template <
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
GemmDlAlgorithm
GemmDlAlg
=
GemmDlAlgorithm
::
Default
,
enable_if_t
<
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
...
...
@@ -236,7 +238,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
>
;
CThreadTransferDstScalarPerVector
,
GemmDlAlg
>
;
using
AGridDesc_K0_M0_M1_K1
=
decltype
(
GridwiseGemm
::
MakeAGridDescriptor_K0_M0_M1_K1
(
AGridDesc_K0_M_K1
{}));
...
...
@@ -372,7 +375,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
true
,
true
>
;
true
,
GemmDlAlg
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
...
...
@@ -398,7 +402,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
true
,
false
>
;
false
,
GemmDlAlg
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
...
...
@@ -424,7 +429,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
false
,
true
>
;
true
,
GemmDlAlg
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
...
...
@@ -450,7 +456,8 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
remove_reference_t
<
CGridDesc_M0_M10_M11_N0_N10_N11
>
,
remove_reference_t
<
DefaultBlock2CTileMap
>
,
false
,
false
>
;
false
,
GemmDlAlg
>
;
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
...
...
@@ -485,6 +492,16 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
if
constexpr
(
GemmDlAlg
==
GemmDlAlgorithm
::
Dpp8
)
{
if
(
ck
::
get_device_name
()
==
"gfx1030"
)
{
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
);
}
return
false
;
}
if
(
ck
::
get_device_name
()
==
"gfx906"
||
ck
::
get_device_name
()
==
"gfx1030"
||
ck
::
get_device_name
()
==
"gfx1100"
||
ck
::
get_device_name
()
==
"gfx1101"
||
ck
::
get_device_name
()
==
"gfx1102"
)
...
...
@@ -492,10 +509,7 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
return
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_k0_m_k1_
,
arg
.
b_grid_desc_k0_n_k1_
,
arg
.
c_grid_desc_m_n_
);
}
else
{
return
false
;
}
return
false
;
}
// polymorphic
...
...
@@ -572,7 +586,7 @@ struct DeviceGemmDl : public DeviceGemm<ALayout,
}
// polymorphic
std
::
string
GetTypeString
()
const
override
virtual
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp
0 → 100644
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
#include "ck/tensor_operation/gpu/device/gemm_dl_algorithm.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
CDataType
,
typename
AccDataType
,
typename
ALayout
,
typename
BLayout
,
typename
CLayout
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
GemmSpecialization
GemmSpec
,
index_t
BlockSize
,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
K0PerBlock
,
index_t
K1
,
index_t
M1PerThread
,
index_t
N1PerThread
,
index_t
KPerThread
,
typename
M1N1ThreadClusterM1Xs
,
typename
M1N1ThreadClusterN1Xs
,
typename
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
typename
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
typename
ABlockTransferThreadClusterArrangeOrder
,
typename
ABlockTransferSrcAccessOrder
,
typename
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
,
typename
ABlockTransferSrcVectorTensorContiguousDimOrder
,
typename
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
,
typename
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
typename
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
typename
BBlockTransferSrcAccessOrder
,
typename
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
,
typename
BBlockTransferSrcVectorTensorContiguousDimOrder
,
typename
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
typename
CThreadTransferSrcDstAccessOrder
,
index_t
CThreadTransferSrcDstVectorDim
,
index_t
CThreadTransferDstScalarPerVector
,
enable_if_t
<
is_same_v
<
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
&&
is_same_v
<
CElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
,
bool
>
=
false
>
struct
DeviceGemmDlDpp8
:
public
DeviceGemmDl
<
ADataType
,
BDataType
,
CDataType
,
AccDataType
,
ALayout
,
BLayout
,
CLayout
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
,
GemmSpec
,
BlockSize
,
MPerBlock
,
NPerBlock
,
K0PerBlock
,
K1
,
M1PerThread
,
N1PerThread
,
KPerThread
,
M1N1ThreadClusterM1Xs
,
M1N1ThreadClusterN1Xs
,
ABlockTransferThreadSliceLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterLengths_K0_M0_M1_K1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1
,
ABlockTransferSrcVectorTensorContiguousDimOrder
,
ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1
,
BBlockTransferThreadSliceLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterLengths_K0_N0_N1_K1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1
,
BBlockTransferSrcVectorTensorContiguousDimOrder
,
BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
GemmDlAlgorithm
::
Dpp8
>
{
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceGemmDlDpp8"
<<
"<"
<<
BlockSize
<<
", "
<<
MPerBlock
<<
", "
<<
NPerBlock
<<
", "
<<
K0PerBlock
<<
", "
<<
K1
<<
", "
<<
M1PerThread
<<
", "
<<
N1PerThread
<<
", "
<<
KPerThread
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
View file @
aba0880d
...
...
@@ -161,8 +161,10 @@ struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
const
auto
Run
=
[
&
](
const
auto
&
kernel
)
{
if
(
kbatch
>
1
)
hipGetErrorString
(
hipMemset
(
karg
.
p_c_grid
,
0
,
karg
.
M
*
karg
.
N
*
sizeof
(
CDataType
)));
hipGetErrorString
(
hipMemsetAsync
(
karg
.
p_c_grid
,
0
,
karg
.
M
*
karg
.
N
*
sizeof
(
CDataType
),
stream_config
.
stream_id_
));
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
gdx
,
gdy
,
gdz
),
dim3
(
BlockSize
),
0
,
karg
,
b2c_map
);
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_streamk.hpp
View file @
aba0880d
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -147,7 +147,10 @@ struct DeviceGemmXdlStreamK : public DeviceGemmStreamK<ALayout,
if
constexpr
(
GridwiseGemm
::
Block2CTileMap
::
ReductionStrategy
==
StreamKReductionStrategy
::
Atomic
)
{
hipGetErrorString
(
hipMemset
(
karg
.
p_c_grid
,
0
,
karg
.
M
*
karg
.
N
*
sizeof
(
CDataType
)));
hipGetErrorString
(
hipMemsetAsync
(
karg
.
p_c_grid
,
0
,
karg
.
M
*
karg
.
N
*
sizeof
(
CDataType
),
stream_config
.
stream_id_
));
ave_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
grid_dims
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
View file @
aba0880d
...
...
@@ -422,8 +422,10 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
for
(
const
auto
&
trans_arg
:
arg
.
gemm_kernel_args_
)
{
const
auto
&
karg
=
trans_arg
.
karg_
;
hip_check_error
(
hipMemset
(
karg
.
p_c_grid
,
0
,
karg
.
M
*
karg
.
N
*
sizeof
(
EDataType
)));
hip_check_error
(
hipMemsetAsync
(
karg
.
p_c_grid
,
0
,
karg
.
M
*
karg
.
N
*
sizeof
(
EDataType
),
stream_config
.
stream_id_
));
}
}
...
...
include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
View file @
aba0880d
...
...
@@ -3,16 +3,7 @@
#pragma once
#include <iostream>
#include <sstream>
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp"
namespace
ck
{
namespace
tensor_operation
{
...
...
@@ -30,255 +21,32 @@ template <typename InDataType,
ck
::
index_t
ReduceMThreadSliceSize
,
ck
::
index_t
ReduceKThreadSliceSize
,
ck
::
index_t
InSrcOutDstVectorSize
>
struct
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
:
public
DevicePoolFwd
<
4
,
2
,
InDataType
,
OutDataType
,
IndexDataType
,
ReduceOpId
,
OutputIndex
>
struct
DevicePool2dFwd_NHWC_NHWC
:
public
DevicePool3dFwd_NDHWC_NDHWC
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
BlockSize
,
ReduceMThreadClusterSize
,
ReduceKThreadClusterSize
,
ReduceMThreadSliceSize
,
ReduceKThreadSliceSize
,
InSrcOutDstVectorSize
>
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
index_t
InOutRank
=
4
;
static
constexpr
index_t
WindowRank
=
2
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
static
constexpr
index_t
InSrcOutDstVectorDim
=
0
;
// for NHWC, the dim C is the vector Dim for both input and output in memory, which is
// not reduced.
static
constexpr
ck
::
index_t
ReduceM_BlockTileSize
=
ReduceMThreadClusterSize
*
ReduceMThreadSliceSize
;
static
constexpr
ck
::
index_t
ReduceK_BlockTileSize
=
ReduceKThreadClusterSize
*
ReduceKThreadSliceSize
;
static
auto
MakeABGridDescriptor_A_M_K_B_M
(
ck
::
index_t
N
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
{
const
index_t
Hi
=
input_spatial_lengths
[
0
];
const
index_t
Wi
=
input_spatial_lengths
[
1
];
const
index_t
Ho
=
output_spatial_lengths
[
0
];
const
index_t
Wo
=
output_spatial_lengths
[
1
];
const
index_t
Y
=
window_spatial_lengths
[
0
];
const
index_t
X
=
window_spatial_lengths
[
1
];
const
index_t
ConvStrideH
=
window_strides
[
0
];
const
index_t
ConvStrideW
=
window_strides
[
1
];
const
index_t
InLeftPadH
=
input_left_pads
[
0
];
const
index_t
InLeftPadW
=
input_left_pads
[
1
];
const
index_t
InRightPadH
=
input_right_pads
[
0
];
const
index_t
InRightPadW
=
input_right_pads
[
1
];
const
index_t
ReduceMRaw
=
N
*
Ho
*
Wo
*
C
;
const
index_t
ReduceMPad
=
math
::
integer_least_multiple
(
ReduceMRaw
,
ReduceM_BlockTileSize
)
-
ReduceMRaw
;
const
index_t
ReduceKRaw
=
Y
*
X
;
const
index_t
ReduceKPad
=
math
::
integer_least_multiple
(
ReduceKRaw
,
ReduceK_BlockTileSize
)
-
ReduceKRaw
;
// A[ReduceM, ReduceK]
const
auto
in_grid_desc_n_hi_wi_c
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Hi
,
Wi
,
C
));
const
auto
in_grid_desc_n_hip_wip_c
=
transform_tensor_descriptor
(
in_grid_desc_n_hi_wi_c
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_grid_desc_n_y_ho_x_wo_c
=
transform_tensor_descriptor
(
in_grid_desc_n_hip_wip_c
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
Y
,
Ho
),
make_tuple
(
I1
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wo
),
make_tuple
(
I1
,
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
in_grid_desc_reducemraw_reducekraw
=
transform_tensor_descriptor
(
in_grid_desc_n_y_ho_x_wo_c
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Ho
,
Wo
,
C
)),
make_merge_transform
(
make_tuple
(
Y
,
X
))),
make_tuple
(
Sequence
<
0
,
2
,
4
,
5
>
{},
Sequence
<
1
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
const
auto
in_grid_desc_reducem_reducek
=
transform_tensor_descriptor
(
in_grid_desc_reducemraw_reducekraw
,
make_tuple
(
make_right_pad_transform
(
ReduceMRaw
,
ReduceMPad
),
make_right_pad_transform
(
ReduceKRaw
,
ReduceKPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// B[ReduceM]
const
auto
out_grid_desc_reducemraw
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
*
Ho
*
Wo
*
C
));
const
auto
out_grid_desc_reducem
=
transform_tensor_descriptor
(
out_grid_desc_reducemraw
,
make_tuple
(
make_right_pad_transform
(
ReduceMRaw
,
ReduceMPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
make_tuple
(
in_grid_desc_reducem_reducek
,
out_grid_desc_reducem
);
}
using
ABGridDescs
=
decltype
(
MakeABGridDescriptor_A_M_K_B_M
(
1
,
1
,
{},
{},
{},
{},
{},
{}));
using
AGridDesc_M_K
=
remove_cvref_t
<
decltype
(
ABGridDescs
{}[
I0
])
>
;
using
BGridDesc_M
=
remove_cvref_t
<
decltype
(
ABGridDescs
{}[
I1
])
>
;
// TODO
struct
Argument
:
public
BaseArgument
{
Argument
(
const
InDataType
*
p_in_dev
,
OutDataType
*
p_out_dev
,
IndexDataType
*
p_out_indices_dev
,
ck
::
index_t
N
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>&
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>&
window_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>&
window_strides
,
std
::
vector
<
ck
::
index_t
>&
input_left_pads
,
std
::
vector
<
ck
::
index_t
>&
input_right_pads
)
:
p_in_dev_
{
p_in_dev
},
p_out_dev_
{
p_out_dev
},
p_out_indices_dev_
{
p_out_indices_dev
},
a_grid_desc_m_k_
{},
b_grid_desc_m_
{}
{
const
auto
descs
=
MakeABGridDescriptor_A_M_K_B_M
(
N
,
C
,
input_spatial_lengths
,
window_spatial_lengths
,
output_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
a_grid_desc_m_k_
=
descs
[
I0
];
b_grid_desc_m_
=
descs
[
I1
];
invariant_lowest_length_
=
C
;
reduce_lowest_length_
=
window_spatial_lengths
[
1
];
int32_t
reduceLength
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
std
::
tie
(
in_element_op_
,
acc_element_op_
)
=
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
reduceLength
);
}
const
InDataType
*
p_in_dev_
;
OutDataType
*
p_out_dev_
;
IndexDataType
*
p_out_indices_dev_
;
AGridDesc_M_K
a_grid_desc_m_k_
;
BGridDesc_M
b_grid_desc_m_
;
InElementwiseOperation
in_element_op_
;
AccElementwiseOperation
acc_element_op_
;
// for checking vector load/store
ck
::
index_t
invariant_lowest_length_
;
ck
::
index_t
reduce_lowest_length_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
using
gridwise_reduce
=
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
using
DevicePool3D
=
DevicePool3dFwd_NDHWC_NDHWC
<
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
AGridDesc_M_K
,
BGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
false
,
// propagate_nan
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
BlockSize
,
ReduceMThreadClusterSize
,
ReduceKThreadClusterSize
,
ReduceMThreadSliceSize
,
ReduceKThreadSliceSize
,
InSrcOutDstVectorDim
,
InSrcOutDstVectorSize
,
InSrcOutDstVectorSize
>
;
const
auto
kernel
=
kernel_reduce_threadwise
<
gridwise_reduce
,
OutputIndex
,
true
,
// pooling need to return global index
false
,
// don't have index input
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
AGridDesc_M_K
,
BGridDesc_M
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
ck
::
index_t
ReduceM
=
arg
.
a_grid_desc_m_k_
.
GetLength
(
I0
);
const
index_t
grid_size
=
(
ReduceM
/
ReduceM_BlockTileSize
);
return
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
a_grid_desc_m_k_
,
arg
.
b_grid_desc_m_
,
arg
.
in_element_op_
,
arg
.
acc_element_op_
,
float
(
1
),
arg
.
p_in_dev_
,
nullptr
,
float
(
0
),
arg
.
p_out_dev_
,
arg
.
p_out_indices_dev_
);
}
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
->
invariant_lowest_length_
%
InSrcOutDstVectorSize
!=
0
)
{
return
(
false
);
}
return
(
true
);
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in_dev
,
void
*
p_out_dev
,
...
...
@@ -286,62 +54,57 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
std
::
vector
<
ck
::
index_t
>
input_lengths
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
output_lengths
,
std
::
vector
<
ck
::
index_t
>
,
// Suppose tensor layout = NHWC
std
::
vector
<
ck
::
index_t
>
,
// Suppose tensor layout = NHWC
std
::
vector
<
ck
::
index_t
>
,
// Suppose tensor layout = NHWC
std
::
vector
<
ck
::
index_t
>
input_stride
,
std
::
vector
<
ck
::
index_t
>
output_stride
,
std
::
vector
<
ck
::
index_t
>
indices_stride
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
std
::
vector
<
ck
::
index_t
>
pooling_dims
)
override
{
static
constexpr
index_t
InOutRank
=
4
;
static
constexpr
index_t
WindowRank
=
2
;
if
(
input_lengths
.
size
()
!=
InOutRank
||
window_lengths
.
size
()
!=
WindowRank
||
input_lengths
.
size
()
!=
InOutRank
||
window_strides
.
size
()
!=
WindowRank
||
input_left_pads
.
size
()
!=
WindowRank
||
input_right_pads
.
size
()
!=
WindowRank
)
window_dilations
.
size
()
!=
WindowRank
||
input_left_pads
.
size
()
!=
WindowRank
||
input_right_pads
.
size
()
!=
WindowRank
)
throw
std
::
runtime_error
(
"dimension is incorrect"
);
if
(
pooling_dims
!=
std
::
vector
<
ck
::
index_t
>
{
2
,
3
})
throw
std
::
runtime_error
(
"pooling_dims only support {2, 3} in pool2d so far"
);
index_t
N
=
input_lengths
[
0
];
index_t
C
=
input_lengths
[
1
];
index_t
Hi
=
input_lengths
[
2
];
index_t
Wi
=
input_lengths
[
3
];
index_t
Ho
=
output_lengths
[
2
];
index_t
Wo
=
output_lengths
[
3
];
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
=
{
Hi
,
Wi
};
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
=
{
Ho
,
Wo
};
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InDataType
*>
(
p_in_dev
),
static_cast
<
OutDataType
*>
(
p_out_dev
),
static_cast
<
IndexDataType
*>
(
p_out_indices_dev
),
N
,
C
,
input_spatial_lengths
,
window_lengths
,
output_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
ReduceMThreadClusterSize
<<
"_S"
<<
ReduceMThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
ReduceKThreadClusterSize
<<
"_S"
<<
ReduceKThreadSliceSize
<<
","
;
str
<<
"InSrcOutDstVectorSize_"
<<
InSrcOutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
// NCHW to NCDHW
input_lengths
.
insert
(
input_lengths
.
begin
()
+
2
,
1
);
output_lengths
.
insert
(
output_lengths
.
begin
()
+
2
,
1
);
input_stride
.
insert
(
input_stride
.
begin
()
+
2
,
0
);
output_stride
.
insert
(
output_stride
.
begin
()
+
2
,
0
);
indices_stride
.
insert
(
indices_stride
.
begin
()
+
2
,
0
);
// YX to ZYX
window_lengths
.
insert
(
window_lengths
.
begin
(),
1
);
window_strides
.
insert
(
window_strides
.
begin
(),
0
);
window_dilations
.
insert
(
window_dilations
.
begin
(),
0
);
input_left_pads
.
insert
(
input_left_pads
.
begin
(),
0
);
input_right_pads
.
insert
(
input_right_pads
.
begin
(),
0
);
pooling_dims
=
{
2
,
3
,
4
};
return
DevicePool3D
::
MakeArgumentPointer
(
p_in_dev
,
p_out_dev
,
p_out_indices_dev
,
input_lengths
,
window_lengths
,
output_lengths
,
input_stride
,
output_stride
,
indices_stride
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
pooling_dims
);
}
};
...
...
include/ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp
View file @
aba0880d
...
...
@@ -8,8 +8,10 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
...
...
@@ -30,8 +32,15 @@ template <typename InDataType,
ck
::
index_t
MThreadSliceSize
,
ck
::
index_t
KThreadSliceSize
,
ck
::
index_t
InSrcOutDstVectorSize
>
struct
DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
:
public
DevicePoolFwd
<
5
,
3
,
InDataType
,
OutDataType
,
IndexDataType
,
ReduceOpId
,
OutputIndex
>
struct
DevicePool3dFwd_NDHWC_NDHWC
:
public
DevicePoolFwd
<
5
,
3
,
InDataType
,
OutDataType
,
IndexDataType
,
tensor_layout
::
convolution
::
NDHWC
,
tensor_layout
::
convolution
::
NDHWC
,
ReduceOpId
,
OutputIndex
>
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -51,45 +60,48 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
// for NDHWC, the dim C is the vector Dim for both input and output in memory, which is not
// reduced.
static
constexpr
index_t
InSrcOutDstVectorDim
=
0
;
static
constexpr
ck
::
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
ck
::
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
auto
MakeABGridDescriptor_A_M_K_B_M
(
ck
::
index_t
N
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
)
static
auto
MakeABGridDescriptor_A_M_K_B_M
(
std
::
vector
<
ck
::
index_t
>
input_ncdhw_lengths
,
std
::
vector
<
ck
::
index_t
>
output_ncdhw_lengths
,
std
::
vector
<
ck
::
index_t
>
input_ncdhw_stride
,
std
::
vector
<
ck
::
index_t
>
output_ncdhw_stride
,
std
::
vector
<
ck
::
index_t
>
window_spatial_zyx_lengths
,
std
::
vector
<
ck
::
index_t
>
window_zyx_strides
,
std
::
vector
<
ck
::
index_t
>
window_zyx_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_dhw_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_dhw_pads
)
{
const
index_t
Di
=
input_spatial_lengths
[
0
];
const
index_t
Hi
=
input_spatial_lengths
[
1
];
const
index_t
Wi
=
input_spatial_lengths
[
2
];
const
index_t
N
=
input_ncdhw_lengths
[
0
];
const
index_t
C
=
input_ncdhw_lengths
[
1
];
const
index_t
Di
=
input_ncdhw_lengths
[
2
];
const
index_t
Hi
=
input_ncdhw_lengths
[
3
];
const
index_t
Wi
=
input_ncdhw_lengths
[
4
];
const
index_t
Do
=
output_ncdhw_lengths
[
2
];
const
index_t
Ho
=
output_ncdhw_lengths
[
3
];
const
index_t
Wo
=
output_ncdhw_lengths
[
4
];
const
index_t
Do
=
output
_spatial_lengths
[
0
];
const
index_t
Ho
=
output
_spatial_lengths
[
1
];
const
index_t
Wo
=
output
_spatial_lengths
[
2
];
const
index_t
Z
=
window
_spatial_
zyx_
lengths
[
0
];
const
index_t
Y
=
window
_spatial_
zyx_
lengths
[
1
];
const
index_t
X
=
window
_spatial_
zyx_
lengths
[
2
];
const
index_t
Z
=
window_spatial_length
s
[
0
];
const
index_t
Y
=
window_spatial_length
s
[
1
];
const
index_t
X
=
window_spatial_length
s
[
2
];
const
index_t
WindowStrideD
=
window_zyx_stride
s
[
0
];
const
index_t
WindowStrideH
=
window_zyx_stride
s
[
1
];
const
index_t
WindowStrideW
=
window_zyx_stride
s
[
2
];
const
index_t
ConvStrideD
=
window_stride
s
[
0
];
const
index_t
ConvStrideH
=
window_stride
s
[
1
];
const
index_t
ConvStrideW
=
window_stride
s
[
2
];
const
index_t
WindowDilationD
=
window_zyx_dilation
s
[
0
];
const
index_t
WindowDilationH
=
window_zyx_dilation
s
[
1
];
const
index_t
WindowDilationW
=
window_zyx_dilation
s
[
2
];
const
index_t
InLeftPadD
=
input_left_pads
[
0
];
const
index_t
InLeftPadH
=
input_left_pads
[
1
];
const
index_t
InLeftPadW
=
input_left_pads
[
2
];
const
index_t
InLeftPadD
=
input_left_
dhw_
pads
[
0
];
const
index_t
InLeftPadH
=
input_left_
dhw_
pads
[
1
];
const
index_t
InLeftPadW
=
input_left_
dhw_
pads
[
2
];
const
index_t
InRightPadD
=
input_right_pads
[
0
];
const
index_t
InRightPadH
=
input_right_pads
[
1
];
const
index_t
InRightPadW
=
input_right_pads
[
2
];
const
index_t
InRightPadD
=
input_right_
dhw_
pads
[
0
];
const
index_t
InRightPadH
=
input_right_
dhw_
pads
[
1
];
const
index_t
InRightPadW
=
input_right_
dhw_
pads
[
2
];
const
index_t
MRaw
=
N
*
Do
*
Ho
*
Wo
*
C
;
const
index_t
MPad
=
math
::
integer_least_multiple
(
MRaw
,
M_BlockTileSize
)
-
MRaw
;
...
...
@@ -98,8 +110,15 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
const
index_t
KPad
=
math
::
integer_least_multiple
(
KRaw
,
K_BlockTileSize
)
-
KRaw
;
// A[ReduceM, ReduceK]
const
auto
in_grid_desc_n_di_hi_wi_c
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
,
Di
,
Hi
,
Wi
,
C
));
const
index_t
Ni_stride
=
input_ncdhw_stride
[
0
];
const
index_t
Ci_stride
=
input_ncdhw_stride
[
1
];
const
index_t
Di_stride
=
input_ncdhw_stride
[
2
];
const
index_t
Hi_stride
=
input_ncdhw_stride
[
3
];
const
index_t
Wi_stride
=
input_ncdhw_stride
[
4
];
const
auto
in_grid_desc_n_di_hi_wi_c
=
make_naive_tensor_descriptor
(
make_tuple
(
N
,
Di
,
Hi
,
Wi
,
C
),
make_tuple
(
Ni_stride
,
Di_stride
,
Hi_stride
,
Wi_stride
,
Ci_stride
));
const
auto
in_grid_desc_n_dip_hip_wip_c
=
transform_tensor_descriptor
(
in_grid_desc_n_di_hi_wi_c
,
...
...
@@ -113,11 +132,12 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
const
auto
in_grid_desc_n_z_do_y_ho_x_wo_c
=
transform_tensor_descriptor
(
in_grid_desc_n_dip_hip_wip_c
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
Z
,
Do
),
make_tuple
(
I1
,
ConvStrideD
)),
make_embed_transform
(
make_tuple
(
Y
,
Ho
),
make_tuple
(
I1
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wo
),
make_tuple
(
I1
,
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
Z
,
Do
),
make_tuple
(
WindowDilationD
,
WindowStrideD
)),
make_embed_transform
(
make_tuple
(
Y
,
Ho
),
make_tuple
(
WindowDilationH
,
WindowStrideH
)),
make_embed_transform
(
make_tuple
(
X
,
Wo
),
make_tuple
(
WindowDilationW
,
WindowStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
...
...
@@ -139,8 +159,21 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// B[ReduceM]
const
auto
out_grid_desc_reducemraw
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
N
*
Do
*
Ho
*
Wo
*
C
));
const
index_t
No_stride
=
output_ncdhw_stride
[
0
];
const
index_t
Co_stride
=
output_ncdhw_stride
[
1
];
const
index_t
Do_stride
=
output_ncdhw_stride
[
2
];
const
index_t
Ho_stride
=
output_ncdhw_stride
[
3
];
const
index_t
Wo_stride
=
output_ncdhw_stride
[
4
];
const
auto
out_grid_desc_n_do_ho_wo_c
=
make_naive_tensor_descriptor
(
make_tuple
(
N
,
Di
,
Hi
,
Wi
,
C
),
make_tuple
(
No_stride
,
Do_stride
,
Ho_stride
,
Wo_stride
,
Co_stride
));
const
auto
out_grid_desc_reducemraw
=
transform_tensor_descriptor
(
out_grid_desc_n_do_ho_wo_c
,
make_tuple
(
make_merge_transform
(
make_tuple
(
N
,
Do
,
Ho
,
Wo
,
C
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
out_grid_desc_reducem
=
transform_tensor_descriptor
(
out_grid_desc_reducemraw
,
...
...
@@ -151,7 +184,9 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
return
make_tuple
(
in_grid_desc_reducem_reducek
,
out_grid_desc_reducem
);
}
using
ABGridDescs
=
decltype
(
MakeABGridDescriptor_A_M_K_B_M
(
1
,
1
,
{},
{},
{},
{},
{},
{}));
using
ABGridDescs
=
decltype
(
MakeABGridDescriptor_A_M_K_B_M
({},
{},
{},
{},
{},
{},
{},
{},
{}));
using
AGridDesc_M_K
=
remove_cvref_t
<
decltype
(
ABGridDescs
{}[
I0
])
>
;
using
BGridDesc_M
=
remove_cvref_t
<
decltype
(
ABGridDescs
{}[
I1
])
>
;
...
...
@@ -160,36 +195,41 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
Argument
(
const
InDataType
*
p_in_dev
,
OutDataType
*
p_out_dev
,
IndexDataType
*
p_out_indices_dev
,
ck
::
index_t
N
,
ck
::
index_t
C
,
std
::
vector
<
ck
::
index_t
>&
input_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>&
window_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>&
output_spatial_lengths
,
std
::
vector
<
ck
::
index_t
>&
window_strides
,
std
::
vector
<
ck
::
index_t
>&
input_left_pads
,
std
::
vector
<
ck
::
index_t
>&
input_right_pads
)
std
::
vector
<
ck
::
index_t
>&
input_ncdhw_lengths
,
std
::
vector
<
ck
::
index_t
>&
output_ncdhw_lengths
,
std
::
vector
<
ck
::
index_t
>&
input_ncdhw_stride
,
std
::
vector
<
ck
::
index_t
>&
output_ncdhw_stride
,
std
::
vector
<
ck
::
index_t
>&
,
// indices_ncdhw_stride
std
::
vector
<
ck
::
index_t
>&
window_spatial_zyx_lengths
,
std
::
vector
<
ck
::
index_t
>&
window_zyx_strides
,
std
::
vector
<
ck
::
index_t
>&
window_zyx_dilations
,
std
::
vector
<
ck
::
index_t
>&
input_left_dhw_pads
,
std
::
vector
<
ck
::
index_t
>&
input_right_dhw_pads
)
:
p_in_dev_
{
p_in_dev
},
p_out_dev_
{
p_out_dev
},
p_out_indices_dev_
{
p_out_indices_dev
},
a_grid_desc_m_k_
{},
b_grid_desc_m_
{}
b_grid_desc_m_
{},
input_ncdhw_lengths_
{
input_ncdhw_lengths
},
output_ncdhw_lengths_
{
output_ncdhw_lengths
},
input_ncdhw_stride_
{
input_ncdhw_stride
},
output_ncdhw_stride_
{
output_ncdhw_stride
}
{
const
auto
descs
=
MakeABGridDescriptor_A_M_K_B_M
(
N
,
C
,
input_spatial_lengths
,
window_spatial_lengths
,
output_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
const
auto
descs
=
MakeABGridDescriptor_A_M_K_B_M
(
input_ncdhw_lengths
,
output_ncdhw_lengths
,
input_ncdhw_stride
,
output_ncdhw_stride
,
window_spatial_zyx_lengths
,
window_zyx_strides
,
window_zyx_dilations
,
input_left_dhw_pads
,
input_right_dhw_pads
);
a_grid_desc_m_k_
=
descs
[
I0
];
b_grid_desc_m_
=
descs
[
I1
];
invariant_lowest_length_
=
C
;
int32_t
reduceLength
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
]
*
window_spatial_lengths
[
2
];
int32_t
reduceLength
=
window_spatial_zyx_lengths
[
0
]
*
window_spatial_zyx_lengths
[
1
]
*
window_spatial_zyx_lengths
[
2
];
std
::
tie
(
in_element_op_
,
acc_element_op_
)
=
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
reduceLength
);
...
...
@@ -200,17 +240,25 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
IndexDataType
*
p_out_indices_dev_
;
AGridDesc_M_K
a_grid_desc_m_k_
;
BGridDesc_M
b_grid_desc_m_
;
InElementwiseOperation
in_element_op_
;
AccElementwiseOperation
acc_element_op_
;
// for checking vector load/store
ck
::
index_t
invariant_lowest_length_
;
std
::
vector
<
ck
::
index_t
>
input_ncdhw_lengths_
;
std
::
vector
<
ck
::
index_t
>
output_ncdhw_lengths_
;
std
::
vector
<
ck
::
index_t
>
input_ncdhw_stride_
;
std
::
vector
<
ck
::
index_t
>
output_ncdhw_stride_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
// for NDHWC, the dim C is the fastest dimension, and is not reduced.
// Hence, it is in M dimension for reduction kernel.
static
constexpr
index_t
InSrcOutDstVectorDim
=
0
;
// 0: M, 1: K
using
gridwise_reduce
=
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
OutDataType
,
...
...
@@ -276,60 +324,66 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
->
invariant_lowest_length_
%
InSrcOutDstVectorSize
!=
0
)
{
// C should be fastest dimension
if
(
pArg
->
input_ncdhw_stride_
[
1
]
!=
1
)
return
false
;
for
(
int
i
=
0
;
i
<
InOutRank
;
++
i
)
{
if
(
pArg
->
input_ncdhw_stride_
[
i
]
==
1
&&
pArg
->
input_ncdhw_lengths_
[
i
]
%
InSrcOutDstVectorSize
!=
0
)
return
false
;
if
(
pArg
->
output_ncdhw_stride_
[
i
]
==
1
&&
pArg
->
output_ncdhw_lengths_
[
i
]
%
InSrcOutDstVectorSize
!=
0
)
return
false
;
}
return
true
;
}
std
::
unique_ptr
<
BaseArgument
>
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_in_dev
,
void
*
p_out_dev
,
void
*
p_out_indices_dev
,
std
::
vector
<
ck
::
index_t
>
input_lengths
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
output_lengths
,
std
::
vector
<
ck
::
index_t
>
,
// Suppose tensor layout = NDHWC
std
::
vector
<
ck
::
index_t
>
,
// Suppose tensor layout = NDHWC
std
::
vector
<
ck
::
index_t
>
,
// Suppose tensor layout = NDHWC
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
input_left_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_pads
,
std
::
vector
<
ck
::
index_t
>
input_ncdhw_lengths
,
std
::
vector
<
ck
::
index_t
>
window_zyx_lengths
,
std
::
vector
<
ck
::
index_t
>
output_ncdhw_lengths
,
std
::
vector
<
ck
::
index_t
>
input_ncdhw_stride
,
std
::
vector
<
ck
::
index_t
>
output_ncdhw_stride
,
std
::
vector
<
ck
::
index_t
>
indices_ncdhw_stride
,
std
::
vector
<
ck
::
index_t
>
window_zyx_strides
,
std
::
vector
<
ck
::
index_t
>
window_zyx_dilations
,
std
::
vector
<
ck
::
index_t
>
input_left_dhw_pads
,
std
::
vector
<
ck
::
index_t
>
input_right_dhw_pads
,
std
::
vector
<
ck
::
index_t
>
pooling_dims
)
override
{
if
(
input_lengths
.
size
()
!=
InOutRank
||
window_lengths
.
size
()
!=
WindowRank
||
input_lengths
.
size
()
!=
InOutRank
||
window_strides
.
size
()
!=
WindowRank
||
input_left_pads
.
size
()
!=
WindowRank
||
input_right_pads
.
size
()
!=
WindowRank
)
if
(
input_ncdhw_lengths
.
size
()
!=
InOutRank
||
window_zyx_lengths
.
size
()
!=
WindowRank
||
input_ncdhw_lengths
.
size
()
!=
InOutRank
||
window_zyx_strides
.
size
()
!=
WindowRank
||
window_zyx_dilations
.
size
()
!=
WindowRank
||
input_left_dhw_pads
.
size
()
!=
WindowRank
||
input_right_dhw_pads
.
size
()
!=
WindowRank
)
throw
std
::
runtime_error
(
"dimension is incorrect"
);
if
(
pooling_dims
!=
std
::
vector
<
ck
::
index_t
>
{
2
,
3
,
4
})
throw
std
::
runtime_error
(
"pooling_dims only support {2, 3, 4} in pool3d so far"
);
index_t
N
=
input_lengths
[
0
];
index_t
C
=
input_lengths
[
1
];
index_t
Di
=
input_lengths
[
2
];
index_t
Hi
=
input_lengths
[
3
];
index_t
Wi
=
input_lengths
[
4
];
index_t
Do
=
output_lengths
[
2
];
index_t
Ho
=
output_lengths
[
3
];
index_t
Wo
=
output_lengths
[
4
];
std
::
vector
<
ck
::
index_t
>
input_spatial_lengths
=
{
Di
,
Hi
,
Wi
};
std
::
vector
<
ck
::
index_t
>
output_spatial_lengths
=
{
Do
,
Ho
,
Wo
};
if
(
output_ncdhw_stride
!=
indices_ncdhw_stride
)
throw
std
::
runtime_error
(
"output_ncdhw_stride need to be equal to indices_ncdhw_stride for now"
);
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
InDataType
*>
(
p_in_dev
),
static_cast
<
OutDataType
*>
(
p_out_dev
),
static_cast
<
IndexDataType
*>
(
p_out_indices_dev
),
N
,
C
,
input_spatial_lengths
,
window_lengths
,
output_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
input_ncdhw_lengths
,
output_ncdhw_lengths
,
input_ncdhw_stride
,
output_ncdhw_stride
,
indices_ncdhw_stride
,
window_zyx_lengths
,
window_zyx_strides
,
window_zyx_dilations
,
input_left_dhw_pads
,
input_right_dhw_pads
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
...
...
@@ -342,7 +396,7 @@ struct DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DevicePool3dFwd_
Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_
C<"
<<
BlockSize
<<
","
;
str
<<
"DevicePool3dFwd_
NDHWC_NDHW
C<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcOutDstVectorSize_"
<<
InSrcOutDstVectorSize
<<
">"
;
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment