Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
fa9da1a4
Commit
fa9da1a4
authored
Jun 19, 2023
by
Jun Liu
Browse files
Merge branch 'amd-develop' into amd-master
parents
4c105089
457308e3
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
843 additions
and
291 deletions
+843
-291
example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+1
-1
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+8
-1
example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
...s_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+3
-2
example/48_pool3d_fwd/CMakeLists.txt
example/48_pool3d_fwd/CMakeLists.txt
+2
-0
example/48_pool3d_fwd/pool3d_fwd_common.hpp
example/48_pool3d_fwd/pool3d_fwd_common.hpp
+187
-0
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+83
-0
example/49_maxpool2d_bwd/CMakeLists.txt
example/49_maxpool2d_bwd/CMakeLists.txt
+3
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+62
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+222
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+62
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+62
-0
example/50_put_element/CMakeLists.txt
example/50_put_element/CMakeLists.txt
+1
-0
example/50_put_element/put_element_fp16.cpp
example/50_put_element/put_element_fp16.cpp
+88
-0
include/ck/ck.hpp
include/ck/ck.hpp
+12
-8
include/ck/host_utility/device_prop.hpp
include/ck/host_utility/device_prop.hpp
+1
-1
include/ck/host_utility/hip_check_error.hpp
include/ck/host_utility/hip_check_error.hpp
+1
-1
include/ck/host_utility/io.hpp
include/ck/host_utility/io.hpp
+1
-1
include/ck/host_utility/kernel_launch.hpp
include/ck/host_utility/kernel_launch.hpp
+1
-1
include/ck/host_utility/stream_utility.hpp
include/ck/host_utility/stream_utility.hpp
+43
-0
include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
...ckward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+0
-275
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
...
...
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
View file @
fa9da1a4
add_example_executable
(
example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp
)
set
(
target 1
)
endif
()
endforeach
()
example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
...
...
@@ -121,7 +121,8 @@ using DeviceOpInstance =
2
,
// CShuffleNXdlPerWavePerShuffle
S
<
1
,
32
,
1
,
8
>
,
// CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8
,
// CShuffleBlockTransferScalarPerVector_NPerBlock
MaskingSpec
>
;
// MaskingSpecialization
MaskingSpec
,
// MaskingSpecialization
1
>
;
// Ref Gemm0: fp16 in, fp32 out
using
ReferenceGemm0Instance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
...
...
example/48_pool3d_fwd/CMakeLists.txt
0 → 100644
View file @
fa9da1a4
add_example_executable
(
example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp
)
example/48_pool3d_fwd/pool3d_fwd_common.hpp
0 → 100644
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
template
<
typename
InDataType
,
typename
OutDataType
,
typename
ComputeDataType
,
typename
IndexDataType
,
typename
InLayout
,
typename
OutLayout
,
ck
::
ReduceTensorOp
ReduceOpId
,
bool
PropagateNan
,
bool
OutputIndex
>
bool
pool3d_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Z
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
Di
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
window_stride_d
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
in_left_pad_d
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
ck
::
index_t
in_right_pad_d
,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
IndexDataType
,
// IndexDataType
ComputeDataType
,
// ComputeDataType
ReduceOpId
,
OutputIndex
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
4
>
;
// InSrcOutDstVectorSize
const
ck
::
index_t
Do
=
(
Di
+
in_left_pad_d
+
in_right_pad_d
-
Z
)
/
window_stride_d
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
{
Z
,
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
window_strides
{
window_stride_d
,
window_stride_h
,
window_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_d
,
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_d
,
in_right_pad_h
,
in_right_pad_w
};
// tensor layout
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
}
};
Tensor
<
InDataType
>
in_n_c_di_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
OutDataType
>
out_n_c_do_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
IndexDataType
>
out_indices_n_c_do_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
OutDataType
>
out_n_c_do_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
IndexDataType
>
out_indices_n_c_do_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_di_hi_wi: "
<<
in_n_c_di_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_do_ho_wo: "
<<
out_n_c_do_ho_wo_host
.
mDesc
<<
std
::
endl
;
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_di_hi_wi
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_do_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_indices_n_c_do_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in_n_c_di_hi_wi
.
mData
.
data
());
auto
pool
=
DevicePoolFwdInstance
{};
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Di
,
Hi
,
Wi
},
{
Z
,
Y
,
X
},
{
N
,
C
,
Do
,
Ho
,
Wo
},
{
Di
*
C
*
Hi
*
Wi
,
1
,
C
*
Hi
*
Wi
,
Wi
*
C
,
C
},
{
Do
*
C
*
Ho
*
Wo
,
1
,
C
*
Ho
*
Wo
,
Wo
*
C
,
C
},
{
Do
*
C
*
Ho
*
Wo
,
1
,
C
*
Ho
*
Wo
,
Wo
*
C
,
C
},
window_strides
,
input_left_pads
,
input_right_pads
,
{
2
,
3
,
4
});
if
(
!
pool
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_op with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Perf: "
<<
ave_time
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
using
ReferencePoolingFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferencePoolingFwd
<
5
,
3
,
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
;
auto
ref_pooling
=
ReferencePoolingFwdInstance
{};
auto
ref_pooling_invoker
=
ref_pooling
.
MakeInvoker
();
auto
ref_pooling_argument
=
ref_pooling
.
MakeArgument
(
in_n_c_di_hi_wi
,
out_n_c_do_ho_wo_host
,
out_indices_n_c_do_ho_wo_host
,
window_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
ref_pooling_invoker
.
Run
(
ref_pooling_argument
);
out_device_buf
.
FromDevice
(
out_n_c_do_ho_wo_device
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_do_ho_wo_device
,
out_n_c_do_ho_wo_host
);
if
constexpr
(
OutputIndex
)
{
out_indices_device_buf
.
FromDevice
(
out_indices_n_c_do_ho_wo_device
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices_n_c_do_ho_wo_device
,
out_indices_n_c_do_ho_wo_host
);
};
}
return
(
pass
);
};
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
0 → 100644
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "pool3d_fwd_common.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
ComputeDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#if 1
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
#else
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
static
constexpr
bool
OutputIndex
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
2
;
ck
::
index_t
C
=
32
;
ck
::
index_t
Z
=
2
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Di
=
30
;
ck
::
index_t
Hi
=
30
;
ck
::
index_t
Wi
=
30
;
ck
::
index_t
window_stride_d
=
2
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_d
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_d
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
bool
pass
=
pool3d_test
<
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
InLayout
,
OutLayout
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
do_verification
,
time_kernel
,
N
,
C
,
Z
,
Y
,
X
,
Di
,
Hi
,
Wi
,
window_stride_d
,
window_stride_h
,
window_stride_w
,
in_left_pad_d
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_d
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/CMakeLists.txt
0 → 100644
View file @
fa9da1a4
add_example_executable
(
example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp
)
add_example_executable
(
example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp
)
add_example_executable
(
example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp
)
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
0 → 100644
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
ck
::
bhalf_t
;
using
OutDataType
=
ck
::
bhalf_t
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
ck
::
bhalf_t
;
using
DOutDataType
=
ck
::
bhalf_t
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
1
;
ck
::
index_t
window_stride_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
0 → 100644
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
template
<
typename
InDataType
,
typename
OutDataType
,
typename
IndexDataType
,
typename
ComputeDataType
,
typename
DInDataType
,
typename
DOutDataType
,
bool
PropagateNan
>
bool
maxpool_bwd_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
IndexDataType
,
// IndexDataType
ComputeDataType
,
// ComputeDataType
ck
::
ReduceTensorOp
::
MAX
,
true
,
// OutputIndex
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
using
DeviceMaxPoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceIndexPoolBwdImpl
<
DOutDataType
,
IndexDataType
,
DInDataType
,
4
>
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
{
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
window_strides
{
window_stride_h
,
window_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_h
,
in_right_pad_w
};
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
)
{
using
namespace
ck
::
literals
;
// reference need Tensor with NCHW order
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
};
// in
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
// out
Tensor
<
OutDataType
>
out_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// indices
Tensor
<
IndexDataType
>
indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
Tensor
<
IndexDataType
>
indices_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// dout
Tensor
<
DOutDataType
>
dout_n_c_ho_wo
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// din
Tensor
<
DInDataType
>
din_n_c_hi_wi_host
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
Tensor
<
DInDataType
>
din_n_c_hi_wi_device
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"indices_n_c_ho_wo: "
<<
indices_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"dout_n_c_ho_wo: "
<<
dout_n_c_ho_wo
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"din_n_c_hi_wi: "
<<
din_n_c_hi_wi_host
.
mDesc
<<
std
::
endl
;
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
dout_n_c_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
-
1.0
,
1.0
});
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
indices_n_c_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
dout_device_buf
.
ToDevice
(
dout_n_c_ho_wo
.
mData
.
data
());
auto
pool_fwd
=
DevicePoolFwdInstance
{};
auto
pool_fwd_invoker_ptr
=
pool_fwd
.
MakeInvokerPointer
();
auto
pool_fwd_argument_ptr
=
pool_fwd
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Hi
,
Wi
},
window_spatial_lengths
,
{
N
,
C
,
Ho
,
Wo
},
{
C
*
Hi
*
Wi
,
1
,
Wi
*
C
,
C
},
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
},
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
},
window_strides
,
input_left_pads
,
input_right_pads
,
{
2
,
3
});
if
(
!
pool_fwd
.
IsSupportedArgument
(
pool_fwd_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! pool_fwd with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time_fwd
=
pool_fwd_invoker_ptr
->
Run
(
pool_fwd_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
auto
pool_bwd
=
DeviceMaxPoolBwdInstance
{};
auto
pool_bwd_invoker_ptr
=
pool_bwd
.
MakeInvokerPointer
();
auto
pool_bwd_argument_ptr
=
pool_bwd
.
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
(),
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
(),
window_spatial_lengths
,
window_strides
);
if
(
!
pool_bwd
.
IsSupportedArgument
(
pool_bwd_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! pool_bwd with the specified compilation parameters does "
"not support this problem"
);
}
size_t
pool_bwd_workspace_sz
=
pool_bwd
.
GetWorkSpaceSize
(
pool_bwd_argument_ptr
.
get
());
DeviceMem
pool_bwd_workspace_device_buf
(
pool_bwd_workspace_sz
);
pool_bwd
.
SetWorkSpacePointer
(
pool_bwd_argument_ptr
.
get
(),
pool_bwd_workspace_device_buf
.
GetDeviceBuffer
());
float
ave_time_bwd
=
pool_bwd_invoker_ptr
->
Run
(
pool_bwd_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Pool fwd perf: "
<<
ave_time_fwd
<<
" ms"
<<
std
::
endl
;
std
::
cout
<<
"Pool bwd perf: "
<<
ave_time_bwd
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
using
ReferencePoolingFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferencePoolingFwd
<
4
,
2
,
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
ck
::
ReduceTensorOp
::
MAX
,
PropagateNan
,
true
>
;
auto
ref_pooling_fwd
=
ReferencePoolingFwdInstance
{};
auto
ref_pooling_fwd_invoker
=
ref_pooling_fwd
.
MakeInvoker
();
auto
ref_pooling_fwd_argument
=
ref_pooling_fwd
.
MakeArgument
(
in_n_c_hi_wi
,
out_n_c_ho_wo_host
,
indices_n_c_ho_wo_host
,
window_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
ref_pooling_fwd_invoker
.
Run
(
ref_pooling_fwd_argument
);
using
ReferencePoolingBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
PassThrough
>
;
auto
ref_pooling_bwd
=
ReferencePoolingBwdInstance
{};
auto
ref_pooling_bwd_invoker
=
ref_pooling_bwd
.
MakeInvoker
();
auto
ref_pooling_bwd_argument
=
ref_pooling_bwd
.
MakeArgument
(
dout_n_c_ho_wo
,
indices_n_c_ho_wo_host
,
din_n_c_hi_wi_host
,
PassThrough
{});
ref_pooling_bwd_invoker
.
Run
(
ref_pooling_bwd_argument
);
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
indices_device_buf
.
FromDevice
(
indices_n_c_ho_wo_device
.
mData
.
data
());
din_device_buf
.
FromDevice
(
din_n_c_hi_wi_device
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
,
out_n_c_ho_wo_host
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
indices_n_c_ho_wo_device
,
indices_n_c_ho_wo_host
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
din_n_c_hi_wi_device
,
din_n_c_hi_wi_host
);
}
return
(
pass
);
};
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
0 → 100644
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
ck
::
half_t
;
using
DOutDataType
=
ck
::
half_t
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
1
;
ck
::
index_t
window_stride_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
0 → 100644
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
float
;
using
OutDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
float
;
using
DOutDataType
=
float
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/50_put_element/CMakeLists.txt
0 → 100644
View file @
fa9da1a4
add_example_executable
(
example_put_element_fp16 put_element_fp16.cpp
)
example/50_put_element/put_element_fp16.cpp
0 → 100644
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
XDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
using
YElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DevicePutElementImpl
<
XDataType
,
// XDataType
IndexDataType
,
// IndexDataType
YDataType
,
// YDataType
YElementwiseOp
,
ck
::
InMemoryDataOperationEnum
::
Set
,
1
>
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
int
N
=
1024
;
Tensor
<
XDataType
>
x
(
HostTensorDescriptor
{
N
,
1
});
Tensor
<
IndexDataType
>
indices
(
HostTensorDescriptor
{
N
,
1
});
Tensor
<
YDataType
>
y
(
HostTensorDescriptor
{
N
,
1
});
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
-
1.0
,
1.0
});
for
(
int
i
=
0
;
i
<
N
;
++
i
)
indices
(
i
)
=
i
;
DeviceMem
x_device_buf
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_device_buf
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
indices
.
mDesc
.
GetElementSpaceSize
());
x_device_buf
.
ToDevice
(
x
.
mData
.
data
());
indices_device_buf
.
ToDevice
(
indices
.
mData
.
data
());
auto
put_instance
=
DeviceInstance
{};
auto
put_invoker_ptr
=
put_instance
.
MakeInvokerPointer
();
auto
put_argument_ptr
=
put_instance
.
MakeArgumentPointer
(
static_cast
<
XDataType
*>
(
x_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
YDataType
*>
(
y_device_buf
.
GetDeviceBuffer
()),
N
,
N
,
YElementwiseOp
{});
if
(
!
put_instance
.
IsSupportedArgument
(
put_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"argument is not supported!"
);
}
float
ave_time
=
put_invoker_ptr
->
Run
(
put_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"perf: "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
YDataType
>
y_host
(
HostTensorDescriptor
{
N
,
1
});
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
IndexDataType
idx
=
indices
(
i
);
y_host
(
idx
)
=
x
(
i
);
}
y_device_buf
.
FromDevice
(
y
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
y
,
y_host
);
}
return
(
pass
?
0
:
1
);
}
include/ck/ck.hpp
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -31,7 +31,8 @@
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) // for GPU code
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
...
...
@@ -44,7 +45,7 @@
#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
#define CK_USE_AMD_V_MAC_F32
#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) || \
defined(__gfx940__) // for GPU code
defined(__gfx940__)
|| defined(__gfx941__) || defined(__gfx942__)
// for GPU code
#define CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_DOT2_F32_F16
#define CK_USE_AMD_V_DOT4_I32_I8
...
...
@@ -53,15 +54,16 @@
// MFMA instruction
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_MFMA
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) // for GPU code
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__) // for GPU code
#define CK_USE_AMD_MFMA
#endif
#if(defined(__gfx90a__) || defined(__gfx940__))
#if(defined(__gfx90a__) || defined(__gfx940__)
|| defined(__gfx941__) || defined(__gfx942__)
)
#define CK_USE_AMD_MFMA_BF16_1K_OP
#endif
#if defined(__gfx940__)
#if defined(__gfx940__)
|| defined(__gfx941__) || defined(__gfx942__)
#define CK_USE_AMD_MFMA_GFX940
#endif
...
...
@@ -84,13 +86,15 @@
// buffer atomic add: floating point
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) // for GPU code
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__) // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
#else // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
#endif
#if(defined(__gfx90a__) || defined(__gfx940__)) // for GPU code
#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__)) // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
#else
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
...
...
include/ck/host_utility/device_prop.hpp
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck/host_utility/hip_check_error.hpp
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck/host_utility/io.hpp
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck/host_utility/kernel_launch.hpp
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck/host_utility/stream_utility.hpp
0 → 100644
View file @
fa9da1a4
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
static
inline
int
getAvailableComputeUnitCount
(
const
StreamConfig
&
stream_config
)
{
constexpr
int
MAX_MASK_DWORDS
=
64
;
// assume at most 64*32 = 2048 CUs
uint32_t
cuMask
[
MAX_MASK_DWORDS
];
for
(
int
i
=
0
;
i
<
MAX_MASK_DWORDS
;
i
++
)
cuMask
[
i
]
=
0
;
auto
countSetBits
=
[](
uint32_t
dword
)
{
int
count
=
0
;
while
(
dword
!=
0
)
{
if
(
dword
&
0x1
)
count
++
;
dword
=
dword
>>
1
;
};
return
(
count
);
};
hip_check_error
(
hipExtStreamGetCUMask
(
stream_config
.
stream_id_
,
MAX_MASK_DWORDS
,
&
cuMask
[
0
]));
int
ret
=
0
;
for
(
int
i
=
0
;
i
<
MAX_MASK_DWORDS
;
i
++
)
ret
+=
countSetBits
(
cuMask
[
i
]);
return
(
ret
);
};
include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
deleted
100644 → 0
View file @
4c105089
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
namespace
ck
{
// Number of GEMMs = YTilde * XTilde
// GemmM = C
// GemmN = N * HTildeSlice * WTildeSlice
// GemmK = K * YDotSlice * XDotSlice
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
,
index_t
IYTildeValue
,
index_t
IXTildeValue
,
index_t
GemmK1Value
>
__host__
__device__
constexpr
auto
transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk
(
const
TensorDescriptor
<
Wei
...
>&
wei_k_y_x_c_grid_desc
,
const
TensorDescriptor
<
Out
...
>&
out_n_ho_wo_k_grid_desc
,
const
TensorDescriptor
<
In
...
>&
in_n_hi_wi_c_grid_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
Number
<
IYTildeValue
>
,
Number
<
IXTildeValue
>
,
Number
<
GemmK1Value
>
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
GemmK1
=
Number
<
GemmK1Value
>
{};
constexpr
auto
IYTilde
=
Number
<
IYTildeValue
>
{};
constexpr
auto
IXTilde
=
Number
<
IXTildeValue
>
{};
const
auto
N
=
in_n_hi_wi_c_grid_desc
.
GetLength
(
I0
);
const
auto
C
=
in_n_hi_wi_c_grid_desc
.
GetLength
(
I3
);
const
auto
K
=
out_n_ho_wo_k_grid_desc
.
GetLength
(
I3
);
const
auto
Hi
=
in_n_hi_wi_c_grid_desc
.
GetLength
(
I1
);
const
auto
Wi
=
in_n_hi_wi_c_grid_desc
.
GetLength
(
I2
);
const
auto
Ho
=
out_n_ho_wo_k_grid_desc
.
GetLength
(
I1
);
const
auto
Wo
=
out_n_ho_wo_k_grid_desc
.
GetLength
(
I2
);
const
auto
Y
=
wei_k_y_x_c_grid_desc
.
GetLength
(
I1
);
const
auto
X
=
wei_k_y_x_c_grid_desc
.
GetLength
(
I2
);
const
auto
ConvStrideH
=
conv_strides
[
I0
];
const
auto
ConvStrideW
=
conv_strides
[
I1
];
const
auto
ConvDilationH
=
conv_dilations
[
I0
];
const
auto
ConvDilationW
=
conv_dilations
[
I1
];
const
auto
InLeftPadH
=
in_left_pads
[
I0
];
const
auto
InLeftPadW
=
in_left_pads
[
I1
];
const
auto
InRightPadH
=
in_right_pads
[
I0
];
const
auto
InRightPadW
=
in_right_pads
[
I1
];
const
auto
GcdStrideDilationH
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
const
auto
GcdStrideDilationW
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
const
auto
YTilde
=
ConvStrideH
/
GcdStrideDilationH
;
const
auto
XTilde
=
ConvStrideW
/
GcdStrideDilationW
;
const
auto
YDot
=
math
::
integer_divide_ceil
(
Y
,
YTilde
);
const
auto
XDot
=
math
::
integer_divide_ceil
(
X
,
XTilde
);
const
auto
HTilde
=
Ho
+
math
::
integer_divide_ceil
(
ConvDilationH
*
(
Y
-
I1
),
ConvStrideH
);
const
auto
WTilde
=
Wo
+
math
::
integer_divide_ceil
(
ConvDilationW
*
(
X
-
I1
),
ConvStrideW
);
// only work on HTilde and WTilde that contribute to non-padding area of input tensor
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadH
-
ConvDilationH
*
(
YTilde
-
I1
)),
ConvStrideH
);
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadW
-
ConvDilationW
*
(
XTilde
-
I1
)),
ConvStrideW
);
const
auto
IHTildeSliceEnd
=
math
::
min
(
HTilde
,
math
::
integer_divide_ceil
(
InLeftPadH
+
Hi
-
I1
,
ConvStrideH
)
+
I1
);
const
auto
IWTildeSliceEnd
=
math
::
min
(
WTilde
,
math
::
integer_divide_ceil
(
InLeftPadW
+
Wi
-
I1
,
ConvStrideW
)
+
I1
);
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
const
auto
WTildeSlice
=
IWTildeSliceEnd
-
IWTildeSliceBegin
;
// GemmK is different for each GEMM
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
IYTilde
,
YTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
IXTilde
,
XTilde
);
const
auto
K1
=
GemmK1
;
const
auto
K0
=
K
/
K1
;
// weight tensor
const
auto
wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc
=
transform_tensor_descriptor
(
wei_k_y_x_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
K
),
make_embed_transform
(
make_tuple
(
YDot
,
YTilde
),
make_tuple
(
ConvStrideH
/
GcdStrideDilationH
,
I1
)),
make_embed_transform
(
make_tuple
(
XDot
,
XTilde
),
make_tuple
(
ConvStrideW
/
GcdStrideDilationW
,
I1
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
wei_k0_k1_ydotslice_xdotslice_c_grid_desc
=
transform_tensor_descriptor
(
wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1
)),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_freeze_transform
(
IYTilde
),
make_freeze_transform
(
IXTilde
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<>
{},
Sequence
<>
{},
Sequence
<
4
>
{}));
#if 1
const
auto
wei_gemmk0_gemmm_gemmk1_grid_desc
=
transform_tensor_descriptor
(
wei_k0_k1_ydotslice_xdotslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
YDotSlice
,
XDotSlice
,
K0
)),
make_pass_through_transform
(
C
),
make_pass_through_transform
(
K1
)),
make_tuple
(
Sequence
<
2
,
3
,
0
>
{},
Sequence
<
4
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#else
const
auto
wei_gemmk0_gemmm_gemmk1_grid_desc
=
transform_tensor_descriptor
(
wei_k0_k1_ydotslice_xdotslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
YDotSlice
,
XDotSlice
)),
make_pass_through_transform
(
C
),
make_pass_through_transform
(
K1
)),
make_tuple
(
Sequence
<
0
,
2
,
3
>
{},
Sequence
<
4
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#endif
// output tensor
// this add padding check
const
auto
out_n_hop_wop_k_grid_desc
=
transform_tensor_descriptor
(
out_n_ho_wo_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
I0
,
I0
),
make_pad_transform
(
Wo
,
I0
,
I0
),
make_pass_through_transform
(
K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
out_n_ydot_htilde_xdot_wtilde_k_grid_desc
=
transform_tensor_descriptor
(
out_n_hop_wop_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
YDot
,
HTilde
),
make_tuple
(
-
ConvDilationH
/
GcdStrideDilationH
,
I1
)),
make_embed_transform
(
make_tuple
(
XDot
,
WTilde
),
make_tuple
(
-
ConvDilationW
/
GcdStrideDilationW
,
I1
)),
make_pass_through_transform
(
K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc
=
transform_tensor_descriptor
(
out_n_ydot_htilde_xdot_wtilde_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_unmerge_transform
(
make_tuple
(
K0
,
K1
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
,
6
>
{}));
#if 1
const
auto
out_gemmk0_gemmn_gemmk1_grid_desc
=
transform_tensor_descriptor
(
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
YDotSlice
,
XDotSlice
,
K0
)),
make_merge_transform
(
make_tuple
(
N
,
HTildeSlice
,
WTildeSlice
)),
make_pass_through_transform
(
K1
)),
make_tuple
(
Sequence
<
1
,
3
,
5
>
{},
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
6
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#else
const
auto
out_gemmk0_gemmn_gemmk1_grid_desc
=
transform_tensor_descriptor
(
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
YDotSlice
,
XDotSlice
)),
make_merge_transform
(
make_tuple
(
N
,
HTildeSlice
,
WTildeSlice
)),
make_pass_through_transform
(
K1
)),
make_tuple
(
Sequence
<
5
,
1
,
3
>
{},
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
6
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#endif
// input tensor
const
auto
in_n_hip_wip_c_grid_desc
=
transform_tensor_descriptor
(
in_n_hi_wi_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc
=
transform_tensor_descriptor
(
in_n_hip_wip_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
YTilde
,
HTilde
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
XTilde
,
WTilde
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
in_n_htildeslice_wtildeslice_c_grid_desc
=
transform_tensor_descriptor
(
in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_freeze_transform
(
IYTilde
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_freeze_transform
(
IXTilde
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<>
{},
Sequence
<
1
>
{},
Sequence
<>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_gemmm_gemmn_grid_desc
=
transform_tensor_descriptor
(
in_n_htildeslice_wtildeslice_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
C
),
make_merge_transform
(
make_tuple
(
N
,
HTildeSlice
,
WTildeSlice
))),
make_tuple
(
Sequence
<
3
>
{},
Sequence
<
0
,
1
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
make_tuple
(
wei_gemmk0_gemmm_gemmk1_grid_desc
,
out_gemmk0_gemmn_gemmk1_grid_desc
,
in_gemmm_gemmn_grid_desc
);
}
}
// namespace ck
#endif
Prev
1
…
10
11
12
13
14
15
16
17
18
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment