Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
06d2c7b1
Commit
06d2c7b1
authored
Jun 28, 2023
by
Jing Zhang
Committed by
root
Jun 28, 2023
Browse files
clean
parents
b27909a0
3b18f1e3
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
847 additions
and
291 deletions
+847
-291
example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
+1
-1
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+8
-1
example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
...s_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+3
-2
example/48_pool3d_fwd/CMakeLists.txt
example/48_pool3d_fwd/CMakeLists.txt
+2
-0
example/48_pool3d_fwd/pool3d_fwd_common.hpp
example/48_pool3d_fwd/pool3d_fwd_common.hpp
+187
-0
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+83
-0
example/49_maxpool2d_bwd/CMakeLists.txt
example/49_maxpool2d_bwd/CMakeLists.txt
+3
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+62
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+222
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+62
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+62
-0
example/50_put_element/CMakeLists.txt
example/50_put_element/CMakeLists.txt
+1
-0
example/50_put_element/put_element_fp16.cpp
example/50_put_element/put_element_fp16.cpp
+88
-0
include/ck/ck.hpp
include/ck/ck.hpp
+16
-8
include/ck/host_utility/device_prop.hpp
include/ck/host_utility/device_prop.hpp
+1
-1
include/ck/host_utility/hip_check_error.hpp
include/ck/host_utility/hip_check_error.hpp
+1
-1
include/ck/host_utility/io.hpp
include/ck/host_utility/io.hpp
+1
-1
include/ck/host_utility/kernel_launch.hpp
include/ck/host_utility/kernel_launch.hpp
+1
-1
include/ck/host_utility/stream_utility.hpp
include/ck/host_utility/stream_utility.hpp
+43
-0
include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
...ckward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
+0
-275
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
example/46_gemm_add_multiply/gemm_add_multiply_xdl_fp16.cpp
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
...
...
example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
View file @
06d2c7b1
add_example_executable
(
example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp
)
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp
)
set
(
target 1
)
endif
()
endforeach
()
example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
...
...
@@ -121,7 +121,8 @@ using DeviceOpInstance =
2
,
// CShuffleNXdlPerWavePerShuffle
S
<
1
,
32
,
1
,
8
>
,
// CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
8
,
// CShuffleBlockTransferScalarPerVector_NPerBlock
MaskingSpec
>
;
// MaskingSpecialization
MaskingSpec
,
// MaskingSpecialization
1
>
;
// Ref Gemm0: fp16 in, fp32 out
using
ReferenceGemm0Instance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
...
...
example/48_pool3d_fwd/CMakeLists.txt
0 → 100644
View file @
06d2c7b1
add_example_executable
(
example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp
)
example/48_pool3d_fwd/pool3d_fwd_common.hpp
0 → 100644
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool3d_fwd_ndhwc_ndhwc.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
template
<
typename
InDataType
,
typename
OutDataType
,
typename
ComputeDataType
,
typename
IndexDataType
,
typename
InLayout
,
typename
OutLayout
,
ck
::
ReduceTensorOp
ReduceOpId
,
bool
PropagateNan
,
bool
OutputIndex
>
bool
pool3d_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Z
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
Di
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
window_stride_d
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
in_left_pad_d
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
ck
::
index_t
in_right_pad_d
,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool3dFwd_Input_N_Di_Hi_Wi_C_Output_N_Do_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
IndexDataType
,
// IndexDataType
ComputeDataType
,
// ComputeDataType
ReduceOpId
,
OutputIndex
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
4
>
;
// InSrcOutDstVectorSize
const
ck
::
index_t
Do
=
(
Di
+
in_left_pad_d
+
in_right_pad_d
-
Z
)
/
window_stride_d
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
{
Z
,
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
window_strides
{
window_stride_d
,
window_stride_h
,
window_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_d
,
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_d
,
in_right_pad_h
,
in_right_pad_w
};
// tensor layout
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
}
};
Tensor
<
InDataType
>
in_n_c_di_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
InLayout
{}));
Tensor
<
OutDataType
>
out_n_c_do_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
IndexDataType
>
out_indices_n_c_do_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
OutDataType
>
out_n_c_do_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
Tensor
<
IndexDataType
>
out_indices_n_c_do_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
OutLayout
{}));
std
::
cout
<<
"in_n_c_di_hi_wi: "
<<
in_n_c_di_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_do_ho_wo: "
<<
out_n_c_do_ho_wo_host
.
mDesc
<<
std
::
endl
;
in_n_c_di_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_di_hi_wi
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_do_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_indices_device_buf
(
sizeof
(
IndexDataType
)
*
out_indices_n_c_do_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in_n_c_di_hi_wi
.
mData
.
data
());
auto
pool
=
DevicePoolFwdInstance
{};
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
out_indices_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Di
,
Hi
,
Wi
},
{
Z
,
Y
,
X
},
{
N
,
C
,
Do
,
Ho
,
Wo
},
{
Di
*
C
*
Hi
*
Wi
,
1
,
C
*
Hi
*
Wi
,
Wi
*
C
,
C
},
{
Do
*
C
*
Ho
*
Wo
,
1
,
C
*
Ho
*
Wo
,
Wo
*
C
,
C
},
{
Do
*
C
*
Ho
*
Wo
,
1
,
C
*
Ho
*
Wo
,
Wo
*
C
,
C
},
window_strides
,
input_left_pads
,
input_right_pads
,
{
2
,
3
,
4
});
if
(
!
pool
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_op with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Perf: "
<<
ave_time
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
using
ReferencePoolingFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferencePoolingFwd
<
5
,
3
,
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
;
auto
ref_pooling
=
ReferencePoolingFwdInstance
{};
auto
ref_pooling_invoker
=
ref_pooling
.
MakeInvoker
();
auto
ref_pooling_argument
=
ref_pooling
.
MakeArgument
(
in_n_c_di_hi_wi
,
out_n_c_do_ho_wo_host
,
out_indices_n_c_do_ho_wo_host
,
window_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
ref_pooling_invoker
.
Run
(
ref_pooling_argument
);
out_device_buf
.
FromDevice
(
out_n_c_do_ho_wo_device
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_do_ho_wo_device
,
out_n_c_do_ho_wo_host
);
if
constexpr
(
OutputIndex
)
{
out_indices_device_buf
.
FromDevice
(
out_indices_n_c_do_ho_wo_device
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_indices_n_c_do_ho_wo_device
,
out_indices_n_c_do_ho_wo_host
);
};
}
return
(
pass
);
};
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
0 → 100644
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "pool3d_fwd_common.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
ComputeDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#if 1
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
MAX
;
#else
static
constexpr
auto
ReduceOpId
=
ck
::
ReduceTensorOp
::
AVG
;
#endif
static
constexpr
bool
OutputIndex
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
2
;
ck
::
index_t
C
=
32
;
ck
::
index_t
Z
=
2
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Di
=
30
;
ck
::
index_t
Hi
=
30
;
ck
::
index_t
Wi
=
30
;
ck
::
index_t
window_stride_d
=
2
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_d
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_d
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
bool
pass
=
pool3d_test
<
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
InLayout
,
OutLayout
,
ReduceOpId
,
PropagateNan
,
OutputIndex
>
(
do_verification
,
time_kernel
,
N
,
C
,
Z
,
Y
,
X
,
Di
,
Hi
,
Wi
,
window_stride_d
,
window_stride_h
,
window_stride_w
,
in_left_pad_d
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_d
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/CMakeLists.txt
0 → 100644
View file @
06d2c7b1
add_example_executable
(
example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp
)
add_example_executable
(
example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp
)
add_example_executable
(
example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp
)
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
0 → 100644
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
ck
::
bhalf_t
;
using
OutDataType
=
ck
::
bhalf_t
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
ck
::
bhalf_t
;
using
DOutDataType
=
ck
::
bhalf_t
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
1
;
ck
::
index_t
window_stride_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
0 → 100644
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
template
<
typename
InDataType
,
typename
OutDataType
,
typename
IndexDataType
,
typename
ComputeDataType
,
typename
DInDataType
,
typename
DOutDataType
,
bool
PropagateNan
>
bool
maxpool_bwd_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
IndexDataType
,
// IndexDataType
ComputeDataType
,
// ComputeDataType
ck
::
ReduceTensorOp
::
MAX
,
true
,
// OutputIndex
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
using
DeviceMaxPoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceIndexPoolBwdImpl
<
DOutDataType
,
IndexDataType
,
DInDataType
,
4
>
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Y
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
X
)
/
window_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
{
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
window_strides
{
window_stride_h
,
window_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_h
,
in_right_pad_w
};
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
)
{
using
namespace
ck
::
literals
;
// reference need Tensor with NCHW order
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
};
// in
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
// out
Tensor
<
OutDataType
>
out_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// indices
Tensor
<
IndexDataType
>
indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
Tensor
<
IndexDataType
>
indices_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// dout
Tensor
<
DOutDataType
>
dout_n_c_ho_wo
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// din
Tensor
<
DInDataType
>
din_n_c_hi_wi_host
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
Tensor
<
DInDataType
>
din_n_c_hi_wi_device
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"indices_n_c_ho_wo: "
<<
indices_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"dout_n_c_ho_wo: "
<<
dout_n_c_ho_wo
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"din_n_c_hi_wi: "
<<
din_n_c_hi_wi_host
.
mDesc
<<
std
::
endl
;
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
dout_n_c_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
-
1.0
,
1.0
});
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
indices_n_c_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
dout_device_buf
.
ToDevice
(
dout_n_c_ho_wo
.
mData
.
data
());
auto
pool_fwd
=
DevicePoolFwdInstance
{};
auto
pool_fwd_invoker_ptr
=
pool_fwd
.
MakeInvokerPointer
();
auto
pool_fwd_argument_ptr
=
pool_fwd
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Hi
,
Wi
},
window_spatial_lengths
,
{
N
,
C
,
Ho
,
Wo
},
{
C
*
Hi
*
Wi
,
1
,
Wi
*
C
,
C
},
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
},
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
},
window_strides
,
input_left_pads
,
input_right_pads
,
{
2
,
3
});
if
(
!
pool_fwd
.
IsSupportedArgument
(
pool_fwd_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! pool_fwd with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time_fwd
=
pool_fwd_invoker_ptr
->
Run
(
pool_fwd_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
auto
pool_bwd
=
DeviceMaxPoolBwdInstance
{};
auto
pool_bwd_invoker_ptr
=
pool_bwd
.
MakeInvokerPointer
();
auto
pool_bwd_argument_ptr
=
pool_bwd
.
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
(),
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
(),
window_spatial_lengths
,
window_strides
);
if
(
!
pool_bwd
.
IsSupportedArgument
(
pool_bwd_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! pool_bwd with the specified compilation parameters does "
"not support this problem"
);
}
size_t
pool_bwd_workspace_sz
=
pool_bwd
.
GetWorkSpaceSize
(
pool_bwd_argument_ptr
.
get
());
DeviceMem
pool_bwd_workspace_device_buf
(
pool_bwd_workspace_sz
);
pool_bwd
.
SetWorkSpacePointer
(
pool_bwd_argument_ptr
.
get
(),
pool_bwd_workspace_device_buf
.
GetDeviceBuffer
());
float
ave_time_bwd
=
pool_bwd_invoker_ptr
->
Run
(
pool_bwd_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Pool fwd perf: "
<<
ave_time_fwd
<<
" ms"
<<
std
::
endl
;
std
::
cout
<<
"Pool bwd perf: "
<<
ave_time_bwd
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
using
ReferencePoolingFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferencePoolingFwd
<
4
,
2
,
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
ck
::
ReduceTensorOp
::
MAX
,
PropagateNan
,
true
>
;
auto
ref_pooling_fwd
=
ReferencePoolingFwdInstance
{};
auto
ref_pooling_fwd_invoker
=
ref_pooling_fwd
.
MakeInvoker
();
auto
ref_pooling_fwd_argument
=
ref_pooling_fwd
.
MakeArgument
(
in_n_c_hi_wi
,
out_n_c_ho_wo_host
,
indices_n_c_ho_wo_host
,
window_spatial_lengths
,
window_strides
,
input_left_pads
,
input_right_pads
);
ref_pooling_fwd_invoker
.
Run
(
ref_pooling_fwd_argument
);
using
ReferencePoolingBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
PassThrough
>
;
auto
ref_pooling_bwd
=
ReferencePoolingBwdInstance
{};
auto
ref_pooling_bwd_invoker
=
ref_pooling_bwd
.
MakeInvoker
();
auto
ref_pooling_bwd_argument
=
ref_pooling_bwd
.
MakeArgument
(
dout_n_c_ho_wo
,
indices_n_c_ho_wo_host
,
din_n_c_hi_wi_host
,
PassThrough
{});
ref_pooling_bwd_invoker
.
Run
(
ref_pooling_bwd_argument
);
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
indices_device_buf
.
FromDevice
(
indices_n_c_ho_wo_device
.
mData
.
data
());
din_device_buf
.
FromDevice
(
din_n_c_hi_wi_device
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
,
out_n_c_ho_wo_host
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
indices_n_c_ho_wo_device
,
indices_n_c_ho_wo_host
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
din_n_c_hi_wi_device
,
din_n_c_hi_wi_host
);
}
return
(
pass
);
};
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
0 → 100644
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
ck
::
half_t
;
using
DOutDataType
=
ck
::
half_t
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
1
;
ck
::
index_t
window_stride_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
0 → 100644
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
float
;
using
OutDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
float
;
using
DOutDataType
=
float
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/50_put_element/CMakeLists.txt
0 → 100644
View file @
06d2c7b1
add_example_executable
(
example_put_element_fp16 put_element_fp16.cpp
)
example/50_put_element/put_element_fp16.cpp
0 → 100644
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
XDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
using
YElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DevicePutElementImpl
<
XDataType
,
// XDataType
IndexDataType
,
// IndexDataType
YDataType
,
// YDataType
YElementwiseOp
,
ck
::
InMemoryDataOperationEnum
::
Set
,
1
>
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
int
N
=
1024
;
Tensor
<
XDataType
>
x
(
HostTensorDescriptor
{
N
,
1
});
Tensor
<
IndexDataType
>
indices
(
HostTensorDescriptor
{
N
,
1
});
Tensor
<
YDataType
>
y
(
HostTensorDescriptor
{
N
,
1
});
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
-
1.0
,
1.0
});
for
(
int
i
=
0
;
i
<
N
;
++
i
)
indices
(
i
)
=
i
;
DeviceMem
x_device_buf
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_device_buf
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
indices
.
mDesc
.
GetElementSpaceSize
());
x_device_buf
.
ToDevice
(
x
.
mData
.
data
());
indices_device_buf
.
ToDevice
(
indices
.
mData
.
data
());
auto
put_instance
=
DeviceInstance
{};
auto
put_invoker_ptr
=
put_instance
.
MakeInvokerPointer
();
auto
put_argument_ptr
=
put_instance
.
MakeArgumentPointer
(
static_cast
<
XDataType
*>
(
x_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
YDataType
*>
(
y_device_buf
.
GetDeviceBuffer
()),
N
,
N
,
YElementwiseOp
{});
if
(
!
put_instance
.
IsSupportedArgument
(
put_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"argument is not supported!"
);
}
float
ave_time
=
put_invoker_ptr
->
Run
(
put_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"perf: "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
YDataType
>
y_host
(
HostTensorDescriptor
{
N
,
1
});
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
IndexDataType
idx
=
indices
(
i
);
y_host
(
idx
)
=
x
(
i
);
}
y_device_buf
.
FromDevice
(
y
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
y
,
y_host
);
}
return
(
pass
?
0
:
1
);
}
include/ck/ck.hpp
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
@@ -31,7 +31,8 @@
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
defined(__gfx90a__) || defined(__gfx940__) // for GPU code
defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
#elif defined(__gfx1030__) // for GPU code
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
...
...
@@ -44,7 +45,7 @@
#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
#define CK_USE_AMD_V_MAC_F32
#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) || \
defined(__gfx940__) // for GPU code
defined(__gfx940__)
|| defined(__gfx941__) || defined(__gfx942__)
// for GPU code
#define CK_USE_AMD_V_FMAC_F32
#define CK_USE_AMD_V_DOT2_F32_F16
#define CK_USE_AMD_V_DOT4_I32_I8
...
...
@@ -53,15 +54,16 @@
// MFMA instruction
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_MFMA
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) // for GPU code
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__) // for GPU code
#define CK_USE_AMD_MFMA
#endif
#if(defined(__gfx90a__) || defined(__gfx940__))
#if(defined(__gfx90a__) || defined(__gfx940__)
|| defined(__gfx941__) || defined(__gfx942__)
)
#define CK_USE_AMD_MFMA_BF16_1K_OP
#endif
#if defined(__gfx940__)
#if defined(__gfx940__)
|| defined(__gfx941__) || defined(__gfx942__)
#define CK_USE_AMD_MFMA_GFX940
#endif
...
...
@@ -84,13 +86,15 @@
// buffer atomic add: floating point
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) // for GPU code
#elif defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__) // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
#else // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
#endif
#if(defined(__gfx90a__) || defined(__gfx940__)) // for GPU code
#if(defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx941__) || \
defined(__gfx942__)) // for GPU code
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
#else
#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
...
...
@@ -169,6 +173,10 @@
// workaround: compiler issue on gfx908
#define CK_WORKAROUND_SWDEV_388832 1
// workaround: Grouped Conv2d_bwd_data fails for already implemented instance
#define CK_WORKAROUND_SWDEV_3318619 0
// flag to enable (1) or disable (0) the debugging output in some kernels
#define DEBUG_LOG 0
...
...
include/ck/host_utility/device_prop.hpp
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck/host_utility/hip_check_error.hpp
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck/host_utility/io.hpp
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck/host_utility/kernel_launch.hpp
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
2
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
...
...
include/ck/host_utility/stream_utility.hpp
0 → 100644
View file @
06d2c7b1
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
static
inline
int
getAvailableComputeUnitCount
(
const
StreamConfig
&
stream_config
)
{
constexpr
int
MAX_MASK_DWORDS
=
64
;
// assume at most 64*32 = 2048 CUs
uint32_t
cuMask
[
MAX_MASK_DWORDS
];
for
(
int
i
=
0
;
i
<
MAX_MASK_DWORDS
;
i
++
)
cuMask
[
i
]
=
0
;
auto
countSetBits
=
[](
uint32_t
dword
)
{
int
count
=
0
;
while
(
dword
!=
0
)
{
if
(
dword
&
0x1
)
count
++
;
dword
=
dword
>>
1
;
};
return
(
count
);
};
hip_check_error
(
hipExtStreamGetCUMask
(
stream_config
.
stream_id_
,
MAX_MASK_DWORDS
,
&
cuMask
[
0
]));
int
ret
=
0
;
for
(
int
i
=
0
;
i
<
MAX_MASK_DWORDS
;
i
++
)
ret
+=
countSetBits
(
cuMask
[
i
]);
return
(
ret
);
};
include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
deleted
100644 → 0
View file @
b27909a0
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
namespace
ck
{
// Number of GEMMs = YTilde * XTilde
// GemmM = C
// GemmN = N * HTildeSlice * WTildeSlice
// GemmK = K * YDotSlice * XDotSlice
template
<
typename
...
Wei
,
typename
...
In
,
typename
...
Out
,
typename
ConvStrides
,
typename
ConvDilations
,
typename
InLeftPads
,
typename
InRightPads
,
index_t
IYTildeValue
,
index_t
IXTildeValue
,
index_t
GemmK1Value
>
__host__
__device__
constexpr
auto
transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk
(
const
TensorDescriptor
<
Wei
...
>&
wei_k_y_x_c_grid_desc
,
const
TensorDescriptor
<
Out
...
>&
out_n_ho_wo_k_grid_desc
,
const
TensorDescriptor
<
In
...
>&
in_n_hi_wi_c_grid_desc
,
const
ConvStrides
&
conv_strides
,
const
ConvDilations
&
conv_dilations
,
const
InLeftPads
&
in_left_pads
,
const
InRightPads
&
in_right_pads
,
Number
<
IYTildeValue
>
,
Number
<
IXTildeValue
>
,
Number
<
GemmK1Value
>
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
GemmK1
=
Number
<
GemmK1Value
>
{};
constexpr
auto
IYTilde
=
Number
<
IYTildeValue
>
{};
constexpr
auto
IXTilde
=
Number
<
IXTildeValue
>
{};
const
auto
N
=
in_n_hi_wi_c_grid_desc
.
GetLength
(
I0
);
const
auto
C
=
in_n_hi_wi_c_grid_desc
.
GetLength
(
I3
);
const
auto
K
=
out_n_ho_wo_k_grid_desc
.
GetLength
(
I3
);
const
auto
Hi
=
in_n_hi_wi_c_grid_desc
.
GetLength
(
I1
);
const
auto
Wi
=
in_n_hi_wi_c_grid_desc
.
GetLength
(
I2
);
const
auto
Ho
=
out_n_ho_wo_k_grid_desc
.
GetLength
(
I1
);
const
auto
Wo
=
out_n_ho_wo_k_grid_desc
.
GetLength
(
I2
);
const
auto
Y
=
wei_k_y_x_c_grid_desc
.
GetLength
(
I1
);
const
auto
X
=
wei_k_y_x_c_grid_desc
.
GetLength
(
I2
);
const
auto
ConvStrideH
=
conv_strides
[
I0
];
const
auto
ConvStrideW
=
conv_strides
[
I1
];
const
auto
ConvDilationH
=
conv_dilations
[
I0
];
const
auto
ConvDilationW
=
conv_dilations
[
I1
];
const
auto
InLeftPadH
=
in_left_pads
[
I0
];
const
auto
InLeftPadW
=
in_left_pads
[
I1
];
const
auto
InRightPadH
=
in_right_pads
[
I0
];
const
auto
InRightPadW
=
in_right_pads
[
I1
];
const
auto
GcdStrideDilationH
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
const
auto
GcdStrideDilationW
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
const
auto
YTilde
=
ConvStrideH
/
GcdStrideDilationH
;
const
auto
XTilde
=
ConvStrideW
/
GcdStrideDilationW
;
const
auto
YDot
=
math
::
integer_divide_ceil
(
Y
,
YTilde
);
const
auto
XDot
=
math
::
integer_divide_ceil
(
X
,
XTilde
);
const
auto
HTilde
=
Ho
+
math
::
integer_divide_ceil
(
ConvDilationH
*
(
Y
-
I1
),
ConvStrideH
);
const
auto
WTilde
=
Wo
+
math
::
integer_divide_ceil
(
ConvDilationW
*
(
X
-
I1
),
ConvStrideW
);
// only work on HTilde and WTilde that contribute to non-padding area of input tensor
const
auto
IHTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadH
-
ConvDilationH
*
(
YTilde
-
I1
)),
ConvStrideH
);
const
auto
IWTildeSliceBegin
=
math
::
integer_divide_floor
(
math
::
max
(
I0
,
InLeftPadW
-
ConvDilationW
*
(
XTilde
-
I1
)),
ConvStrideW
);
const
auto
IHTildeSliceEnd
=
math
::
min
(
HTilde
,
math
::
integer_divide_ceil
(
InLeftPadH
+
Hi
-
I1
,
ConvStrideH
)
+
I1
);
const
auto
IWTildeSliceEnd
=
math
::
min
(
WTilde
,
math
::
integer_divide_ceil
(
InLeftPadW
+
Wi
-
I1
,
ConvStrideW
)
+
I1
);
const
auto
HTildeSlice
=
IHTildeSliceEnd
-
IHTildeSliceBegin
;
const
auto
WTildeSlice
=
IWTildeSliceEnd
-
IWTildeSliceBegin
;
// GemmK is different for each GEMM
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
IYTilde
,
YTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
IXTilde
,
XTilde
);
const
auto
K1
=
GemmK1
;
const
auto
K0
=
K
/
K1
;
// weight tensor
const
auto
wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc
=
transform_tensor_descriptor
(
wei_k_y_x_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
K
),
make_embed_transform
(
make_tuple
(
YDot
,
YTilde
),
make_tuple
(
ConvStrideH
/
GcdStrideDilationH
,
I1
)),
make_embed_transform
(
make_tuple
(
XDot
,
XTilde
),
make_tuple
(
ConvStrideW
/
GcdStrideDilationW
,
I1
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
wei_k0_k1_ydotslice_xdotslice_c_grid_desc
=
transform_tensor_descriptor
(
wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
K0
,
K1
)),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_freeze_transform
(
IYTilde
),
make_freeze_transform
(
IXTilde
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
3
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<>
{},
Sequence
<>
{},
Sequence
<
4
>
{}));
#if 1
const
auto
wei_gemmk0_gemmm_gemmk1_grid_desc
=
transform_tensor_descriptor
(
wei_k0_k1_ydotslice_xdotslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
YDotSlice
,
XDotSlice
,
K0
)),
make_pass_through_transform
(
C
),
make_pass_through_transform
(
K1
)),
make_tuple
(
Sequence
<
2
,
3
,
0
>
{},
Sequence
<
4
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#else
const
auto
wei_gemmk0_gemmm_gemmk1_grid_desc
=
transform_tensor_descriptor
(
wei_k0_k1_ydotslice_xdotslice_c_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
YDotSlice
,
XDotSlice
)),
make_pass_through_transform
(
C
),
make_pass_through_transform
(
K1
)),
make_tuple
(
Sequence
<
0
,
2
,
3
>
{},
Sequence
<
4
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#endif
// output tensor
// this add padding check
const
auto
out_n_hop_wop_k_grid_desc
=
transform_tensor_descriptor
(
out_n_ho_wo_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Ho
,
I0
,
I0
),
make_pad_transform
(
Wo
,
I0
,
I0
),
make_pass_through_transform
(
K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
out_n_ydot_htilde_xdot_wtilde_k_grid_desc
=
transform_tensor_descriptor
(
out_n_hop_wop_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
YDot
,
HTilde
),
make_tuple
(
-
ConvDilationH
/
GcdStrideDilationH
,
I1
)),
make_embed_transform
(
make_tuple
(
XDot
,
WTilde
),
make_tuple
(
-
ConvDilationW
/
GcdStrideDilationW
,
I1
)),
make_pass_through_transform
(
K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc
=
transform_tensor_descriptor
(
out_n_ydot_htilde_xdot_wtilde_k_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_slice_transform
(
YDot
,
I0
,
YDotSlice
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_slice_transform
(
XDot
,
I0
,
XDotSlice
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_unmerge_transform
(
make_tuple
(
K0
,
K1
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
,
6
>
{}));
#if 1
const
auto
out_gemmk0_gemmn_gemmk1_grid_desc
=
transform_tensor_descriptor
(
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
YDotSlice
,
XDotSlice
,
K0
)),
make_merge_transform
(
make_tuple
(
N
,
HTildeSlice
,
WTildeSlice
)),
make_pass_through_transform
(
K1
)),
make_tuple
(
Sequence
<
1
,
3
,
5
>
{},
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
6
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#else
const
auto
out_gemmk0_gemmn_gemmk1_grid_desc
=
transform_tensor_descriptor
(
out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc
,
make_tuple
(
make_merge_transform
(
make_tuple
(
K0
,
YDotSlice
,
XDotSlice
)),
make_merge_transform
(
make_tuple
(
N
,
HTildeSlice
,
WTildeSlice
)),
make_pass_through_transform
(
K1
)),
make_tuple
(
Sequence
<
5
,
1
,
3
>
{},
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
6
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
#endif
// input tensor
const
auto
in_n_hip_wip_c_grid_desc
=
transform_tensor_descriptor
(
in_n_hi_wi_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_pad_transform
(
Hi
,
InLeftPadH
,
InRightPadH
),
make_pad_transform
(
Wi
,
InLeftPadW
,
InRightPadW
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc
=
transform_tensor_descriptor
(
in_n_hip_wip_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_embed_transform
(
make_tuple
(
YTilde
,
HTilde
),
make_tuple
(
ConvDilationH
,
ConvStrideH
)),
make_embed_transform
(
make_tuple
(
XTilde
,
WTilde
),
make_tuple
(
ConvDilationW
,
ConvStrideW
)),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
,
4
>
{},
Sequence
<
5
>
{}));
const
auto
in_n_htildeslice_wtildeslice_c_grid_desc
=
transform_tensor_descriptor
(
in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
N
),
make_freeze_transform
(
IYTilde
),
make_slice_transform
(
HTilde
,
IHTildeSliceBegin
,
HTildeSlice
),
make_freeze_transform
(
IXTilde
),
make_slice_transform
(
WTilde
,
IWTildeSliceBegin
,
WTildeSlice
),
make_pass_through_transform
(
C
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<>
{},
Sequence
<
1
>
{},
Sequence
<>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}));
const
auto
in_gemmm_gemmn_grid_desc
=
transform_tensor_descriptor
(
in_n_htildeslice_wtildeslice_c_grid_desc
,
make_tuple
(
make_pass_through_transform
(
C
),
make_merge_transform
(
make_tuple
(
N
,
HTildeSlice
,
WTildeSlice
))),
make_tuple
(
Sequence
<
3
>
{},
Sequence
<
0
,
1
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
make_tuple
(
wei_gemmk0_gemmm_gemmk1_grid_desc
,
out_gemmk0_gemmn_gemmk1_grid_desc
,
in_gemmm_gemmn_grid_desc
);
}
}
// namespace ck
#endif
Prev
1
…
10
11
12
13
14
15
16
17
18
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment