Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
4100d1d8
Commit
4100d1d8
authored
Aug 23, 2023
by
Alan Turner
Browse files
Merge remote-tracking branch 'origin/develop' into migx-flash-attn
parents
48717006
c8a8385f
Changes
609
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1490 additions
and
30 deletions
+1490
-30
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
+39
-18
example/49_maxpool2d_bwd/CMakeLists.txt
example/49_maxpool2d_bwd/CMakeLists.txt
+9
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
+66
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
+228
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
+66
-0
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
+66
-0
example/50_put_element/CMakeLists.txt
example/50_put_element/CMakeLists.txt
+3
-0
example/50_put_element/put_element_fp16.cpp
example/50_put_element/put_element_fp16.cpp
+88
-0
example/51_avgpool3d_bwd/CMakeLists.txt
example/51_avgpool3d_bwd/CMakeLists.txt
+3
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
+62
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
+147
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
+62
-0
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
+62
-0
include/ck/ck.hpp
include/ck/ck.hpp
+31
-2
include/ck/config.h.in
include/ck/config.h.in
+102
-0
include/ck/host_utility/device_prop.hpp
include/ck/host_utility/device_prop.hpp
+7
-0
include/ck/host_utility/kernel_launch.hpp
include/ck/host_utility/kernel_launch.hpp
+69
-0
include/ck/host_utility/stream_utility.hpp
include/ck/host_utility/stream_utility.hpp
+1
-1
include/ck/tensor_description/multi_index_transform.hpp
include/ck/tensor_description/multi_index_transform.hpp
+9
-9
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
.../ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
+370
-0
No files found.
example/48_pool3d_fwd/pool3d_fwd_fp16.cpp
View file @
4100d1d8
...
@@ -27,31 +27,49 @@ static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
...
@@ -27,31 +27,49 @@ static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
static
constexpr
bool
OutputIndex
=
false
;
static
constexpr
bool
OutputIndex
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
static
constexpr
bool
PropagateNan
=
false
;
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool3dFwd_NDHWC_NDHWC
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
ReduceOpId
,
OutputIndex
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
int
main
()
{
{
bool
do_verification
=
true
;
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
bool
time_kernel
=
false
;
// Pool shape
// Pool shape
ck
::
index_t
N
=
2
;
ck
::
index_t
N
=
2
;
ck
::
index_t
C
=
32
;
ck
::
index_t
C
=
32
;
ck
::
index_t
Z
=
2
;
ck
::
index_t
Z
=
2
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Di
=
30
;
ck
::
index_t
Di
=
30
;
ck
::
index_t
Hi
=
30
;
ck
::
index_t
Hi
=
30
;
ck
::
index_t
Wi
=
30
;
ck
::
index_t
Wi
=
30
;
ck
::
index_t
window_stride_d
=
2
;
ck
::
index_t
window_stride_d
=
2
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
in_left_pad_d
=
1
;
ck
::
index_t
window_dilation_d
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
window_dilation_h
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
window_dilation_w
=
1
;
ck
::
index_t
in_right_pad_d
=
1
;
ck
::
index_t
in_left_pad_d
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_left_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
ck
::
index_t
in_left_pad_w
=
1
;
ck
::
index_t
in_right_pad_d
=
1
;
ck
::
index_t
in_right_pad_h
=
1
;
ck
::
index_t
in_right_pad_w
=
1
;
bool
pass
=
pool3d_test
<
InDataType
,
bool
pass
=
pool3d_test
<
DevicePoolFwdInstance
,
InDataType
,
OutDataType
,
OutDataType
,
ComputeDataType
,
ComputeDataType
,
IndexDataType
,
IndexDataType
,
...
@@ -72,6 +90,9 @@ int main()
...
@@ -72,6 +90,9 @@ int main()
window_stride_d
,
window_stride_d
,
window_stride_h
,
window_stride_h
,
window_stride_w
,
window_stride_w
,
window_dilation_d
,
window_dilation_h
,
window_dilation_w
,
in_left_pad_d
,
in_left_pad_d
,
in_left_pad_h
,
in_left_pad_h
,
in_left_pad_w
,
in_left_pad_w
,
...
...
example/49_maxpool2d_bwd/CMakeLists.txt
0 → 100644
View file @
4100d1d8
if
(
DTYPES MATCHES
"bf16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp
)
endif
()
if
(
DTYPES MATCHES
"fp32"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp
)
endif
()
example/49_maxpool2d_bwd/maxpool2d_bwd_bf16.cpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
ck
::
bhalf_t
;
using
OutDataType
=
ck
::
bhalf_t
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
ck
::
bhalf_t
;
using
DOutDataType
=
ck
::
bhalf_t
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
1
;
ck
::
index_t
window_stride_w
=
1
;
ck
::
index_t
window_dilation_h
=
1
;
ck
::
index_t
window_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
window_dilation_h
,
window_dilation_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/maxpool2d_bwd_common.hpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_index_pool_bwd_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_pool_fwd.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_maxpool_bwd.hpp"
template
<
typename
InDataType
,
typename
OutDataType
,
typename
IndexDataType
,
typename
ComputeDataType
,
typename
DInDataType
,
typename
DOutDataType
,
bool
PropagateNan
>
bool
maxpool_bwd_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Y
,
ck
::
index_t
X
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
ck
::
index_t
window_stride_h
,
ck
::
index_t
window_stride_w
,
ck
::
index_t
window_dilation_h
,
ck
::
index_t
window_dilation_w
,
ck
::
index_t
in_left_pad_h
,
ck
::
index_t
in_left_pad_w
,
ck
::
index_t
in_right_pad_h
,
ck
::
index_t
in_right_pad_w
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DevicePoolFwdInstance
=
ck
::
tensor_operation
::
device
::
DevicePool2dFwd_NHWC_NHWC
<
InDataType
,
// InDataType
OutDataType
,
// OutDataType
IndexDataType
,
// IndexDataType
ComputeDataType
,
// ComputeDataType
ck
::
ReduceTensorOp
::
MAX
,
true
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
4
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
using
DeviceMaxPoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceIndexPoolBwdImpl
<
DOutDataType
,
IndexDataType
,
DInDataType
,
4
>
;
const
ck
::
index_t
Ys
=
(
Y
-
1
)
*
window_dilation_h
+
1
;
const
ck
::
index_t
Xs
=
(
X
-
1
)
*
window_dilation_w
+
1
;
const
ck
::
index_t
Ho
=
(
Hi
+
in_left_pad_h
+
in_right_pad_h
-
Ys
)
/
window_stride_h
+
1
;
const
ck
::
index_t
Wo
=
(
Wi
+
in_left_pad_w
+
in_right_pad_w
-
Xs
)
/
window_stride_w
+
1
;
const
std
::
vector
<
ck
::
index_t
>
window_spatial_lengths
{
Y
,
X
};
const
std
::
vector
<
ck
::
index_t
>
window_strides
{
window_stride_h
,
window_stride_w
};
const
std
::
vector
<
ck
::
index_t
>
window_dilations
{
window_dilation_h
,
window_dilation_w
};
const
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
in_left_pad_h
,
in_left_pad_w
};
const
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
in_right_pad_h
,
in_right_pad_w
};
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
H
,
std
::
size_t
W
)
{
using
namespace
ck
::
literals
;
// reference need Tensor with NCHW order
return
HostTensorDescriptor
({
N_
,
C_
,
H
,
W
},
{
C_
*
H
*
W
,
1
_uz
,
W
*
C_
,
C_
});
};
// in
Tensor
<
InDataType
>
in_n_c_hi_wi
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
// out
Tensor
<
OutDataType
>
out_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
Tensor
<
OutDataType
>
out_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// indices
Tensor
<
IndexDataType
>
indices_n_c_ho_wo_device
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
Tensor
<
IndexDataType
>
indices_n_c_ho_wo_host
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// dout
Tensor
<
DOutDataType
>
dout_n_c_ho_wo
(
f_host_tensor_descriptor
(
N
,
C
,
Ho
,
Wo
));
// din
Tensor
<
DInDataType
>
din_n_c_hi_wi_host
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
Tensor
<
DInDataType
>
din_n_c_hi_wi_device
(
f_host_tensor_descriptor
(
N
,
C
,
Hi
,
Wi
));
std
::
cout
<<
"in_n_c_hi_wi: "
<<
in_n_c_hi_wi
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out_n_c_ho_wo: "
<<
out_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"indices_n_c_ho_wo: "
<<
indices_n_c_ho_wo_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"dout_n_c_ho_wo: "
<<
dout_n_c_ho_wo
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"din_n_c_hi_wi: "
<<
din_n_c_hi_wi_host
.
mDesc
<<
std
::
endl
;
in_n_c_hi_wi
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
dout_n_c_ho_wo
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
-
1.0
,
1.0
});
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_n_c_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
indices_n_c_ho_wo_device
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in_n_c_hi_wi
.
mData
.
data
());
dout_device_buf
.
ToDevice
(
dout_n_c_ho_wo
.
mData
.
data
());
auto
pool_fwd
=
DevicePoolFwdInstance
{};
auto
pool_fwd_invoker_ptr
=
pool_fwd
.
MakeInvokerPointer
();
auto
pool_fwd_argument_ptr
=
pool_fwd
.
MakeArgumentPointer
(
static_cast
<
InDataType
*>
(
in_device_buf
.
GetDeviceBuffer
()),
static_cast
<
OutDataType
*>
(
out_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Hi
,
Wi
},
window_spatial_lengths
,
{
N
,
C
,
Ho
,
Wo
},
{
C
*
Hi
*
Wi
,
1
,
Wi
*
C
,
C
},
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
},
{
C
*
Ho
*
Wo
,
1
,
Wo
*
C
,
C
},
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
,
{
2
,
3
});
if
(
!
pool_fwd
.
IsSupportedArgument
(
pool_fwd_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! pool_fwd with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time_fwd
=
pool_fwd_invoker_ptr
->
Run
(
pool_fwd_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
auto
pool_bwd
=
DeviceMaxPoolBwdInstance
{};
auto
pool_bwd_invoker_ptr
=
pool_bwd
.
MakeInvokerPointer
();
auto
pool_bwd_argument_ptr
=
pool_bwd
.
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
dout_n_c_ho_wo
.
mDesc
.
GetElementSpaceSize
(),
din_n_c_hi_wi_device
.
mDesc
.
GetElementSpaceSize
(),
window_spatial_lengths
,
window_strides
);
if
(
!
pool_bwd
.
IsSupportedArgument
(
pool_bwd_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! pool_bwd with the specified compilation parameters does "
"not support this problem"
);
}
size_t
pool_bwd_workspace_sz
=
pool_bwd
.
GetWorkSpaceSize
(
pool_bwd_argument_ptr
.
get
());
DeviceMem
pool_bwd_workspace_device_buf
(
pool_bwd_workspace_sz
);
pool_bwd
.
SetWorkSpacePointer
(
pool_bwd_argument_ptr
.
get
(),
pool_bwd_workspace_device_buf
.
GetDeviceBuffer
());
float
ave_time_bwd
=
pool_bwd_invoker_ptr
->
Run
(
pool_bwd_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Pool fwd perf: "
<<
ave_time_fwd
<<
" ms"
<<
std
::
endl
;
std
::
cout
<<
"Pool bwd perf: "
<<
ave_time_bwd
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
using
ReferencePoolingFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferencePoolingFwd
<
4
,
2
,
InDataType
,
OutDataType
,
ComputeDataType
,
IndexDataType
,
ck
::
ReduceTensorOp
::
MAX
,
PropagateNan
,
true
>
;
auto
ref_pooling_fwd
=
ReferencePoolingFwdInstance
{};
auto
ref_pooling_fwd_invoker
=
ref_pooling_fwd
.
MakeInvoker
();
auto
ref_pooling_fwd_argument
=
ref_pooling_fwd
.
MakeArgument
(
in_n_c_hi_wi
,
out_n_c_ho_wo_host
,
indices_n_c_ho_wo_host
,
window_spatial_lengths
,
window_strides
,
window_dilations
,
input_left_pads
,
input_right_pads
);
ref_pooling_fwd_invoker
.
Run
(
ref_pooling_fwd_argument
);
using
ReferencePoolingBwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceMaxPoolBwd
<
DOutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
PassThrough
>
;
auto
ref_pooling_bwd
=
ReferencePoolingBwdInstance
{};
auto
ref_pooling_bwd_invoker
=
ref_pooling_bwd
.
MakeInvoker
();
auto
ref_pooling_bwd_argument
=
ref_pooling_bwd
.
MakeArgument
(
dout_n_c_ho_wo
,
indices_n_c_ho_wo_host
,
din_n_c_hi_wi_host
,
PassThrough
{});
ref_pooling_bwd_invoker
.
Run
(
ref_pooling_bwd_argument
);
out_device_buf
.
FromDevice
(
out_n_c_ho_wo_device
.
mData
.
data
());
indices_device_buf
.
FromDevice
(
indices_n_c_ho_wo_device
.
mData
.
data
());
din_device_buf
.
FromDevice
(
din_n_c_hi_wi_device
.
mData
.
data
());
pass
=
pass
&&
ck
::
utils
::
check_err
(
out_n_c_ho_wo_device
,
out_n_c_ho_wo_host
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
indices_n_c_ho_wo_device
,
indices_n_c_ho_wo_host
);
pass
=
pass
&&
ck
::
utils
::
check_err
(
din_n_c_hi_wi_device
,
din_n_c_hi_wi_host
);
}
return
(
pass
);
};
example/49_maxpool2d_bwd/maxpool2d_bwd_fp16.cpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
ck
::
half_t
;
using
DOutDataType
=
ck
::
half_t
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
3
;
ck
::
index_t
X
=
3
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
1
;
ck
::
index_t
window_stride_w
=
1
;
ck
::
index_t
window_dilation_h
=
1
;
ck
::
index_t
window_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
window_dilation_h
,
window_dilation_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/49_maxpool2d_bwd/maxpool2d_bwd_fp32.cpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "maxpool2d_bwd_common.hpp"
using
InDataType
=
float
;
using
OutDataType
=
float
;
using
IndexDataType
=
int32_t
;
using
ComputeDataType
=
float
;
using
DInDataType
=
float
;
using
DOutDataType
=
float
;
static
constexpr
bool
PropagateNan
=
false
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
// Pool shape
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
1
;
ck
::
index_t
Y
=
2
;
ck
::
index_t
X
=
2
;
ck
::
index_t
Hi
=
32
;
ck
::
index_t
Wi
=
32
;
ck
::
index_t
window_stride_h
=
2
;
ck
::
index_t
window_stride_w
=
2
;
ck
::
index_t
window_dilation_h
=
1
;
ck
::
index_t
window_dilation_w
=
1
;
ck
::
index_t
in_left_pad_h
=
0
;
ck
::
index_t
in_left_pad_w
=
0
;
ck
::
index_t
in_right_pad_h
=
0
;
ck
::
index_t
in_right_pad_w
=
0
;
bool
pass
=
maxpool_bwd_test
<
InDataType
,
OutDataType
,
IndexDataType
,
ComputeDataType
,
DInDataType
,
DOutDataType
,
PropagateNan
>
(
do_verification
,
time_kernel
,
N
,
C
,
Y
,
X
,
Hi
,
Wi
,
window_stride_h
,
window_stride_w
,
window_dilation_h
,
window_dilation_w
,
in_left_pad_h
,
in_left_pad_w
,
in_right_pad_h
,
in_right_pad_w
);
return
(
pass
?
0
:
1
);
}
example/50_put_element/CMakeLists.txt
0 → 100644
View file @
4100d1d8
if
(
DTYPES MATCHES
"fp16"
OR NOT DEFINED DTYPES
)
add_example_executable
(
example_put_element_fp16 put_element_fp16.cpp
)
endif
()
example/50_put_element/put_element_fp16.cpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_put_element_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using
XDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
IndexDataType
=
int32_t
;
using
YElementwiseOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DevicePutElementImpl
<
XDataType
,
// XDataType
IndexDataType
,
// IndexDataType
YDataType
,
// YDataType
YElementwiseOp
,
ck
::
InMemoryDataOperationEnum
::
Set
,
1
>
;
int
main
()
{
bool
do_verification
=
true
;
bool
time_kernel
=
false
;
int
N
=
1024
;
Tensor
<
XDataType
>
x
(
HostTensorDescriptor
{
N
});
Tensor
<
IndexDataType
>
indices
(
HostTensorDescriptor
{
N
});
Tensor
<
YDataType
>
y
(
HostTensorDescriptor
{
N
});
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
-
1.0
,
1.0
});
for
(
int
i
=
0
;
i
<
N
;
++
i
)
indices
(
i
)
=
i
;
DeviceMem
x_device_buf
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_device_buf
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
indices_device_buf
(
sizeof
(
IndexDataType
)
*
indices
.
mDesc
.
GetElementSpaceSize
());
x_device_buf
.
ToDevice
(
x
.
mData
.
data
());
indices_device_buf
.
ToDevice
(
indices
.
mData
.
data
());
auto
put_instance
=
DeviceInstance
{};
auto
put_invoker_ptr
=
put_instance
.
MakeInvokerPointer
();
auto
put_argument_ptr
=
put_instance
.
MakeArgumentPointer
(
static_cast
<
XDataType
*>
(
x_device_buf
.
GetDeviceBuffer
()),
static_cast
<
IndexDataType
*>
(
indices_device_buf
.
GetDeviceBuffer
()),
static_cast
<
YDataType
*>
(
y_device_buf
.
GetDeviceBuffer
()),
N
,
N
,
YElementwiseOp
{});
if
(
!
put_instance
.
IsSupportedArgument
(
put_argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"argument is not supported!"
);
}
float
ave_time
=
put_invoker_ptr
->
Run
(
put_argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"perf: "
<<
ave_time
<<
" ms"
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
Tensor
<
YDataType
>
y_host
(
HostTensorDescriptor
{
N
});
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
IndexDataType
idx
=
indices
(
i
);
y_host
(
idx
)
=
x
(
i
);
}
y_device_buf
.
FromDevice
(
y
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
y
,
y_host
);
}
return
(
pass
?
0
:
1
);
}
example/51_avgpool3d_bwd/CMakeLists.txt
0 → 100644
View file @
4100d1d8
add_example_executable
(
example_avgpool3d_bwd_bf16 avgpool3d_bwd_bf16.cpp
)
add_example_executable
(
example_avgpool3d_bwd_fp16 avgpool3d_bwd_fp16.cpp
)
add_example_executable
(
example_avgpool3d_bwd_fp32 avgpool3d_bwd_fp32.cpp
)
example/51_avgpool3d_bwd/avgpool3d_bwd_bf16.cpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
ck
::
bhalf_t
;
using
DInDataType
=
ck
::
bhalf_t
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
example/51_avgpool3d_bwd/avgpool3d_bwd_common.hpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_avgpool_bwd.hpp"
template
<
typename
TensorLayout
>
std
::
vector
<
ck
::
index_t
>
f_tensor_strides_ncdhw
(
ck
::
index_t
N_
,
ck
::
index_t
C_
,
ck
::
index_t
D
,
ck
::
index_t
H
,
ck
::
index_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
(
void
)
N_
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
return
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
};
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
return
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
};
};
template
<
typename
TensorLayout
>
HostTensorDescriptor
f_host_tensor_descriptor
(
std
::
size_t
N_
,
std
::
size_t
C_
,
std
::
size_t
D
,
std
::
size_t
H
,
std
::
size_t
W
,
TensorLayout
layout
)
{
using
namespace
ck
::
literals
;
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NCDHW
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
C_
*
D
*
H
*
W
,
D
*
H
*
W
,
H
*
W
,
W
,
1
_uz
});
}
else
if
constexpr
(
ck
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
convolution
::
NDHWC
>::
value
)
{
return
HostTensorDescriptor
({
N_
,
C_
,
D
,
H
,
W
},
{
D
*
C_
*
H
*
W
,
1
_uz
,
C_
*
H
*
W
,
W
*
C_
,
C_
});
}
};
template
<
typename
DevicePoolBwdInstance
,
typename
DOutDataType
,
typename
DInDataType
,
typename
DOutLayout
,
typename
DInLayout
>
bool
pool3d_bwd_test
(
bool
do_verification
,
bool
time_kernel
,
ck
::
index_t
N
,
ck
::
index_t
C
,
ck
::
index_t
Di
,
ck
::
index_t
Hi
,
ck
::
index_t
Wi
,
std
::
vector
<
ck
::
index_t
>
window_lengths
,
std
::
vector
<
ck
::
index_t
>
window_strides
,
std
::
vector
<
ck
::
index_t
>
window_dilations
,
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
,
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
)
{
auto
OutSpatialLength
=
[
&
](
auto
InSpatialLength
,
int
index
)
{
ck
::
index_t
left_pad
=
dinput_left_pads
[
index
];
ck
::
index_t
right_pad
=
dinput_right_pads
[
index
];
ck
::
index_t
window_len
=
window_lengths
[
index
];
ck
::
index_t
stride
=
window_strides
[
index
];
ck
::
index_t
dilation
=
window_dilations
[
index
];
ck
::
index_t
eff
=
(
window_len
-
1
)
*
dilation
+
1
;
return
(
InSpatialLength
+
left_pad
+
right_pad
-
eff
)
/
stride
+
1
;
};
ck
::
index_t
Do
=
OutSpatialLength
(
Di
,
0
);
ck
::
index_t
Ho
=
OutSpatialLength
(
Hi
,
1
);
ck
::
index_t
Wo
=
OutSpatialLength
(
Wi
,
2
);
Tensor
<
DOutDataType
>
dout
(
f_host_tensor_descriptor
(
N
,
C
,
Do
,
Ho
,
Wo
,
DOutLayout
{}));
Tensor
<
DInDataType
>
din_dev
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}));
Tensor
<
DInDataType
>
din_host
(
f_host_tensor_descriptor
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}));
std
::
cout
<<
"dout: "
<<
dout
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"din_host: "
<<
din_host
.
mDesc
<<
std
::
endl
;
dout
.
GenerateTensorValue
(
GeneratorTensor_3
<
DOutDataType
>
{
0.0
,
1.0
});
DeviceMem
dout_device_buf
(
sizeof
(
DOutDataType
)
*
dout
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
din_device_buf
(
sizeof
(
DInDataType
)
*
din_dev
.
mDesc
.
GetElementSpaceSize
());
dout_device_buf
.
ToDevice
(
dout
.
mData
.
data
());
din_device_buf
.
SetZero
();
auto
pool
=
DevicePoolBwdInstance
{};
auto
invoker_ptr
=
pool
.
MakeInvokerPointer
();
auto
argument_ptr
=
pool
.
MakeArgumentPointer
(
static_cast
<
DOutDataType
*>
(
dout_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DInDataType
*>
(
din_device_buf
.
GetDeviceBuffer
()),
{
N
,
C
,
Do
,
Ho
,
Wo
},
{
N
,
C
,
Di
,
Hi
,
Wi
},
f_tensor_strides_ncdhw
(
N
,
C
,
Do
,
Ho
,
Wo
,
DOutLayout
{}),
f_tensor_strides_ncdhw
(
N
,
C
,
Di
,
Hi
,
Wi
,
DInLayout
{}),
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
if
(
!
pool
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
throw
std
::
runtime_error
(
"wrong! device_op with the specified compilation parameters does "
"not support this problem"
);
}
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
cout
<<
"Perf: "
<<
ave_time
<<
std
::
endl
;
bool
pass
=
true
;
if
(
do_verification
)
{
auto
ref_pool
=
ck
::
tensor_operation
::
host
::
ReferenceAvgPoolBwd
<
3
,
DInDataType
,
DOutDataType
>
();
auto
ref_invoker
=
ref_pool
.
MakeInvoker
();
auto
ref_argument
=
ref_pool
.
MakeArgument
(
din_host
,
dout
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
ref_invoker
.
Run
(
ref_argument
);
din_device_buf
.
FromDevice
(
din_dev
.
mData
.
data
());
pass
=
ck
::
utils
::
check_err
(
din_dev
,
din_host
);
}
return
pass
;
}
example/51_avgpool3d_bwd/avgpool3d_bwd_fp16.cpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
ck
::
half_t
;
using
DInDataType
=
ck
::
half_t
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
example/51_avgpool3d_bwd/avgpool3d_bwd_fp32.cpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_avgpool3d_bwd_ndhwc_ndhwc.hpp"
#include "avgpool3d_bwd_common.hpp"
using
DOutDataType
=
float
;
using
DInDataType
=
float
;
using
ComputeDataType
=
float
;
#if 1
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWC
;
#else
using
DOutLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
using
DInLayout
=
ck
::
tensor_layout
::
convolution
::
NCDHW
;
#endif
using
DevicePoolBwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceAvgPool3dBwd_NDHWC_NDHWC
<
DOutDataType
,
DInDataType
,
ComputeDataType
,
64
,
// BlockSize
64
,
// ReduceMThreadClusterSize
1
,
// ReduceKThreadClusterSize
1
,
// ReduceMThreadSliceSize
1
,
// ReduceKThreadSliceSize
1
>
;
// InSrcOutDstVectorSize
int
main
()
{
std
::
vector
<
ck
::
index_t
>
window_lengths
=
{
5
,
5
,
5
};
std
::
vector
<
ck
::
index_t
>
window_strides
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
window_dilations
=
{
2
,
2
,
2
};
std
::
vector
<
ck
::
index_t
>
dinput_left_pads
=
{
0
,
0
,
0
};
std
::
vector
<
ck
::
index_t
>
dinput_right_pads
=
{
0
,
0
,
0
};
ck
::
index_t
N
=
1
;
ck
::
index_t
C
=
16
;
ck
::
index_t
Di
=
40
;
ck
::
index_t
Hi
=
40
;
ck
::
index_t
Wi
=
40
;
pool3d_bwd_test
<
DevicePoolBwdInstance
,
DOutDataType
,
DInDataType
,
DOutLayout
,
DInLayout
>
(
true
,
false
,
N
,
C
,
Di
,
Hi
,
Wi
,
window_lengths
,
window_strides
,
window_dilations
,
dinput_left_pads
,
dinput_right_pads
);
}
include/ck/ck.hpp
View file @
4100d1d8
...
@@ -3,6 +3,8 @@
...
@@ -3,6 +3,8 @@
#pragma once
#pragma once
#include "ck/config.h"
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
#include "hip/hip_runtime.h"
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#include "hip/hip_fp16.h"
...
@@ -27,6 +29,21 @@
...
@@ -27,6 +29,21 @@
#define CK_WAVELET_MIN_BLOCK_PER_CU 2
#define CK_WAVELET_MIN_BLOCK_PER_CU 2
#endif
#endif
// kernel attribute: amdgpu_waves_per_eu()
#ifdef CK_USE_WAVES_PER_EU
// for 1-wave kernels, control arguments of amdgpu_waves_per_eu() attribute
#ifndef CK_MIN_WAVES_PER_EU
#define CK_MIN_WAVES_PER_EU 0
#endif
#ifndef CK_MAX_WAVES_PER_EU
#define CK_MAX_WAVES_PER_EU 0
#endif
#else
#define CK_USE_WAVES_PER_EU 0
#endif
// buffer resource
// buffer resource
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
...
@@ -103,8 +120,15 @@
...
@@ -103,8 +120,15 @@
// inline asm
// inline asm
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INLINE_ASM 1
// inner product (DLOP)
// inner product (V_MAC/V_FMAC)
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
#define CK_USE_AMD_V_MAC_INLINE_ASM 1
// V_DOT inline instructions, less efficient since they require adding
// `s_nop`s to avoid hazard
#define CK_USE_AMD_V_DOT_INLINE_ASM 0
// inner product using V_DOT with DPP8 modifiers
#define CK_USE_AMD_V_DOT_DPP8_INLINE_ASM 1
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
...
@@ -148,6 +172,10 @@
...
@@ -148,6 +172,10 @@
#define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
#define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
// experimental feature: add instances using pipeline v2
// experimental feature: add instances using pipeline v2
#define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
#define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
// experimental feature: optimize pipeline v2 by IGLP strategy (value=ID of strategy)
#ifndef CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT
#define CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT 0
#endif
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack: have underlying assumption that need to be satsified, otherwise it's a bug
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
...
@@ -173,6 +201,7 @@
...
@@ -173,6 +201,7 @@
// workaround: compiler issue on gfx908
// workaround: compiler issue on gfx908
#define CK_WORKAROUND_SWDEV_388832 1
#define CK_WORKAROUND_SWDEV_388832 1
// flag to enable (1) or disable (0) the debugging output in some kernels
// flag to enable (1) or disable (0) the debugging output in some kernels
#define DEBUG_LOG 0
#define DEBUG_LOG 0
...
...
include/ck/config.h.in
0 → 100644
View file @
4100d1d8
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_CONFIG_H_IN
#define CK_CONFIG_H_IN
// clang-format off
//
// DataType supports in the current CK build
//
#ifndef DTYPES
#cmakedefine DTYPES "@DTYPES@"
#endif
// if DTYPES is not defined, enable all datatypes in headerfiles
#ifndef CK_ENABLE_ALL_DTYPES
#cmakedefine CK_ENABLE_ALL_DTYPES @CK_ENABLE_ALL_DTYPES@
#if defined(CK_ENABLE_ALL_DTYPES)
#ifndef CK_ENABLE_INT8
#define CK_ENABLE_INT8 "ON"
#endif
#ifndef CK_ENABLE_FP8
#define CK_ENABLE_FP8 "ON"
#endif
#ifndef CK_ENABLE_FP16
#define CK_ENABLE_FP16 "ON"
#endif
#ifndef CK_ENABLE_BF16
#define CK_ENABLE_BF16 "ON"
#endif
#ifndef CK_ENABLE_FP32
#define CK_ENABLE_FP32 "ON"
#endif
#ifndef CK_ENABLE_FP64
#define CK_ENABLE_FP64 "ON"
#endif
#endif
#endif
// if DTYPES are selectively enabled
#ifndef CK_ENABLE_INT8
#cmakedefine CK_ENABLE_INT8 @CK_ENABLE_INT8@
#endif
#ifndef CK_ENABLE_FP8
#cmakedefine CK_ENABLE_FP8 @CK_ENABLE_FP8@
#endif
#ifndef CK_ENABLE_FP16
#cmakedefine CK_ENABLE_FP16 @CK_ENABLE_FP16@
#endif
#ifndef CK_ENABLE_BF16
#cmakedefine CK_ENABLE_BF16 @CK_ENABLE_BF16@
#endif
#ifndef CK_ENABLE_FP32
#cmakedefine CK_ENABLE_FP32 @CK_ENABLE_FP32@
#endif
#ifndef CK_ENABLE_FP64
#cmakedefine CK_ENABLE_FP64 @CK_ENABLE_FP64@
#endif
//
// Legacy DL kernel supports in the current CK build
// by default DL kernels are turned OFF
//
#ifndef CK_ENABLE_DL_KERNELS
#cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@
#endif
//
// Instances supports in the current CK build
//
#ifndef CK_ENABLE_INSTANCES_ONLY
#cmakedefine CK_ENABLE_INSTANCES_ONLY @CK_ENABLE_INSTANCES_ONLY@
#endif
// clang-format on
#endif // CK_CONFIG_H_IN
include/ck/host_utility/device_prop.hpp
View file @
4100d1d8
...
@@ -51,4 +51,11 @@ inline std::string get_device_name()
...
@@ -51,4 +51,11 @@ inline std::string get_device_name()
return
name
;
return
name
;
}
}
inline
bool
is_xdl_supported
()
{
return
ck
::
get_device_name
()
==
"gfx908"
||
ck
::
get_device_name
()
==
"gfx90a"
||
ck
::
get_device_name
()
==
"gfx940"
||
ck
::
get_device_name
()
==
"gfx941"
||
ck
::
get_device_name
()
==
"gfx942"
;
}
}
// namespace ck
}
// namespace ck
include/ck/host_utility/kernel_launch.hpp
View file @
4100d1d8
...
@@ -73,3 +73,72 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
...
@@ -73,3 +73,72 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
return
0
;
return
0
;
#endif
#endif
}
}
template
<
typename
...
Args
,
typename
F
,
typename
PreProcessFunc
>
float
launch_and_time_kernel_with_preprocess
(
const
StreamConfig
&
stream_config
,
PreProcessFunc
preprocess
,
F
kernel
,
dim3
grid_dim
,
dim3
block_dim
,
std
::
size_t
lds_byte
,
Args
...
args
)
{
#if CK_TIME_KERNEL
if
(
stream_config
.
time_kernel_
)
{
#if DEBUG_LOG
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
__func__
,
grid_dim
.
x
,
grid_dim
.
y
,
grid_dim
.
z
,
block_dim
.
x
,
block_dim
.
y
,
block_dim
.
z
);
printf
(
"Warm up 1 time
\n
"
);
#endif
// warm up
preprocess
();
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
const
int
nrepeat
=
10
;
#if DEBUG_LOG
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
#endif
hipEvent_t
start
,
stop
;
hip_check_error
(
hipEventCreate
(
&
start
));
hip_check_error
(
hipEventCreate
(
&
stop
));
hip_check_error
(
hipDeviceSynchronize
());
hip_check_error
(
hipEventRecord
(
start
,
stream_config
.
stream_id_
));
for
(
int
i
=
0
;
i
<
nrepeat
;
++
i
)
{
preprocess
();
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
}
hip_check_error
(
hipEventRecord
(
stop
,
stream_config
.
stream_id_
));
hip_check_error
(
hipEventSynchronize
(
stop
));
float
total_time
=
0
;
hip_check_error
(
hipEventElapsedTime
(
&
total_time
,
start
,
stop
));
return
total_time
/
nrepeat
;
}
else
{
preprocess
();
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
return
0
;
}
#else
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
return
0
;
#endif
}
include/ck/host_utility/stream_utility.hpp
View file @
4100d1d8
...
@@ -8,7 +8,7 @@
...
@@ -8,7 +8,7 @@
#include "ck/stream_config.hpp"
#include "ck/stream_config.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/host_utility/hip_check_error.hpp"
static
int
getAvailableComputeUnitCount
(
const
StreamConfig
&
stream_config
)
static
inline
int
getAvailableComputeUnitCount
(
const
StreamConfig
&
stream_config
)
{
{
constexpr
int
MAX_MASK_DWORDS
=
64
;
constexpr
int
MAX_MASK_DWORDS
=
64
;
...
...
include/ck/tensor_description/multi_index_transform.hpp
View file @
4100d1d8
...
@@ -1042,13 +1042,13 @@ struct Merge_v2_magic_division
...
@@ -1042,13 +1042,13 @@ struct Merge_v2_magic_division
using
UpLengths
=
using
UpLengths
=
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})));
decltype
(
make_tuple
(
container_reduce
(
LowLengths
{},
math
::
multiplies
{},
Number
<
1
>
{})));
using
LowLengthsMagicDivisorMultipiler
=
decltype
(
using
LowLengthsMagicDivisorMultipiler
=
decltype
(
generate_tuple
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengths
>
{},
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengths
>
{},
Number
<
NDimLow
>
{}));
Number
<
NDimLow
>
{}));
using
LowLengthsMagicDivisorShift
=
decltype
(
using
LowLengthsMagicDivisorShift
=
decltype
(
generate_tuple
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_shift
<
LowLengths
>
{},
lambda_merge_generate_MagicDivision_calculate_magic_shift
<
LowLengths
>
{},
Number
<
NDimLow
>
{}));
Number
<
NDimLow
>
{}));
LowLengths
low_lengths_
;
LowLengths
low_lengths_
;
LowLengthsMagicDivisorMultipiler
low_lengths_magic_divisor_multiplier_
;
LowLengthsMagicDivisorMultipiler
low_lengths_magic_divisor_multiplier_
;
...
@@ -1201,9 +1201,9 @@ struct Merge_v2r2_magic_division
...
@@ -1201,9 +1201,9 @@ struct Merge_v2r2_magic_division
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengthsScan
>
{},
lambda_merge_generate_MagicDivision_calculate_magic_multiplier
<
LowLengthsScan
>
{},
Number
<
NDimLow
>
{}));
Number
<
NDimLow
>
{}));
using
LowLengthsScanMagicDivisorShift
=
decltype
(
using
LowLengthsScanMagicDivisorShift
=
decltype
(
generate_tuple
(
generate_tuple
(
lambda_merge_generate_MagicDivision_calculate_magic_shift
<
LowLengthsScan
>
{},
lambda_merge_generate_MagicDivision_calculate_magic_shift
<
LowLengthsScan
>
{},
Number
<
NDimLow
>
{}));
Number
<
NDimLow
>
{}));
LowLengths
low_lengths_
;
LowLengths
low_lengths_
;
LowLengthsScan
low_lengths_scan_
;
LowLengthsScan
low_lengths_scan_
;
...
...
include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_dpp8.hpp
0 → 100644
View file @
4100d1d8
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/amd_gemm_dpp.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl_dpp8.hpp"
namespace
ck
{
/**
* DPP8 version of blockwise GEMM algorithm. It uses DPP8 instruction modifier to limit
* the data loaded from LDS to registers.
*
* The algorithm groups threads into groups of size `dpp8::lane_group_size` and splits the matrix C
* between them in such a way that threads from the same group need the same chunk of either
* matrix A (or B, respectively). Without the usage of DPP8, each thread would need to load the
* whole chunk from LDS to its own register space.
* Usage of DPP8 modifiers allow each thread to load less data, exactly `1 / dpp8::lane_group_size`
* of the chunk, and then share that data with other threads from the same lane group.
*
* Assumptions coming from the usage of DPP8:
* 1. `BM10BN10ThreadClusterBM10Xs[1] == dpp8::lane_group_size` or
* `BM10BN10ThreadClusterBN10Xs[1] == dpp8::lane_group_size` -
* - it makes consecutive `dpp8::lane_group_size` threads use the same chunk of either
* matrix A or B;
* - based on these values we determine which matrix to share.
* 2. `BM1PerThreadBM11 % dpp8::lane_group_size == 0` (if sharing A) or
* `BN1PerThreadBN11 % dpp8::lane_group_size == 0` (if sharing B) -
* - we have to make sure that the data to split is divisible by the number of
* threads in the group.
*
* General algorithm:
* C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
* A and B are visible to the whole block, C is distributed among each thread
* Assume:
* 1. A:
* 1. ABlockDesc_BK0_BM_BK1 is known at compile-time
* 2. ABlockBuffer is DynamicBuffer
* 2. B:
* 1. BBlockDesc_BK0_BN_BK1 is known at compile-time
* 2. BBlockBuffer is DynamicBuffer
* 3. C:
* 1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
* 2. CThreadBuffer is StaticBuffer
* 4. BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
*/
template
<
index_t
BlockSize
,
typename
FloatA
,
typename
FloatB
,
typename
FloatC
,
typename
ABlockDesc_BK0_BM_BK1
,
typename
BBlockDesc_BK0_BN_BK1
,
index_t
BM1PerThreadBM11
,
index_t
BN1PerThreadBN11
,
index_t
BK0PerThread
,
typename
BM10BN10ThreadClusterBM10Xs
,
// Sequence<BM10BN10ThreadClusterBM100,
// BM10BN10ThreadClusterBM101, ...>
typename
BM10BN10ThreadClusterBN10Xs
,
// Sequence<BM10BN10ThreadClusterBN100,
// BM10BN10ThreadClusterBN101, ...>
index_t
AThreadCopyScalarPerVector_BM11
,
index_t
BThreadCopyScalarPerVector_BN11
,
typename
enable_if
<
ABlockDesc_BK0_BM_BK1
::
IsKnownAtCompileTime
()
&&
BBlockDesc_BK0_BN_BK1
::
IsKnownAtCompileTime
(),
bool
>
::
type
=
false
>
struct
BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0
{
using
AIndex
=
MultiIndex
<
4
>
;
using
BIndex
=
MultiIndex
<
4
>
;
using
CIndex
=
MultiIndex
<
4
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
index_t
BK0
=
ABlockDesc_BK0_BM_BK1
{}.
GetLength
(
I0
);
static
constexpr
index_t
BK1
=
ABlockDesc_BK0_BM_BK1
{}.
GetLength
(
I2
);
static
constexpr
index_t
BM
=
ABlockDesc_BK0_BM_BK1
{}.
GetLength
(
I1
);
static
constexpr
index_t
BN
=
BBlockDesc_BK0_BN_BK1
{}.
GetLength
(
I1
);
static
constexpr
index_t
BM100
=
BM10BN10ThreadClusterBM10Xs
{}[
I0
];
static
constexpr
index_t
BN100
=
BM10BN10ThreadClusterBN10Xs
{}[
I0
];
static
constexpr
index_t
BM101
=
BM10BN10ThreadClusterBM10Xs
{}[
I1
];
static
constexpr
index_t
BN101
=
BM10BN10ThreadClusterBN10Xs
{}[
I1
];
static
constexpr
index_t
BM11
=
BM1PerThreadBM11
;
static
constexpr
index_t
BN11
=
BN1PerThreadBN11
;
static
constexpr
index_t
BM1
=
BM100
*
BM101
*
BM11
;
static
constexpr
index_t
BN1
=
BN100
*
BN101
*
BN11
;
static
constexpr
index_t
BM0
=
BM
/
BM1
;
static
constexpr
index_t
BN0
=
BN
/
BN1
;
// We assume that either `BM101` or `BN101` is equal to `dpp8::lane_group_size`. It makes all
// threads in a lane group need the same chunk of B or A matrices and we can share them using
// DPP.
static_assert
(
BM101
==
dpp8
::
lane_group_size
||
BN101
==
dpp8
::
lane_group_size
);
static
constexpr
bool
ShareB
=
BM101
==
dpp8
::
lane_group_size
?
true
:
false
;
static
constexpr
bool
ShareA
=
!
ShareB
;
// If DPP shares A (B, respectively), lane group gets `BM1PerThreadBM11` (`BN1PerThreadBN11`,
// respectively) elements, so we split them between threads in lane group so each thread loads
// less data from LDS.
static
constexpr
index_t
BM1PerThread
=
ShareA
?
BM1PerThreadBM11
/
dpp8
::
lane_group_size
:
BM1PerThreadBM11
;
static
constexpr
index_t
BN1PerThread
=
ShareB
?
BN1PerThreadBN11
/
dpp8
::
lane_group_size
:
BN1PerThreadBN11
;
__host__
__device__
static
constexpr
auto
MakeABlockDescriptor_BK0_BM0_BM1_BK1
(
const
ABlockDesc_BK0_BM_BK1
&
a_block_desc_bk0_bm_bk1
)
{
const
auto
a_block_bk0_bm0_bm1_bk1
=
transform_tensor_descriptor
(
a_block_desc_bk0_bm_bk1
,
make_tuple
(
make_pass_through_transform
(
Number
<
BK0
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
BM0
>
{},
Number
<
BM1
>
{})),
make_pass_through_transform
(
Number
<
BK1
>
{})),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
return
a_block_bk0_bm0_bm1_bk1
;
}
__host__
__device__
static
constexpr
auto
MakeBBlockDescriptor_BK0_BN0_BN1_BK1
(
const
BBlockDesc_BK0_BN_BK1
&
b_block_desc_bk0_bn_bk1
)
{
const
auto
b_block_desc_bk0_bn0_bn1_bk1
=
transform_tensor_descriptor
(
b_block_desc_bk0_bn_bk1
,
make_tuple
(
make_pass_through_transform
(
Number
<
BK0
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
BN0
>
{},
Number
<
BN1
>
{})),
make_pass_through_transform
(
Number
<
BK1
>
{})),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
>
{},
Sequence
<
3
>
{}));
return
b_block_desc_bk0_bn0_bn1_bk1
;
}
__host__
__device__
static
constexpr
auto
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN
()
{
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// lower: [BM, BN]
constexpr
auto
c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_unmerge_transform
(
make_tuple
(
Number
<
BM0
>
{},
Number
<
BM100
>
{},
Number
<
BM101
>
{},
Number
<
BM11
>
{})),
make_unmerge_transform
(
make_tuple
(
Number
<
BN0
>
{},
Number
<
BN100
>
{},
Number
<
BN101
>
{},
Number
<
BN11
>
{}))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{},
Sequence
<
4
,
5
,
6
,
7
>
{}));
return
c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n
;
}
__host__
__device__
static
constexpr
auto
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1
()
{
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// lower: [BM0, BM1, BN0, BN1]
constexpr
auto
c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_pass_through_transform
(
Number
<
BM0
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
BM100
>
{},
Number
<
BM101
>
{},
Number
<
BM11
>
{})),
make_pass_through_transform
(
Number
<
BN0
>
{}),
make_unmerge_transform
(
make_tuple
(
Number
<
BN100
>
{},
Number
<
BN101
>
{},
Number
<
BN11
>
{}))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
2
,
3
>
{},
Sequence
<
4
>
{},
Sequence
<
5
,
6
,
7
>
{}));
return
c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1
;
}
__host__
__device__
static
constexpr
auto
GetCThreadTensorLengths_BM0_BM1_BN0_BN1
()
{
return
Sequence
<
BM0
,
BM11
,
BN0
,
BN11
>
{};
}
static
constexpr
auto
a_block_desc_bk0_bm0_bm1_bk1_
=
MakeABlockDescriptor_BK0_BM0_BM1_BK1
(
ABlockDesc_BK0_BM_BK1
{});
static
constexpr
auto
b_block_desc_bk0_bn0_bn1_bk1_
=
MakeBBlockDescriptor_BK0_BN0_BN1_BK1
(
BBlockDesc_BK0_BN_BK1
{});
public:
__device__
BlockwiseGemmDlDpp8_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_loop_BM0_BN0
()
:
c_thread_origin_data_idx_
{
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
get_thread_local_1d_id
())},
a_thread_copy_
{
CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1
()},
b_thread_copy_
{
CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1
()}
{
static_assert
(
ABlockDesc_BK0_BM_BK1
::
IsKnownAtCompileTime
()
&&
BBlockDesc_BK0_BN_BK1
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
static_assert
(
BM
%
BM1
==
0
&&
BN
%
BN1
==
0
,
"wrong!"
);
static_assert
(
ABlockDesc_BK0_BM_BK1
{}.
GetLength
(
I0
)
==
BBlockDesc_BK0_BN_BK1
{}.
GetLength
(
I0
),
"wrong! K dimension not consistent"
);
static_assert
(
BM10BN10ThreadClusterBM10Xs
::
Size
()
==
2
&&
BM10BN10ThreadClusterBN10Xs
::
Size
()
==
2
,
"wrong!"
);
}
__device__
static
CIndex
CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1
(
index_t
thread_id
)
{
// lower: [BM0, BM1, BN0, BN1]
// upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
constexpr
auto
adaptor0
=
MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1
();
// lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
// upper: [Tid, BM0, BM11, BN0, BN11]
constexpr
auto
adaptor1
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
BM100
,
BN100
,
BM101
,
BN101
)),
make_pass_through_transform
(
BM0
),
make_pass_through_transform
(
BM11
),
make_pass_through_transform
(
BN0
),
make_pass_through_transform
(
BN11
)),
make_tuple
(
Sequence
<
1
,
5
,
2
,
6
>
{},
Sequence
<
0
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{},
Sequence
<
7
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{},
Sequence
<
4
>
{}));
constexpr
auto
adaptor
=
chain_tensor_adaptors
(
adaptor0
,
adaptor1
);
return
adaptor
.
CalculateBottomIndex
(
make_multi_index
(
thread_id
,
0
,
0
,
0
,
0
));
}
__device__
AIndex
CalculateAThreadOriginOnBlock_BK0_BM0_BM1_BK1
()
{
const
auto
offsetBM0
=
c_thread_origin_data_idx_
[
I0
];
// If sharing matrix A, we need a separate BM1 offset for each thread in lane group.
const
auto
offsetBM1
=
ShareA
?
c_thread_origin_data_idx_
[
I1
]
+
dpp8
::
get_thread_idx_in_lane_group
()
*
BM1PerThread
:
c_thread_origin_data_idx_
[
I1
];
return
make_tuple
(
0
,
offsetBM0
,
offsetBM1
,
0
);
}
__device__
BIndex
CalculateBThreadOriginOnBlock_BK0_BN0_BN1_BK1
()
{
const
auto
offsetBN0
=
c_thread_origin_data_idx_
[
I2
];
// If sharing matrix B, we need a separate BN1 offset for each thread in lane group.
const
auto
offsetBN1
=
ShareB
?
c_thread_origin_data_idx_
[
I3
]
+
dpp8
::
get_thread_idx_in_lane_group
()
*
BN1PerThread
:
c_thread_origin_data_idx_
[
I3
];
return
make_tuple
(
0
,
offsetBN0
,
offsetBN1
,
0
);
}
template
<
typename
CThreadDesc_BM0_BM11_BN0_BN11
,
typename
ABlockBuffer
,
typename
BBlockBuffer
,
typename
CThreadBuffer
>
__device__
void
Run
(
const
CThreadDesc_BM0_BM11_BN0_BN11
&
,
const
ABlockBuffer
&
a_block_buf
,
const
BBlockBuffer
&
b_block_buf
,
CThreadBuffer
&
c_thread_buf
)
const
{
static_assert
(
CThreadDesc_BM0_BM11_BN0_BN11
::
IsKnownAtCompileTime
(),
"wrong! Desc should be known at compile-time"
);
auto
a_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatA
>
(
a_thread_desc_bk0_bm0_bm1_bk1_
.
GetElementSpaceSize
());
auto
b_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatB
>
(
b_thread_desc_bk0_bn0_bn1_bk1_
.
GetElementSpaceSize
());
constexpr
auto
threadwise_contraction
=
ThreadwiseContractionDlDpp8_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
<
FloatA
,
FloatB
,
FloatC
,
decltype
(
a_thread_desc_bk0_bm0_bm1_bk1_
),
decltype
(
b_thread_desc_bk0_bn0_bn1_bk1_
),
CThreadDesc_BM0_BM11_BN0_BN11
,
Sequence
<
BK0PerThread
,
BK1
>
,
Sequence
<
1
,
BM1PerThreadBM11
>
,
Sequence
<
1
,
BN1PerThreadBN11
>
,
ShareA
>
{};
static_for
<
0
,
BN0
,
1
>
{}([
&
](
auto
bn0
)
{
static_for
<
0
,
BM0
,
1
>
{}([
&
](
auto
bm0
)
{
a_thread_copy_
.
Run
(
a_block_desc_bk0_bm0_bm1_bk1_
,
make_tuple
(
I0
,
bm0
,
I0
,
I0
),
a_block_buf
,
a_thread_desc_bk0_bm0_bm1_bk1_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
a_thread_buf
);
b_thread_copy_
.
Run
(
b_block_desc_bk0_bn0_bn1_bk1_
,
make_tuple
(
I0
,
bn0
,
I0
,
I0
),
b_block_buf
,
b_thread_desc_bk0_bn0_bn1_bk1_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
);
threadwise_contraction
.
Run
(
a_thread_buf
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
make_tuple
(
bm0
,
I0
,
bn0
,
I0
));
static_for
<
BK0PerThread
,
BK0
,
BK0PerThread
>
{}([
&
](
auto
bk0
)
{
a_thread_copy_
.
Run
(
a_block_desc_bk0_bm0_bm1_bk1_
,
make_tuple
(
bk0
,
bm0
,
I0
,
I0
),
a_block_buf
,
a_thread_desc_bk0_bm0_bm1_bk1_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
a_thread_buf
);
b_thread_copy_
.
Run
(
b_block_desc_bk0_bn0_bn1_bk1_
,
make_tuple
(
bk0
,
bn0
,
I0
,
I0
),
b_block_buf
,
b_thread_desc_bk0_bn0_bn1_bk1_
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
);
threadwise_contraction
.
Run
(
a_thread_buf
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
b_thread_buf
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_thread_buf
,
make_tuple
(
bm0
,
I0
,
bn0
,
I0
));
});
});
});
}
private:
// A[BK0, BM0, BM1, BK1]
static
constexpr
auto
a_thread_desc_bk0_bm0_bm1_bk1_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BK0PerThread
>
{},
Number
<
BM0
>
{},
Number
<
BM1PerThread
>
{},
Number
<
BK1
>
{}));
// B[BK0, BN0, BN1, BK1]
static
constexpr
auto
b_thread_desc_bk0_bn0_bn1_bk1_
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
BK0PerThread
>
{},
Number
<
BN0
>
{},
Number
<
BN1PerThread
>
{},
Number
<
BK1
>
{}));
using
AThreadCopy
=
ThreadwiseTensorSliceTransfer_v4r1
<
FloatA
,
FloatA
,
decltype
(
a_block_desc_bk0_bm0_bm1_bk1_
),
decltype
(
a_thread_desc_bk0_bm0_bm1_bk1_
),
Sequence
<
BK0PerThread
,
1
,
BM1PerThread
,
BK1
>
,
// SliceLengths
Sequence
<
0
,
1
,
2
,
3
>
,
// DimAccessOrder
Sequence
<
1
,
1
,
BM1PerThread
,
BK1
>
,
// SrcVectorTensorLengths
Sequence
<
0
,
1
,
2
,
3
>>
;
// SrcVectorTensorContiguousDimOrder
using
BThreadCopy
=
ThreadwiseTensorSliceTransfer_v4r1
<
FloatB
,
FloatB
,
decltype
(
b_block_desc_bk0_bn0_bn1_bk1_
),
decltype
(
b_thread_desc_bk0_bn0_bn1_bk1_
),
Sequence
<
BK0PerThread
,
1
,
BN1PerThread
,
BK1
>
,
// SliceLengths
Sequence
<
0
,
1
,
2
,
3
>
,
// DimAccessOrder
Sequence
<
1
,
1
,
BN1PerThread
,
BK1
>
,
// SrcVectorTensorLengths
Sequence
<
0
,
1
,
2
,
3
>>
;
// SrcVectorTensorContiguousDimOrder
CIndex
c_thread_origin_data_idx_
;
AThreadCopy
a_thread_copy_
;
BThreadCopy
b_thread_copy_
;
};
}
// namespace ck
Prev
1
2
3
4
5
6
7
8
9
10
…
31
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment