Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
5a9c4962
Commit
5a9c4962
authored
Apr 24, 2024
by
Adam Osewski
Browse files
Merge remote-tracking branch 'origin/develop' into aosewski/ggemm_multi_d2
parents
3970cf73
43879b89
Changes
703
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
197 additions
and
231 deletions
+197
-231
example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
...le/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp16_fp8.cpp
+2
-2
example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
+41
-48
example/17_convnd_bwd_data/CMakeLists.txt
example/17_convnd_bwd_data/CMakeLists.txt
+4
-11
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
+7
-2
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
+10
-5
example/19_binary_elementwise/elementwise_add_1d.cpp
example/19_binary_elementwise/elementwise_add_1d.cpp
+10
-5
example/19_binary_elementwise/elementwise_add_4d.cpp
example/19_binary_elementwise/elementwise_add_4d.cpp
+10
-5
example/20_grouped_conv_bwd_weight/CMakeLists.txt
example/20_grouped_conv_bwd_weight/CMakeLists.txt
+10
-24
example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
..._weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
+4
-1
example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
...d_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
+4
-1
example/21_gemm_layernorm/CMakeLists.txt
example/21_gemm_layernorm/CMakeLists.txt
+4
-12
example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
...layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+10
-5
example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+10
-5
example/26_contraction/CMakeLists.txt
example/26_contraction/CMakeLists.txt
+16
-16
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+1
-4
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+17
-34
example/31_batched_gemm_gemm/CMakeLists.txt
example/31_batched_gemm_gemm/CMakeLists.txt
+6
-14
example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
+6
-8
example/34_batchnorm/batchnorm_infer_impl.hpp
example/34_batchnorm/batchnorm_infer_impl.hpp
+8
-3
example/35_splitK_gemm/CMakeLists.txt
example/35_splitK_gemm/CMakeLists.txt
+17
-26
No files found.
example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_fp8.cpp
→
example/15_grouped_gemm/grouped_gemm_xdl_fixed_nk_
fp16_
fp8.cpp
View file @
5a9c4962
...
@@ -35,7 +35,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
...
@@ -35,7 +35,7 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using
ADataType
=
F16
;
using
ADataType
=
F16
;
using
BDataType
=
F8
;
using
BDataType
=
F8
;
using
AccDataType
=
F32
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F
32
;
using
CShuffleDataType
=
F
16
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
DsDataType
=
ck
::
Tuple
<>
;
using
EDataType
=
F16
;
using
EDataType
=
F16
;
...
@@ -56,7 +56,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl_F
...
@@ -56,7 +56,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl_F
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmDefault
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
1
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
<
ALayout
,
BLayout
,
DsLayout
,
ELayout
,
ADataType
,
BDataType
,
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmDefault
,
1
,
256
,
64
,
128
,
32
,
8
,
8
,
32
,
32
,
1
,
2
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
1
,
S
<
1
,
4
,
64
,
1
>
,
S
<
0
,
2
,
1
,
3
>
,
S
<
0
,
2
,
1
,
3
>
,
3
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
// clang-format on
// clang-format on
struct
ProblemSize
final
struct
ProblemSize
final
...
...
example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
View file @
5a9c4962
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
add_custom_target
(
example_gemm_reduce_xdl
)
set
(
target 0
)
add_custom_target
(
example_gemm_reduce_xdl_max
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
add_custom_target
(
example_gemm_reduce_xdl_mean_meansquare
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_gemm_add_add_mean_meansquare_xdl
)
add_custom_target
(
example_gemm_reduce_xdl
)
add_custom_target
(
example_gemm_reduce_xdl_max
)
add_example_executable
(
example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp
)
add_custom_target
(
example_gemm_reduce_xdl_mean_meansquare
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_fp16
)
add_custom_target
(
example_gemm_add_add_mean_meansquare_xdl
)
add_example_executable
(
example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp
)
add_example_dependencies
(
example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_fp16
)
add_example_executable
(
example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp
)
add_example_dependencies
(
example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_fp16
)
add_example_dependencies
(
example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16
)
add_example_executable
(
example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp
)
add_example_executable
(
example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_int8
)
add_example_dependencies
(
example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_fp16
)
add_example_executable
(
example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp
)
add_example_executable
(
example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp
)
add_example_dependencies
(
example_gemm_reduce_xdl_mean_meansquare example_gemm_add_addsquare_xdl_int8
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_int8
)
add_example_executable
(
example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp
)
add_example_executable
(
example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_fp32
)
add_example_dependencies
(
example_gemm_reduce_xdl_mean_meansquare example_gemm_add_addsquare_xdl_int8
)
add_example_executable
(
example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp
)
add_example_executable
(
example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp
)
add_example_dependencies
(
example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_fp32
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_fp32
)
add_example_executable
(
example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp
)
add_example_executable
(
example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_bf16
)
add_example_dependencies
(
example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_fp32
)
add_example_executable
(
example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp
)
add_example_executable
(
example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp
)
add_example_dependencies
(
example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_bf16
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_bf16
)
add_example_dependencies
(
example_gemm_reduce_xdl
add_example_executable
(
example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp
)
example_gemm_reduce_xdl_mean_meansquare
add_example_dependencies
(
example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_bf16
)
example_gemm_reduce_xdl_max
example_gemm_add_add_mean_meansquare_xdl
)
add_example_dependencies
(
example_gemm_reduce_xdl
example_gemm_reduce_xdl_mean_meansquare
if
(
USE_BITINT_EXTENSION_INT4
)
example_gemm_reduce_xdl_max
add_example_executable
(
example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp
)
example_gemm_add_add_mean_meansquare_xdl
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_int4
)
endif
()
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp
)
add_example_dependencies
(
example_gemm_reduce_xdl_max example_gemm_max_xdl_int4
)
endif
()
set
(
target 1
)
endif
()
endforeach
()
example/17_convnd_bwd_data/CMakeLists.txt
View file @
5a9c4962
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
add_example_executable
(
example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp
)
set
(
target 0
)
if
(
result EQUAL 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
target_link_libraries
(
example_convnd_bwd_data_xdl_fp16 PRIVATE utility
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
endif
()
add_example_executable
(
example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp
)
if
(
result EQUAL 0
)
target_link_libraries
(
example_convnd_bwd_data_xdl_fp16 PRIVATE utility
)
endif
()
set
(
target 1
)
endif
()
endforeach
()
add_example_executable
(
example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp
)
add_example_executable
(
example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp
)
if
(
result EQUAL 0
)
if
(
result EQUAL 0
)
...
...
example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
View file @
5a9c4962
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims_
impl.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -27,7 +27,12 @@ using DeviceElementwiseAddInstance =
...
@@ -27,7 +27,12 @@ using DeviceElementwiseAddInstance =
ck
::
Tuple
<
CDataType
>
,
ck
::
Tuple
<
CDataType
>
,
Add
,
Add
,
2
,
2
,
64
,
64
,
64
,
8
,
8
,
8
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
8
,
8
>
,
ck
::
Sequence
<
8
,
8
>
,
ck
::
Sequence
<
8
>>
;
ck
::
Sequence
<
8
>>
;
...
...
example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
View file @
5a9c4962
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims_
impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
...
@@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance =
...
@@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance =
ck
::
Tuple
<
CDataType
>
,
ck
::
Tuple
<
CDataType
>
,
Add
,
Add
,
3
,
3
,
8
,
64
,
ck
::
Sequence
<
1
,
8
>
,
16
,
ck
::
Sequence
<
8
>>
;
16
,
2
,
2
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
1
,
2
>
,
ck
::
Sequence
<
2
>>
;
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
void
host_broadcast3D_am_bmnk
(
HostTensorC
&
C
,
void
host_broadcast3D_am_bmnk
(
HostTensorC
&
C
,
...
...
example/19_binary_elementwise/elementwise_add_1d.cpp
View file @
5a9c4962
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims_
impl.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -25,9 +25,14 @@ using DeviceElementwiseAddInstance =
...
@@ -25,9 +25,14 @@ using DeviceElementwiseAddInstance =
ck
::
Tuple
<
CDataType
>
,
ck
::
Tuple
<
CDataType
>
,
Add
,
Add
,
1
,
1
,
8
,
64
,
ck
::
Sequence
<
8
,
8
>
,
16
,
ck
::
Sequence
<
8
>>
;
16
,
2
,
2
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
2
,
2
>
,
ck
::
Sequence
<
2
>>
;
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
void
host_elementwise1D
(
void
host_elementwise1D
(
...
...
example/19_binary_elementwise/elementwise_add_4d.cpp
View file @
5a9c4962
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <cstdlib>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims_
impl.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
...
@@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance =
...
@@ -27,9 +27,14 @@ using DeviceElementwiseAddInstance =
ck
::
Tuple
<
CDataType
>
,
ck
::
Tuple
<
CDataType
>
,
Add
,
Add
,
4
,
4
,
8
,
64
,
ck
::
Sequence
<
8
,
8
>
,
2
,
ck
::
Sequence
<
8
>>
;
128
,
2
,
2
,
ck
::
Sequence
<
1
,
0
>
,
ck
::
Sequence
<
2
,
2
>
,
ck
::
Sequence
<
2
>>
;
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
void
host_elementwise4D
(
HostTensorC
&
C
,
void
host_elementwise4D
(
HostTensorC
&
C
,
...
...
example/20_grouped_conv_bwd_weight/CMakeLists.txt
View file @
5a9c4962
list
(
APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942
)
add_custom_target
(
example_grouped_conv_bwd_weight
)
list
(
APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp
)
set
(
target 0
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list_xdl AND target EQUAL 0
)
add_custom_target
(
example_grouped_conv_bwd_weight
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_bf16
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_bf16
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8 grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
)
add_example_executable
(
example_grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8 grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8
)
set
(
target 1
)
endif
()
if
(
gpu IN_LIST gpu_list_wmma AND target EQUAL 0
)
add_example_executable
(
example_grouped_conv_bwd_weight_wmma_fp16 grouped_conv_bwd_weight_wmma_fp16.cpp
)
add_custom_target
(
example_grouped_conv_bwd_weight
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_wmma_fp16
)
add_example_executable
(
example_grouped_conv_bwd_weight_wmma_fp16 grouped_conv_bwd_weight_wmma_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_wmma_fp16
)
set
(
target 1
)
endif
()
endforeach
()
add_custom_target
(
example_grouped_conv_bwd_weight_dl
)
add_example_executable
(
example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_bwd_weight
_dl
example_grouped_conv_bwd_weight_dl_fp16
)
add_example_dependencies
(
example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_dl_fp16
)
example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16_comp_bf8_fp8.cpp
View file @
5a9c4962
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "common.hpp"
...
@@ -78,6 +78,9 @@ using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWe
...
@@ -78,6 +78,9 @@ using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWe
InElementOp
,
InElementOp
,
WeiElementOp
,
WeiElementOp
,
OutElementOp
,
OutElementOp
,
0
,
0
,
0
,
ComputeTypeA
,
ComputeTypeA
,
ComputeTypeB
>
;
ComputeTypeB
>
;
...
...
example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
View file @
5a9c4962
...
@@ -119,7 +119,10 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
...
@@ -119,7 +119,10 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
conv_param
.
input_right_pads_
,
conv_param
.
input_right_pads_
,
InElementOp
{},
InElementOp
{},
WeiElementOp
{},
WeiElementOp
{},
OutElementOp
{});
OutElementOp
{},
{},
{},
{});
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
...
...
example/21_gemm_layernorm/CMakeLists.txt
View file @
5a9c4962
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
add_example_executable
(
example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
)
set
(
target 0
)
add_example_executable
(
example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
add_example_executable
(
example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_example_executable
(
example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
)
add_example_executable
(
example_gemm_bias_relu_add_layernorm_xdl_welford_fp16 gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
)
add_example_executable
(
example_gemm_bias_relu_add_layernorm_xdl_naive_fp16 gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
)
add_example_executable
(
example_gemm_layernorm_xdl_naive_fp16 gemm_layernorm_xdl_naive_fp16.cpp
)
add_example_executable
(
example_gemm_xdl_layernorm_naive_single_kernel_fp16 gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
)
set
(
target 1
)
endif
()
endforeach
()
example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
View file @
5a9c4962
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims_
impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -103,9 +103,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI
...
@@ -103,9 +103,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI
ck
::
Tuple
<
LayerNormOutDataType
>
,
// y
ck
::
Tuple
<
LayerNormOutDataType
>
,
// y
NormalizeFunctor
,
NormalizeFunctor
,
2
,
2
,
8
,
// MPerthread
64
,
// BlockSize
ck
::
Sequence
<
8
,
1
,
1
,
8
,
8
>
,
// scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
16
,
// MPerBlock
ck
::
Sequence
<
8
>>
;
// scalarPerVector: y(layerNorm_out)
16
,
// NPerBlock
2
,
// MPerthread
2
,
// NPerthread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
2
,
1
,
1
,
2
,
2
>
,
// scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
ck
::
Sequence
<
2
>>
;
// scalarPerVector: y(layerNorm_out)
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
({
len
},
{
stride
});
return
HostTensorDescriptor
({
len
},
{
stride
});
...
...
example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
View file @
5a9c4962
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims_
impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -102,9 +102,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI
...
@@ -102,9 +102,14 @@ using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwiseI
ck
::
Tuple
<
LayerNormOutDataType
>
,
// y
ck
::
Tuple
<
LayerNormOutDataType
>
,
// y
NormalizeFunctor
,
NormalizeFunctor
,
2
,
2
,
8
,
// MPerthread
64
,
// BlockSize
ck
::
Sequence
<
8
,
1
,
1
,
8
,
8
>
,
// scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
16
,
// MPerBlock
ck
::
Sequence
<
8
>>
;
// scalarPerVector: y(layerNorm_out)
16
,
// NPerBlock
2
,
// MPerthread
2
,
// NPerthread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
2
,
1
,
1
,
2
,
2
>
,
// scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
ck
::
Sequence
<
2
>>
;
// scalarPerVector: y(layerNorm_out)
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
({
len
},
{
stride
});
return
HostTensorDescriptor
({
len
},
{
stride
});
...
...
example/26_contraction/CMakeLists.txt
View file @
5a9c4962
...
@@ -4,49 +4,49 @@ add_custom_target(example_contraction_bilinear)
...
@@ -4,49 +4,49 @@ add_custom_target(example_contraction_bilinear)
# FP32
# FP32
add_example_executable
(
example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32
)
add_example_executable
(
example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32
)
add_example_executable
(
example_contraction_bilinear_xdl_fp32_compute_bf16 contraction_bilinear_xdl_fp32_compute_bf16.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_fp32_compute_bf16 contraction_bilinear_xdl_fp32_compute_bf16.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_bf16
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_bf16
)
add_example_executable
(
example_contraction_scale_xdl_fp32_compute_bf16 contraction_scale_xdl_fp32_compute_bf16.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp32_compute_bf16 contraction_scale_xdl_fp32_compute_bf16.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32_compute_bf16
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32_compute_bf16
)
add_example_executable
(
example_contraction_bilinear_xdl_fp32_compute_fp16 contraction_bilinear_xdl_fp32_compute_fp16.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_fp32_compute_fp16 contraction_bilinear_xdl_fp32_compute_fp16.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_fp16
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp32_compute_fp16
)
add_example_executable
(
example_contraction_scale_xdl_fp32_compute_fp16 contraction_scale_xdl_fp32_compute_fp16.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp32_compute_fp16 contraction_scale_xdl_fp32_compute_fp16.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32_compute_fp16
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp32_compute_fp16
)
# FP64
# FP64
add_example_executable
(
example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp64
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp64
)
add_example_executable
(
example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp64
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp64
)
add_example_executable
(
example_contraction_bilinear_xdl_fp64_compute_fp32 contraction_bilinear_xdl_fp64_compute_fp32.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_fp64_compute_fp32 contraction_bilinear_xdl_fp64_compute_fp32.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp64_compute_fp32
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp64_compute_fp32
)
add_example_executable
(
example_contraction_scale_xdl_fp64_compute_fp32 contraction_scale_xdl_fp64_compute_fp32.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp64_compute_fp32 contraction_scale_xdl_fp64_compute_fp32.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp64_compute_fp32
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp64_compute_fp32
)
# FP16
# FP16
add_example_executable
(
example_contraction_bilinear_xdl_fp16_compute_fp32 contraction_bilinear_xdl_fp16_compute_fp32.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_fp16_compute_fp32 contraction_bilinear_xdl_fp16_compute_fp32.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp16_compute_fp32
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_fp16_compute_fp32
)
add_example_executable
(
example_contraction_scale_xdl_fp16_compute_fp32 contraction_scale_xdl_fp16_compute_fp32.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp16_compute_fp32 contraction_scale_xdl_fp16_compute_fp32.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp16_compute_fp32
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_fp16_compute_fp32
)
# BF16
# BF16
add_example_executable
(
example_contraction_bilinear_xdl_bf16_compute_fp32 contraction_bilinear_xdl_bf16_compute_fp32.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_bf16_compute_fp32 contraction_bilinear_xdl_bf16_compute_fp32.cpp
)
add_dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_bf16_compute_fp32
)
add_
example_
dependencies
(
example_contraction_bilinear example_contraction_bilinear_xdl_bf16_compute_fp32
)
add_example_executable
(
example_contraction_scale_xdl_bf16_compute_fp32 contraction_scale_xdl_bf16_compute_fp32.cpp
)
add_example_executable
(
example_contraction_scale_xdl_bf16_compute_fp32 contraction_scale_xdl_bf16_compute_fp32.cpp
)
add_dependencies
(
example_contraction_scale example_contraction_scale_xdl_bf16_compute_fp32
)
add_
example_
dependencies
(
example_contraction_scale example_contraction_scale_xdl_bf16_compute_fp32
)
add_dependencies
(
example_contraction example_contraction_scale
)
add_
example_
dependencies
(
example_contraction example_contraction_scale
)
add_dependencies
(
example_contraction example_contraction_bilinear
)
add_
example_
dependencies
(
example_contraction example_contraction_bilinear
)
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
View file @
5a9c4962
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx11"
)
add_example_executable
(
example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp
)
endif
()
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
View file @
5a9c4962
list
(
APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942
)
add_custom_target
(
example_grouped_conv_fwd_multiple_d
)
list
(
APPEND gpu_list2 gfx1100 gfx1101 gfx1102
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16
)
set
(
target 0
)
add_example_executable
(
example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
add_custom_target
(
example_grouped_conv_fwd_multiple_d
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp
16
grouped_conv_fwd_bias_relu_add_xdl_fp
16
.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_fp
32
grouped_conv_fwd_bias_relu_add_xdl_fp
32
.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp
16
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp
32
)
add_example_executable
(
example_grouped_conv_fwd_xdl_f
p
16 grouped_conv_fwd_xdl_f
p
16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_
bias_relu_add_
xdl_
b
f16 grouped_conv_fwd_
bias_relu_add_
xdl_
b
f16.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_f
p
16
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_
bias_relu_add_
xdl_
b
f16
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_
fp32
grouped_conv_fwd_bias_relu_add_xdl_
fp32
.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_
int8
grouped_conv_fwd_bias_relu_add_xdl_
int8
.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_
fp32
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_
int8
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
)
add_example_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
set
(
target 1
)
endif
()
endforeach
()
set
(
target 0
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
gpu IN_LIST gpu_list2 AND target EQUAL 0
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp
)
set
(
target 1
)
endif
()
endforeach
()
example/31_batched_gemm_gemm/CMakeLists.txt
View file @
5a9c4962
list
(
APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp
)
set
(
target 0
)
add_example_executable
(
example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
if
(
USE_BITINT_EXTENSION_INT4
)
if
(
gpu IN_LIST gpu_list1 AND target EQUAL 0
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp
)
endif
(
USE_BITINT_EXTENSION_INT4
)
set
(
target 1
)
endif
()
endforeach
()
if
(
NOT GPU_TARGETS MATCHES
"gfx94"
AND NOT GPU_TARGETS MATCHES
"gfx1"
)
if
(
NOT GPU_TARGETS MATCHES
"gfx94"
AND NOT GPU_TARGETS MATCHES
"gfx1"
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp
)
add_example_executable
(
example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp
)
...
...
example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
View file @
5a9c4962
if
(
GPU_TARGETS MATCHES
"gfx11"
)
add_example_executable
(
example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
)
add_example_executable
(
example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp
)
add_example_executable
(
example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp
)
add_example_executable
(
example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp
)
add_example_executable
(
example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_cross_attention_forward_wmma_fp16 cross_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_cross_attention_forward_wmma_fp16 cross_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_multi_query_attention_forward_wmma_fp16 multi_query_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_multi_query_attention_forward_wmma_fp16 multi_query_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_query_attention_forward_wmma_fp16 grouped_query_attention_forward_wmma_fp16.cpp
)
add_example_executable
(
example_grouped_query_attention_forward_wmma_fp16 grouped_query_attention_forward_wmma_fp16.cpp
)
endif
()
add_custom_target
(
example_gemm_scale_softmax_gemm
)
add_custom_target
(
example_gemm_scale_softmax_gemm
)
...
...
example/34_batchnorm/batchnorm_infer_impl.hpp
View file @
5a9c4962
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-202
3
, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-202
4
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -10,7 +10,7 @@
...
@@ -10,7 +10,7 @@
#include "ck/utility/sequence.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims_
impl.hpp"
#include "batchnorm_common.hpp"
#include "batchnorm_common.hpp"
...
@@ -54,7 +54,12 @@ int bnorm_infer(
...
@@ -54,7 +54,12 @@ int bnorm_infer(
ck
::
Tuple
<
YDataType
>
,
// y
ck
::
Tuple
<
YDataType
>
,
// y
NormalizeInInfer
,
NormalizeInInfer
,
Rank
,
Rank
,
2
,
// MPerthread
64
,
// BlockSize
32
,
// MPerBlock
32
,
// NPerBlock
4
,
// MPerthread
4
,
// NPerthread
ck
::
Sequence
<
1
,
0
>
,
// ThreadClusterArrangeOrder
ck
::
Sequence
<
1
,
1
,
1
,
1
,
1
>
,
// x, mean, variance, scale, bias
ck
::
Sequence
<
1
,
1
,
1
,
1
,
1
>
,
// x, mean, variance, scale, bias
ck
::
Sequence
<
1
>>
;
// scalarPerVector: y
ck
::
Sequence
<
1
>>
;
// scalarPerVector: y
...
...
example/35_splitK_gemm/CMakeLists.txt
View file @
5a9c4962
list
(
APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942
)
add_custom_target
(
example_splitK_gemm_xdl
)
set
(
target 0
)
add_example_executable
(
example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp
)
foreach
(
gpu IN LISTS GPU_TARGETS
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp32
)
if
(
gpu IN_LIST gpu_list AND target EQUAL 0
)
add_custom_target
(
example_splitK_gemm_xdl
)
add_example_executable
(
example_splitK_gemm_xdl_fp
32
splitK_gemm_xdl_fp
32
.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_fp
16
splitK_gemm_xdl_fp
16
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp
32
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp
16
)
add_example_executable
(
example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_fp16
_fp8
splitK_gemm_xdl_fp16
_fp8
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16
_fp8
)
add_example_executable
(
example_splitK_gemm_xdl_
fp16_fp8 splitK_gemm_xdl_fp16_fp8
.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_
lds_direct_load_fp16 splitK_gemm_xdl_lds_direct_load_fp16
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16
_fp8
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
lds_direct_load_
fp16
)
add_example_executable
(
example_splitK_gemm_xdl_
lds_direct_load_fp
16 splitK_gemm_xdl_
lds_direct_load_fp
16.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_
bf
16 splitK_gemm_xdl_
bf
16.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
lds_direct_load_fp
16
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
bf
16
)
add_example_executable
(
example_splitK_gemm_xdl_
bf16
splitK_gemm_xdl_
bf16
.cpp
)
add_example_executable
(
example_splitK_gemm_xdl_
int8
splitK_gemm_xdl_
int8
.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
bf16
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_
int8
)
add_example_executable
(
example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp
)
if
(
USE_BITINT_EXTENSION_INT4
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_int8
)
add_example_executable
(
example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_int4
)
if
(
USE_BITINT_EXTENSION_INT4
)
endif
()
add_example_executable
(
example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp
)
add_example_dependencies
(
example_splitK_gemm_xdl example_splitK_gemm_xdl_int4
)
endif
()
set
(
target 1
)
endif
()
endforeach
()
Prev
1
2
3
4
5
6
7
8
…
36
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment