Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
f76c0072
Commit
f76c0072
authored
Mar 22, 2024
by
Jakub Piasecki
Browse files
Merge remote-tracking branch 'origin/develop' into jakpiase/ggemm_multid_two_stage
parents
f27c50a7
9c052804
Changes
41
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
495 additions
and
121 deletions
+495
-121
library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp
...ance/gpu/permute_scale/device_permute_scale_instances.hpp
+164
-15
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
..._operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+5
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_instance.cpp
...uped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_instance.cpp
+53
-0
library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt
...ensor_operation_instance/gpu/permute_scale/CMakeLists.txt
+12
-6
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_1d_fp16_instances.cpp
.../permute_scale/device_permute_scale_1d_fp16_instances.cpp
+5
-10
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_1d_fp32_instances.cpp
.../permute_scale/device_permute_scale_1d_fp32_instances.cpp
+24
-0
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_2d_fp16_instances.cpp
.../permute_scale/device_permute_scale_2d_fp16_instances.cpp
+5
-10
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_2d_fp32_instances.cpp
.../permute_scale/device_permute_scale_2d_fp32_instances.cpp
+24
-0
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_3d_fp16_instances.cpp
.../permute_scale/device_permute_scale_3d_fp16_instances.cpp
+5
-10
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_3d_fp32_instances.cpp
.../permute_scale/device_permute_scale_3d_fp32_instances.cpp
+24
-0
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_4d_fp16_instances.cpp
.../permute_scale/device_permute_scale_4d_fp16_instances.cpp
+5
-10
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_4d_fp32_instances.cpp
.../permute_scale/device_permute_scale_4d_fp32_instances.cpp
+24
-0
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_5d_fp16_instances.cpp
.../permute_scale/device_permute_scale_5d_fp16_instances.cpp
+5
-10
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_5d_fp32_instances.cpp
.../permute_scale/device_permute_scale_5d_fp32_instances.cpp
+24
-0
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp16_instances.cpp
.../permute_scale/device_permute_scale_6d_fp16_instances.cpp
+5
-10
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_instances.cpp
.../permute_scale/device_permute_scale_6d_fp32_instances.cpp
+24
-0
profiler/include/profiler/profile_permute_scale_impl.hpp
profiler/include/profiler/profile_permute_scale_impl.hpp
+12
-34
profiler/src/profile_grouped_conv_fwd.cpp
profiler/src/profile_grouped_conv_fwd.cpp
+8
-1
profiler/src/profile_permute_scale.cpp
profiler/src/profile_permute_scale.cpp
+24
-5
script/profile_permute_scale.sh
script/profile_permute_scale.sh
+43
-0
No files found.
library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp
View file @
f76c0072
This diff is collapsed.
Click to expand it.
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
View file @
f76c0072
...
@@ -35,4 +35,9 @@ if(DTYPES MATCHES "fp8" OR NOT DEFINED DTYPES)
...
@@ -35,4 +35,9 @@ if(DTYPES MATCHES "fp8" OR NOT DEFINED DTYPES)
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_fp8_instance.cpp
)
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_fp8_instance.cpp
)
endif
()
endif
()
if
(
DTYPES MATCHES
"bf8"
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_instance.cpp
)
endif
()
add_instance_library
(
device_grouped_conv3d_fwd_instance
${
GROUPED_CONV3D_FWD
}
)
add_instance_library
(
device_grouped_conv3d_fwd_instance
${
GROUPED_CONV3D_FWD
}
)
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_instance.cpp
0 → 100644
View file @
f76c0072
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
BF8
,
BF8
,
Empty_Tuple
,
F8
,
PassThrough
,
PassThrough
,
PassThrough
,
BF8
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_bf8_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwdDefault
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_bf8_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwd1x1P0
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_bf8_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwd1x1S1P0
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt
View file @
f76c0072
add_instance_library
(
device_permute_scale_instance
add_instance_library
(
device_permute_scale_instance
device_permute_scale_1d_instances.cpp
device_permute_scale_1d_fp16_instances.cpp
device_permute_scale_2d_instances.cpp
device_permute_scale_2d_fp16_instances.cpp
device_permute_scale_3d_instances.cpp
device_permute_scale_3d_fp16_instances.cpp
device_permute_scale_4d_instances.cpp
device_permute_scale_4d_fp16_instances.cpp
device_permute_scale_5d_instances.cpp
device_permute_scale_5d_fp16_instances.cpp
device_permute_scale_6d_instances.cpp
)
device_permute_scale_6d_fp16_instances.cpp
device_permute_scale_1d_fp32_instances.cpp
device_permute_scale_2d_fp32_instances.cpp
device_permute_scale_3d_fp32_instances.cpp
device_permute_scale_4d_fp32_instances.cpp
device_permute_scale_5d_fp32_instances.cpp
device_permute_scale_6d_fp32_instances.cpp
)
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_1d_instances.cpp
→
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_1d_
fp16_
instances.cpp
View file @
f76c0072
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_permute_scale_1d_f16_instances
(
using
Scale
=
element_wise
::
Scale
;
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f16_instances
<
1
>
{});
}
void
add_device_permute_scale_1d_f
32
_instances
(
void
add_device_permute_scale_1d_f
16
_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Scale
,
1
>>>&
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
1
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_permute_scale_f
32
_instances
<
1
>
{});
add_device_operation_instances
(
instances
,
device_permute_scale_f
16
_instances
<
1
,
Scale
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_1d_fp32_instances.cpp
0 → 100644
View file @
f76c0072
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Scale
=
element_wise
::
Scale
;
void
add_device_permute_scale_1d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Scale
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f32_instances
<
1
,
Scale
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_2d_instances.cpp
→
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_2d_
fp16_
instances.cpp
View file @
f76c0072
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_permute_scale_2d_f16_instances
(
using
Scale
=
element_wise
::
Scale
;
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
2
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f16_instances
<
2
>
{});
}
void
add_device_permute_scale_2d_f
32
_instances
(
void
add_device_permute_scale_2d_f
16
_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Scale
,
2
>>>&
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
2
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_permute_scale_f
32
_instances
<
2
>
{});
add_device_operation_instances
(
instances
,
device_permute_scale_f
16
_instances
<
2
,
Scale
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_2d_fp32_instances.cpp
0 → 100644
View file @
f76c0072
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Scale
=
element_wise
::
Scale
;
void
add_device_permute_scale_2d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Scale
,
2
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f32_instances
<
2
,
Scale
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_3d_instances.cpp
→
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_3d_
fp16_
instances.cpp
View file @
f76c0072
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_permute_scale_3d_f16_instances
(
using
Scale
=
element_wise
::
Scale
;
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f16_instances
<
3
>
{});
}
void
add_device_permute_scale_3d_f
32
_instances
(
void
add_device_permute_scale_3d_f
16
_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Scale
,
3
>>>&
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
3
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_permute_scale_f
32
_instances
<
3
>
{});
add_device_operation_instances
(
instances
,
device_permute_scale_f
16
_instances
<
3
,
Scale
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_3d_fp32_instances.cpp
0 → 100644
View file @
f76c0072
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Scale
=
element_wise
::
Scale
;
void
add_device_permute_scale_3d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Scale
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f32_instances
<
3
,
Scale
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_4d_instances.cpp
→
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_4d_
fp16_
instances.cpp
View file @
f76c0072
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_permute_scale_4d_f16_instances
(
using
Scale
=
element_wise
::
Scale
;
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
4
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f16_instances
<
4
>
{});
}
void
add_device_permute_scale_4d_f
32
_instances
(
void
add_device_permute_scale_4d_f
16
_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Scale
,
4
>>>&
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
4
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_permute_scale_f
32
_instances
<
4
>
{});
add_device_operation_instances
(
instances
,
device_permute_scale_f
16
_instances
<
4
,
Scale
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_4d_fp32_instances.cpp
0 → 100644
View file @
f76c0072
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Scale
=
element_wise
::
Scale
;
void
add_device_permute_scale_4d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Scale
,
4
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f32_instances
<
4
,
Scale
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_5d_instances.cpp
→
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_5d_
fp16_
instances.cpp
View file @
f76c0072
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_permute_scale_5d_f16_instances
(
using
Scale
=
element_wise
::
Scale
;
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
5
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f16_instances
<
5
>
{});
}
void
add_device_permute_scale_5d_f
32
_instances
(
void
add_device_permute_scale_5d_f
16
_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Scale
,
5
>>>&
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
5
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_permute_scale_f
32
_instances
<
5
>
{});
add_device_operation_instances
(
instances
,
device_permute_scale_f
16
_instances
<
5
,
Scale
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_5d_fp32_instances.cpp
0 → 100644
View file @
f76c0072
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Scale
=
element_wise
::
Scale
;
void
add_device_permute_scale_5d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Scale
,
5
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f32_instances
<
5
,
Scale
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_instances.cpp
→
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_
fp16_
instances.cpp
View file @
f76c0072
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
...
@@ -9,18 +9,13 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_permute_scale_6d_f16_instances
(
using
Scale
=
element_wise
::
Scale
;
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Pass
,
UnaryOp
,
Scale
,
6
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f16_instances
<
6
>
{});
}
void
add_device_permute_scale_6d_f
32
_instances
(
void
add_device_permute_scale_6d_f
16
_instances
(
std
::
vector
<
std
::
unique_ptr
<
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F16
>
,
ck
::
Tuple
<
F16
>
,
Scale
,
6
>>>&
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Pass
,
UnaryOp
,
Scale
,
6
>>>&
instances
)
instances
)
{
{
add_device_operation_instances
(
instances
,
device_permute_scale_f
32
_instances
<
6
>
{});
add_device_operation_instances
(
instances
,
device_permute_scale_f
16
_instances
<
6
,
Scale
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_instances.cpp
0 → 100644
View file @
f76c0072
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
Scale
=
element_wise
::
Scale
;
void
add_device_permute_scale_6d_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwise
<
ck
::
Tuple
<
F32
>
,
ck
::
Tuple
<
F32
>
,
Scale
,
6
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_permute_scale_f32_instances
<
6
,
Scale
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/include/profiler/profile_permute_scale_impl.hpp
View file @
f76c0072
...
@@ -8,9 +8,9 @@
...
@@ -8,9 +8,9 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise
_scale
.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
scale
_impl.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_
dynamic_vector_dims
_impl.hpp"
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
...
@@ -21,23 +21,12 @@
...
@@ -21,23 +21,12 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
namespace
ck
{
namespace
ck
{
template
<
typename
HostTensorA
,
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
ElementOp
>
typename
HostTensorB
,
typename
AElementOp
,
typename
BElementOp
,
typename
ScaleElementOp
>
void
reference_permute_scale
(
HostTensorB
&
b_tensor
,
void
reference_permute_scale
(
HostTensorB
&
b_tensor
,
const
HostTensorA
&
a_tensor
,
const
HostTensorA
&
a_tensor
,
AElementOp
a_tensor_op
,
ElementOp
tensor_op
)
BElementOp
b_tensor_op
,
ScaleElementOp
scale_op
)
{
{
b_tensor
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
b_tensor
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
tensor_op
(
self
(
idx
),
a_tensor
(
idx
));
});
auto
tmp_val
=
a_tensor
(
idx
);
b_tensor_op
(
tmp_val
,
tmp_val
);
scale_op
(
tmp_val
,
tmp_val
);
a_tensor_op
(
self
(
idx
),
tmp_val
);
});
}
}
namespace
profiler
{
namespace
profiler
{
...
@@ -54,9 +43,7 @@ bool profile_permute_scale_impl(int do_verification,
...
@@ -54,9 +43,7 @@ bool profile_permute_scale_impl(int do_verification,
bool
pass
=
true
;
bool
pass
=
true
;
bool
instance_found
=
false
;
bool
instance_found
=
false
;
using
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ElementOp
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
using
UnaryOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
;
using
Scale
=
ck
::
tensor_operation
::
element_wise
::
Scale
;
float
scale
=
2.
f
;
float
scale
=
2.
f
;
Tensor
<
ADataType
>
a
(
lengths_vector
,
input_strides_vector
);
Tensor
<
ADataType
>
a
(
lengths_vector
,
input_strides_vector
);
...
@@ -80,12 +67,8 @@ bool profile_permute_scale_impl(int do_verification,
...
@@ -80,12 +67,8 @@ bool profile_permute_scale_impl(int do_verification,
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
const
void
*
,
1
>
input
=
{
a_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
std
::
array
<
void
*
,
1
>
output
=
{
b_device_buf
.
GetDeviceBuffer
()};
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceElementwise
<
ck
::
Tuple
<
ADataType
>
,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
ck
::
Tuple
<
BDataType
>
,
DeviceElementwise
<
ck
::
Tuple
<
ADataType
>
,
ck
::
Tuple
<
BDataType
>
,
ElementOp
,
NumDim
>
;
ElementOp
,
UnaryOp
,
Scale
,
NumDim
>
;
// get device op instances
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
@@ -100,7 +83,7 @@ bool profile_permute_scale_impl(int do_verification,
...
@@ -100,7 +83,7 @@ bool profile_permute_scale_impl(int do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
reference_permute_scale
(
host_b
,
a
,
ElementOp
{
},
UnaryOp
{},
Scale
{
scale
});
reference_permute_scale
(
host_b
,
a
,
ElementOp
{
scale
});
}
}
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
y
.
begin
());
};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
(),
y
.
begin
());
};
...
@@ -113,14 +96,8 @@ bool profile_permute_scale_impl(int do_verification,
...
@@ -113,14 +96,8 @@ bool profile_permute_scale_impl(int do_verification,
for
(
auto
&
op_ptr
:
op_ptrs
)
for
(
auto
&
op_ptr
:
op_ptrs
)
{
{
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
lengths
,
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
{
input_strides
},
lengths
,
{
input_strides
},
{
output_strides
},
input
,
output
,
ElementOp
{
scale
});
{
output_strides
},
input
,
output
,
ElementOp
{},
UnaryOp
{},
Scale
{
scale
});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
...
@@ -141,6 +118,7 @@ bool profile_permute_scale_impl(int do_verification,
...
@@ -141,6 +118,7 @@ bool profile_permute_scale_impl(int do_verification,
if
(
do_log
)
if
(
do_log
)
{
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host_b: "
,
host_b
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b: "
,
b
.
mData
,
","
)
<<
std
::
endl
;
}
}
}
}
...
...
profiler/src/profile_grouped_conv_fwd.cpp
View file @
f76c0072
...
@@ -24,6 +24,7 @@ enum struct ConvDataType
...
@@ -24,6 +24,7 @@ enum struct ConvDataType
BF16_BF16_BF16
,
// 2
BF16_BF16_BF16
,
// 2
INT8_INT8_INT8
,
// 3
INT8_INT8_INT8
,
// 3
F8_F8_F8
,
// 4
F8_F8_F8
,
// 4
BF8_BF8_F8
,
// 5
};
};
#define OP_NAME "grouped_conv_fwd"
#define OP_NAME "grouped_conv_fwd"
...
@@ -38,7 +39,8 @@ static void print_helper_msg()
...
@@ -38,7 +39,8 @@ static void print_helper_msg()
<<
" 1: Input fp16, Weight fp16, Output fp16
\n
"
<<
" 1: Input fp16, Weight fp16, Output fp16
\n
"
<<
" 2: Input bf16, Weight bf16, Output bf16
\n
"
<<
" 2: Input bf16, Weight bf16, Output bf16
\n
"
<<
" 3: Input int8, Weight int8, Output int8
\n
"
<<
" 3: Input int8, Weight int8, Output int8
\n
"
<<
" 4: Input fp8, Weight fp8, Output fp8)
\n
"
<<
" 4: Input fp8, Weight fp8, Output fp8
\n
"
<<
" 5: Input bf8, Weight bf8, Output fp8)
\n
"
<<
"arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]
\n
"
<<
"arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]
\n
"
<<
" 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])
\n
"
<<
" 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])
\n
"
<<
"arg4: verification (0: no, 1: yes)
\n
"
<<
"arg4: verification (0: no, 1: yes)
\n
"
...
@@ -82,6 +84,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
...
@@ -82,6 +84,7 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
using
BF16
=
ck
::
bhalf_t
;
using
BF16
=
ck
::
bhalf_t
;
using
INT8
=
int8_t
;
using
INT8
=
int8_t
;
using
F8
=
ck
::
f8_t
;
using
F8
=
ck
::
f8_t
;
using
BF8
=
ck
::
bf8_t
;
//
//
using
GNWC
=
ck
::
tensor_layout
::
convolution
::
GNWC
;
using
GNWC
=
ck
::
tensor_layout
::
convolution
::
GNWC
;
...
@@ -257,6 +260,10 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
...
@@ -257,6 +260,10 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
{
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F8
{},
F8
{},
F8
{});
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
F8
{},
F8
{},
F8
{});
}
}
else
if
(
data_type
==
ConvDataType
::
BF8_BF8_F8
)
{
return
profile
(
I3
,
NDHWGC
{},
GKZYXC
{},
NDHWGK
{},
BF8
{},
BF8
{},
F8
{});
}
}
}
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
std
::
cout
<<
"this data_type & layout is not implemented"
<<
std
::
endl
;
...
...
profiler/src/profile_permute_scale.cpp
View file @
f76c0072
...
@@ -37,6 +37,20 @@ static void print_helper_msg()
...
@@ -37,6 +37,20 @@ static void print_helper_msg()
// clang-format on
// clang-format on
}
}
void
init_strides
(
const
std
::
vector
<
ck
::
index_t
>&
lengths
,
const
std
::
vector
<
ck
::
index_t
>&
dims_order
,
std
::
vector
<
ck
::
index_t
>&
strides
)
{
ck
::
index_t
stride
=
1
;
for
(
ck
::
index_t
d
=
lengths
.
size
()
-
1
;
d
>=
0
;
d
--
)
{
ck
::
index_t
dim
=
dims_order
[
d
];
strides
[
dim
]
=
stride
;
stride
*=
lengths
[
dim
];
}
}
}
// namespace
}
// namespace
int
profile_permute_scale
(
int
argc
,
char
*
argv
[])
int
profile_permute_scale
(
int
argc
,
char
*
argv
[])
...
@@ -58,16 +72,21 @@ int profile_permute_scale(int argc, char* argv[])
...
@@ -58,16 +72,21 @@ int profile_permute_scale(int argc, char* argv[])
const
int
num_dims
=
dims_argc
/
3
;
const
int
num_dims
=
dims_argc
/
3
;
std
::
vector
<
ck
::
index_t
>
lengths
(
num_dims
);
std
::
vector
<
ck
::
index_t
>
lengths
(
num_dims
);
std
::
vector
<
ck
::
index_t
>
input_
stri
de
s
(
num_dims
);
std
::
vector
<
ck
::
index_t
>
input_
dims_or
de
r
(
num_dims
);
std
::
vector
<
ck
::
index_t
>
output_
stri
de
s
(
num_dims
);
std
::
vector
<
ck
::
index_t
>
output_
dims_or
de
r
(
num_dims
);
for
(
int
i
=
0
;
i
<
num_dims
;
i
++
)
for
(
int
i
=
0
;
i
<
num_dims
;
i
++
)
{
{
lengths
[
i
]
=
std
::
stoi
(
argv
[
control_argc
+
i
]);
lengths
[
i
]
=
std
::
stoi
(
argv
[
control_argc
+
i
]);
input_
stri
de
s
[
i
]
=
std
::
stoi
(
argv
[
control_argc
+
num_dims
+
i
]);
input_
dims_or
de
r
[
i
]
=
std
::
stoi
(
argv
[
control_argc
+
num_dims
+
i
]);
output_
stri
de
s
[
i
]
=
std
::
stoi
(
argv
[
control_argc
+
2
*
num_dims
+
i
]);
output_
dims_or
de
r
[
i
]
=
std
::
stoi
(
argv
[
control_argc
+
2
*
num_dims
+
i
]);
}
}
std
::
vector
<
ck
::
index_t
>
input_strides
(
num_dims
);
std
::
vector
<
ck
::
index_t
>
output_strides
(
num_dims
);
init_strides
(
lengths
,
input_dims_order
,
input_strides
);
init_strides
(
lengths
,
output_dims_order
,
output_strides
);
using
F32
=
float
;
using
F32
=
float
;
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
...
...
script/profile_permute_scale.sh
0 → 100755
View file @
f76c0072
#!/bin/bash
## GPU visibility
export
HIP_VISIBLE_DEVICES
=
0
DRIVER
=
"../build/bin/ckProfiler"
echo
$DRIVER
OP
=
$1
DATATYPE
=
$2
VERIFY
=
$3
INIT
=
$4
LOG
=
$5
TIME
=
$6
# 1D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
67108864 0 0
# # 2D
# ######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8192 8192 0 1 1 0
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8192 8192 1 0 0 1
# 3D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8 1024 8192 0 1 2 2 1 0
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8 1024 8192 2 1 0 0 1 2
# 4D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8 2 512 8192 0 1 2 3 3 2 1 0
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8 2 512 8192 3 2 1 0 0 1 2 3
# 5D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8 2 2 256 8192 0 1 2 3 4 4 3 2 1 0
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8 2 2 256 8192 4 3 2 1 0 0 1 2 3 4
# 6D
######## op datatype verify init log time dims in_strides_order out_strides_order
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8 2 2 2 128 8192 0 1 2 3 4 5 5 4 3 2 1 0
$DRIVER
$OP
$DATATYPE
$VERIFY
$INIT
$LOG
$TIME
8 2 2 2 128 8192 5 4 3 2 1 0 0 1 2 3 4 5
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment