Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
8f41bd8e
"tests/pipelines/animatediff/__init__.py" did not exist on "6ab2dd18a4d17d90c92409886ac22a02acf25d7d"
Commit
8f41bd8e
authored
Apr 11, 2024
by
Jun Liu
Browse files
Merge branch 'develop' into amd-develop
parents
7f65ac05
d7f05fb9
Changes
144
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1021 additions
and
149 deletions
+1021
-149
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
...k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
+58
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
...scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+57
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
...scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+57
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
...scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+57
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
...scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+57
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
...k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
+58
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
...k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
+58
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
...k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
+58
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
...k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
+58
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
...scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
+57
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
...scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
+57
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
...scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
+57
-0
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
...scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
+57
-0
library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt
...r_operation_instance/gpu/contraction_scale/CMakeLists.txt
+44
-38
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
..._operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+2
-0
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_fp8_instance.cpp
..._conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_fp8_instance.cpp
+54
-0
profiler/README.md
profiler/README.md
+13
-11
profiler/include/profiler/profile_contraction_impl.hpp
profiler/include/profiler/profile_contraction_impl.hpp
+39
-41
profiler/include/profiler/profile_contraction_utils.hpp
profiler/include/profiler/profile_contraction_utils.hpp
+25
-3
profiler/src/profile_contraction_bilinear.cpp
profiler/src/profile_contraction_bilinear.cpp
+98
-56
No files found.
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance
=
device_contraction_mn_instance
<
F32
,
F32
,
F32
,
F32
,
Empty_Tuple
,
F32
,
F16
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F16
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance
=
device_contraction_kk_instance
<
F32
,
F32
,
F32
,
F32
,
Empty_Tuple
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance
=
device_contraction_kn_instance
<
F32
,
F32
,
F32
,
F32
,
Empty_Tuple
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance
=
device_contraction_mk_instance
<
F32
,
F32
,
F32
,
F32
,
Empty_Tuple
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance
=
device_contraction_mn_instance
<
F32
,
F32
,
F32
,
F32
,
Empty_Tuple
,
F32
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F32
,
F32
,
Empty_Tuple
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance
=
device_contraction_f64_kk_instance
<
F64
,
F64
,
F32
,
F64
,
Empty_Tuple
,
F64
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance
=
device_contraction_f64_kn_instance
<
F64
,
F64
,
F32
,
F64
,
Empty_Tuple
,
F64
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance
=
device_contraction_f64_mk_instance
<
F64
,
F64
,
F32
,
F64
,
Empty_Tuple
,
F64
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance
=
device_contraction_f64_mn_instance
<
F64
,
F64
,
F32
,
F64
,
Empty_Tuple
,
F64
,
F32
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F32
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance
=
device_contraction_f64_kk_instance
<
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F64
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance
=
device_contraction_f64_kn_instance
<
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F64
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance
=
device_contraction_f64_mk_instance
<
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F64
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/6D/device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance
=
device_contraction_f64_mn_instance
<
F64
,
F64
,
F64
,
F64
,
Empty_Tuple
,
F64
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
6
>
;
void
add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance
(
std
::
vector
<
std
::
unique_ptr
<
DeviceContractionMultipleD
<
6
,
6
,
6
,
F64
,
F64
,
Empty_Tuple
,
F64
,
PassThrough
,
PassThrough
,
Scale
,
F64
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt
View file @
8f41bd8e
# ONLY XDL_KERNELS
# ONLY XDL_KERNELS
set
(
DEVICE_CONTRACTION_SCALE_INSTANCES
)
set
(
DEVICE_CONTRACTION_SCALE_INSTANCES
)
# FP32
list
(
APPEND DIMS 2 6
)
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
foreach
(
idx IN LISTS DIMS
)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
set
(
PREFIX
${
idx
}
D/device_contraction_scale_m
${
idx
}
_n
${
idx
}
_k
${
idx
}
)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
)
# FP32
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
)
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
)
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
)
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp
)
# FP64
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp
)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
)
# FP64
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES
${
PREFIX
}
_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
)
${
PREFIX
}
_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp
)
# FP16
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES
${
PREFIX
}
_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp
)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
)
# FP16
# BF16
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES
${
PREFIX
}
_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp
)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
)
# BF16
list
(
APPEND DEVICE_CONTRACTION_SCALE_INSTANCES
${
PREFIX
}
_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
${
PREFIX
}
_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp
)
endforeach
()
add_instance_library
(
device_contraction_scale_instance
${
DEVICE_CONTRACTION_SCALE_INSTANCES
}
)
add_instance_library
(
device_contraction_scale_instance
${
DEVICE_CONTRACTION_SCALE_INSTANCES
}
)
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
View file @
8f41bd8e
...
@@ -44,6 +44,8 @@ endif()
...
@@ -44,6 +44,8 @@ endif()
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"bf8"
)
OR NOT DEFINED DTYPES
)
if
((
DTYPES MATCHES
"fp8"
AND DTYPES MATCHES
"bf8"
)
OR NOT DEFINED DTYPES
)
list
(
APPEND GROUPED_CONV3D_FWD
list
(
APPEND GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_fp8_bf8_instance.cpp
)
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_fp8_bf8_instance.cpp
)
list
(
APPEND GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_fp8_instance.cpp
)
endif
()
endif
()
add_instance_library
(
device_grouped_conv3d_fwd_instance
${
GROUPED_CONV3D_FWD
}
)
add_instance_library
(
device_grouped_conv3d_fwd_instance
${
GROUPED_CONV3D_FWD
}
)
library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_fp8_instance.cpp
0 → 100644
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
void
add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_f8_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceGroupedConvFwdMultipleABD
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
BF8
,
F8
,
Empty_Tuple
,
F8
,
PassThrough
,
PassThrough
,
PassThrough
,
BF8
,
F8
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_bf8_f8_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwdDefault
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_bf8_f8_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwd1x1P0
>
{});
add_device_operation_instances
(
instances
,
device_grouped_conv_fwd_xdl_bf8_f8_instances
<
3
,
NDHWGC
,
GKZYXC
,
Empty_Tuple
,
NDHWGK
,
ConvFwd1x1S1P0
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/README.md
View file @
8f41bd8e
...
@@ -52,21 +52,23 @@ Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
...
@@ -52,21 +52,23 @@ Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
#arg1: tensor operation (contraction_bilinear=CONTRACTION+Bilinear)
#arg1: tensor operation (contraction_bilinear=CONTRACTION+Bilinear)
#arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)
#arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)
#arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)
#arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)
#arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
#arg4: Number of dimension for M, N and K (one for all)
#arg5: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1])
# 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1])
#arg5: verification (0: no; 1: yes)
#arg6: verification (0: no; 1: yes)
#arg6: initialization (0: no init; 1: integer value; 2: decimal value)
#arg7: initialization (0: no init; 1: integer value; 2: decimal
#arg7: print tensor value (0: no; 1: yes)
# value)
#arg8: time kernel (0: no, 1: yes)
#arg8: print tensor value (0: no; 1: yes)
#arg9: alpha
#arg9: time kernel (0: no, 1: yes)
#arg10: beta
#arg10: alpha
#arg11 to 16: M0, M1, N0, N1, K0, K1
#arg11: beta
#arg17 to 32: Strides for A, B, D and E (skip for default)
#arg12 to 17/29: M0, M1, N0, N1, K0, K1
#arg18/30 to 33/77: Strides for A, B, D and E (skip for default)
################ op datatype compute_datatype layout verify init log time alpha beta M0 M1 N0 N1 K0 K1
################ op datatype compute_datatype
num_dim
layout verify init log time alpha beta M0 M1 N0 N1 K0 K1
./bin/ckProfiler contraction_bilinear 0 0 1 0 0 0 1 1.0 1.0 128 128 128 128 128 128
./bin/ckProfiler contraction_bilinear 0 0
2
1 0 0 0 1 1.0 1.0 128 128 128 128 128 128
```
```
Result (MI100)
Result (MI100)
...
...
profiler/include/profiler/profile_contraction_impl.hpp
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -22,6 +22,7 @@
...
@@ -22,6 +22,7 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/host_utility/io.hpp"
...
@@ -34,7 +35,8 @@ using Scale = ck::tensor_operation::element_wise::Scale;
...
@@ -34,7 +35,8 @@ using Scale = ck::tensor_operation::element_wise::Scale;
using
F32
=
float
;
using
F32
=
float
;
using
F64
=
double
;
using
F64
=
double
;
template
<
typename
ALayout
,
template
<
index_t
NumDimMNK
,
typename
ALayout
,
typename
BLayout
,
typename
BLayout
,
typename
CDELayout
,
typename
CDELayout
,
typename
DataType
,
typename
DataType
,
...
@@ -104,18 +106,24 @@ int profile_contraction_impl(ck::index_t do_verification,
...
@@ -104,18 +106,24 @@ int profile_contraction_impl(ck::index_t do_verification,
e_device_buf
.
SetZero
();
e_device_buf
.
SetZero
();
d_device_buf
.
ToDevice
(
d_m_n
.
mData
.
data
());
d_device_buf
.
ToDevice
(
d_m_n
.
mData
.
data
());
const
std
::
vector
<
index_t
>
a_ms_ks_lengths
=
{
M
[
0
],
M
[
1
],
K
[
0
],
K
[
1
]};
auto
merge_dims
=
[](
const
std
::
vector
<
ck
::
index_t
>&
dims01
,
const
std
::
vector
<
index_t
>
b_ns_ks_lengths
=
{
N
[
0
],
N
[
1
],
K
[
0
],
K
[
1
]};
const
std
::
vector
<
ck
::
index_t
>&
dims23
)
{
const
std
::
vector
<
index_t
>
e_ms_ns_lengths
=
{
M
[
0
],
M
[
1
],
N
[
0
],
N
[
1
]};
std
::
vector
<
ck
::
index_t
>
dims_szt
(
dims01
.
begin
(),
dims01
.
end
());
const
std
::
vector
<
index_t
>
d_m_n_lengths
=
{
M
[
0
],
M
[
1
],
N
[
0
],
N
[
1
]};
dims_szt
.
insert
(
dims_szt
.
end
(),
dims23
.
begin
(),
dims23
.
end
());
return
dims_szt
;
};
const
std
::
vector
<
index_t
>
a_ms_ks_lengths
=
merge_dims
(
M
,
K
);
const
std
::
vector
<
index_t
>
b_ns_ks_lengths
=
merge_dims
(
N
,
K
);
const
std
::
vector
<
index_t
>
e_ms_ns_lengths
=
merge_dims
(
M
,
N
);
const
std
::
vector
<
index_t
>
d_m_n_lengths
=
merge_dims
(
M
,
N
);
const
auto
a_element_op
=
AElementOp
{};
const
auto
a_element_op
=
AElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
const
auto
b_element_op
=
BElementOp
{};
constexpr
ck
::
index_t
NumDim
=
2
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDimMNK
,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceContractionMultipleD
<
NumDim
,
NumDimMNK
,
NumDim
,
NumDimMNK
,
NumDim
,
DataType
,
DataType
,
DataType
,
DataType
,
DTupleDataType
,
DTupleDataType
,
...
@@ -138,9 +146,9 @@ int profile_contraction_impl(ck::index_t do_verification,
...
@@ -138,9 +146,9 @@ int profile_contraction_impl(ck::index_t do_verification,
if
(
do_verification
)
if
(
do_verification
)
{
{
using
ReferenceGemmInstance
=
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceContraction_M2_N2_K2
<
NumDim
,
ck
::
tensor_operation
::
host
::
ReferenceContraction_M2_N2_K2
<
NumDim
MNK
,
NumDim
,
NumDim
MNK
,
NumDim
,
NumDim
MNK
,
DataType
,
DataType
,
DataType
,
DataType
,
DataType
,
DataType
,
...
@@ -159,33 +167,20 @@ int profile_contraction_impl(ck::index_t do_verification,
...
@@ -159,33 +167,20 @@ int profile_contraction_impl(ck::index_t do_verification,
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
for
(
size_t
m0
=
0
;
m0
<
e_m_n_host_result
.
mDesc
.
GetLengths
()[
0
];
++
m0
)
e_m_n_host_result
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
{
if
constexpr
(
is_same
<
CDElementOp
,
Bilinear
>::
value
)
for
(
size_t
m1
=
0
;
m1
<
e_m_n_host_result
.
mDesc
.
GetLengths
()[
1
];
++
m1
)
{
{
for
(
size_t
n0
=
0
;
n0
<
e_m_n_host_result
.
mDesc
.
GetLengths
()[
2
];
++
n0
)
cde_element_op
(
self
(
idx
),
c_m_n_host_result
(
idx
),
d_m_n
(
idx
));
{
for
(
size_t
n1
=
0
;
n1
<
e_m_n_host_result
.
mDesc
.
GetLengths
()[
3
];
++
n1
)
{
if
constexpr
(
is_same
<
CDElementOp
,
Bilinear
>::
value
)
{
cde_element_op
(
e_m_n_host_result
(
m0
,
m1
,
n0
,
n1
),
c_m_n_host_result
(
m0
,
m1
,
n0
,
n1
),
d_m_n
(
m0
,
m1
,
n0
,
n1
));
}
else
if
constexpr
(
is_same
<
CDElementOp
,
Scale
>::
value
)
{
cde_element_op
(
e_m_n_host_result
(
m0
,
m1
,
n0
,
n1
),
c_m_n_host_result
(
m0
,
m1
,
n0
,
n1
));
}
else
{
static_assert
(
"Unsupported CDElementOp in contraction profiler."
);
}
}
}
}
}
}
else
if
constexpr
(
is_same
<
CDElementOp
,
Scale
>::
value
)
{
cde_element_op
(
self
(
idx
),
c_m_n_host_result
(
idx
));
}
else
{
static_assert
(
"Unsupported CDElementOp in contraction profiler."
);
}
});
}
}
std
::
string
best_op_name
;
std
::
string
best_op_name
;
...
@@ -242,9 +237,12 @@ int profile_contraction_impl(ck::index_t do_verification,
...
@@ -242,9 +237,12 @@ int profile_contraction_impl(ck::index_t do_verification,
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
auto
nelems_m
=
M
[
0
]
*
M
[
1
];
auto
nelems_m
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
auto
nelems_n
=
N
[
0
]
*
N
[
1
];
a_ms_ks_lengths
.
begin
(),
NumDimMNK
,
1
,
std
::
multiplies
<>
{});
auto
nelems_k
=
K
[
0
]
*
K
[
1
];
auto
nelems_n
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
b_ns_ks_lengths
.
begin
(),
NumDimMNK
,
1
,
std
::
multiplies
<>
{});
auto
nelems_k
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
a_ms_ks_lengths
.
begin
()
+
NumDimMNK
,
NumDimMNK
,
1
,
std
::
multiplies
<>
{});
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
...
...
profiler/include/profiler/profile_contraction_utils.hpp
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
...
@@ -48,14 +48,36 @@ inline void collect_index_params(char* argv[],
...
@@ -48,14 +48,36 @@ inline void collect_index_params(char* argv[],
// Defualt strides for row-major: {Dim1 * Dim2 * Dim3, Dim2 * Dim3, Dim3, 1}
// Defualt strides for row-major: {Dim1 * Dim2 * Dim3, Dim2 * Dim3, Dim3, 1}
// Defualt strides for column-major: {Dim1, 1, Dim0 * Dim1 * Dim3, Dim0 * Dim1}
// Defualt strides for column-major: {Dim1, 1, Dim0 * Dim1 * Dim3, Dim0 * Dim1}
// M1, 1, M0 * M1 * K1, M0 * M1
// K0, K1, M0, M1
inline
void
inline
void
assign_default_strides
(
Row
,
std
::
vector
<
ck
::
index_t
>&
strides
,
std
::
vector
<
ck
::
index_t
>
dims
)
assign_default_strides
(
Row
,
std
::
vector
<
ck
::
index_t
>&
strides
,
std
::
vector
<
ck
::
index_t
>
dims
)
{
{
strides
=
{
dims
[
1
]
*
dims
[
2
]
*
dims
[
3
],
dims
[
2
]
*
dims
[
3
],
dims
[
3
],
1
};
ck
::
index_t
stride
=
1
;
for
(
ck
::
index_t
s
=
strides
.
size
()
-
1
;
s
>=
0
;
s
--
)
{
strides
[
s
]
=
stride
;
stride
*=
dims
[
s
];
}
}
}
inline
void
inline
void
assign_default_strides
(
Col
,
std
::
vector
<
ck
::
index_t
>&
strides
,
std
::
vector
<
ck
::
index_t
>
dims
)
assign_default_strides
(
Col
,
std
::
vector
<
ck
::
index_t
>&
strides
,
std
::
vector
<
ck
::
index_t
>
dims
)
{
{
strides
=
{
dims
[
1
],
1
,
dims
[
0
]
*
dims
[
1
]
*
dims
[
3
],
dims
[
0
]
*
dims
[
1
]};
// Assign second half of strides
ck
::
index_t
stride
=
1
;
for
(
ck
::
index_t
s
=
strides
.
size
()
/
2
-
1
;
s
>=
0
;
s
--
)
{
strides
[
s
]
=
stride
;
stride
*=
dims
[
s
];
}
// Assign first half of strides
for
(
ck
::
index_t
s
=
strides
.
size
()
-
1
;
s
>
static_cast
<
ck
::
index_t
>
(
strides
.
size
())
/
2
-
1
;
s
--
)
{
strides
[
s
]
=
stride
;
stride
*=
dims
[
s
];
}
}
}
profiler/src/profile_contraction_bilinear.cpp
View file @
8f41bd8e
// SPDX-License-Identifier: MIT
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2023
-2024
, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <iostream>
#include <numeric>
#include <numeric>
...
@@ -19,7 +19,8 @@ static void print_helper_msg()
...
@@ -19,7 +19,8 @@ static void print_helper_msg()
std
::
cout
<<
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
std
::
cout
<<
"arg1: tensor operation ("
OP_NAME
": "
OP_DESC
")
\n
"
<<
"arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)
\n
"
<<
"arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)
\n
"
<<
"arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)
\n
"
<<
"arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)
\n
"
<<
"arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
<<
"arg4: Number of dimension for M, N and K (one for all)
\n
"
<<
"arg5: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
\n
"
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
\n
"
<<
" 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
<<
" 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
\n
"
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
\n
"
...
@@ -27,23 +28,23 @@ static void print_helper_msg()
...
@@ -27,23 +28,23 @@ static void print_helper_msg()
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
\n
"
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
\n
"
<<
" 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
<<
" 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])
\n
"
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])
\n
"
<<
"arg
5
: verification (0: no; 1: yes)
\n
"
<<
"arg
6
: verification (0: no; 1: yes)
\n
"
<<
"arg
6
: initialization (0: no init; 1: integer value; 2: decimal "
<<
"arg
7
: initialization (0: no init; 1: integer value; 2: decimal "
<<
"value)
\n
"
<<
"value)
\n
"
<<
"arg
7
: print tensor value (0: no; 1: yes)
\n
"
<<
"arg
8
: print tensor value (0: no; 1: yes)
\n
"
<<
"arg
8
: time kernel (0: no, 1: yes)
\n
"
<<
"arg
9
: time kernel (0: no, 1: yes)
\n
"
<<
"arg
9
: alpha
\n
"
<<
"arg
10
: alpha
\n
"
<<
"arg1
0
: beta
\n
"
<<
"arg1
1
: beta
\n
"
<<
"arg1
1
to 1
6
: M0, M1, N0, N1, K0, K1
\n
"
<<
"arg1
2
to 1
7/29
: M0, M1, N0, N1, K0, K1
\n
"
<<
"arg1
7
to 3
2
: Strides for A, B, D and E (skip for default)
\n
"
<<
"arg1
8/30
to 3
3/77
: Strides for A, B, D and E (skip for default)
\n
"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
int
profile_contraction_bilinear
(
int
argc
,
char
*
argv
[])
int
profile_contraction_bilinear
(
int
argc
,
char
*
argv
[])
{
{
const
bool
default_strides
=
argc
==
1
7
;
const
bool
default_strides
=
argc
==
1
8
||
30
;
if
(
argc
!=
3
3
&&
argc
!=
1
7
)
if
(
argc
!=
3
4
&&
argc
!=
7
8
&&
!
default_strides
)
{
{
print_helper_msg
();
print_helper_msg
();
exit
(
1
);
exit
(
1
);
...
@@ -51,32 +52,33 @@ int profile_contraction_bilinear(int argc, char* argv[])
...
@@ -51,32 +52,33 @@ int profile_contraction_bilinear(int argc, char* argv[])
const
auto
data_type
=
static_cast
<
ContractionDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
data_type
=
static_cast
<
ContractionDataType
>
(
std
::
stoi
(
argv
[
2
]));
const
auto
compute_data_type
=
static_cast
<
ContractionComputeDataType
>
(
std
::
stoi
(
argv
[
3
]));
const
auto
compute_data_type
=
static_cast
<
ContractionComputeDataType
>
(
std
::
stoi
(
argv
[
3
]));
const
auto
layout
=
static_cast
<
ContractionMatrixLayout
>
(
std
::
stoi
(
argv
[
4
]));
const
ck
::
index_t
NumDimMNK
=
std
::
stoi
(
argv
[
4
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
5
]);
const
auto
layout
=
static_cast
<
ContractionMatrixLayout
>
(
std
::
stoi
(
argv
[
5
]));
const
ck
::
index_t
init_method
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_verification
=
std
::
stoi
(
argv
[
6
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
7
]);
const
ck
::
index_t
init_method
=
std
::
stoi
(
argv
[
7
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
8
]);
const
bool
do_log
=
std
::
stoi
(
argv
[
8
]);
const
float
alpha
=
std
::
stof
(
argv
[
9
]);
const
bool
time_kernel
=
std
::
stoi
(
argv
[
9
]);
const
float
beta
=
std
::
stof
(
argv
[
10
]);
const
float
alpha
=
std
::
stof
(
argv
[
10
]);
const
float
beta
=
std
::
stof
(
argv
[
11
]);
std
::
vector
<
ck
::
index_t
>
M
;
std
::
vector
<
ck
::
index_t
>
M
;
std
::
vector
<
ck
::
index_t
>
N
;
std
::
vector
<
ck
::
index_t
>
N
;
std
::
vector
<
ck
::
index_t
>
K
;
std
::
vector
<
ck
::
index_t
>
K
;
const
ck
::
index_t
dims_arg_num
=
1
1
;
const
ck
::
index_t
dims_arg_num
=
1
2
;
collect_index_params
(
argv
,
M
,
dims_arg_num
,
2
);
collect_index_params
(
argv
,
M
,
dims_arg_num
,
NumDimMNK
);
collect_index_params
(
argv
,
N
,
dims_arg_num
+
2
,
2
);
collect_index_params
(
argv
,
N
,
dims_arg_num
+
NumDimMNK
,
NumDimMNK
);
collect_index_params
(
argv
,
K
,
dims_arg_num
+
4
,
2
);
collect_index_params
(
argv
,
K
,
dims_arg_num
+
NumDimMNK
*
2
,
NumDimMNK
);
std
::
vector
<
ck
::
index_t
>
StridesA
;
std
::
vector
<
ck
::
index_t
>
StridesA
(
NumDimMNK
*
2
)
;
std
::
vector
<
ck
::
index_t
>
StridesB
;
std
::
vector
<
ck
::
index_t
>
StridesB
(
NumDimMNK
*
2
)
;
std
::
vector
<
ck
::
index_t
>
StridesE
;
std
::
vector
<
ck
::
index_t
>
StridesE
(
NumDimMNK
*
2
)
;
std
::
vector
<
ck
::
index_t
>
StridesD
;
std
::
vector
<
ck
::
index_t
>
StridesD
(
NumDimMNK
*
2
)
;
if
(
!
default_strides
)
if
(
!
default_strides
)
{
{
collect_index_params
(
argv
,
StridesA
,
dims_arg_num
+
6
,
4
);
collect_index_params
(
argv
,
StridesA
,
dims_arg_num
+
NumDimMNK
*
3
,
NumDimMNK
*
2
);
collect_index_params
(
argv
,
StridesB
,
dims_arg_num
+
10
,
4
);
collect_index_params
(
argv
,
StridesB
,
dims_arg_num
+
NumDimMNK
*
5
,
NumDimMNK
*
2
);
collect_index_params
(
argv
,
StridesE
,
dims_arg_num
+
14
,
4
);
collect_index_params
(
argv
,
StridesE
,
dims_arg_num
+
NumDimMNK
*
7
,
NumDimMNK
*
2
);
collect_index_params
(
argv
,
StridesD
,
dims_arg_num
+
18
,
4
);
collect_index_params
(
argv
,
StridesD
,
dims_arg_num
+
NumDimMNK
*
9
,
NumDimMNK
*
2
);
}
}
using
F16
=
ck
::
half_t
;
using
F16
=
ck
::
half_t
;
...
@@ -95,31 +97,71 @@ int profile_contraction_bilinear(int argc, char* argv[])
...
@@ -95,31 +97,71 @@ int profile_contraction_bilinear(int argc, char* argv[])
if
(
default_strides
)
if
(
default_strides
)
{
{
assign_default_strides
(
a_layout
,
StridesA
,
{
M
[
0
],
M
[
1
],
K
[
0
],
K
[
1
]});
auto
merge_dims
=
[](
const
std
::
vector
<
ck
::
index_t
>&
dims01
,
assign_default_strides
(
b_layout
,
StridesB
,
{
N
[
0
],
N
[
1
],
K
[
0
],
K
[
1
]});
const
std
::
vector
<
ck
::
index_t
>&
dims23
)
{
assign_default_strides
(
cde_layout
,
StridesE
,
{
M
[
0
],
M
[
1
],
N
[
0
],
N
[
1
]});
std
::
vector
<
ck
::
index_t
>
dims_szt
(
dims01
.
begin
(),
dims01
.
end
());
assign_default_strides
(
cde_layout
,
StridesD
,
{
M
[
0
],
M
[
1
],
N
[
0
],
N
[
1
]});
dims_szt
.
insert
(
dims_szt
.
end
(),
dims23
.
begin
(),
dims23
.
end
());
return
dims_szt
;
};
assign_default_strides
(
a_layout
,
StridesA
,
merge_dims
(
M
,
K
));
assign_default_strides
(
b_layout
,
StridesB
,
merge_dims
(
N
,
K
));
assign_default_strides
(
cde_layout
,
StridesE
,
merge_dims
(
M
,
N
));
assign_default_strides
(
cde_layout
,
StridesD
,
merge_dims
(
M
,
N
));
}
if
(
NumDimMNK
==
2
)
{
bool
pass
=
ck
::
profiler
::
profile_contraction_impl
<
2
,
ALayout
,
BLayout
,
CDELayout
,
DataType
,
ComputeDataType
,
ck
::
Tuple
<
DataType
>
,
Bilinear
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
Bilinear
{
alpha
,
beta
},
M
,
N
,
K
,
StridesA
,
StridesB
,
StridesE
,
StridesD
);
return
pass
;
}
else
if
(
NumDimMNK
==
6
)
{
bool
pass
=
ck
::
profiler
::
profile_contraction_impl
<
6
,
ALayout
,
BLayout
,
CDELayout
,
DataType
,
ComputeDataType
,
ck
::
Tuple
<
DataType
>
,
Bilinear
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
Bilinear
{
alpha
,
beta
},
M
,
N
,
K
,
StridesA
,
StridesB
,
StridesE
,
StridesD
);
return
pass
;
}
else
{
throw
std
::
runtime_error
(
"Not supported NumDimMNK"
);
return
false
;
}
}
bool
pass
=
ck
::
profiler
::
profile_contraction_impl
<
ALayout
,
BLayout
,
CDELayout
,
DataType
,
ComputeDataType
,
ck
::
Tuple
<
DataType
>
,
Bilinear
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
Bilinear
{
alpha
,
beta
},
M
,
N
,
K
,
StridesA
,
StridesB
,
StridesE
,
StridesD
);
return
pass
;
};
};
auto
run_profile_for_datatype
=
[
&
](
auto
type
,
auto
compute_type
)
{
auto
run_profile_for_datatype
=
[
&
](
auto
type
,
auto
compute_type
)
{
...
...
Prev
1
…
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment