"tests/pipelines/animatediff/__init__.py" did not exist on "6ab2dd18a4d17d90c92409886ac22a02acf25d7d"
Commit 8f41bd8e authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'develop' into amd-develop

parents 7f65ac05 d7f05fb9
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance =
device_contraction_mn_instance<F32,
F32,
F32,
F32,
Empty_Tuple,
F32,
F16,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F32,
F32,
Empty_Tuple,
F32,
PassThrough,
PassThrough,
Scale,
F16>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance =
device_contraction_kk_instance<F32,
F32,
F32,
F32,
Empty_Tuple,
F32,
F32,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F32,
F32,
Empty_Tuple,
F32,
PassThrough,
PassThrough,
Scale,
F32>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_kkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance =
device_contraction_kn_instance<F32,
F32,
F32,
F32,
Empty_Tuple,
F32,
F32,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F32,
F32,
Empty_Tuple,
F32,
PassThrough,
PassThrough,
Scale,
F32>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_knn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance =
device_contraction_mk_instance<F32,
F32,
F32,
F32,
Empty_Tuple,
F32,
F32,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F32,
F32,
Empty_Tuple,
F32,
PassThrough,
PassThrough,
Scale,
F32>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance =
device_contraction_mn_instance<F32,
F32,
F32,
F32,
Empty_Tuple,
F32,
F32,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F32,
F32,
Empty_Tuple,
F32,
PassThrough,
PassThrough,
Scale,
F32>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f32_f32_f32_mnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance =
device_contraction_f64_kk_instance<F64,
F64,
F32,
F64,
Empty_Tuple,
F64,
F32,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F64,
F64,
Empty_Tuple,
F64,
PassThrough,
PassThrough,
Scale,
F32>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance =
device_contraction_f64_kn_instance<F64,
F64,
F32,
F64,
Empty_Tuple,
F64,
F32,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F64,
F64,
Empty_Tuple,
F64,
PassThrough,
PassThrough,
Scale,
F32>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance =
device_contraction_f64_mk_instance<F64,
F64,
F32,
F64,
Empty_Tuple,
F64,
F32,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F64,
F64,
Empty_Tuple,
F64,
PassThrough,
PassThrough,
Scale,
F32>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance =
device_contraction_f64_mn_instance<F64,
F64,
F32,
F64,
Empty_Tuple,
F64,
F32,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F64,
F64,
Empty_Tuple,
F64,
PassThrough,
PassThrough,
Scale,
F32>>>& instances)
{
add_device_operation_instances(
instances,
device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance =
device_contraction_f64_kk_instance<F64,
F64,
F64,
F64,
Empty_Tuple,
F64,
F64,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F64,
F64,
Empty_Tuple,
F64,
PassThrough,
PassThrough,
Scale,
F64>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_kkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// k/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance =
device_contraction_f64_kn_instance<F64,
F64,
F64,
F64,
Empty_Tuple,
F64,
F64,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F64,
F64,
Empty_Tuple,
F64,
PassThrough,
PassThrough,
Scale,
F64>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_knn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/k/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance =
device_contraction_f64_mk_instance<F64,
F64,
F64,
F64,
Empty_Tuple,
F64,
F64,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F64,
F64,
Empty_Tuple,
F64,
PassThrough,
PassThrough,
Scale,
F64>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mkn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
// setting Don't use this hack unless absolutely necessary!
// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/contraction/device_contraction_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
// m/n/n/n are the fast changing dimension for A/B/D/E
using device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance =
device_contraction_f64_mn_instance<F64,
F64,
F64,
F64,
Empty_Tuple,
F64,
F64,
PassThrough,
PassThrough,
Scale,
6>;
void add_device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance(
std::vector<std::unique_ptr<DeviceContractionMultipleD<6,
6,
6,
F64,
F64,
Empty_Tuple,
F64,
PassThrough,
PassThrough,
Scale,
F64>>>& instances)
{
add_device_operation_instances(
instances, device_contraction_scale_m6_n6_k6_xdl_c_shuffle_f64_f64_f64_mnn_instance{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
# ONLY XDL_KERNELS # ONLY XDL_KERNELS
set(DEVICE_CONTRACTION_SCALE_INSTANCES) set(DEVICE_CONTRACTION_SCALE_INSTANCES)
# FP32 list(APPEND DIMS 2 6)
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp foreach(idx IN LISTS DIMS)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp set(PREFIX ${idx}D/device_contraction_scale_m${idx}_n${idx}_k${idx})
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp)
# FP32
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp ${PREFIX}_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp) ${PREFIX}_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp)
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_f32_f32_f32_compute_f16_kkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp ${PREFIX}_xdl_c_shuffle_f32_f32_f32_compute_f16_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f32_f32_f32_compute_f16_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp) ${PREFIX}_xdl_c_shuffle_f32_f32_f32_compute_f16_mnn_instance.cpp)
# FP64 list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_f32_f32_f32_compute_bf16_kkn_instance.cpp
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f32_f32_f32_compute_bf16_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp ${PREFIX}_xdl_c_shuffle_f32_f32_f32_compute_bf16_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f32_f32_f32_compute_bf16_mnn_instance.cpp)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp)
# FP64
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp ${PREFIX}_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp) ${PREFIX}_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp)
# FP16 list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_f64_f64_f64_compute_f32_kkn_instance.cpp
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f64_f64_f64_compute_f32_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp ${PREFIX}_xdl_c_shuffle_f64_f64_f64_compute_f32_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f64_f64_f64_compute_f32_mnn_instance.cpp)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp)
# FP16
# BF16 list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_f16_f16_f16_compute_f32_kkn_instance.cpp
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f16_f16_f16_compute_f32_knn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp ${PREFIX}_xdl_c_shuffle_f16_f16_f16_compute_f32_mkn_instance.cpp
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp ${PREFIX}_xdl_c_shuffle_f16_f16_f16_compute_f32_mnn_instance.cpp)
device_contraction_scale_m2_n2_k2_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp)
# BF16
list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES ${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_kkn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_knn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mkn_instance.cpp
${PREFIX}_xdl_c_shuffle_bf16_bf16_bf16_compute_f32_mnn_instance.cpp)
endforeach()
add_instance_library(device_contraction_scale_instance ${DEVICE_CONTRACTION_SCALE_INSTANCES}) add_instance_library(device_contraction_scale_instance ${DEVICE_CONTRACTION_SCALE_INSTANCES})
...@@ -44,6 +44,8 @@ endif() ...@@ -44,6 +44,8 @@ endif()
if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR NOT DEFINED DTYPES) if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8") OR NOT DEFINED DTYPES)
list(APPEND GROUPED_CONV3D_FWD list(APPEND GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_fp8_bf8_instance.cpp) xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_fp8_bf8_instance.cpp)
list(APPEND GROUPED_CONV3D_FWD
xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_fp8_instance.cpp)
endif() endif()
add_instance_library(device_grouped_conv3d_fwd_instance ${GROUPED_CONV3D_FWD}) add_instance_library(device_grouped_conv3d_fwd_instance ${GROUPED_CONV3D_FWD})
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf8_f8_instances(
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
BF8,
F8,
Empty_Tuple,
F8,
PassThrough,
PassThrough,
PassThrough,
BF8,
F8>>>& instances)
{
add_device_operation_instances(instances,
device_grouped_conv_fwd_xdl_bf8_f8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
ConvFwdDefault>{});
add_device_operation_instances(instances,
device_grouped_conv_fwd_xdl_bf8_f8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
ConvFwd1x1P0>{});
add_device_operation_instances(instances,
device_grouped_conv_fwd_xdl_bf8_f8_instances<3,
NDHWGC,
GKZYXC,
Empty_Tuple,
NDHWGK,
ConvFwd1x1S1P0>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -52,21 +52,23 @@ Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s ...@@ -52,21 +52,23 @@ Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
#arg1: tensor operation (contraction_bilinear=CONTRACTION+Bilinear) #arg1: tensor operation (contraction_bilinear=CONTRACTION+Bilinear)
#arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16) #arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)
#arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16) #arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)
#arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]; #arg4: Number of dimension for M, N and K (one for all)
#arg5: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]; # 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]; # 2: A[k0, k1, m0, m1] * B[k0, k1, n0, n1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1];
# 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]) # 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1])
#arg5: verification (0: no; 1: yes) #arg6: verification (0: no; 1: yes)
#arg6: initialization (0: no init; 1: integer value; 2: decimal value) #arg7: initialization (0: no init; 1: integer value; 2: decimal
#arg7: print tensor value (0: no; 1: yes) # value)
#arg8: time kernel (0: no, 1: yes) #arg8: print tensor value (0: no; 1: yes)
#arg9: alpha #arg9: time kernel (0: no, 1: yes)
#arg10: beta #arg10: alpha
#arg11 to 16: M0, M1, N0, N1, K0, K1 #arg11: beta
#arg17 to 32: Strides for A, B, D and E (skip for default) #arg12 to 17/29: M0, M1, N0, N1, K0, K1
#arg18/30 to 33/77: Strides for A, B, D and E (skip for default)
################ op datatype compute_datatype layout verify init log time alpha beta M0 M1 N0 N1 K0 K1 ################ op datatype compute_datatype num_dim layout verify init log time alpha beta M0 M1 N0 N1 K0 K1
./bin/ckProfiler contraction_bilinear 0 0 1 0 0 0 1 1.0 1.0 128 128 128 128 128 128 ./bin/ckProfiler contraction_bilinear 0 0 2 1 0 0 0 1 1.0 1.0 128 128 128 128 128 128
``` ```
Result (MI100) Result (MI100)
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_contraction.hpp"
#include "ck/library/utility/numeric.hpp"
#include "ck/host_utility/io.hpp" #include "ck/host_utility/io.hpp"
...@@ -34,7 +35,8 @@ using Scale = ck::tensor_operation::element_wise::Scale; ...@@ -34,7 +35,8 @@ using Scale = ck::tensor_operation::element_wise::Scale;
using F32 = float; using F32 = float;
using F64 = double; using F64 = double;
template <typename ALayout, template <index_t NumDimMNK,
typename ALayout,
typename BLayout, typename BLayout,
typename CDELayout, typename CDELayout,
typename DataType, typename DataType,
...@@ -104,18 +106,24 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -104,18 +106,24 @@ int profile_contraction_impl(ck::index_t do_verification,
e_device_buf.SetZero(); e_device_buf.SetZero();
d_device_buf.ToDevice(d_m_n.mData.data()); d_device_buf.ToDevice(d_m_n.mData.data());
const std::vector<index_t> a_ms_ks_lengths = {M[0], M[1], K[0], K[1]}; auto merge_dims = [](const std::vector<ck::index_t>& dims01,
const std::vector<index_t> b_ns_ks_lengths = {N[0], N[1], K[0], K[1]}; const std::vector<ck::index_t>& dims23) {
const std::vector<index_t> e_ms_ns_lengths = {M[0], M[1], N[0], N[1]}; std::vector<ck::index_t> dims_szt(dims01.begin(), dims01.end());
const std::vector<index_t> d_m_n_lengths = {M[0], M[1], N[0], N[1]}; dims_szt.insert(dims_szt.end(), dims23.begin(), dims23.end());
return dims_szt;
};
const std::vector<index_t> a_ms_ks_lengths = merge_dims(M, K);
const std::vector<index_t> b_ns_ks_lengths = merge_dims(N, K);
const std::vector<index_t> e_ms_ns_lengths = merge_dims(M, N);
const std::vector<index_t> d_m_n_lengths = merge_dims(M, N);
const auto a_element_op = AElementOp{}; const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{}; const auto b_element_op = BElementOp{};
constexpr ck::index_t NumDim = 2; using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<NumDimMNK,
using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<NumDim, NumDimMNK,
NumDim, NumDimMNK,
NumDim,
DataType, DataType,
DataType, DataType,
DTupleDataType, DTupleDataType,
...@@ -138,9 +146,9 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -138,9 +146,9 @@ int profile_contraction_impl(ck::index_t do_verification,
if(do_verification) if(do_verification)
{ {
using ReferenceGemmInstance = using ReferenceGemmInstance =
ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDim, ck::tensor_operation::host::ReferenceContraction_M2_N2_K2<NumDimMNK,
NumDim, NumDimMNK,
NumDim, NumDimMNK,
DataType, DataType,
DataType, DataType,
DataType, DataType,
...@@ -159,33 +167,20 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -159,33 +167,20 @@ int profile_contraction_impl(ck::index_t do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
for(size_t m0 = 0; m0 < e_m_n_host_result.mDesc.GetLengths()[0]; ++m0) e_m_n_host_result.ForEach([&](auto& self, auto idx) {
{ if constexpr(is_same<CDElementOp, Bilinear>::value)
for(size_t m1 = 0; m1 < e_m_n_host_result.mDesc.GetLengths()[1]; ++m1)
{ {
for(size_t n0 = 0; n0 < e_m_n_host_result.mDesc.GetLengths()[2]; ++n0) cde_element_op(self(idx), c_m_n_host_result(idx), d_m_n(idx));
{
for(size_t n1 = 0; n1 < e_m_n_host_result.mDesc.GetLengths()[3]; ++n1)
{
if constexpr(is_same<CDElementOp, Bilinear>::value)
{
cde_element_op(e_m_n_host_result(m0, m1, n0, n1),
c_m_n_host_result(m0, m1, n0, n1),
d_m_n(m0, m1, n0, n1));
}
else if constexpr(is_same<CDElementOp, Scale>::value)
{
cde_element_op(e_m_n_host_result(m0, m1, n0, n1),
c_m_n_host_result(m0, m1, n0, n1));
}
else
{
static_assert("Unsupported CDElementOp in contraction profiler.");
}
}
}
} }
} else if constexpr(is_same<CDElementOp, Scale>::value)
{
cde_element_op(self(idx), c_m_n_host_result(idx));
}
else
{
static_assert("Unsupported CDElementOp in contraction profiler.");
}
});
} }
std::string best_op_name; std::string best_op_name;
...@@ -242,9 +237,12 @@ int profile_contraction_impl(ck::index_t do_verification, ...@@ -242,9 +237,12 @@ int profile_contraction_impl(ck::index_t do_verification,
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
auto nelems_m = M[0] * M[1]; auto nelems_m = ck::accumulate_n<ck::index_t>(
auto nelems_n = N[0] * N[1]; a_ms_ks_lengths.begin(), NumDimMNK, 1, std::multiplies<>{});
auto nelems_k = K[0] * K[1]; auto nelems_n = ck::accumulate_n<ck::index_t>(
b_ns_ks_lengths.begin(), NumDimMNK, 1, std::multiplies<>{});
auto nelems_k = ck::accumulate_n<ck::index_t>(
a_ms_ks_lengths.begin() + NumDimMNK, NumDimMNK, 1, std::multiplies<>{});
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -48,14 +48,36 @@ inline void collect_index_params(char* argv[], ...@@ -48,14 +48,36 @@ inline void collect_index_params(char* argv[],
// Defualt strides for row-major: {Dim1 * Dim2 * Dim3, Dim2 * Dim3, Dim3, 1} // Defualt strides for row-major: {Dim1 * Dim2 * Dim3, Dim2 * Dim3, Dim3, 1}
// Defualt strides for column-major: {Dim1, 1, Dim0 * Dim1 * Dim3, Dim0 * Dim1} // Defualt strides for column-major: {Dim1, 1, Dim0 * Dim1 * Dim3, Dim0 * Dim1}
// M1, 1, M0 * M1 * K1, M0 * M1
// K0, K1, M0, M1
inline void inline void
assign_default_strides(Row, std::vector<ck::index_t>& strides, std::vector<ck::index_t> dims) assign_default_strides(Row, std::vector<ck::index_t>& strides, std::vector<ck::index_t> dims)
{ {
strides = {dims[1] * dims[2] * dims[3], dims[2] * dims[3], dims[3], 1}; ck::index_t stride = 1;
for(ck::index_t s = strides.size() - 1; s >= 0; s--)
{
strides[s] = stride;
stride *= dims[s];
}
} }
inline void inline void
assign_default_strides(Col, std::vector<ck::index_t>& strides, std::vector<ck::index_t> dims) assign_default_strides(Col, std::vector<ck::index_t>& strides, std::vector<ck::index_t> dims)
{ {
strides = {dims[1], 1, dims[0] * dims[1] * dims[3], dims[0] * dims[1]}; // Assign second half of strides
ck::index_t stride = 1;
for(ck::index_t s = strides.size() / 2 - 1; s >= 0; s--)
{
strides[s] = stride;
stride *= dims[s];
}
// Assign first half of strides
for(ck::index_t s = strides.size() - 1; s > static_cast<ck::index_t>(strides.size()) / 2 - 1;
s--)
{
strides[s] = stride;
stride *= dims[s];
}
} }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -19,7 +19,8 @@ static void print_helper_msg() ...@@ -19,7 +19,8 @@ static void print_helper_msg()
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n" << "arg2: data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n" << "arg3: compute data type (0: fp32; 1: f64; 2: f16; 3: bf16)\n"
<< "arg4: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + " << "arg4: Number of dimension for M, N and K (one for all)\n"
<< "arg5: matrix layout (0: A[m0, m1, k0, k1] * B[k0, k1, n0, n1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + " << " 1: A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
...@@ -27,23 +28,23 @@ static void print_helper_msg() ...@@ -27,23 +28,23 @@ static void print_helper_msg()
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1];\n"
<< " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + " << " 3: A[k0, k1, m0, m1] * B[n0, n1, k0, k1] + "
"D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n" "D[m0, m1, n0, n1] = E[m0, m1, n0, n1])\n"
<< "arg5: verification (0: no; 1: yes)\n" << "arg6: verification (0: no; 1: yes)\n"
<< "arg6: initialization (0: no init; 1: integer value; 2: decimal " << "arg7: initialization (0: no init; 1: integer value; 2: decimal "
<< "value)\n" << "value)\n"
<< "arg7: print tensor value (0: no; 1: yes)\n" << "arg8: print tensor value (0: no; 1: yes)\n"
<< "arg8: time kernel (0: no, 1: yes)\n" << "arg9: time kernel (0: no, 1: yes)\n"
<< "arg9: alpha\n" << "arg10: alpha\n"
<< "arg10: beta\n" << "arg11: beta\n"
<< "arg11 to 16: M0, M1, N0, N1, K0, K1\n" << "arg12 to 17/29: M0, M1, N0, N1, K0, K1\n"
<< "arg17 to 32: Strides for A, B, D and E (skip for default)\n" << "arg18/30 to 33/77: Strides for A, B, D and E (skip for default)\n"
<< std::endl; << std::endl;
} }
int profile_contraction_bilinear(int argc, char* argv[]) int profile_contraction_bilinear(int argc, char* argv[])
{ {
const bool default_strides = argc == 17; const bool default_strides = argc == 18 || 30;
if(argc != 33 && argc != 17) if(argc != 34 && argc != 78 && !default_strides)
{ {
print_helper_msg(); print_helper_msg();
exit(1); exit(1);
...@@ -51,32 +52,33 @@ int profile_contraction_bilinear(int argc, char* argv[]) ...@@ -51,32 +52,33 @@ int profile_contraction_bilinear(int argc, char* argv[])
const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ContractionDataType>(std::stoi(argv[2]));
const auto compute_data_type = static_cast<ContractionComputeDataType>(std::stoi(argv[3])); const auto compute_data_type = static_cast<ContractionComputeDataType>(std::stoi(argv[3]));
const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[4])); const ck::index_t NumDimMNK = std::stoi(argv[4]);
const bool do_verification = std::stoi(argv[5]); const auto layout = static_cast<ContractionMatrixLayout>(std::stoi(argv[5]));
const ck::index_t init_method = std::stoi(argv[6]); const bool do_verification = std::stoi(argv[6]);
const bool do_log = std::stoi(argv[7]); const ck::index_t init_method = std::stoi(argv[7]);
const bool time_kernel = std::stoi(argv[8]); const bool do_log = std::stoi(argv[8]);
const float alpha = std::stof(argv[9]); const bool time_kernel = std::stoi(argv[9]);
const float beta = std::stof(argv[10]); const float alpha = std::stof(argv[10]);
const float beta = std::stof(argv[11]);
std::vector<ck::index_t> M; std::vector<ck::index_t> M;
std::vector<ck::index_t> N; std::vector<ck::index_t> N;
std::vector<ck::index_t> K; std::vector<ck::index_t> K;
const ck::index_t dims_arg_num = 11; const ck::index_t dims_arg_num = 12;
collect_index_params(argv, M, dims_arg_num, 2); collect_index_params(argv, M, dims_arg_num, NumDimMNK);
collect_index_params(argv, N, dims_arg_num + 2, 2); collect_index_params(argv, N, dims_arg_num + NumDimMNK, NumDimMNK);
collect_index_params(argv, K, dims_arg_num + 4, 2); collect_index_params(argv, K, dims_arg_num + NumDimMNK * 2, NumDimMNK);
std::vector<ck::index_t> StridesA; std::vector<ck::index_t> StridesA(NumDimMNK * 2);
std::vector<ck::index_t> StridesB; std::vector<ck::index_t> StridesB(NumDimMNK * 2);
std::vector<ck::index_t> StridesE; std::vector<ck::index_t> StridesE(NumDimMNK * 2);
std::vector<ck::index_t> StridesD; std::vector<ck::index_t> StridesD(NumDimMNK * 2);
if(!default_strides) if(!default_strides)
{ {
collect_index_params(argv, StridesA, dims_arg_num + 6, 4); collect_index_params(argv, StridesA, dims_arg_num + NumDimMNK * 3, NumDimMNK * 2);
collect_index_params(argv, StridesB, dims_arg_num + 10, 4); collect_index_params(argv, StridesB, dims_arg_num + NumDimMNK * 5, NumDimMNK * 2);
collect_index_params(argv, StridesE, dims_arg_num + 14, 4); collect_index_params(argv, StridesE, dims_arg_num + NumDimMNK * 7, NumDimMNK * 2);
collect_index_params(argv, StridesD, dims_arg_num + 18, 4); collect_index_params(argv, StridesD, dims_arg_num + NumDimMNK * 9, NumDimMNK * 2);
} }
using F16 = ck::half_t; using F16 = ck::half_t;
...@@ -95,31 +97,71 @@ int profile_contraction_bilinear(int argc, char* argv[]) ...@@ -95,31 +97,71 @@ int profile_contraction_bilinear(int argc, char* argv[])
if(default_strides) if(default_strides)
{ {
assign_default_strides(a_layout, StridesA, {M[0], M[1], K[0], K[1]}); auto merge_dims = [](const std::vector<ck::index_t>& dims01,
assign_default_strides(b_layout, StridesB, {N[0], N[1], K[0], K[1]}); const std::vector<ck::index_t>& dims23) {
assign_default_strides(cde_layout, StridesE, {M[0], M[1], N[0], N[1]}); std::vector<ck::index_t> dims_szt(dims01.begin(), dims01.end());
assign_default_strides(cde_layout, StridesD, {M[0], M[1], N[0], N[1]}); dims_szt.insert(dims_szt.end(), dims23.begin(), dims23.end());
return dims_szt;
};
assign_default_strides(a_layout, StridesA, merge_dims(M, K));
assign_default_strides(b_layout, StridesB, merge_dims(N, K));
assign_default_strides(cde_layout, StridesE, merge_dims(M, N));
assign_default_strides(cde_layout, StridesD, merge_dims(M, N));
}
if(NumDimMNK == 2)
{
bool pass = ck::profiler::profile_contraction_impl<2,
ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<DataType>,
Bilinear>(do_verification,
init_method,
do_log,
time_kernel,
Bilinear{alpha, beta},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
}
else if(NumDimMNK == 6)
{
bool pass = ck::profiler::profile_contraction_impl<6,
ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<DataType>,
Bilinear>(do_verification,
init_method,
do_log,
time_kernel,
Bilinear{alpha, beta},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
}
else
{
throw std::runtime_error("Not supported NumDimMNK");
return false;
} }
bool pass = ck::profiler::profile_contraction_impl<ALayout,
BLayout,
CDELayout,
DataType,
ComputeDataType,
ck::Tuple<DataType>,
Bilinear>(do_verification,
init_method,
do_log,
time_kernel,
Bilinear{alpha, beta},
M,
N,
K,
StridesA,
StridesB,
StridesE,
StridesD);
return pass;
}; };
auto run_profile_for_datatype = [&](auto type, auto compute_type) { auto run_profile_for_datatype = [&](auto type, auto compute_type) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment