Commit e5ebcc41 authored by Artur Wojcik's avatar Artur Wojcik
Browse files

Merge branch 'develop' into uif2-migraphx

parents 57cdd70b abac8b07
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::NHWGC;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 28; // input H
static constexpr ck::index_t Wi = 28; // input W
static constexpr ck::index_t Ho = 28; // output H
static constexpr ck::index_t Wo = 28; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
std::array<ck::index_t, 2> in_spatial_lengths{Hi, Wi};
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space
// However, CK's API only accepts lengths and strides with order of GNCHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * G * N * Ho * Wo * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C);
using namespace ck::conv_tensor_rearrange_op;
using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange<NumDimSpatial,
ImageLayout,
InDataType,
OutDataType,
ColumnToImage>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
image_strides,
gemm_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(avg_time < best_avg_time)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
image_strides,
gemm_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp"
#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using ImageLayout = ck::tensor_layout::convolution::NHWGC;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 2;
static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 28; // input H
static constexpr ck::index_t Wi = 28; // input W
static constexpr ck::index_t Ho = 28; // output H
static constexpr ck::index_t Wo = 28; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
std::array<ck::index_t, 2> in_spatial_lengths{Hi, Wi};
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space
// However, CK's API only accepts lengths and strides with order of GNCHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 5> image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 3> gemm_strides{Y * X * C, G * Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C);
using namespace ck::conv_tensor_rearrange_op;
using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange<NumDimSpatial,
ImageLayout,
InDataType,
OutDataType,
ImageToColumn>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
image_strides,
gemm_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(avg_time < best_avg_time)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
G,
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
image_strides,
gemm_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
}
add_executable(client_elementwise_transpose3d elementwise_transpose_3d.cpp)
target_link_libraries(client_elementwise_transpose3d PRIVATE composable_kernel::device_other_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
using F16 = ck::half_t;
using F32 = float;
using ADataType = F16;
using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
const int N = 16;
const int C = 8;
const int D = 8;
const int H = 8;
const int W = 8;
std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D};
auto size = N * C * D * H * W;
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, H * W, W, 1, D * H * W}; // N, C, D, H, W
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, C, H, W, D
SimpleDeviceMem a_dev_buf(sizeof(ADataType) * size);
SimpleDeviceMem b_dev_buf(sizeof(BDataType) * size);
std::array<const void*, 1> input = {a_dev_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_dev_buf.GetDeviceBuffer()};
using DeviceElementwisePermuteInstance = ck::tensor_operation::device::
DeviceElementwise<ck::Tuple<ADataType>, ck::Tuple<BDataType>, PassThrough, 5>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceElementwisePermuteInstance>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte =
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
// run the best intance
if(found)
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
# Fwd scaleadd scaleadd relu
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp16.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp16 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_bf16.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_bf16 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_int8.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_int8 PRIVATE composable_kernel::device_conv_operations)
# Fwd scaleadd AB
add_executable(client_grouped_convnd_fwd_scaleadd_ab_fp32
grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp32.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_fp32 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_ab_fp16
grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_fp16.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_fp16 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_ab_bf16
grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_bf16.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_bf16 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_grouped_convnd_fwd_scaleadd_ab_int8
grouped_convnd_fwd_scaleadd_ab/grouped_conv_fwd_scaleadd_ab_int8.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_ab_int8 PRIVATE composable_kernel::device_conv_operations)
# Fwd bilinear
add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp)
target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
# Bwd data bilinear
add_executable(client_grouped_convnd_bwd_data_bilinear_residual_fp16
grouped_convnd_bwd_data_bilinear/grouped_conv_bwd_data_bilinear_residual_fp16.cpp)
target_link_libraries(client_grouped_convnd_bwd_data_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_bilinear.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<ck::half_t>;
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 64; // batch size
static constexpr ck::index_t K = 64; // output channel
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Z = 3; // filter D
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Di = 14; // input D
static constexpr ck::index_t Hi = 14; // input H
static constexpr ck::index_t Wi = 14; // input W
static constexpr ck::index_t Do = 14; // output D
static constexpr ck::index_t Ho = 14; // output H
static constexpr ck::index_t Wo = 14; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int execute_conv_bwd_data_bilinear()
{
std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, C, Di, Hi, Wi};
std::array<ck::index_t, NumDimSpatial + 3> in_strides{
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, C, Z, Y, X};
std::array<ck::index_t, NumDimSpatial + 3> wei_strides{
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, K, Do, Ho, Wo};
std::array<ck::index_t, NumDimSpatial + 3> out_strides{
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
SimpleDeviceMem in(sizeof(InDataType) * G * N * Di * Hi * Wi * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * G * N * Do * Ho * Wo * K);
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
OutLayout,
WeiLayout,
ck::Tuple<InLayout>,
InLayout,
OutDataType,
WeiDataType,
ck::Tuple<InDataType>,
InDataType,
PassThrough,
PassThrough,
Bilinear>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{in.GetDeviceBuffer()},
in.GetDeviceBuffer(),
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{in_lengths},
{in_strides},
in_lengths,
in_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
Bilinear{2.f, 2.f});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Y * X +
3 * G * N * Di * Hi * Wi * C;
std::size_t num_bytes = 2 * sizeof(InDataType) * G * N * Di * Hi * Wi * C +
sizeof(WeiDataType) * G * K * Z * Y * X * C +
sizeof(OutDataType) * G * N * Do * Ho * Wo * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{in.GetDeviceBuffer()},
in.GetDeviceBuffer(),
out_lengths,
out_strides,
wei_lengths,
wei_strides,
{in_lengths},
{in_strides},
in_lengths,
in_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
Bilinear{2.f, 2.f});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
int main() { return execute_conv_bwd_data_bilinear(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bilinear.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<ck::half_t>;
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Bilinear = ck::tensor_operation::element_wise::Bilinear;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 64; // batch size
static constexpr ck::index_t K = 64; // output channel
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Z = 3; // filter D
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Di = 14; // input D
static constexpr ck::index_t Hi = 14; // input H
static constexpr ck::index_t Wi = 14; // input W
static constexpr ck::index_t Do = 14; // output D
static constexpr ck::index_t Ho = 14; // output H
static constexpr ck::index_t Wo = 14; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int execute_conv_fwd_bilinear()
{
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
std::array<ck::index_t, 6> in_strides{
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
std::array<ck::index_t, 6> wei_strides{
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
std::array<ck::index_t, 6> out_strides{
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<OutLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<OutDataType>,
OutDataType,
PassThrough,
PassThrough,
Bilinear>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{out.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{out_lengths},
{out_strides},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
Bilinear{2.f, 2.f});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop =
std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 3 * N * Ho * Wo * G * K;
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(WeiDataType) * G * K * Y * X * C +
sizeof(OutDataType) * 2 * N * Ho * Wo * G * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{out.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{out_lengths},
{out_strides},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
Bilinear{2.f, 2.f});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
int main() { return execute_conv_fwd_bilinear(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_ab.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 64; // batch size
static constexpr ck::index_t K = 64; // output channel
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Z = 3; // filter D
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Di = 14; // input D
static constexpr ck::index_t Hi = 14; // input H
static constexpr ck::index_t Wi = 14; // input W
static constexpr ck::index_t Do = 14; // output D
static constexpr ck::index_t Ho = 14; // output H
static constexpr ck::index_t Wo = 14; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int execute_conv_fwd_scaleadd_ab()
{
constexpr ck::index_t NumAs = 2;
constexpr ck::index_t NumBs = 2;
constexpr float scale = 1.5f;
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
std::array<ck::index_t, 6> in_strides{
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
std::array<ck::index_t, 6> wei_strides{
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
std::array<ck::index_t, 6> out_strides{
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
using InputDtype = ck::tuple_element_t<0, InDataType>;
using InputBiasDtype = ck::tuple_element_t<1, InDataType>;
using WeightDtype = ck::tuple_element_t<0, WeiDataType>;
using WeightBiasDtype = ck::tuple_element_t<1, WeiDataType>;
SimpleDeviceMem in(sizeof(InputDtype) * N * Di * Hi * Wi * G * C);
SimpleDeviceMem in_bias(sizeof(InputBiasDtype) * N * Di * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeightDtype) * G * K * Z * Y * X * C);
SimpleDeviceMem wei_bias(sizeof(WeightBiasDtype) * G * K * Z * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
ScaleAdd,
ScaleAdd,
PassThrough>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
std::array<const void*, NumAs> as = {in.GetDeviceBuffer(), in_bias.GetDeviceBuffer()};
std::array<const void*, NumBs> bs = {wei.GetDeviceBuffer(), wei_bias.GetDeviceBuffer()};
std::array<const void*, 0> ds{};
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(as,
bs,
ds,
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{},
{},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
ScaleAdd{scale},
ScaleAdd{scale},
PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = std::size_t(2) * G * N * K * C * Do * Ho * Wo * Z * Y * X +
N * Di * Hi * Wi * G * C + G * K * Z * Y * X * C;
std::size_t num_bytes = 2 * sizeof(InDataType) * N * Di * Hi * Wi * G * C +
2 * sizeof(WeiDataType) * G * K * Z * Y * X * C +
sizeof(OutDataType) * N * Do * Ho * Wo * G * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(as,
bs,
ds,
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{},
{},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
ScaleAdd{scale},
ScaleAdd{scale},
PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = ck::Tuple<ck::bhalf_t, ck::bhalf_t>;
using WeiDataType = ck::Tuple<ck::bhalf_t, ck::bhalf_t>;
using OutDataType = ck::bhalf_t;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int main() { return execute_conv_fwd_scaleadd_ab(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = ck::Tuple<ck::half_t, ck::half_t>;
using WeiDataType = ck::Tuple<ck::half_t, ck::half_t>;
using OutDataType = ck::half_t;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int main() { return execute_conv_fwd_scaleadd_ab(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = ck::Tuple<float, float>;
using WeiDataType = ck::Tuple<float, float>;
using OutDataType = float;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int main() { return execute_conv_fwd_scaleadd_ab(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = ck::Tuple<int8_t, int8_t>;
using WeiDataType = ck::Tuple<int8_t, int8_t>;
using OutDataType = int8_t;
#include "grouped_conv_fwd_scaleadd_ab.inc"
int main() { return execute_conv_fwd_scaleadd_ab(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
using BiasLayout = ck::tensor_layout::convolution::G_K;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 32;
static constexpr ck::index_t N = 64; // batch size
static constexpr ck::index_t K = 64; // output channel
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Z = 3; // filter D
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Di = 14; // input D
static constexpr ck::index_t Hi = 14; // input H
static constexpr ck::index_t Wi = 14; // input W
static constexpr ck::index_t Do = 14; // output D
static constexpr ck::index_t Ho = 14; // output H
static constexpr ck::index_t Wo = 14; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int execute_conv_fwd_scaleadd_scaleadd_relu()
{
// We have NHWGC/GKYXC/NHWGK (x, weight, y) in memory space.
// However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW.
// Hence, we need to adjust the order of strides.
std::array<ck::index_t, 6> in_lengths{G, N, C, Di, Hi, Wi};
std::array<ck::index_t, 6> in_strides{
C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C};
std::array<ck::index_t, 6> wei_lengths{G, K, C, Z, Y, X};
std::array<ck::index_t, 6> wei_strides{
K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C};
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
std::array<ck::index_t, 6> out_strides{
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Di * Hi * Wi * G * C);
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K);
SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
NumDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<OutLayout, BiasLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<std::tuple_element_t<0, DDataTypes>, std::tuple_element_t<1, DDataTypes>>,
OutDataType,
PassThrough,
PassThrough,
ScaleAddScaleAddRelu>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
float best_tflops = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr =
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{out_lengths, bias_lengths},
{out_strides, bias_strides},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
ScaleAddScaleAddRelu{2.f, 2.f});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop =
std::size_t(2) * G * N * K * C * Ho * Wo * Y * X + 2 * N * Ho * Wo * G * K;
std::size_t num_bytes =
sizeof(InDataType) * N * Hi * Wi * G * C + sizeof(WeiDataType) * G * K * Y * X * C +
(sizeof(OutDataType) + sizeof(std::tuple_element_t<0, DDataTypes>) +
sizeof(std::tuple_element_t<1, DDataTypes>)) *
N * Ho * Wo * G * K;
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_tflops = tflops;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
<< " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr =
op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
wei.GetDeviceBuffer(),
{d0.GetDeviceBuffer(), d1.GetDeviceBuffer()},
out.GetDeviceBuffer(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
{out_lengths, bias_lengths},
{out_strides, bias_strides},
out_lengths,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads,
PassThrough{},
PassThrough{},
ScaleAddScaleAddRelu{2.f, 2.f});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = ck::bhalf_t;
using WeiDataType = ck::bhalf_t;
using OutDataType = ck::bhalf_t;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<ck::bhalf_t, ck::bhalf_t>;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = ck::half_t;
using WeiDataType = ck::half_t;
using OutDataType = ck::half_t;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<ck::half_t, ck::half_t>;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = float;
using WeiDataType = float;
using OutDataType = float;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<float, float>;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <tuple>
#include "ck/utility/data_type.hpp"
#include "ck/utility/tuple.hpp"
using InDataType = int8_t;
using WeiDataType = int8_t;
using OutDataType = int8_t;
// Use std tuple instead of ck tuple to avoid clang
// implicit instantiation of undefined template error.
using DDataTypes = std::tuple<float, float>;
#include "grouped_conv_fwd_scaleadd_scaleadd_relu.inc"
int main() { return execute_conv_fwd_scaleadd_scaleadd_relu(); }
add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations)
add_executable(client_wrapper_img2col wrapper_img2col.cpp)
target_link_libraries(client_wrapper_img2col PRIVATE composable_kernel::device_other_operations)
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR
GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR
GPU_TARGETS MATCHES "gfx942")
add_executable(client_wrapper_basic_gemm wrapper_basic_gemm.cpp)
target_link_libraries(client_wrapper_basic_gemm PRIVATE composable_kernel::device_other_operations)
add_executable(client_wrapper_optimized_gemm wrapper_optimized_gemm.cpp)
target_link_libraries(client_wrapper_optimized_gemm PRIVATE composable_kernel::device_other_operations)
endif()
# Composable Kernel wrapper GEMM tutorial
This tutorial demonstrates how to implement matrix multiplication using Composable Kernel (CK)
wrapper. We present the base version of GEMM without most of the available optimizations; however,
it's worth noting that CK has kernels with different optimizations.
To implement these optimizations, you can use the CK wrapper or directly use available instances in
CK. You can also refer to the
[optimized GEMM example](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_optimized_gemm.cpp),
that uses CK wrapper based on the
[`gridwise_gemm_xdlops_v2r3`](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp) implementation.
The kernel definition should look similar to:
```cpp
template <typename DataType,
typename GemmTraits,
ck::index_t scalar_per_vector,
typename BlockShape,
typename ThreadLayout>
__global__ void __CK_WRAPPER_LAUNCH_BOUNDS__ DeviceGemm(const void* p_a,
const void* p_b,
void* p_c,
const ck::index_t M,
const ck::index_t N,
const ck::index_t K,
const BlockShape tile_shape,
const ThreadLayout thread_layout)
```
We pass pointers to global memory and matrix dimensions via arguments. Additionally, we pass
selected lengths of processed data through each block (`tile_shape`) and thread layout
(`thread_layout`). For compilation time parameters, we define the data type,
[traits for the GEMM operation](https://github.com/ROCm/composable_kernel/blob/develop/include/ck/wrapper/traits/blockwise_gemm_xdl_traits.hpp)
and scalar per vector value during copy.
Step 1: Create layouts for global and LDS memory.
```cpp
// Specify layouts for global memory.
const auto a_global_layout =
ck::wrapper::make_layout(ck::make_tuple(M, K), ck::make_tuple(K, 1));
const auto b_global_layout =
ck::wrapper::make_layout(ck::make_tuple(N, K), ck::make_tuple(K, 1));
const auto c_global_layout =
ck::wrapper::make_layout(ck::make_tuple(M, N), ck::make_tuple(N, 1));
// Specify layouts for tiles.
constexpr auto a_tile_layout = ck::wrapper::make_layout(
ck::make_tuple(MPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
constexpr auto b_tile_layout = ck::wrapper::make_layout(
ck::make_tuple(NPerBlock, KPerBlock), ck::make_tuple(KPerBlock, ck::Number<1>{}));
constexpr auto c_tile_layout = ck::wrapper::make_layout(
ck::make_tuple(MPerBlock, NPerBlock), ck::make_tuple(NPerBlock, ck::Number<1>{}));
// Apply padding for global memory.
auto a_global_layout_padded = ck::wrapper::pad(a_global_layout, shape(a_tile_layout));
auto b_global_layout_padded = ck::wrapper::pad(b_global_layout, shape(b_tile_layout));
auto c_global_layout_padded = ck::wrapper::pad(c_global_layout, shape(c_tile_layout));
```
We pad layouts for global tensors in case M, N, and K are not divisible by `MPerBlock`, `NPerBlock`, or
`KPerBlock`.
Step 2: Create tensors for global and LDS memory.
```cpp
// Make tensors for global memory.
auto a_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<const DataType*>(p_a), a_global_layout_padded);
auto b_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<const DataType*>(p_b), b_global_layout_padded);
auto c_global_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Global>(
static_cast<DataType*>(p_c), c_global_layout_padded);
// Allocate LDS memory.
__shared__ DataType lds_a[ck::wrapper::size(a_tile_layout)];
__shared__ DataType lds_b[ck::wrapper::size(b_tile_layout)];
// Make tensors for lds memory.
auto a_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
static_cast<DataType*>(lds_a), a_tile_layout);
auto b_lds_tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Lds>(
static_cast<DataType*>(lds_b), b_tile_layout);
```
We must specify parameters for copy and convert block indexes to tuple:
```cpp
// Specify block index as tuple.
const auto block_idxs = ck::make_tuple(static_cast<ck::index_t>(blockIdx.x),
static_cast<ck::index_t>(blockIdx.y),
ck::wrapper::slice());
// Specify access parameters for copy.
using DimAccessOrder = ck::Tuple<ck::Number<0>, ck::Number<1>>;
constexpr ck::index_t vector_dim = 1;
```
We create a local tile (per block) and local partitions (per thread) for the global memory (`C`). We also
define and clear an output register (`c_vgpr_reg`) for the accumulation.
```cpp
auto c_global_local_tile = ck::wrapper::make_local_tile(
c_global_tensor,
tile_shape,
block_idxs,
make_tuple(ck::Number<1>{}, ck::Number<1>{}, ck::wrapper::slice(KPerBlock)));
auto c_global_local_partition =
ck::wrapper::make_blockwise_gemm_xdl_c_local_partition<DataType,
decltype(a_tile_layout),
decltype(b_tile_layout),
ck::wrapper::size(thread_layout),
GemmTraits>(c_global_local_tile);
// Create C vgpr to accumulate results.
auto c_vgpr_reg = ck::wrapper::make_blockwise_gemm_xdl_c_vgpr<DataType,
decltype(a_tile_layout),
decltype(b_tile_layout),
ck::wrapper::size(thread_layout),
GemmTraits>();
// Clear C vgpr.
ck::wrapper::clear(c_vgpr_reg);
```
We use two specific functions for `blockwise_gemm`: `make_blockwise_gemm_xdl_c_local_partition` and
`make_blockwise_gemm_xdl_c_vgpr`. This helps to choose the appropriate partition for the `C` output
and define tensors with specific layouts for `blockwise_gemm`. In the following step, we use only
generic functions for the CK wrapper.
Step 3: Create the compute loop.
```cpp
const ck::index_t num_loop = ck::math::integer_divide_ceil(K, KPerBlock);
ck::index_t i = 0;
do
{
// Get KPerBlock slice.
const auto k_slice = ck::wrapper::slice(i * KPerBlock, (i + 1) * KPerBlock);
auto a_global_tensor_k_slice = a_global_tensor(ck::wrapper::slice(), k_slice);
auto b_global_tensor_k_slice = b_global_tensor(ck::wrapper::slice(), k_slice);
// Create local tiles for A and B.
auto a_global_local_tile = ck::wrapper::make_local_tile(
a_global_tensor_k_slice,
tile_shape,
block_idxs,
make_tuple(ck::Number<1>{}, ck::wrapper::slice(N), ck::Number<1>{}));
auto b_global_local_tile = ck::wrapper::make_local_tile(
b_global_tensor_k_slice,
tile_shape,
block_idxs,
make_tuple(ck::wrapper::slice(M), ck::Number<1>{}, ck::Number<1>{}));
// Copy from global to LDS.
ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
a_global_local_tile, a_lds_tensor, thread_layout);
ck::wrapper::blockwise_copy<DimAccessOrder, vector_dim, scalar_per_vector>(
b_global_local_tile, b_lds_tensor, thread_layout);
// Synchronize lds.
ck::block_sync_lds();
// Execute blockwise GEMM.
ck::wrapper::blockwise_gemm_xdl<DataType, ck::wrapper::size(thread_layout), GemmTraits>(
a_lds_tensor, b_lds_tensor, c_vgpr_reg);
++i;
} while(i < num_loop);
```
Loop iterate over `K / KPerBlock`. Each time a local tile is created for A and B tensors (tensor per block),
data is copied from global memory to LDS. The `blockwise_gemm` function performs the GEMM
operation on `a_lds_tensor` and `b_lds_tensor`, and stores results in `c_vgpr_reg`.
The end result from `c_vgpr_reg` is stored in the `C` local partition (tensor per thread):
```cpp
ck::wrapper::copy(c_vgpr_reg, c_global_local_partition);
```
If you want to dive deep into the details, you can find the entire example
[here](https://github.com/ROCm/composable_kernel/blob/develop/client_example/25_wrapper/wrapper_basic_gemm.cpp).
// SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include "ck/ck.hpp"
#include "ck/utility/number.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/wrapper/layout.hpp"
using DataType = int;
template <typename Layout>
void Print1d(const Layout& layout)
{
std::cout << "Print1d" << std::endl;
for(ck::index_t w = 0; w < ck::wrapper::size(layout); w++)
{
std::cout << layout(ck::make_tuple(w)) << " ";
}
std::cout << std::endl;
}
template <typename Layout>
void Print2d(const Layout& layout)
{
std::cout << "Print2d" << std::endl;
for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++)
{
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
{
std::cout << layout(ck::make_tuple(h, w)) << " ";
}
std::cout << std::endl;
}
}
// Print in (x,y),z pattern
template <typename Layout>
void Print3dCustom(const Layout& layout)
{
std::cout << "Print3dCustom" << std::endl;
for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
{
for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
{
for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
{
std::cout << layout(ck::make_tuple(ck::make_tuple(d, h), w)) << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
}
int main()
{
// Layout traverse in row-major
std::cout << "Note: Layout traverse in column-major" << std::endl;
// Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor)
// (dims:4,8 strides:1,4)
const auto shape_4x8 = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{});
const auto layout_4x8_s1x4 = ck::wrapper::make_layout(shape_4x8);
std::cout << "dims:4,8 strides:1,4" << std::endl;
Print2d(layout_4x8_s1x4);
using Cord1x1Type = ck::Tuple<ck::Number<1>, ck::Number<1>>;
constexpr ck::index_t offset_1x1 = layout_4x8_s1x4.template operator()<Cord1x1Type>();
std::cout << "Constexpr calculated [1, 1] offset:" << offset_1x1 << std::endl;
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor)
// dims:4,(2,4) strides:2,(1,8)
const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4));
const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8));
const auto layout_4x2x4_s2x1x8 = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl;
Print2d(layout_4x2x4_s2x1x8);
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
// dims:(2,2),(2,4) strides:((1,4),(2,8)
const auto shape_2x2x2x4 = ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}),
ck::make_tuple(ck::Number<2>{}, ck::Number<4>{}));
const auto strides_s1x4x2x8 = ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}),
ck::make_tuple(ck::Number<2>{}, ck::Number<8>{}));
static const auto layout_2x2x2x4_s1x4x2x8 =
ck::wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8);
std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl;
Print2d(layout_2x2x2x4_s1x4x2x8);
Print3dCustom(layout_2x2x2x4_s1x4x2x8);
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (compile-time descriptor)
// dims:((2,2),2),4 strides:((1,4),2),8
// Transform to 2d
const auto shape_2x2x2x4_nested = ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<2>{}, ck::Number<2>{}), ck::Number<2>{}),
ck::Number<4>{});
const auto strides_s1x4x2x8_nested = ck::make_tuple(
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::Number<2>{}),
ck::Number<8>{});
static const auto layout_2x2x2x4_s1x4x2x8_nested =
ck::wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested);
std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl;
Print1d(layout_2x2x2x4_s1x4x2x8_nested);
Print2d(layout_2x2x2x4_s1x4x2x8_nested);
Print3dCustom(layout_2x2x2x4_s1x4x2x8_nested);
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment