Commit f6ceef78 authored by ThomasNing's avatar ThomasNing
Browse files

merge with the develop branch

parents 536c5458 25935b57
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using Scale = element_wise::Scale;
void add_device_permute_scale_6d_f32_f8_instances(
std::vector<std::unique_ptr<DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F8>, Scale, 6>>>&
instances)
{
#ifdef CK_ENABLE_FP8
add_device_operation_instances(instances, device_permute_scale_f32_f8_instances<6, Scale>{});
#else
ignore = instances;
#endif
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
...@@ -10,15 +10,24 @@ namespace device { ...@@ -10,15 +10,24 @@ namespace device {
namespace instance { namespace instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex
template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&); template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&); template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&); template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&); template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>>&);
template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&); template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&); template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&); template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&); template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 6, 6, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 6, 6, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 5, 5, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 5, 5, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 6, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 6, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 5, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 5, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 3, 3, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 3, 3, ReduceAMax, PassThrough, PassThrough, true, false>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 2, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 2, 2, ReduceAMax, PassThrough, PassThrough, true, false>>&);
template void add_device_reduce_instance_blockwise< F32, F32, F32, 1, 1, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector<DeviceReducePtr<F32, F32, F32, 1, 1, ReduceAMax, PassThrough, PassThrough, true, false>>&);
// clang-format on // clang-format on
} // namespace instance } // namespace instance
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/host_utility/io.hpp" #include "ck/host_utility/io.hpp"
...@@ -20,6 +20,63 @@ ConvParam::ConvParam(ck::index_t n_dim, ...@@ -20,6 +20,63 @@ ConvParam::ConvParam(ck::index_t n_dim,
const std::vector<ck::index_t>& dilations, const std::vector<ck::index_t>& dilations,
const std::vector<ck::index_t>& left_pads, const std::vector<ck::index_t>& left_pads,
const std::vector<ck::index_t>& right_pads) const std::vector<ck::index_t>& right_pads)
: num_dim_spatial_(static_cast<ck::long_index_t>(n_dim)),
G_(static_cast<ck::long_index_t>(group_count)),
N_(static_cast<ck::long_index_t>(n_batch)),
K_(static_cast<ck::long_index_t>(n_out_channels)),
C_(static_cast<ck::long_index_t>(n_in_channels)),
filter_spatial_lengths_(num_dim_spatial_),
input_spatial_lengths_(num_dim_spatial_),
output_spatial_lengths_(num_dim_spatial_),
conv_filter_strides_(num_dim_spatial_),
conv_filter_dilations_(num_dim_spatial_),
input_left_pads_(num_dim_spatial_),
input_right_pads_(num_dim_spatial_)
{
if(static_cast<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
static_cast<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
static_cast<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
static_cast<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
static_cast<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
static_cast<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
{
throw(
std::runtime_error("ConvParam::ConvParam: "
"parameter size is different from number of declared dimensions!"));
}
for(ck::index_t i = 0; i < num_dim_spatial_; ++i)
{
filter_spatial_lengths_[i] = static_cast<ck::long_index_t>(filters_len[i]);
input_spatial_lengths_[i] = static_cast<ck::long_index_t>(input_len[i]);
conv_filter_strides_[i] = static_cast<ck::long_index_t>(strides[i]);
conv_filter_dilations_[i] = static_cast<ck::long_index_t>(dilations[i]);
input_left_pads_[i] = static_cast<ck::long_index_t>(left_pads[i]);
input_right_pads_[i] = static_cast<ck::long_index_t>(right_pads[i]);
// XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const ck::long_index_t x_eff =
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
output_spatial_lengths_[i] =
(input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
conv_filter_strides_[i] +
1;
}
}
ConvParam::ConvParam(ck::long_index_t n_dim,
ck::long_index_t group_count,
ck::long_index_t n_batch,
ck::long_index_t n_out_channels,
ck::long_index_t n_in_channels,
const std::vector<ck::long_index_t>& filters_len,
const std::vector<ck::long_index_t>& input_len,
const std::vector<ck::long_index_t>& strides,
const std::vector<ck::long_index_t>& dilations,
const std::vector<ck::long_index_t>& left_pads,
const std::vector<ck::long_index_t>& right_pads)
: num_dim_spatial_(n_dim), : num_dim_spatial_(n_dim),
G_(group_count), G_(group_count),
N_(n_batch), N_(n_batch),
...@@ -49,7 +106,8 @@ ConvParam::ConvParam(ck::index_t n_dim, ...@@ -49,7 +106,8 @@ ConvParam::ConvParam(ck::index_t n_dim,
{ {
// XEff = (X - 1) * conv_dilation_w + 1; // XEff = (X - 1) * conv_dilation_w + 1;
// Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
const ck::index_t x_eff = (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1; const ck::long_index_t x_eff =
(filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
output_spatial_lengths_[i] = output_spatial_lengths_[i] =
(input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) / (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
...@@ -63,7 +121,7 @@ ConvParam::ConvParam() ...@@ -63,7 +121,7 @@ ConvParam::ConvParam()
{ {
} }
std::vector<ck::index_t> ConvParam::GetOutputSpatialLengths() const std::vector<ck::long_index_t> ConvParam::GetOutputSpatialLengths() const
{ {
return output_spatial_lengths_; return output_spatial_lengths_;
} }
...@@ -97,46 +155,46 @@ std::string get_conv_param_parser_helper_msg() ...@@ -97,46 +155,46 @@ std::string get_conv_param_parser_helper_msg()
ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[]) ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
{ {
const ck::index_t G = std::stoi(argv[arg_idx++]); const ck::long_index_t G = std::stol(argv[arg_idx++]);
const ck::index_t N = std::stoi(argv[arg_idx++]); const ck::long_index_t N = std::stol(argv[arg_idx++]);
const ck::index_t K = std::stoi(argv[arg_idx++]); const ck::long_index_t K = std::stol(argv[arg_idx++]);
const ck::index_t C = std::stoi(argv[arg_idx++]); const ck::long_index_t C = std::stol(argv[arg_idx++]);
std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial); std::vector<ck::long_index_t> filter_spatial_lengths(num_dim_spatial);
std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial); std::vector<ck::long_index_t> input_spatial_lengths(num_dim_spatial);
std::vector<ck::index_t> conv_filter_strides(num_dim_spatial); std::vector<ck::long_index_t> conv_filter_strides(num_dim_spatial);
std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial); std::vector<ck::long_index_t> conv_filter_dilations(num_dim_spatial);
std::vector<ck::index_t> input_left_pads(num_dim_spatial); std::vector<ck::long_index_t> input_left_pads(num_dim_spatial);
std::vector<ck::index_t> input_right_pads(num_dim_spatial); std::vector<ck::long_index_t> input_right_pads(num_dim_spatial);
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); filter_spatial_lengths[i] = std::stol(argv[arg_idx++]);
} }
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); input_spatial_lengths[i] = std::stol(argv[arg_idx++]);
} }
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
conv_filter_strides[i] = std::stoi(argv[arg_idx++]); conv_filter_strides[i] = std::stol(argv[arg_idx++]);
} }
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); conv_filter_dilations[i] = std::stol(argv[arg_idx++]);
} }
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
input_left_pads[i] = std::stoi(argv[arg_idx++]); input_left_pads[i] = std::stol(argv[arg_idx++]);
} }
for(int i = 0; i < num_dim_spatial; ++i) for(int i = 0; i < num_dim_spatial; ++i)
{ {
input_right_pads[i] = std::stoi(argv[arg_idx++]); input_right_pads[i] = std::stol(argv[arg_idx++]);
} }
return ck::utils::conv::ConvParam{num_dim_spatial, return ck::utils::conv::ConvParam{num_dim_spatial,
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -82,6 +82,29 @@ bool profile_conv_bwd_data_impl(int do_verification, ...@@ -82,6 +82,29 @@ bool profile_conv_bwd_data_impl(int do_verification,
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc); Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
Tensor<OutDataType> output(out_g_n_k_wos_desc); Tensor<OutDataType> output(out_g_n_k_wos_desc);
std::vector<ck::index_t> input_spatial_lengths_i32(NDimSpatial);
std::vector<ck::index_t> filter_spatial_lengths_i32(NDimSpatial);
std::vector<ck::index_t> output_spatial_lengths_i32(NDimSpatial);
std::vector<ck::index_t> conv_filter_strides_i32(NDimSpatial);
std::vector<ck::index_t> conv_filter_dilations_i32(NDimSpatial);
std::vector<ck::index_t> input_left_pads_i32(NDimSpatial);
std::vector<ck::index_t> input_right_pads_i32(NDimSpatial);
for(ck::index_t d = 0; d < NDimSpatial; d++)
{
input_spatial_lengths_i32[d] =
static_cast<ck::index_t>(conv_param.input_spatial_lengths_[d]);
filter_spatial_lengths_i32[d] =
static_cast<ck::index_t>(conv_param.filter_spatial_lengths_[d]);
output_spatial_lengths_i32[d] =
static_cast<ck::index_t>(conv_param.GetOutputSpatialLengths()[d]);
conv_filter_strides_i32[d] = static_cast<ck::index_t>(conv_param.conv_filter_strides_[d]);
conv_filter_dilations_i32[d] =
static_cast<ck::index_t>(conv_param.conv_filter_dilations_[d]);
input_left_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_left_pads_[d]);
input_right_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_right_pads_[d]);
}
std::cout << "input: " << input_host_result.mDesc << std::endl; std::cout << "input: " << input_host_result.mDesc << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl; std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl; std::cout << "output: " << output.mDesc << std::endl;
...@@ -161,16 +184,16 @@ bool profile_conv_bwd_data_impl(int do_verification, ...@@ -161,16 +184,16 @@ bool profile_conv_bwd_data_impl(int do_verification,
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.N_, static_cast<ck::index_t>(conv_param.N_),
conv_param.K_, static_cast<ck::index_t>(conv_param.K_),
conv_param.C_, static_cast<ck::index_t>(conv_param.C_),
conv_param.input_spatial_lengths_, input_spatial_lengths_i32,
conv_param.filter_spatial_lengths_, filter_spatial_lengths_i32,
conv_param.output_spatial_lengths_, output_spatial_lengths_i32,
conv_param.conv_filter_strides_, conv_filter_strides_i32,
conv_param.conv_filter_dilations_, conv_filter_dilations_i32,
conv_param.input_left_pads_, input_left_pads_i32,
conv_param.input_right_pads_, input_right_pads_i32,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -60,6 +60,29 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -60,6 +60,29 @@ bool profile_conv_fwd_impl(int do_verification,
Tensor<OutDataType> host_output(out_g_n_k_wos_desc); Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc); Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
std::vector<ck::index_t> input_spatial_lengths_i32(NDimSpatial);
std::vector<ck::index_t> filter_spatial_lengths_i32(NDimSpatial);
std::vector<ck::index_t> output_spatial_lengths_i32(NDimSpatial);
std::vector<ck::index_t> conv_filter_strides_i32(NDimSpatial);
std::vector<ck::index_t> conv_filter_dilations_i32(NDimSpatial);
std::vector<ck::index_t> input_left_pads_i32(NDimSpatial);
std::vector<ck::index_t> input_right_pads_i32(NDimSpatial);
for(ck::index_t d = 0; d < NDimSpatial; d++)
{
input_spatial_lengths_i32[d] =
static_cast<ck::index_t>(conv_param.input_spatial_lengths_[d]);
filter_spatial_lengths_i32[d] =
static_cast<ck::index_t>(conv_param.filter_spatial_lengths_[d]);
output_spatial_lengths_i32[d] =
static_cast<ck::index_t>(conv_param.GetOutputSpatialLengths()[d]);
conv_filter_strides_i32[d] = static_cast<ck::index_t>(conv_param.conv_filter_strides_[d]);
conv_filter_dilations_i32[d] =
static_cast<ck::index_t>(conv_param.conv_filter_dilations_[d]);
input_left_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_left_pads_[d]);
input_right_pads_i32[d] = static_cast<ck::index_t>(conv_param.input_right_pads_[d]);
}
std::cout << "input: " << input.mDesc << std::endl; std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl; std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl; std::cout << "output: " << host_output.mDesc << std::endl;
...@@ -143,16 +166,16 @@ bool profile_conv_fwd_impl(int do_verification, ...@@ -143,16 +166,16 @@ bool profile_conv_fwd_impl(int do_verification,
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()), static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
conv_param.N_, static_cast<ck::index_t>(conv_param.N_),
conv_param.K_, static_cast<ck::index_t>(conv_param.K_),
conv_param.C_, static_cast<ck::index_t>(conv_param.C_),
conv_param.input_spatial_lengths_, input_spatial_lengths_i32,
conv_param.filter_spatial_lengths_, filter_spatial_lengths_i32,
conv_param.GetOutputSpatialLengths(), output_spatial_lengths_i32,
conv_param.conv_filter_strides_, conv_filter_strides_i32,
conv_param.conv_filter_dilations_, conv_filter_dilations_i32,
conv_param.input_left_pads_, input_left_pads_i32,
conv_param.input_right_pads_, input_right_pads_i32,
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
......
...@@ -48,6 +48,7 @@ bool profile_gemm_multiply_multiply_impl(int do_verification, ...@@ -48,6 +48,7 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
int StrideD0, int StrideD0,
int StrideD1, int StrideD1,
int StrideE, int StrideE,
int KBatch,
int n_warmup, int n_warmup,
int n_iter, int n_iter,
uint64_t rotating = 0) uint64_t rotating = 0)
...@@ -129,17 +130,17 @@ bool profile_gemm_multiply_multiply_impl(int do_verification, ...@@ -129,17 +130,17 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
d1_device_buf.ToDevice(d1_m_n.mData.data()); d1_device_buf.ToDevice(d1_m_n.mData.data());
using DeviceOp = using DeviceOp =
ck::tensor_operation::device::DeviceGemmMultipleD<ALayout, ck::tensor_operation::device::DeviceGemmMultipleDSplitK<ALayout,
BLayout, BLayout,
ck::Tuple<D0Layout, D1Layout>, ck::Tuple<D0Layout, D1Layout>,
ELayout, ELayout,
ADataType, ADataType,
BDataType, BDataType,
ck::Tuple<D0DataType, D1DataType>, ck::Tuple<D0DataType, D1DataType>,
EDataType, EDataType,
AElementOp, AElementOp,
BElementOp, BElementOp,
CElementOp>; CElementOp>;
// get device op instances // get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
...@@ -182,104 +183,128 @@ bool profile_gemm_multiply_multiply_impl(int do_verification, ...@@ -182,104 +183,128 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
float best_ave_time = 0; float best_ave_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
float best_kbatch = 0;
// profile device GEMM instances // profile device GEMM instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
d1_device_buf.GetDeviceBuffer()},
static_cast<EDataType*>(c_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 2>{StrideD0, StrideD1},
StrideE,
a_element_op,
b_element_op,
c_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init C to zero before profiling next kernel
c_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false, 0, n_warmup, n_iter}); if(KBatch > 0)
{
kbatch_list = {KBatch};
}
if(do_verification) for(std::size_t i = 0; i < kbatch_list.size(); i++)
{
auto kbatch_curr = kbatch_list[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
std::array<const void*, 2>{d0_device_buf.GetDeviceBuffer(),
d1_device_buf.GetDeviceBuffer()},
static_cast<EDataType*>(c_device_buf.GetDeviceBuffer()),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 2>{StrideD0, StrideD1},
StrideE,
kbatch_curr,
a_element_op,
b_element_op,
c_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
c_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result); // re-init C to zero before profiling next kernel
c_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(),
StreamConfig{nullptr, false, 0, n_warmup, n_iter});
if(do_log) if(do_verification)
{ {
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl; c_device_buf.FromDevice(e_m_n_device_result.mData.data());
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "c_host : ", e_m_n_host_result.mData, ",") pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
<< std::endl;
LogRangeAsType<float>(std::cout << "c_device: ", e_m_n_device_result.mData, ",") if(do_log)
<< std::endl; {
LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
LogRangeAsType<float>(
std::cout << "c_host : ", e_m_n_host_result.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "c_device: ", e_m_n_device_result.mData, ",")
<< std::endl;
}
} }
}
std::string op_name = op_ptr->GetTypeString(); std::string op_name = op_ptr->GetTypeString();
float ave_time = invoker_ptr->Run( float ave_time = invoker_ptr->Run(argument_ptr.get(),
argument_ptr.get(), StreamConfig{nullptr,
StreamConfig{ time_kernel,
nullptr, time_kernel, 0, n_warmup, n_iter, rotating_count > 1, rotating_count}); 0,
n_warmup,
n_iter,
rotating_count > 1,
rotating_count});
std::size_t flop = std::size_t(2) * M * N * K; std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype = std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
<< gb_per_sec << " GB/s, " << op_name << std::endl; << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
<< kbatch_curr << std::endl;
#if defined CK_ENABLE_FP8 #if defined CK_ENABLE_FP8
// set softer tolerances for fp8 // set softer tolerances for fp8
if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> || if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
is_same_v<EDataType, f8_t>) is_same_v<EDataType, f8_t>)
{ {
std::string msg = "Error: Incorrect results!"; std::string msg = "Error: Incorrect results!";
double rtol = 1e-1; double rtol = 1e-1;
double atol = 1e-1; double atol = 1e-1;
pass = pass & ck::utils::check_err( pass = pass & ck::utils::check_err(
e_m_n_device_result, e_m_n_host_result, msg, rtol, atol); e_m_n_device_result, e_m_n_host_result, msg, rtol, atol);
} }
else else
{ {
#endif #endif
pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result); pass = pass & ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
#if defined CK_ENABLE_FP8 #if defined CK_ENABLE_FP8
} }
#endif #endif
if(tflops > best_tflops) if(tflops > best_tflops && ave_time > 1e-10)
{
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
best_kbatch = kbatch_curr;
}
}
else
{ {
best_op_name = op_name; std::cout << op_ptr->GetTypeString() << " does not support this problem"
best_tflops = tflops; << std::endl;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
} }
} }
else
{
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
}
} }
if constexpr(is_same<EDataType, float>::value) if constexpr(is_same<EDataType, float>::value)
...@@ -318,9 +343,9 @@ bool profile_gemm_multiply_multiply_impl(int do_verification, ...@@ -318,9 +343,9 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
} }
std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
<< " StrideB = " << StrideB << " StrideE = " << StrideE << " : " << best_ave_time << " StrideB = " << StrideB << " StrideE = " << StrideE << " KBatch = " << best_kbatch
<< " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << " : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec
<< best_op_name << std::endl; << " GB/s, " << best_op_name << std::endl;
return pass; return pass;
} }
......
...@@ -152,7 +152,7 @@ bool profile_gemm_universal_impl(int do_verification, ...@@ -152,7 +152,7 @@ bool profile_gemm_universal_impl(int do_verification,
// profile device GEMM instances // profile device GEMM instances
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 19, 20, 32, 38}; std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
if(KBatch > 0) if(KBatch > 0)
{ {
...@@ -249,7 +249,7 @@ bool profile_gemm_universal_impl(int do_verification, ...@@ -249,7 +249,7 @@ bool profile_gemm_universal_impl(int do_verification,
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch " << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", KBatch "
<< kbatch_curr << std::endl; << kbatch_curr << std::endl;
if(tflops > best_tflops) if(tflops > best_tflops && ave_time > 1e-10)
{ {
best_op_name = op_name; best_op_name = op_name;
best_tflops = tflops; best_tflops = tflops;
......
...@@ -136,9 +136,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, ...@@ -136,9 +136,10 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
std::cout << "found " << op_ptrs.size() << " instances" << std::endl; std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name; std::string best_op_name;
float best_avg_time = 0; float best_avg_time = 0;
float best_tflops = 0; float best_tflops = 0;
float best_gb_per_sec = 0; float best_gb_per_sec = 0;
ck::index_t best_split_k = 1;
// profile device Conv instances // profile device Conv instances
bool all_pass = true; bool all_pass = true;
...@@ -167,99 +168,111 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, ...@@ -167,99 +168,111 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
range_copy(conv_param.input_left_pads_, begin(input_left_pads)); range_copy(conv_param.input_left_pads_, begin(input_left_pads));
range_copy(conv_param.input_right_pads_, begin(input_right_pads)); range_copy(conv_param.input_right_pads_, begin(input_right_pads));
std::vector<ck::index_t> split_k_list = {1, 2, 4, 8, 16, 32, 64, 128};
if(split_k > 0)
{
split_k_list = {split_k};
}
for(auto& op_ptr : op_ptrs) for(auto& op_ptr : op_ptrs)
{ {
auto argument_ptr = for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
input_lengths,
input_strides,
filter_lengths,
weights_strides,
output_lengths,
output_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op,
split_k);
const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
DeviceMem workspace_dev(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// using atomic add, so need to reset input auto argument_ptr = op_ptr->MakeArgumentPointer(
wei_device_buf.SetZero(); static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
std::string op_name = op_ptr->GetTypeString(); static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
input_lengths,
auto invoker_ptr = op_ptr->MakeInvokerPointer(); input_strides,
filter_lengths,
float avg_time = weights_strides,
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); output_lengths,
output_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
in_element_op,
wei_element_op,
out_element_op,
split_k_list[split_k_id]);
const std::size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
DeviceMem workspace_dev(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// using atomic add, so need to reset input
wei_device_buf.SetZero();
std::size_t flop = conv_param.GetFlops(); std::string op_name = op_ptr->GetTypeString();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time; auto invoker_ptr = op_ptr->MakeInvokerPointer();
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " float avg_time =
<< gb_per_sec << " GB/s, " << op_name << std::endl; invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
if(tflops > best_tflops) std::size_t flop = conv_param.GetFlops();
{ std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
best_op_name = op_name;
best_tflops = tflops;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification) float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
{ float gb_per_sec = num_btype / 1.E6 / avg_time;
wei_device_buf.FromDevice(weight_device_result.mData.data());
bool pass = ck::utils::check_err(weight_device_result, weight_host_result); std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
<< split_k_list[split_k_id] << std::endl;
if(!pass) if(tflops > best_tflops)
{ {
std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl; best_op_name = op_name;
best_tflops = tflops;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_split_k = split_k_list[split_k_id];
} }
all_pass &= pass; if(do_verification)
if(do_log)
{ {
LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl; wei_device_buf.FromDevice(weight_device_result.mData.data());
;
LogRangeAsType<float>( bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
std::cout << "weight (device): ", weight_device_result.mData, ",")
<< std::endl; if(!pass)
; {
LogRangeAsType<float>( std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
std::cout << "weight (host): ", weight_host_result.mData, ",") }
<< std::endl;
; all_pass &= pass;
LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
; if(do_log)
{
LogRangeAsType<float>(std::cout << "output : ", output.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "weight (device): ", weight_device_result.mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "weight (host): ", weight_host_result.mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "input: ", input.mData, ",")
<< std::endl;
}
} }
} }
} else
else {
{ std::cout << op_ptr->GetTypeString() << " does not support this problem"
std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; << std::endl;
}
} }
} }
std::cout << "Best configuration parameters:" std::cout << "Best configuration parameters:"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << ", SplitK "
<< best_split_k << std::endl;
return all_pass; return all_pass;
} }
......
...@@ -33,7 +33,8 @@ template <ck::index_t NDimSpatial, ...@@ -33,7 +33,8 @@ template <ck::index_t NDimSpatial,
typename WeiDataType, typename WeiDataType,
typename OutDataType, typename OutDataType,
typename AComputeType = InDataType, typename AComputeType = InDataType,
typename BComputeType = AComputeType> typename BComputeType = AComputeType,
typename IndexType = ck::index_t>
bool profile_grouped_conv_fwd_impl(int do_verification, bool profile_grouped_conv_fwd_impl(int do_verification,
int init_method, int init_method,
bool do_log, bool do_log,
...@@ -57,16 +58,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ...@@ -57,16 +58,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
const auto out_g_n_k_wos_desc = const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param); ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{}; std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{}; std::array<IndexType, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{}; std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{}; std::array<IndexType, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{}; std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{}; std::array<IndexType, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}; std::array<IndexType, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{}; std::array<IndexType, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}; std::array<IndexType, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{}; std::array<IndexType, NDimSpatial> input_right_pads{};
auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
......
...@@ -46,8 +46,10 @@ if(GPU_TARGETS MATCHES "gfx9") ...@@ -46,8 +46,10 @@ if(GPU_TARGETS MATCHES "gfx9")
list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp) list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
endif() endif()
list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp) list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp) if(GPU_TARGETS MATCHES "gfx94")
list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp) list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
endif()
list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp) list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp) list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
...@@ -82,6 +84,11 @@ set(PROFILER_EXECUTABLE ckProfiler) ...@@ -82,6 +84,11 @@ set(PROFILER_EXECUTABLE ckProfiler)
add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES}) add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors) target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
# flags to compress the library
if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
message("Adding --offload-compress flag for ${PROFILER_EXECUTABLE}")
target_compile_options(${PROFILER_EXECUTABLE} PRIVATE --offload-compress)
endif()
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
...@@ -123,8 +130,10 @@ if(GPU_TARGETS MATCHES "gfx9") ...@@ -123,8 +130,10 @@ if(GPU_TARGETS MATCHES "gfx9")
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance) if(GPU_TARGETS MATCHES "gfx94")
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
endif()
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
......
...@@ -34,7 +34,7 @@ enum struct GemmDataType ...@@ -34,7 +34,7 @@ enum struct GemmDataType
int profile_gemm_multiply_multiply(int argc, char* argv[]) int profile_gemm_multiply_multiply(int argc, char* argv[])
{ {
if(argc != 16 && argc != 19) if(argc != 16 && argc != 20)
{ {
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: " printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
...@@ -50,9 +50,10 @@ int profile_gemm_multiply_multiply(int argc, char* argv[]) ...@@ -50,9 +50,10 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
printf("arg7: time kernel (0=no, 1=yes)\n"); printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n"); printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
printf("optional:\n"); printf("optional:\n");
printf("arg16: number of warm-up cycles (default 1)\n"); printf("arg16: number of kbatch (default 1)\n");
printf("arg17: number of iterations (default 10)\n"); printf("arg17: number of warm-up cycles (default 1)\n");
printf("arg18: memory for rotating buffer (default 0, size in MB)\n"); printf("arg18: number of iterations (default 10)\n");
printf("arg19: memory for rotating buffer (default 0, size in MB)\n");
exit(1); exit(1);
} }
...@@ -76,11 +77,13 @@ int profile_gemm_multiply_multiply(int argc, char* argv[]) ...@@ -76,11 +77,13 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
int n_warmup = 1; int n_warmup = 1;
int n_iter = 10; int n_iter = 10;
uint64_t rotating = 0; uint64_t rotating = 0;
if(argc == 18) int KBatch = 1;
if(argc == 20)
{ {
n_warmup = std::stoi(argv[16]); KBatch = std::stoi(argv[16]);
n_iter = std::stoi(argv[17]); n_warmup = std::stoi(argv[17]);
rotating = std::stoull(argv[18]) * 1024 * 1024; n_iter = std::stoi(argv[18]);
rotating = std::stoull(argv[19]) * 1024 * 1024;
} }
using F32 = float; using F32 = float;
...@@ -146,6 +149,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[]) ...@@ -146,6 +149,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
(StrideD0 < 0) ? DefaultStrideD0 : StrideD0, (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
(StrideD1 < 0) ? DefaultStrideD1 : StrideD1, (StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
(StrideE < 0) ? DefaultStrideE : StrideE, (StrideE < 0) ? DefaultStrideE : StrideE,
KBatch,
n_warmup, n_warmup,
n_iter, n_iter,
rotating); rotating);
......
...@@ -171,6 +171,10 @@ int profile_gemm_universal(int argc, char* argv[]) ...@@ -171,6 +171,10 @@ int profile_gemm_universal(int argc, char* argv[])
{ {
return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{}); return profile(BF16{}, BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
} }
else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Row{}, Row{});
}
else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN) else if(data_type == GemmDataType::F8_F8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{ {
return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{}); return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib> #include <cstdlib>
#include <initializer_list> #include <initializer_list>
...@@ -81,7 +81,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[]) ...@@ -81,7 +81,6 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv); const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]); ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
split_k = std::max(1, split_k);
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
......
...@@ -29,6 +29,12 @@ enum struct ConvDataType ...@@ -29,6 +29,12 @@ enum struct ConvDataType
BF8_F8_F8, // 7 BF8_F8_F8, // 7
}; };
enum struct IndexType
{
INDEX_T, // 0
LONG_INDEX_T, // 1
};
#define OP_NAME "grouped_conv_fwd" #define OP_NAME "grouped_conv_fwd"
#define OP_DESC "Grouped Convolution Forward" #define OP_DESC "Grouped Convolution Forward"
...@@ -45,12 +51,13 @@ static void print_helper_msg() ...@@ -45,12 +51,13 @@ static void print_helper_msg()
<< " 5: Input bf8, Weight bf8, Output fp8\n" << " 5: Input bf8, Weight bf8, Output fp8\n"
<< " 6: Input fp8, Weight bf8, Output fp8\n" << " 6: Input fp8, Weight bf8, Output fp8\n"
<< " 7: Input bf8, Weight fp8, Output fp8)\n" << " 7: Input bf8, Weight fp8, Output fp8)\n"
<< "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n" << "arg3: indexing data type (0: 32-bit, 1: 64-bit)\n"
<< "arg4: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
<< " 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n" << " 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
<< "arg4: verification (0: no, 1: yes)\n" << "arg5: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n" << "arg7: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n" << "arg8: time kernel (0: no, 1: yes)\n"
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
// clang-format on // clang-format on
} }
...@@ -60,7 +67,7 @@ static void print_helper_msg() ...@@ -60,7 +67,7 @@ static void print_helper_msg()
int profile_grouped_conv_fwd(int argc, char* argv[]) int profile_grouped_conv_fwd(int argc, char* argv[])
{ {
// 8 for control, 1 for num_dim_spatial // 8 for control, 1 for num_dim_spatial
if(argc < 9) if(argc < 10)
{ {
print_helper_msg(); print_helper_msg();
return 1; return 1;
...@@ -68,20 +75,21 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) ...@@ -68,20 +75,21 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2])); const auto data_type = static_cast<ConvDataType>(std::stoi(argv[2]));
const auto layout = static_cast<ConvLayout>(std::stoi(argv[3])); const auto layout = static_cast<ConvLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]); const auto index_type = static_cast<IndexType>(std::stoi(argv[4]));
const int init_method = std::stoi(argv[5]); const bool do_verification = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]); const int init_method = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]); const bool do_log = std::stoi(argv[7]);
const int num_dim_spatial = std::stoi(argv[8]); const bool time_kernel = std::stoi(argv[8]);
const int num_dim_spatial = std::stoi(argv[9]);
// 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
if(argc != 8 + 1 + 4 + 6 * num_dim_spatial) if(argc != 9 + 1 + 4 + 6 * num_dim_spatial)
{ {
print_helper_msg(); print_helper_msg();
return 1; return 1;
} }
const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv); const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv);
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
...@@ -138,18 +146,43 @@ int profile_grouped_conv_fwd(int argc, char* argv[]) ...@@ -138,18 +146,43 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
using AComputeType = decltype(a_compute_type); using AComputeType = decltype(a_compute_type);
using BComputeType = decltype(b_compute_type); using BComputeType = decltype(b_compute_type);
bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial, if(index_type == IndexType::INDEX_T)
InLayout, {
WeiLayout, bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
OutLayout, InLayout,
InDataType, WeiLayout,
WeiDataType, OutLayout,
OutDataType, InDataType,
AComputeType, WeiDataType,
BComputeType>( OutDataType,
do_verification, init_method, do_log, time_kernel, params); AComputeType,
BComputeType,
ck::index_t>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1;
}
else if(index_type == IndexType::LONG_INDEX_T)
{
bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
AComputeType,
BComputeType,
ck::long_index_t>(
do_verification, init_method, do_log, time_kernel, params);
return pass ? 0 : 1; return pass ? 0 : 1;
}
else
{
std::cout << "this indexing data type is not implemented" << std::endl;
return 1;
}
}; };
// GNHWC_GKYXC_GNHWK // GNHWC_GKYXC_GNHWK
......
...@@ -85,9 +85,11 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[]) ...@@ -85,9 +85,11 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
const auto StrideCs = argToIntArray(argv[13]); const auto StrideCs = argToIntArray(argv[13]);
const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1; const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1;
using F32 = float; using F32 = float;
using F16 = ck::half_t; using F16 = ck::half_t;
using F8 = ck::f8_t; #if defined(CK_ENABLE_FP8)
using F8 = ck::f8_t;
#endif
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
using I8 = int8_t; using I8 = int8_t;
......
...@@ -62,17 +62,13 @@ def parse_instances(str_instances: List[str]) -> List[CKGemmOperation]: ...@@ -62,17 +62,13 @@ def parse_instances(str_instances: List[str]) -> List[CKGemmOperation]:
i_current = i_next + 1 i_current = i_next + 1
if i_next == -1: if i_next == -1:
break break
# pad with `None`s for the fields which are not defined in the instance
template_args.insert(2, tuple()) # ds layout
template_args.insert(6, tuple()) # ds dtype
new_instance = CKGemmOperation( new_instance = CKGemmOperation(
*template_args, # type: ignore[arg-type] *template_args, # type: ignore[arg-type]
*((None,) * (len(fields(CKGemmOperation)) - len(template_args))),
) )
# the last 2 template parameters are optional
# if they are absent, substitute them with default values from Universal Gemm C++ template declaration
if new_instance.a_compute_dtype is None:
new_instance.a_compute_dtype = new_instance.c_element_dtype
if new_instance.b_compute_dtype is None:
new_instance.b_compute_dtype = new_instance.c_element_dtype
op_instances.append(new_instance) op_instances.append(new_instance)
return op_instances return op_instances
...@@ -208,6 +204,8 @@ def gen_ops_preselected() -> List[CKGemmOperation]: ...@@ -208,6 +204,8 @@ def gen_ops_preselected() -> List[CKGemmOperation]:
a_layout="Row", a_layout="Row",
b_layout="Col", b_layout="Col",
c_layout="Row", c_layout="Row",
ds_element_dtypes=tuple(),
ds_layouts=tuple(),
a_element_dtype="F16", a_element_dtype="F16",
b_element_dtype="F16", b_element_dtype="F16",
c_element_dtype="F16", c_element_dtype="F16",
......
...@@ -10,10 +10,12 @@ class CKGemmOperation: ...@@ -10,10 +10,12 @@ class CKGemmOperation:
a_layout: str a_layout: str
b_layout: str b_layout: str
ds_layouts: Tuple[str] # addmm specific
c_layout: str c_layout: str
a_element_dtype: str a_element_dtype: str
b_element_dtype: str b_element_dtype: str
ds_element_dtypes: Tuple[str] # addmm specific
c_element_dtype: str c_element_dtype: str
acc_dtype: str acc_dtype: str
...@@ -64,16 +66,15 @@ class CKGemmOperation: ...@@ -64,16 +66,15 @@ class CKGemmOperation:
Tuple[int, int, int, int] Tuple[int, int, int, int]
) )
c_shuffle_block_transfer_scalar_per_vector_n_per_block: int c_shuffle_block_transfer_scalar_per_vector_n_per_block: int
block_gemm_pipeline_scheduler: str block_gemm_pipeline_scheduler: str
block_gemm_pipeline_version: Optional[str] block_gemm_pipeline_version: str
a_compute_dtype: Optional[str] a_compute_dtype: Optional[str] = None
b_compute_dtype: Optional[str] b_compute_dtype: Optional[str] = None
def name(self): def name(self):
# cpp alias for template instance # cpp alias for template instance
return f"ck_devicegemm_xdl_shuffle_v3_{self.key_name()}" return f"ck_devicegemm_multid_xdl_shuffle_v3_{self.key_name()}"
def key_name(self): def key_name(self):
# TBD; must be unique per instance. Intended to use as dict key # TBD; must be unique per instance. Intended to use as dict key
......
# SPDX-License-Identifier: MIT
# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
# Convert miopen driver command to ck Profiler
# Example: python3 ../script/convert_miopen_driver_to_profiler.py
# /opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
# -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 32 -F 1 -t 1
import argparse
import subprocess
def init_const_args(args):
args.ck_profiler_cmd = '../build/bin/ckProfiler'
# use decimal values
args.init_method = 2
# don't print tensor values
args.log_value = 0
def run_ck_profiler_cmd(cmd):
print("ckProfiler command:")
print(cmd)
subprocess.run(cmd)
def parse_data_type(args):
if args.data_type == "fp32":
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
args.ck_profier_op == "grouped_conv_bwd_data" or \
args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 0
if args.data_type == "fp16":
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
args.ck_profier_op == "grouped_conv_bwd_data" or \
args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 1
if args.data_type == "int8":
if args.ck_profier_op == "grouped_conv_bwd_weight":
args.data_type = 4
if args.ck_profier_op == "grouped_conv_bwd_data":
print('Not supported data type for grouped_conv_bwd_data')
exit(1)
if args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 3
if args.data_type == "bfp16":
if args.ck_profier_op == "grouped_conv_bwd_weight" or \
args.ck_profier_op == "grouped_conv_bwd_data" or \
args.ck_profier_op == "grouped_conv_fwd":
args.data_type = 2
def add_conv_params_to_cmd(args, cmd):
if args.spatial_dim == 1:
cmd += [str(args.fil_w), str(args.in_w)]
cmd += [str(args.conv_stride_w), str(args.dilation_w)]
cmd += [str(args.pad_w), str(args.pad_w)]
elif args.spatial_dim == 2:
cmd += [str(args.fil_h), str(args.fil_w)]
cmd += [str(args.in_h), str(args.in_w)]
cmd += [str(args.conv_stride_h), str(args.conv_stride_w)]
cmd += [str(args.dilation_h), str(args.dilation_w)]
cmd += [str(args.pad_h), str(args.pad_w)]
cmd += [str(args.pad_h), str(args.pad_w)]
elif args.spatial_dim == 3:
cmd += [str(args.fil_d), str(args.fil_h), str(args.fil_w)]
cmd += [str(args.in_d), str(args.in_h), str(args.in_w)]
cmd += [str(args.conv_stride_d), str(args.conv_stride_h)]
cmd += [str(args.conv_stride_w)]
cmd += [str(args.dilation_d),
str(args.dilation_h),
str(args.dilation_w)]
cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
cmd += [str(args.pad_d), str(args.pad_h), str(args.pad_w)]
else:
print('Not supported spatial dim (supported: 1, 2, 3)')
exit(1)
def run_ck_grouped_conv_fwd(args):
args.ck_profier_op = "grouped_conv_fwd"
parse_data_type(args)
# default for MIOpen NHWGC
args.layout = 1
# use int32 by default
args.index_type = 0
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
cmd += [str(args.data_type), str(args.layout), str(args.index_type)]
cmd += [str(args.verify), str(args.init_method)]
cmd += [str(args.log_value), str(args.time)]
cmd += [str(args.spatial_dim), str(args.group_count)]
cmd += [str(args.batchsize), str(args.out_channels)]
cmd += [str(args.in_channels)]
add_conv_params_to_cmd(args, cmd)
run_ck_profiler_cmd(cmd)
def run_ck_grouped_conv_bwd_data(args):
args.ck_profier_op = "grouped_conv_bwd_data"
parse_data_type(args)
# default for MIOpen NHWGC
args.layout = 1
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
cmd += [str(args.data_type), str(args.layout)]
cmd += [str(args.verify), str(args.init_method)]
cmd += [str(args.log_value), str(args.time)]
cmd += [str(args.spatial_dim), str(args.group_count)]
cmd += [str(args.batchsize), str(args.out_channels)]
cmd += [str(args.in_channels)]
add_conv_params_to_cmd(args, cmd)
run_ck_profiler_cmd(cmd)
def run_ck_grouped_conv_bwd_weight(args):
args.ck_profier_op = "grouped_conv_bwd_weight"
parse_data_type(args)
# default for MIOpen NHWGC
args.layout = 2
# Test all split K value from the list {1, 2, 4, 8, 32, 64, 128}
args.split_k_value = -1
cmd = [str(args.ck_profiler_cmd), str(args.ck_profier_op)]
cmd += [str(args.data_type), str(args.layout)]
cmd += [str(args.verify), str(args.init_method)]
cmd += [str(args.log_value), str(args.time)]
cmd += [str(args.spatial_dim), str(args.group_count)]
cmd += [str(args.batchsize), str(args.out_channels)]
cmd += [str(args.in_channels)]
add_conv_params_to_cmd(args, cmd)
cmd += [str(args.split_k_value)]
run_ck_profiler_cmd(cmd)
# Get name of miopen driver, remove it from unknown
def process_miopen_driver_name(args, unknown):
if "convint8" in unknown:
args.data_type = 'int8'
unknown.remove("convint8")
elif "convbfp16" in unknown:
args.data_type = 'bfp16'
unknown.remove("convbfp16")
elif "convfp16" in unknown:
args.data_type = 'fp16'
unknown.remove("convfp16")
elif "conv" in unknown:
args.data_type = 'fp32'
unknown.remove("conv")
else:
print('Not supported driver (supported: conv, convfp16, convint8,'
' convbfp16).')
exit(1)
def run_ck_profiler(args):
# MIOpen get number of channel per all groups, CK profiler get number of
# channel per group
args.in_channels = int(args.in_channels / args.group_count)
args.out_channels = int(args.out_channels / args.group_count)
if args.forw == 0 or args.forw == 1 or args.forw == 3 or args.forw == 5:
run_ck_grouped_conv_fwd(args)
if args.forw == 0 or args.forw == 2 or args.forw == 3 or args.forw == 6:
run_ck_grouped_conv_bwd_data(args)
if args.forw == 0 or args.forw == 4 or args.forw == 5 or args.forw == 6:
run_ck_grouped_conv_bwd_weight(args)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="converter",
description="Convert miopen driver command to ck Profiler"
"\nExample: python3 "
"../script/convert_miopen_driver_to_profiler.py "
"/opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 "
"-k 64 -y 3 -x 3 -p 1 -q 1 -u 1 -v 1 -l 1 -j 1 -m conv -g "
"32 -F 1 -t 1",
)
parser.add_argument(
"-in_layout",
"-I",
default=-1,
type=int,
required=False,
help="Input Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)"
)
parser.add_argument(
"-forw",
"-F",
default=0,
type=int,
required=False,
help="Flag enables fwd, bwd, wrw convolutions"
"\n0 fwd+bwd+wrw (default)"
"\n1 fwd only"
"\n2 bwd only"
"\n4 wrw only"
"\n3 fwd+bwd"
"\n5 fwd+wrw"
"\n6 bwd+wrw"
)
parser.add_argument(
"-spatial_dim",
"-_",
default=2,
type=int,
required=False,
help="convolution spatial dimension (Default-2)"
)
parser.add_argument(
"-batchsize",
"-n",
default=100,
type=int,
required=False,
help="Mini-batch size (Default=100)"
)
parser.add_argument(
"-in_channels",
"-c",
default=3,
type=int,
required=False,
help="Number of Input Channels (Default=3)"
)
parser.add_argument(
"-in_d",
"-!",
default=32,
type=int,
required=False,
help="Input Depth (Default=32)"
)
parser.add_argument(
"-in_h",
"-H",
default=32,
type=int,
required=False,
help="Input Height (Default=32)"
)
parser.add_argument(
"-in_w",
"-W",
default=32,
type=int,
required=False,
help="Input Width (Default=32)"
)
parser.add_argument(
"-out_channels",
"-k",
default=32,
type=int,
required=False,
help="Number of Output Channels (Default=32)"
)
parser.add_argument(
"-fil_d",
"-@",
default=3,
type=int,
required=False,
help="Filter Depth (Default=3)"
)
parser.add_argument(
"-fil_h",
"-y",
default=3,
type=int,
required=False,
help="Filter Height (Default=3)"
)
parser.add_argument(
"-fil_w",
"-x",
default=3,
type=int,
required=False,
help="Filter Width (Default=3)"
)
parser.add_argument(
"-conv_stride_d",
"-#",
default=1,
type=int,
required=False,
help="Convolution Stride for Depth (Default=1)"
)
parser.add_argument(
"-conv_stride_h",
"-u",
default=1,
type=int,
required=False,
help="Convolution Stride for Height (Default=1)"
)
parser.add_argument(
"-conv_stride_w",
"-v",
default=1,
type=int,
required=False,
help="Convolution Stride for Width (Default=1)"
)
parser.add_argument(
"-pad_d",
"-$",
default=1,
type=int,
required=False,
help="Zero Padding for Depth (Default=0)"
)
parser.add_argument(
"-pad_h",
"-p",
default=1,
type=int,
required=False,
help="Zero Padding for Height (Default=0)"
)
parser.add_argument(
"-pad_w",
"-q",
default=1,
type=int,
required=False,
help="Zero Padding for Width (Default=0)"
)
parser.add_argument(
"-verify",
"-V",
default=1,
type=int,
required=False,
help="Verify Each Layer (Default=1)"
)
parser.add_argument(
"-time",
"-t",
default=0,
type=int,
required=False,
help="Time Each Layer (Default=0)"
)
parser.add_argument(
"-dilation_d",
"-^",
default=1,
type=int,
required=False,
help="Dilation of Filter Depth (Default=1)"
)
parser.add_argument(
"-dilation_h",
"-l",
default=1,
type=int,
required=False,
help="Dilation of Filter Height (Default=1)"
)
parser.add_argument(
"-dilation_w",
"-j",
default=1,
type=int,
required=False,
help="Dilation of Filter Width (Default=1)"
)
parser.add_argument(
"-group_count",
"-g",
type=int,
default=1,
required=False,
help="Number of Groups (Default=1)"
)
args, unknown = parser.parse_known_args()
init_const_args(args)
process_miopen_driver_name(args, unknown)
print("Ignored args:")
print(unknown)
run_ck_profiler(args)
...@@ -122,7 +122,7 @@ def parse_logfile(logfile): ...@@ -122,7 +122,7 @@ def parse_logfile(logfile):
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))] #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1)) test_list=list(range(1,len(tests)+1))
#parse conv_fwd and conv_bwd performance tests: #parse conv_fwd and conv_bwd performance tests:
elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile: elif 'conv_fwd' in logfile or 'conv_bwd' in logfile:
for line in open(logfile): for line in open(logfile):
if 'tflops:' in line: if 'tflops:' in line:
lst=line.split() lst=line.split()
...@@ -143,6 +143,12 @@ def parse_logfile(logfile): ...@@ -143,6 +143,12 @@ def parse_logfile(logfile):
if 'Best Perf' in line: if 'Best Perf' in line:
lst=line.split() lst=line.split()
res.append(lst[36]) res.append(lst[36])
elif 'perf_fmha' in logfile:
for line in open(logfile):
if 'TFlops' in line:
lst=line.split()
line_dict=dict(zip(lst[1:],lst))
res.append(line_dict['TFlops,'])
return res return res
...@@ -268,14 +274,26 @@ def main(): ...@@ -268,14 +274,26 @@ def main():
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_grouped_gemm_tflops" table_name="ck_grouped_gemm_tflops"
if 'conv_fwd' in filename: if 'perf_conv_fwd' in filename:
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_conv_fwd_tflops" table_name="ck_conv_fwd_tflops"
if 'conv_bwd_data' in filename: if 'perf_conv_bwd_data' in filename:
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_conv_bwd_data_tflops" table_name="ck_conv_bwd_data_tflops"
if 'grouped_conv_fwd' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_grouped_conv_fwd_tflops"
if 'grouped_conv_bwd_data' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_grouped_conv_bwd_data_tflops"
if 'grouped_conv_bwd_weight' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_grouped_conv_bwd_weight_tflops"
if 'gemm_bilinear' in filename: if 'gemm_bilinear' in filename:
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
...@@ -304,6 +322,14 @@ def main(): ...@@ -304,6 +322,14 @@ def main():
for i in range(1,len(results)+1): for i in range(1,len(results)+1):
testlist.append("Test%i"%i) testlist.append("Test%i"%i)
table_name="ck_mixed_gemm_tflops" table_name="ck_mixed_gemm_tflops"
if 'fmha_fwd' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_fmha_fwd_tflops"
if 'fmha_bwd' in filename:
for i in range(1,len(results)+1):
testlist.append("Test%i"%i)
table_name="ck_fmha_bwd_tflops"
tflops_base = get_baseline(table_name,conn) tflops_base = get_baseline(table_name,conn)
store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn) store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)
......
...@@ -13,3 +13,20 @@ ...@@ -13,3 +13,20 @@
python3 process_perf_data.py perf_gemm.log python3 process_perf_data.py perf_gemm.log
python3 process_perf_data.py perf_resnet50_N256.log python3 process_perf_data.py perf_resnet50_N256.log
python3 process_perf_data.py perf_resnet50_N4.log python3 process_perf_data.py perf_resnet50_N4.log
file=./perf_fmha_fwd_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_fwd_gfx942.log
fi
file=./perf_fmha_bwd_gfx942.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_bwd_gfx942.log
fi
file=./perf_fmha_fwd_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_fwd_gfx90a.log
fi
file=./perf_fmha_bwd_gfx90a.log
if [ -e "$file" ]; then
python3 process_perf_data.py perf_fmha_bwd_gfx90a.log
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment