Commit 41b920e2 authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'amd-develop' into amd-master

parents 874a78f9 5d718e6b
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_permute_scale_2d_f16_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 2>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f16_instances<2>{});
}
void add_device_permute_scale_2d_f32_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 2>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f32_instances<2>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_permute_scale_3d_f16_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 3>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f16_instances<3>{});
}
void add_device_permute_scale_3d_f32_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 3>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f32_instances<3>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_permute_scale_4d_f16_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f16_instances<4>{});
}
void add_device_permute_scale_4d_f32_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f32_instances<4>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_permute_scale_5d_f16_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 5>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f16_instances<5>{});
}
void add_device_permute_scale_5d_f32_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 5>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f32_instances<5>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_permute_scale_6d_f16_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 6>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f16_instances<6>{});
}
void add_device_permute_scale_6d_f32_instances(
std::vector<std::unique_ptr<
DeviceElementwise<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 6>>>& instances)
{
add_device_operation_instances(instances, device_permute_scale_f32_instances<6>{});
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
......@@ -37,9 +37,9 @@ Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
################ op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
./bin/ckProfiler conv2d_fwd 1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1
```
Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
```
```bash
in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
......@@ -104,6 +104,7 @@ arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2}
arg.e_grid_desc_m_n_{ 4096, 4096}
....
Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
```
## Profile grouped convolution backward data kernels
```bash
# arg1: tensor operation (grouped_conv_bwd_data: Grouped Convolution Backward Data)
......@@ -129,10 +130,11 @@ Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
./bin/ckProfiler grouped_conv_bwd_data 1 0 1 1 0 1 2 32 4 192 192 3 3 28 28 1 1 1 1 1 1 1 1
```
```
Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
```
```bash
out: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
wei: dim 5, lengths {32, 192, 192, 3, 3}, strides {331776, 1728, 1, 576, 192}
in: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192}
......@@ -173,10 +175,11 @@ GB/s: 127.947
################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx SplitK
./bin/ckProfiler grouped_conv_bwd_weight 1 1 0 1 0 1 2 32 256 256 512 3 3 28 28 1 1 1 1 1 0 0 0 1
```
```
Result (MI100, FP16, GNHWC_GKYXC_GNHWK)
```
```bash
input: dim 5, lengths {32, 512, 1024, 28, 28}, strides {411041792, 802816, 1, 28672, 1024}
weight: dim 5, lengths {32, 512, 1024, 3, 3}, strides {4718592, 9216, 1, 3072, 1024}
output: dim 5, lengths {32, 512, 512, 26, 26}, strides {177209344, 346112, 1, 13312, 512}
......@@ -190,8 +193,9 @@ GB/s: 69.2301
Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
## Profile image to column/column to image kernels
```bash
# arg1: tensor operation (" OP_NAME ": " OP_DESC ")
# arg1: tensor operation ( conv_tensor_rearrange : Conv Tensor Rearrange )
# arg2: data type (0: Input fp32, Weight fp32, Output fp32
# 1: Input fp16, Weight fp16, Output fp16
# 2: Input bf16, Weight bf16, Output bf16
......@@ -216,10 +220,11 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate
################ op datatype layout verify init log time opType Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx
./bin/ckProfiler conv_tensor_rearrange 0 0 0 1 0 1 0 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0
```
```
Result (MI210, FP32, NHWC)
```
```bash
input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512}
output: dim 2, lengths {173056, 4608}, strides {4608, 1}
....
......@@ -229,3 +234,30 @@ avg_time: 3.12326
GB/s: 2042.59
```
Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time.
## Profile Permute scale kernels
```bash
# arg1: tensor operation ( permute_scale : Permute Scale )
# arg2: data type (0: Input fp32, Output fp32
# 1: Input fp16, Output fp16
# arg4: verification (0: no, 1: yes)
# arg5: initialization (0: no init, 1: integer value, 2: decimal value)
# arg6: print tensor value (0: no; 1: yes)
# arg7: time kernel (0: no, 1: yes)
# from arg8: tensor lengths
# input strides
# output strides
################ op datatype verify init log time dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2
./bin/ckProfiler permute_scale 0 1 1 0 1 64 64 64 4096 64 1 1 64 4096
```
Result (MI100, FP32)
```bash
A: dim 3, lengths {64, 64, 64}, strides {4096, 64, 1}
B: dim 3, lengths {64, 64, 64}, strides {1, 64, 4096}
....
Best perf = 0.0146878 ms, 142.782 GB/s, DeviceElementwiseNormalizationImpl<3, 2>
```
......@@ -233,7 +233,7 @@ bool profile_elementwise_layernorm_impl(int do_verification,
y_dev.FromDevice(y.mData.data());
bool pass =
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 5e-3, 5e-3);
if(do_log)
{
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace profiler {
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename D0DataType,
typename EDataType,
typename ALayout,
typename BLayout,
typename D0Layout,
typename ELayout>
bool profile_gemm_add_impl(int do_verification,
int init_method,
bool /*do_log*/,
bool time_kernel,
int M,
int N,
int K,
int StrideA,
int StrideB,
int StrideD0,
int StrideE)
{
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
}
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Add = ck::tensor_operation::element_wise::Add;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = Add;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{};
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
ALayout,
BLayout,
ck::Tuple<D0Layout>,
ELayout,
ADataType,
BDataType,
ck::Tuple<D0DataType>,
EDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::Add>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
// run reference
if(do_verification)
{
Tensor<AccDataType> c_m_n({M, N});
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
AccDataType,
AccDataType,
AElementOp,
BElementOp,
PassThrough>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
for(int m = 0; m < M; ++m)
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n));
}
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
bool pass = true;
// profile device operation instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{StrideD0},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init E to zero before profiling a kernel
e_device_buf.SetZero();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_relu.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace profiler {
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename D0DataType,
typename EDataType,
typename ALayout,
typename BLayout,
typename D0Layout,
typename ELayout>
bool profile_gemm_add_relu_impl(int do_verification,
int init_method,
bool /*do_log*/,
bool time_kernel,
int M,
int N,
int K,
int StrideA,
int StrideB,
int StrideD0,
int StrideE)
{
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
}
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AddRelu = ck::tensor_operation::element_wise::AddRelu;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AddRelu;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{};
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
ALayout,
BLayout,
ck::Tuple<D0Layout>,
ELayout,
ADataType,
BDataType,
ck::Tuple<D0DataType>,
EDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::AddRelu>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
// run reference
if(do_verification)
{
Tensor<AccDataType> c_m_n({M, N});
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
AccDataType,
AccDataType,
AElementOp,
BElementOp,
PassThrough>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
for(int m = 0; m < M; ++m)
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n));
}
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
bool pass = true;
// profile device operation instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{StrideD0},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init E to zero before profiling a kernel
e_device_buf.SetZero();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/gemm_add_silu.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace profiler {
template <typename ADataType,
typename BDataType,
typename AccDataType,
typename D0DataType,
typename EDataType,
typename ALayout,
typename BLayout,
typename D0Layout,
typename ELayout>
bool profile_gemm_add_silu_impl(int do_verification,
int init_method,
bool /*do_log*/,
bool time_kernel,
int M,
int N,
int K,
int StrideA,
int StrideB,
int StrideD0,
int StrideE)
{
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
}
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using AddRelu = ck::tensor_operation::element_wise::AddSilu;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AddRelu;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{};
using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
ALayout,
BLayout,
ck::Tuple<D0Layout>,
ELayout,
ADataType,
BDataType,
ck::Tuple<D0DataType>,
EDataType,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::AddSilu>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
// run reference
if(do_verification)
{
Tensor<AccDataType> c_m_n({M, N});
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
AccDataType,
AccDataType,
AElementOp,
BElementOp,
PassThrough>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
for(int m = 0; m < M; ++m)
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n));
}
}
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
bool pass = true;
// profile device operation instances
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(
a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{StrideD0},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init E to zero before profiling a kernel
e_device_buf.SetZero();
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
if(do_verification)
{
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return pass;
}
} // namespace profiler
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck {
namespace profiler {
template <typename ADataType,
typename BDataType,
typename CDataType,
typename AccDataType,
typename ALayout,
typename BLayout,
typename CLayout>
bool profile_grouped_gemm_fixed_nk_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const std::vector<int>& Ms,
const std::vector<int>& Ns,
const std::vector<int>& Ks,
const std::vector<int>& StrideAs,
const std::vector<int>& StrideBs,
const std::vector<int>& StrideCs,
int kbatch = 1,
int n_warmup = 1,
int n_iter = 10)
{
bool pass = true;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
std::size_t group_count = Ms.size();
if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() &&
group_count == StrideBs.size() && group_count == StrideCs.size()))
{
throw std::runtime_error("wrong! inconsistent M/N/Ks, StrideA/B/Cs size\n");
}
std::vector<Tensor<ADataType>> a_m_k;
std::vector<Tensor<BDataType>> b_k_n;
std::vector<Tensor<CDataType>> c_m_n_host_results;
std::vector<Tensor<CDataType>> c_m_n_device_results;
for(std::size_t i = 0; i < group_count; i++)
{
a_m_k.push_back(
Tensor<ADataType>(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{})));
b_k_n.push_back(
Tensor<BDataType>(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{})));
c_m_n_device_results.push_back(
Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
c_m_n_host_results.push_back(
Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
#if DEBUG_LOG
std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
<< "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
<< "]:" << c_m_n_device_results[i].mDesc << std::endl;
#endif // DEBUG_LOG
std::size_t num_thread = 1;
switch(init_method)
{
case 0: break;
case 1:
a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
break;
default:
a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
}
}
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto c_element_op = CElementOp{};
using DeviceMemPtr = std::unique_ptr<DeviceMem>;
std::vector<DeviceMemPtr> a_device_buf, b_device_buf, c_device_buf;
a_device_buf.reserve(group_count);
b_device_buf.reserve(group_count);
c_device_buf.reserve(group_count);
std::vector<const void*> p_a, p_b;
std::vector<void*> p_c;
p_a.reserve(group_count);
p_b.reserve(group_count);
p_c.reserve(group_count);
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
gemm_descs.reserve(group_count);
std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
grouped_gemm_kernel_args_;
grouped_gemm_kernel_args_.reserve(group_count);
for(std::size_t i = 0; i < group_count; i++)
{
a_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
b_device_buf.emplace_back(
std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
c_device_buf.emplace_back(std::make_unique<DeviceMem>(
sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
p_a.push_back(a_device_buf[i]->GetDeviceBuffer());
p_b.push_back(b_device_buf[i]->GetDeviceBuffer());
p_c.push_back(c_device_buf[i]->GetDeviceBuffer());
grouped_gemm_kernel_args_.push_back({a_device_buf[i]->GetDeviceBuffer(),
b_device_buf[i]->GetDeviceBuffer(),
{},
c_device_buf[i]->GetDeviceBuffer(),
Ms[i],
Ns[i],
Ks[i],
StrideAs[i],
StrideBs[i],
{},
StrideCs[i]});
}
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
BLayout,
ck::Tuple<>,
CLayout,
ADataType,
BDataType,
ck::Tuple<>,
CDataType,
AElementOp,
BElementOp,
CElementOp>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
if(op_ptrs.size() <= 0)
{
throw std::runtime_error("wrong! no device GEMM instance found");
}
std::string best_gemm_name;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
float best_kbatch = 0;
auto p_ds = std::vector<std::array<const void*, 0>>{};
if(do_verification)
{
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument = ref_gemm.MakeArgument(a_m_k[i],
b_k_n[i],
c_m_n_host_results[i],
a_element_op,
b_element_op,
c_element_op);
ref_invoker.Run(ref_argument);
}
}
// profile device GEMM instances
for(auto& gemm_ptr : op_ptrs)
{
auto argument_ptr =
gemm_ptr->MakeArgumentPointer(p_a,
p_b,
p_ds,
p_c,
gemm_descs,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{});
auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
DeviceMem grouped_gemm_kernel_args_dev(
gemm_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
grouped_gemm_kernel_args_.data(),
gemm_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
hipMemcpyHostToDevice));
gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
gemm_ptr->SetDeviceKernelArgs(argument_ptr.get(),
grouped_gemm_kernel_args_dev.GetDeviceBuffer());
std::string gemm_name = gemm_ptr->GetTypeString();
std::vector<int> kbatch_list = {1, 2, 4, 8, 12, 16, 20, 24, 32, 48, 64};
if(kbatch > 0)
{
kbatch_list = {kbatch};
}
for(std::size_t j = 0; j < kbatch_list.size(); j++)
{
auto kbatch_curr = kbatch_list[j];
gemm_ptr->SetKBatch(argument_ptr.get(), kbatch_curr);
if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{
for(std::size_t i = 0; i < gemm_descs.size(); i++)
c_device_buf[i]->SetZero();
invoker_ptr->Run(argument_ptr.get(),
StreamConfig{nullptr, false, 0, n_warmup, n_iter});
if(do_verification)
{
bool instance_pass = true;
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
if(std::is_same_v<CDataType, ck::half_t> && kbatch_curr > 1)
{
instance_pass =
instance_pass && ck::utils::check_err(c_m_n_device_results[i],
c_m_n_host_results[i],
"Error: Incorrect results!",
0.06);
}
else
{
instance_pass =
instance_pass && ck::utils::check_err(c_m_n_device_results[i],
c_m_n_host_results[i]);
}
if(do_log)
{
LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
<< std::endl;
LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
<< std::endl;
LogRangeAsType<float>(
std::cout << "c_host : ", c_m_n_host_results[i].mData, ",")
<< std::endl;
}
}
std::cout << "Instance: " << gemm_name << " verification "
<< (instance_pass ? "SUCCEED" : "FAILED") << std::endl;
pass = pass && instance_pass;
}
float ave_time = invoker_ptr->Run(
argument_ptr.get(), StreamConfig{nullptr, time_kernel, 0, n_warmup, n_iter});
if(time_kernel)
{
std::size_t flop = 0, num_btype = 0;
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
sizeof(BDataType) * Ks[i] * Ns[i] +
sizeof(CDataType) * Ms[i] * Ns[i];
}
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << ", KBatch "
<< kbatch_curr << std::endl;
if(tflops > best_tflops)
{
best_gemm_name = gemm_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
best_kbatch = kbatch_curr;
}
}
}
else
{
std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
<< std::endl;
}
}
}
if(time_kernel)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << ", KBatch = " << best_kbatch
<< std::endl;
}
return pass;
}
} // namespace profiler
} // namespace ck
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -21,68 +21,47 @@
#include "ck/library/utility/literals.hpp"
namespace ck {
template <typename HostTensorA, typename HostTensorB, typename FunctorA, typename FunctorB>
void host_elementwise4D(HostTensorB& B_nhwc,
const HostTensorA& A_nchw,
FunctorA functor_a,
FunctorB functor_b,
float scale)
template <typename HostTensorA,
typename HostTensorB,
typename AElementOp,
typename BElementOp,
typename ScaleElementOp>
void reference_permute_scale(HostTensorB& b_tensor,
const HostTensorA& a_tensor,
AElementOp a_tensor_op,
BElementOp b_tensor_op,
ScaleElementOp scale_op)
{
std::size_t N = A_nchw.mDesc.GetLengths()[0];
std::size_t C = A_nchw.mDesc.GetLengths()[1];
std::size_t H = A_nchw.mDesc.GetLengths()[2];
std::size_t W = A_nchw.mDesc.GetLengths()[3];
for(std::size_t w = 0; w < W; ++w)
for(std::size_t h = 0; h < H; ++h)
for(std::size_t c = 0; c < C; ++c)
for(std::size_t n = 0; n < N; ++n)
{
using tmp_type = ck::remove_reference_t<decltype(B_nhwc(0, 0))>;
tmp_type tmp_val = 0;
auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
functor_b(tmp_val, a_val);
functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
scale * tmp_val);
}
b_tensor.ForEach([&](auto& self, auto idx) {
auto tmp_val = a_tensor(idx);
b_tensor_op(tmp_val, tmp_val);
scale_op(tmp_val, tmp_val);
a_tensor_op(self(idx), tmp_val);
});
}
namespace profiler {
template <typename ADataType, typename BDataType, index_t NumDim>
bool test_permute_scale_impl(int do_verification,
bool profile_permute_scale_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
std::vector<index_t> lengths)
std::vector<index_t> lengths_vector,
std::vector<index_t> input_strides_vector,
std::vector<index_t> output_strides_vector)
{
bool pass = true;
bool instance_found = false;
using ElementOp = ck::tensor_operation::element_wise::PassThrough;
using UnaryOp = ck::tensor_operation::element_wise::UnarySquare;
using Scale = ck::tensor_operation::element_wise::Scale;
float scale = 2.f;
index_t N = lengths[0];
index_t C = lengths[1];
index_t H = lengths[2];
index_t W = lengths[3];
std::vector<ck::index_t> nchw = {N, C, H, W};
std::vector<ck::index_t> nhwc = {N, H, W, C};
Tensor<ADataType> a(nchw);
Tensor<BDataType> b(nhwc);
Tensor<BDataType> host_b(nhwc);
std::array<ck::index_t, 4> ab_lengths;
std::array<ck::index_t, 4> a_strides = {1,
static_cast<int>(nchw[0]),
static_cast<int>(nchw[0] * nchw[1]),
static_cast<int>(nchw[0] * nchw[1] * nchw[2])};
std::array<ck::index_t, 4> b_strides = {1,
static_cast<int>(nhwc[0] * nhwc[1] * nhwc[2]),
static_cast<int>(nhwc[0]),
static_cast<int>(nhwc[0] * nhwc[1])};
ck::ranges::copy(nchw, ab_lengths.begin());
Tensor<ADataType> a(lengths_vector, input_strides_vector);
Tensor<BDataType> b(lengths_vector, output_strides_vector);
Tensor<BDataType> host_b(lengths_vector, output_strides_vector);
std::cout << "A: " << a.mDesc << std::endl;
std::cout << "B: " << b.mDesc << std::endl;
......@@ -91,19 +70,7 @@ bool test_permute_scale_impl(int do_verification,
{
case 0: break;
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
default: // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
std::mt19937 gen(11939);
std::uniform_int_distribution<int> dis(0, 1);
auto i = 0;
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
for(std::size_t n = 0; n < a.mDesc.GetLengths()[0]; ++n)
{
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
(h * nchw[3]) + w] = i;
i = dis(gen);
}
default: a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); break;
}
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
......@@ -133,14 +100,22 @@ bool test_permute_scale_impl(int do_verification,
if(do_verification)
{
host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
reference_permute_scale(host_b, a, ElementOp{}, UnaryOp{}, Scale{scale});
}
auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
std::array<ck::index_t, NumDim> lengths{};
std::array<ck::index_t, NumDim> input_strides{};
std::array<ck::index_t, NumDim> output_strides{};
copy(lengths_vector, lengths);
copy(input_strides_vector, input_strides);
copy(output_strides_vector, output_strides);
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(ab_lengths,
{a_strides},
{b_strides},
auto argument_ptr = op_ptr->MakeArgumentPointer(lengths,
{input_strides},
{output_strides},
input,
output,
ElementOp{},
......@@ -151,8 +126,9 @@ bool test_permute_scale_impl(int do_verification,
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
b_device_buf.SetZero();
instance_found = true;
b_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
if(do_verification)
......@@ -174,10 +150,10 @@ bool test_permute_scale_impl(int do_verification,
float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
std::size_t flop = std::size_t(2) * a.mDesc.GetElementSpaceSize() / sizeof(ADataType);
std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
std::size_t num_btype = sizeof(ADataType) * a.mDesc.GetElementSpaceSize() +
sizeof(BDataType) * b.mDesc.GetElementSpaceSize();
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
......@@ -201,12 +177,12 @@ bool test_permute_scale_impl(int do_verification,
}
if(time_kernel)
{
LogRange(std::cout << "length = ", lengths, ",") << ", ";
std::cout << "best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
std::cout << "Best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
return true;
return pass && instance_found;
}
} // namespace profiler
} // namespace ck
......@@ -32,6 +32,7 @@ set(PROFILER_SOURCES
profile_grouped_conv_bwd_data.cpp
profile_conv_tensor_rearrange.cpp
profile_transpose.cpp
profile_permute_scale.cpp
)
if(DL_KERNELS)
......@@ -43,11 +44,15 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
endif()
......@@ -96,6 +101,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_d
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
......@@ -109,7 +115,10 @@ if(DL_KERNELS)
endif()
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
......@@ -118,6 +127,7 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
endif()
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_add_impl.hpp"
#include "profiler_operation_registry.hpp"
#define OP_NAME "gemm_add"
#define OP_DESC "GEMM+Add"
using INT8 = int8_t;
using BF16 = ck::bhalf_t;
int profile_gemm_add(int argc, char* argv[])
{
enum struct MatrixLayout
{
MK_KN_MN_MN, // 0
MK_NK_MN_MN, // 1
KM_KN_MN_MN, // 2
KM_NK_MN_MN, // 3
};
enum struct MatrixDataType
{
F16_INT8_F16_F16, // 0
BF16_INT8_BF16_BF16, // 1
};
if(argc != 15)
{
// clang-format off
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: f16&i8 1: bf16&i8)\n");
printf("arg3: matrix layout (0: E[m, n] = ReLU(A[m, k] * B[k, n] + D0[m, n]);\n");
printf(" 1: E[m, n] = ReLU(A[m, k] * B[n, k] + D0[m, n]);\n");
printf(" 2: E[m, n] = ReLU(A[k, m] * B[k, n] + D0[m, n]);\n");
printf(" 3: E[m, n] = ReLU(A[k, m] * B[n, k] + D0[m, n]))\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
// clang-format on
exit(1);
}
const auto data_type = static_cast<MatrixDataType>(std::stoi(argv[2]));
const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideD0 = std::stoi(argv[13]);
const int StrideE = std::stoi(argv[14]);
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
// using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto d0_type,
auto e_type,
auto a_layout,
auto b_layout,
auto d0_layout,
auto e_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using D0DataType = decltype(d0_type);
using EDataType = decltype(e_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using D0Layout = decltype(d0_layout);
using ELayout = decltype(e_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_add_impl<ADataType,
BDataType,
AccDataType,
D0DataType,
EDataType,
ALayout,
BLayout,
D0Layout,
ELayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA < 0) ? DefaultStrideA : StrideA,
(StrideB < 0) ? DefaultStrideB : StrideB,
(StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
(StrideE < 0) ? DefaultStrideE : StrideE);
return pass ? 0 : 1;
};
if(data_type == MatrixDataType::F16_INT8_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN)
{
return profile(F16{}, INT8{}, F32{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::BF16_INT8_BF16_BF16 && layout == MatrixLayout::MK_KN_MN_MN)
{
return profile(BF16{}, INT8{}, F32{}, BF16{}, BF16{}, Row{}, Row{}, Row{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add);
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
......@@ -12,6 +12,9 @@
#define OP_NAME "gemm_add_fastgelu"
#define OP_DESC "GEMM+Add+FastGeLU"
using INT8 = int8_t;
using BF16 = ck::bhalf_t;
int profile_gemm_add_fastgelu(int argc, char* argv[])
{
enum struct MatrixLayout
......@@ -28,13 +31,15 @@ int profile_gemm_add_fastgelu(int argc, char* argv[])
F16_F16_F16_F16, // 1
BF16_BF16_BF16_BF16, // 2
INT8_INT8_INT8_INT8, // 3
F16_INT8_F16_F16, // 4
BF16_INT8_BF16_BF16, // 5
};
if(argc != 15)
{
// clang-format off
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f16&i8 5: bf16&i8)\n");
printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n]);\n");
printf(" 1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n]);\n");
printf(" 2: E[m, n] = FastGeLU(A[k, m] * B[k, n] + D0[m, n]);\n");
......@@ -135,6 +140,14 @@ int profile_gemm_add_fastgelu(int argc, char* argv[])
{
return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::F16_INT8_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN)
{
return profile(F16{}, INT8{}, F32{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::BF16_INT8_BF16_BF16 && layout == MatrixLayout::MK_KN_MN_MN)
{
return profile(BF16{}, INT8{}, F32{}, BF16{}, BF16{}, Row{}, Row{}, Row{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_add_relu_impl.hpp"
#include "profiler_operation_registry.hpp"
#define OP_NAME "gemm_add_relu"
#define OP_DESC "GEMM+Add+ReLU"
using INT8 = int8_t;
using BF16 = ck::bhalf_t;
int profile_gemm_add_relu(int argc, char* argv[])
{
enum struct MatrixLayout
{
MK_KN_MN_MN, // 0
MK_NK_MN_MN, // 1
KM_KN_MN_MN, // 2
KM_NK_MN_MN, // 3
};
enum struct MatrixDataType
{
F16_INT8_F16_F16, // 0
BF16_INT8_BF16_BF16, // 1
};
if(argc != 15)
{
// clang-format off
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: f16&i8 1: bf16&i8)\n");
printf("arg3: matrix layout (0: E[m, n] = ReLU(A[m, k] * B[k, n] + D0[m, n]);\n");
printf(" 1: E[m, n] = ReLU(A[m, k] * B[n, k] + D0[m, n]);\n");
printf(" 2: E[m, n] = ReLU(A[k, m] * B[k, n] + D0[m, n]);\n");
printf(" 3: E[m, n] = ReLU(A[k, m] * B[n, k] + D0[m, n]))\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
// clang-format on
exit(1);
}
const auto data_type = static_cast<MatrixDataType>(std::stoi(argv[2]));
const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideD0 = std::stoi(argv[13]);
const int StrideE = std::stoi(argv[14]);
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
// using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto d0_type,
auto e_type,
auto a_layout,
auto b_layout,
auto d0_layout,
auto e_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using D0DataType = decltype(d0_type);
using EDataType = decltype(e_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using D0Layout = decltype(d0_layout);
using ELayout = decltype(e_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_add_relu_impl<ADataType,
BDataType,
AccDataType,
D0DataType,
EDataType,
ALayout,
BLayout,
D0Layout,
ELayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA < 0) ? DefaultStrideA : StrideA,
(StrideB < 0) ? DefaultStrideB : StrideB,
(StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
(StrideE < 0) ? DefaultStrideE : StrideE);
return pass ? 0 : 1;
};
if(data_type == MatrixDataType::F16_INT8_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN)
{
return profile(F16{}, INT8{}, F32{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::BF16_INT8_BF16_BF16 && layout == MatrixLayout::MK_KN_MN_MN)
{
return profile(BF16{}, INT8{}, F32{}, BF16{}, BF16{}, Row{}, Row{}, Row{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add_relu);
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_gemm_add_silu_impl.hpp"
#include "profiler_operation_registry.hpp"
#define OP_NAME "gemm_add_silu"
#define OP_DESC "GEMM+Add+SiLU"
using INT8 = int8_t;
using BF16 = ck::bhalf_t;
int profile_gemm_add_silu(int argc, char* argv[])
{
enum struct MatrixLayout
{
MK_KN_MN_MN, // 0
MK_NK_MN_MN, // 1
KM_KN_MN_MN, // 2
KM_NK_MN_MN, // 3
};
enum struct MatrixDataType
{
F16_INT8_F16_F16, // 0
BF16_INT8_BF16_BF16, // 1
};
if(argc != 15)
{
// clang-format off
printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
printf("arg2: data type (0: f16&i8 1: bf16&i8)\n");
printf("arg3: matrix layout (0: E[m, n] = ReLU(A[m, k] * B[k, n] + D0[m, n]);\n");
printf(" 1: E[m, n] = ReLU(A[m, k] * B[n, k] + D0[m, n]);\n");
printf(" 2: E[m, n] = ReLU(A[k, m] * B[k, n] + D0[m, n]);\n");
printf(" 3: E[m, n] = ReLU(A[k, m] * B[n, k] + D0[m, n]))\n");
printf("arg4: verification (0: no; 1: yes)\n");
printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
printf("arg6: print tensor value (0: no; 1: yes)\n");
printf("arg7: time kernel (0=no, 1=yes)\n");
printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
// clang-format on
exit(1);
}
const auto data_type = static_cast<MatrixDataType>(std::stoi(argv[2]));
const auto layout = static_cast<MatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const int M = std::stoi(argv[8]);
const int N = std::stoi(argv[9]);
const int K = std::stoi(argv[10]);
const int StrideA = std::stoi(argv[11]);
const int StrideB = std::stoi(argv[12]);
const int StrideD0 = std::stoi(argv[13]);
const int StrideE = std::stoi(argv[14]);
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
// using Col = ck::tensor_layout::gemm::ColumnMajor;
auto profile = [&](auto a_type,
auto b_type,
auto acc_type,
auto d0_type,
auto e_type,
auto a_layout,
auto b_layout,
auto d0_layout,
auto e_layout) {
using ADataType = decltype(a_type);
using BDataType = decltype(b_type);
using AccDataType = decltype(acc_type);
using D0DataType = decltype(d0_type);
using EDataType = decltype(e_type);
using ALayout = decltype(a_layout);
using BLayout = decltype(b_layout);
using D0Layout = decltype(d0_layout);
using ELayout = decltype(e_layout);
const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
bool pass = ck::profiler::profile_gemm_add_silu_impl<ADataType,
BDataType,
AccDataType,
D0DataType,
EDataType,
ALayout,
BLayout,
D0Layout,
ELayout>(
do_verification,
init_method,
do_log,
time_kernel,
M,
N,
K,
(StrideA < 0) ? DefaultStrideA : StrideA,
(StrideB < 0) ? DefaultStrideB : StrideB,
(StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
(StrideE < 0) ? DefaultStrideE : StrideE);
return pass ? 0 : 1;
};
if(data_type == MatrixDataType::F16_INT8_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN)
{
return profile(F16{}, INT8{}, F32{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{});
}
else if(data_type == MatrixDataType::BF16_INT8_BF16_BF16 && layout == MatrixLayout::MK_KN_MN_MN)
{
return profile(BF16{}, INT8{}, F32{}, BF16{}, BF16{}, Row{}, Row{}, Row{}, Row{});
}
else
{
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add_silu);
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_grouped_gemm_fixed_nk_impl.hpp"
#include "profiler_operation_registry.hpp"
enum struct GemmMatrixLayout
{
MK_KN_MN, // 0
MK_NK_MN, // 1
};
enum struct GemmDataType
{
BF16_I8_BF16, // 0
F16_F16_F16, // 1
F16_F8_F16, // 2
F16_I8_F16, // 3
};
#define OP_NAME "grouped_gemm_fixed_nk"
#define OP_DESC "Grouped GEMM Fixed NK"
namespace {
std::vector<int> argToIntArray(char* input)
{
std::vector<int> out;
std::istringstream in(input);
std::string item;
while(std::getline(in, item, ','))
{
out.push_back(std::stoi(item));
}
return out;
}
int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
{
if(argc < 14)
{
std::cout
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: bf16@int8; 1: fp16; 2: fp16@fp8; 3: fp16@int8)\n"
<< "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
<< " 1: A[m, k] * B[n, k] = C[m, n];\n"
<< "arg4: verification (0: no; 1: yes)\n"
<< "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0=n0, 1=yes)\n"
<< "arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
"64,64 64,64 128,128)\n"
<< "arg15: kbatch value (default 1)\n"
<< "optional:\n"
<< "arg16: number of warm-up cycles (default 1)\n"
<< "arg17: number of iterations (default 10)\n"
<< std::endl;
exit(1);
}
const auto data_type = static_cast<GemmDataType>(std::stoi(argv[2]));
const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
const bool do_verification = std::stoi(argv[4]);
const int init_method = std::stoi(argv[5]);
const bool do_log = std::stoi(argv[6]);
const bool time_kernel = std::stoi(argv[7]);
const auto Ms = argToIntArray(argv[8]);
const auto Ns = argToIntArray(argv[9]);
const auto Ks = argToIntArray(argv[10]);
const auto StrideAs = argToIntArray(argv[11]);
const auto StrideBs = argToIntArray(argv[12]);
const auto StrideCs = argToIntArray(argv[13]);
const int kbatch = argc == 15 ? std::stoi(argv[14]) : 1;
using F32 = float;
using F16 = ck::half_t;
using F8 = ck::f8_t;
using BF16 = ck::bhalf_t;
using I8 = int8_t;
int n_warmup = 1;
int n_iter = 10;
if(argc == 17)
{
n_warmup = std::stoi(argv[16]);
n_iter = std::stoi(argv[17]);
}
#if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<BF16,
I8,
BF16,
F32,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
else if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<BF16,
I8,
BF16,
F32,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
#endif
#if defined(CK_ENABLE_FP16)
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
F16,
F16,
F32,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
F16,
F16,
F32,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
#endif
#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
F8,
F16,
F32,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
F8,
F16,
F32,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
#endif
#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_INT8)
else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
I8,
F16,
F32,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
kbatch,
n_warmup,
n_iter);
}
else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
{
ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
I8,
F16,
F32,
ck::tensor_layout::gemm::RowMajor,
ck::tensor_layout::gemm::ColumnMajor,
ck::tensor_layout::gemm::RowMajor>(
do_verification,
init_method,
do_log,
time_kernel,
Ms,
Ns,
Ks,
StrideAs,
StrideBs,
StrideCs,
1,
n_warmup,
n_iter);
}
#endif
else
{
throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
}
return 0;
}
} // anonymous namespace
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm_fixed_nk);
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "profiler/profile_permute_scale_impl.hpp"
#include "profiler_operation_registry.hpp"
namespace {
enum struct DataType
{
F32_F32, // 0
F16_F16 // 1
};
#define OP_NAME "permute_scale"
#define OP_DESC "Permute Scale"
static void print_helper_msg()
{
std::cout
// clang-format off
<< "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Output fp32\n"
<< " 1: Input fp16, Output fp16\n"
<< "arg4: verification (0: no, 1: yes)\n"
<< "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
<< "arg6: print tensor value (0: no; 1: yes)\n"
<< "arg7: time kernel (0: no, 1: yes)\n"
<< "from arg8: tensor lengths\n"
<< " input strides\n"
<< " output strides\n" << std::endl;
// clang-format on
}
} // namespace
int profile_permute_scale(int argc, char* argv[])
{
constexpr int control_argc = 7;
const int dims_argc = argc - control_argc;
// Number of lenghs, input strides and outputs strides must be equal
if(argc < control_argc && dims_argc % 3 != 0)
{
print_helper_msg();
return 1;
}
const auto data_type = static_cast<DataType>(std::stoi(argv[2]));
const bool do_verification = std::stoi(argv[3]);
const int init_method = std::stoi(argv[4]);
const bool do_log = std::stoi(argv[5]);
const bool time_kernel = std::stoi(argv[6]);
const int num_dims = dims_argc / 3;
std::vector<ck::index_t> lengths(num_dims);
std::vector<ck::index_t> input_strides(num_dims);
std::vector<ck::index_t> output_strides(num_dims);
for(int i = 0; i < num_dims; i++)
{
lengths[i] = std::stoi(argv[control_argc + i]);
input_strides[i] = std::stoi(argv[control_argc + num_dims + i]);
output_strides[i] = std::stoi(argv[control_argc + 2 * num_dims + i]);
}
using F32 = float;
using F16 = ck::half_t;
constexpr auto I1 = ck::Number<1>{};
constexpr auto I2 = ck::Number<2>{};
constexpr auto I3 = ck::Number<3>{};
constexpr auto I4 = ck::Number<4>{};
constexpr auto I5 = ck::Number<5>{};
constexpr auto I6 = ck::Number<6>{};
auto profile = [&](auto num_dim_tmp, auto in_type, auto out_type) {
constexpr ck::index_t NDim = num_dim_tmp.value;
using InDataType = decltype(in_type);
using OutDataType = decltype(out_type);
bool pass =
ck::profiler::profile_permute_scale_impl<InDataType, OutDataType, NDim>(do_verification,
init_method,
do_log,
time_kernel,
lengths,
input_strides,
output_strides);
return pass ? 0 : 1;
};
if(num_dims == 1)
{
if(data_type == DataType::F32_F32)
{
return profile(I1, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I1, F16{}, F16{});
}
}
else if(num_dims == 2)
{
if(data_type == DataType::F32_F32)
{
return profile(I2, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I2, F16{}, F16{});
}
}
else if(num_dims == 3)
{
if(data_type == DataType::F32_F32)
{
return profile(I3, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I3, F16{}, F16{});
}
}
else if(num_dims == 4)
{
if(data_type == DataType::F32_F32)
{
return profile(I4, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I4, F16{}, F16{});
}
}
else if(num_dims == 5)
{
if(data_type == DataType::F32_F32)
{
return profile(I5, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I5, F16{}, F16{});
}
}
else if(num_dims == 6)
{
if(data_type == DataType::F32_F32)
{
return profile(I6, F32{}, F32{});
}
else if(data_type == DataType::F16_F16)
{
return profile(I6, F16{}, F16{});
}
}
std::cout << "this data_type & layout is not implemented" << std::endl;
return 1;
}
REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_permute_scale);
#!/usr/bin/env python3
import os, io, argparse, datetime, re
import numpy as np
import sqlalchemy
from sqlalchemy.types import NVARCHAR, Float, Integer
import pymysql
import pandas as pd
from sshtunnel import SSHTunnelForwarder
def print_to_string(*args, **kwargs):
output = io.StringIO()
print(*args, file=output, **kwargs)
contents = output.getvalue()
output.close()
return contents
def parse_args():
parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
args = parser.parse_args()
files = []
if os.path.isdir(args.filename):
all_files = os.listdir(args.filename)
for name in all_files:
if not 'log' in name:
continue
files.append(os.path.join(args.filename, name))
else:
files = [args.filename]
args.files = files
return args
def main():
args = parse_args()
tests = []
kernels=[]
tflops=[]
dtype=[]
alayout=[]
blayout=[]
M=[]
N=[]
K=[]
StrideA=[]
StrideB=[]
StrideC=[]
#parse results, get the Tflops value for "Best Perf" kernels
glue=""
for filename in args.files:
for line in open(filename):
if 'Branch name' in line:
lst=line.split()
branch_name=lst[2]
if 'On branch' in line:
lst=line.split()
branch_name=lst[2]
if 'Node name' in line:
lst=line.split()
node_id=lst[2]
if 'GPU_arch' in line:
lst=line.split()
gpu_arch=lst[2]
if 'HIP version' in line:
lst=line.split()
hip_vers=lst[2]
if 'Compute Unit' in line:
lst=line.split()
compute_units=lst[2]
if 'InstalledDir' in line:
lst=line.split()
rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
print("Branch name:",branch_name)
print("Node name:",node_id)
print("GPU_arch:",gpu_arch)
print("Compute units:",compute_units)
print("ROCM_version:",rocm_vers)
print("HIP_version:",hip_vers)
#parse gemm performance tests:
if 'gemm' in filename:
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
if len(lst)>=37: #the line is complete
tests.append(glue.join(lst[5:30]))
kernels.append(glue.join(lst[37:]))
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
elif len(lst)<37 and len(lst)>=33: #the tflops are available
tests.append(glue.join(lst[5:30]))
kernels.append("N/A")
tflops.append(lst[33])
dtype.append(lst[5])
alayout.append(lst[8])
blayout.append(lst[11])
M.append(lst[14])
N.append(lst[17])
K.append(lst[20])
StrideA.append(lst[23])
StrideB.append(lst[26])
StrideC.append(lst[29])
print("warning: incomplete line:",lst)
elif len(lst)<33: #even the tflops are not available
print("Error in ckProfiler output!")
print("warning: incomplete line=",lst)
#sort results
#sorted_tests = sorted(tests)
#print("sorted tests:",sorted_tests)
sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
#sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
test_list=list(range(1,len(tests)+1))
#parse resnet50 performance tests:
if 'resnet50' in filename:
for filename in args.files:
for line in open(filename):
if 'Best Perf' in line:
lst=line.split()
tflops.append(lst[4])
print("Number of tests:",len(tflops))
sql_hostname = '127.0.0.1'
sql_username = os.environ["dbuser"]
sql_password = os.environ["dbpassword"]
sql_main_database = 'miopen_perf'
sql_port = 3306
ssh_host = os.environ["dbsship"]
ssh_user = os.environ["dbsshuser"]
ssh_port = int(os.environ["dbsshport"])
ssh_pass = os.environ["dbsshpassword"]
with SSHTunnelForwarder(
(ssh_host, ssh_port),
ssh_username=ssh_user,
ssh_password=ssh_pass,
remote_bind_address=(sql_hostname, sql_port)) as tunnel:
sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
conn = sqlEngine.connect()
#save gemm performance tests:
if 'gemm' in filename:
#write the ck_gemm_test_params table
#only needed once the test set changes
'''
sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
sorted_M = [x for _,x in sorted(zip(tests,M))]
sorted_N = [x for _,x in sorted(zip(tests,N))]
sorted_K = [x for _,x in sorted(zip(tests,K))]
sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
sorted_StrideC]
df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
print(df)
dtypes = {
'Test_number': Integer(),
'Data_type': NVARCHAR(length=5),
'Alayout': NVARCHAR(length=12),
'Blayout': NVARCHAR(length=12),
'M': Integer(),
'N': Integer(),
'K': Integer(),
'StrideA': Integer(),
'StrideB': Integer(),
'StrideC': Integer()
}
df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
'''
#read baseline results for the latest develop branch
query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
tflops_base = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,len(tests)+1):
testlist.append("Test%i"%i)
ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
flops=pd.concat([flops,df_add],axis=1)
print("new tflops for gemm tests:",flops)
flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
#save resnet50 performance tests:
if 'resnet50' in filename:
#read baseline results for the latest develop branch
query = '''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
tflops_base_N256 = pd.read_sql_query(query, conn)
query = '''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
tflops_base_N4 = pd.read_sql_query(query, conn)
#write new results to the db
testlist=[]
for i in range(1,50):
testlist.append("Layer%i"%i)
ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=256 resnet50 test:",flops)
flops.to_sql("ck_resnet50_N256_tflops",conn,if_exists='append',index=False)
df_add=pd.DataFrame(data=[tflops[49:98]],columns=testlist)
flops=pd.concat([flops0,df_add],axis=1)
print("new tflops for N=4 resnet50 test:",flops)
flops.to_sql("ck_resnet50_N4_tflops",conn,if_exists='append',index=False)
conn.close()
#compare the results to the baseline if baseline exists
regression=0
if 'gemm' in filename:
if not tflops_base.empty:
base=tflops_base[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(sorted_tflops[i]):
print("test # ",i,"shows regression by {:.3f}%".format(
(float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline")
if 'resnet50' in filename:
if not tflops_base_N256.empty:
base=tflops_base_N256[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(tflops[i]):
print("layer # ",i,"shows regression by {:.3f}%".format(
(float(tflops[i])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(tflops[i])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline for N=256")
if not tflops_base_N4.empty:
base=tflops_base_N4[testlist].to_numpy(dtype='float')
base_list=base[0]
ave_perf=0
for i in range(len(base_list)):
# success criterion:
if base_list[i]>1.01*float(tflops[i+49]):
print("layer # ",i,"shows regression by {:.3f}%".format(
(float(tflops[i+49])-base_list[i])/base_list[i]*100))
regression=1
ave_perf=ave_perf+float(tflops[i+49])/base_list[i]
if regression==0:
print("no regressions found")
ave_perf=ave_perf/len(base_list)
print("average performance relative to baseline:",ave_perf)
else:
print("could not find a baseline for N=4")
#return 0 if performance criteria met, otherwise return 1
return regression
if __name__ == '__main__':
main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment