Commit 3289a5c9 authored by Andriy Roshchenko's avatar Andriy Roshchenko
Browse files

Narrowing the scope of PR to OCP FP8 enablement only

parent dbfb222d
......@@ -46,4 +46,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
......@@ -66,4 +66,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
......@@ -65,4 +65,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
......@@ -57,4 +57,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_streamk_example(argc, argv) ? 0 : -1); }
int main(int argc, char* argv[]) { return !run_gemm_streamk_example(argc, argv); }
......@@ -50,4 +50,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); }
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
......@@ -166,14 +166,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-2.f, 2.f}(b_k_n);
break;
case 6:
a_m_k.GenerateTensorValue(GeneratorTensor_PI<ADataType>{});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
case 7:
a_m_k.GenerateTensorValue(GeneratorTensor_PI_A<ADataType>{});
b_k_n.GenerateTensorValue(GeneratorTensor_PI_B<BDataType>{});
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-0.1f, 0.1f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-0.1f, 0.1f}(b_k_n);
......@@ -256,7 +248,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return false;
return true;
}
ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
......@@ -289,7 +281,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return false;
return true;
}
std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
......@@ -322,26 +314,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cerr << gemm.GetTypeString() << ": the instance does not support the problem config."
<< std::endl;
return false;
return true;
}
if(config.time_kernel)
{
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
std::size_t flop = 2_uz * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
<< " GB/s, " << gemm.GetTypeString() << std::endl;
}
else
{
std::cout << "FINISHED: " << gemm.GetTypeString() << std::endl;
}
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
bool pass = true;
......@@ -368,29 +353,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
#endif
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
if(config.init_method == 6 || config.init_method == 7)
{
std::cout << std::fixed << std::setprecision(16);
AccDataType d = ck::type_convert<AccDataType>(c_m_n_device_result(0, 10));
AccDataType h = ck::type_convert<AccDataType>(c_m_n_host_result(10, 0));
std::cout << "device result: " << d << std::endl;
std::cout << "host result: " << h << std::endl;
std::cout << "expected result: " << M_PI << std::endl;
std::cout << "device - host: " << std::abs(d - h) << std::endl;
std::cout << "device - expected: " << std::abs(d - M_PI) << std::endl;
std::cout << "atol: " << get_atol<CDataType>() << std::endl;
std::cout << std::endl << std::endl;
}
}
if((config.do_verification == 2) || (config.do_verification == 3))
......@@ -416,18 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
auto gpu_pass = ck::utils::check_err(c_m_n_device_result,
c_m_n_device_ref_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
if(gpu_pass)
std::cout << "Verification on GPU: PASS" << std::endl;
pass = pass && gpu_pass;
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_device_ref_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
}
return pass;
return pass == true;
}
bool run_gemm_example(int argc, char* argv[])
......@@ -435,7 +399,7 @@ bool run_gemm_example(int argc, char* argv[])
ProblemSize problem_size;
ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
bool run_gemm_streamk_example(int argc, char* argv[])
......@@ -443,5 +407,5 @@ bool run_gemm_streamk_example(int argc, char* argv[])
ProblemSizeStreamK problem_size;
ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
......@@ -162,12 +162,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 2:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2, 2});
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 3:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
......@@ -237,13 +237,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return false;
return true;
}
bool pass = true;
if((config.do_verification == 1) || (config.do_verification == 3))
{
std::cout << "Compute reference GEMM on CPU... ";
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
......@@ -251,11 +250,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument);
std::cout << "DONE!" << std::endl;
std::cout << "Compute GEMM on device... \n";
ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1});
std::cout << "DONE!" << std::endl;
#ifdef BUILD_INT4_EXAMPLE
Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
......@@ -267,19 +263,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
#endif
}
if(config.time_kernel)
{
std::cout << "Time GEMM on device... \n";
ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K;
......@@ -301,5 +294,5 @@ bool run_gemm_universal_streamk_example(int argc, char* argv[])
ProblemSizeStreamK_universal problem_size;
ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -146,11 +146,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-3, 3});
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 2:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 3:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0});
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
}
......@@ -216,13 +224,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return false;
return true;
}
bool pass = true;
if((config.do_verification == 1) || (config.do_verification == 3))
{
std::cout << "Compute reference GEMM on CPU... ";
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
......@@ -230,11 +237,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument);
std::cout << "DONE!" << std::endl;
std::cout << "Compute GEMM on device... \n";
ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1});
std::cout << "DONE!" << std::endl;
#ifdef BUILD_INT4_EXAMPLE
Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
......@@ -246,19 +250,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
#endif
}
if(config.time_kernel)
{
std::cout << "Time GEMM on device... \n";
ave_time =
invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4});
......@@ -281,5 +282,5 @@ bool run_gemm_splitk_example(int argc, char* argv[])
ProblemSizeSplitK problem_size;
ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config);
return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -57,7 +57,7 @@ struct ProblemSize final
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 2;
int init_method = 1;
bool time_kernel = false;
};
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
......@@ -7,7 +7,7 @@ using ADataType = BF16;
using BDataType = BF16;
using AccDataType = F32;
using CShuffleDataType = F32;
using CDataType = F32; // C matrix doesn't exist in GPU memory, this is used for host verification
using CDataType = F32; // C matrix doesn't exsit in GPU memory, this is used for host verification
using D0DataType = BF16;
using D1DataType = BF16;
using DsDataType = ck::Tuple<D0DataType, D1DataType>;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
template <typename DataType>
inline __host__ __device__ constexpr double get_rtol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 2e-1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 2e-1;
}
else
{
return 1e-3;
}
}
template <typename DataType>
inline __host__ __device__ constexpr double get_atol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 2e-1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 2e-1;
}
else
{
return 1e-3;
}
}
bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config)
{
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
......@@ -235,11 +150,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
#else
return ck::utils::check_err(e_m_n_device_result,
e_m_n_host_result,
"Error: Incorrect results!",
get_rtol<EDataType>(),
get_atol<EDataType>());
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
#endif
}
......
......@@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break;
default:
a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0});
b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0});
a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
}
}
......
......@@ -75,7 +75,7 @@ struct ProblemSize final
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 2;
int init_method = 1;
int k_batch = 1;
bool time_kernel = false;
};
......@@ -154,12 +154,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break;
case 2:
a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0});
a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break;
default:
a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0});
b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0});
a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
}
}
......@@ -266,7 +266,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
BElementOp,
CDEElementOp>;
std::cout << "Running verification on CPU." << std::endl;
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data(),
......@@ -286,9 +285,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
}
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
}
return pass;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -123,12 +123,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break;
case 2:
a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0});
a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break;
default:
a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
}
}
......@@ -187,7 +187,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
bool pass = true;
if(config.do_verification)
{
std::cout << "Running verification on CPU." << std::endl;
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
EDataType,
......@@ -219,8 +218,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
#endif
}
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
}
if(config.time_kernel)
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
......@@ -72,7 +72,7 @@ using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLa
struct ExecutionConfig final
{
bool do_verification = true;
int init_method = 2;
int init_method = 1;
bool time_kernel = false;
};
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
template <ck::index_t NDimSpatial>
bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
......@@ -37,8 +37,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break;
default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-1, 1});
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 0.2});
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.1, 0.1});
}
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
......@@ -128,12 +128,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
wei_device_buf.FromDevice(wei_device_result.mData.data());
return ck::utils::check_err(
wei_device_result.mData,
wei_host_result.mData,
"Error: Incorrect results!",
1e-3,
1e-3); // the errors must be consistent with the less precise type of In/Out DataTypes
return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData);
}
float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
......
......@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
}
c0_n_bias.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});
......
......@@ -3,88 +3,6 @@
#pragma once
template <typename DataType>
inline __host__ __device__ constexpr double get_rtol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 2e-1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 2e-1;
}
else
{
return 1e-3;
}
}
template <typename DataType>
inline __host__ __device__ constexpr double get_atol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 2e-1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 2e-1;
}
else
{
return 1e-3;
}
}
struct ProblemSize final
{
ck::index_t M = 3840;
......@@ -100,10 +18,9 @@ struct ProblemSize final
struct ExecutionConfig final
{
// 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU
int do_verification = 1;
int init_method = 7;
bool time_kernel = false;
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;
};
bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
......@@ -151,17 +68,9 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break;
case 6:
a_m_k.GenerateTensorValue(GeneratorTensor_PI<ADataType>{});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
case 7:
a_m_k.GenerateTensorValue(GeneratorTensor_PI_A<ADataType>{});
b_k_n.GenerateTensorValue(GeneratorTensor_PI_B<BDataType>{});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
}
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
......@@ -217,7 +126,7 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
invoker.Run(argument, StreamConfig{nullptr, false});
bool pass = true;
if((config.do_verification == 1) || (config.do_verification == 3))
if(config.do_verification)
{
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
......@@ -236,7 +145,6 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
std::cout << "Running verification on CPU." << std::endl;
ref_invoker.Run(ref_argument);
if(std::is_same<CDataType, ck::half_t>::value)
......@@ -246,82 +154,10 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
}
else
{
pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
}
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
if(config.init_method == 6 || config.init_method == 7)
{
std::cout << std::fixed << std::setprecision(16);
AccDataType d = ck::type_convert<AccDataType>(c_m_n_device_result(0, 10));
AccDataType h = ck::type_convert<AccDataType>(c_m_n_host_result(10, 0));
std::cout << "device result: " << d << std::endl;
std::cout << "host result: " << h << std::endl;
std::cout << "expected result: " << M_PI << std::endl;
std::cout << "device - host: " << std::abs(d - h) << std::endl;
std::cout << "device - expected: " << std::abs(d - M_PI) << std::endl;
std::cout << "atol: " << get_atol<CDataType>() << std::endl;
std::cout << std::endl << std::endl;
pass &= ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
}
if((config.do_verification == 2) || (config.do_verification == 3))
{
Tensor<CDataType> c_m_n_device_ref_result(
f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
// GPU verification
using ReferenceComputeType = float;
using ReferenceGemmInstanceGPU =
ck::tensor_operation::device::ReferenceGemm<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp,
ReferenceComputeType,
ReferenceComputeType>;
auto ref_gemm_gpu = ReferenceGemmInstanceGPU{};
auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
M,
N,
K,
a_element_op,
b_element_op,
c_element_op);
std::cout << "Running verification on GPU." << std::endl;
ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
auto gpu_pass = ck::utils::check_err(c_m_n_device_result,
c_m_n_device_ref_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
if(gpu_pass)
std::cout << "Verification on GPU: PASS" << std::endl;
pass &= gpu_pass;
}
if(config.time_kernel)
{
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 1});
......@@ -372,7 +208,7 @@ bool run_splitK_gemm_example(int argc, char* argv[])
}
else
{
printf("arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)\n");
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4: KBatch\n");
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
......@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
......@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment