"...git@developer.sourcefind.cn:modelzoo/xuanyuan_pytorch.git" did not exist on "b77a7567d126206d33b60ee6a71870bc0000d7db"
Commit 3289a5c9 authored by Andriy Roshchenko's avatar Andriy Roshchenko
Browse files

Narrowing the scope of PR to OCP FP8 enablement only

parent dbfb222d
...@@ -46,4 +46,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa ...@@ -46,4 +46,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc" #include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); } int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -66,4 +66,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa ...@@ -66,4 +66,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc" #include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); } int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -65,4 +65,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa ...@@ -65,4 +65,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc" #include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); } int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -57,4 +57,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa ...@@ -57,4 +57,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc" #include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_streamk_example(argc, argv) ? 0 : -1); } int main(int argc, char* argv[]) { return !run_gemm_streamk_example(argc, argv); }
...@@ -50,4 +50,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa ...@@ -50,4 +50,4 @@ using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALa
#include "run_gemm_example.inc" #include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return (run_gemm_example(argc, argv) ? 0 : -1); } int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -166,14 +166,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -166,14 +166,6 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue<ADataType>{-2.f, 2.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-2.f, 2.f}(b_k_n); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-2.f, 2.f}(b_k_n);
break; break;
case 6:
a_m_k.GenerateTensorValue(GeneratorTensor_PI<ADataType>{});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
case 7:
a_m_k.GenerateTensorValue(GeneratorTensor_PI_A<ADataType>{});
b_k_n.GenerateTensorValue(GeneratorTensor_PI_B<BDataType>{});
break;
default: default:
ck::utils::FillUniformDistribution<ADataType>{-0.1f, 0.1f}(a_m_k); ck::utils::FillUniformDistribution<ADataType>{-0.1f, 0.1f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-0.1f, 0.1f}(b_k_n); ck::utils::FillUniformDistribution<BDataType>{-0.1f, 0.1f}(b_k_n);
...@@ -256,7 +248,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -256,7 +248,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{ {
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return false; return true;
} }
ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
...@@ -289,7 +281,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -289,7 +281,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{ {
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return false; return true;
} }
std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument); std::size_t workspace_size = gemm.GetWorkSpaceSize(&argument);
...@@ -322,26 +314,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -322,26 +314,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
std::cerr << gemm.GetTypeString() << ": the instance does not support the problem config." std::cerr << gemm.GetTypeString() << ": the instance does not support the problem config."
<< std::endl; << std::endl;
return false; return true;
} }
if(config.time_kernel) std::size_t flop = 2_uz * M * N * K;
{ std::size_t num_btype =
std::size_t flop = 2_uz * M * N * K; sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< " GB/s, " << gemm.GetTypeString() << std::endl; << gemm.GetTypeString() << std::endl;
}
else
{
std::cout << "FINISHED: " << gemm.GetTypeString() << std::endl;
}
bool pass = true; bool pass = true;
...@@ -368,29 +353,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -368,29 +353,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else #else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = ck::utils::check_err(c_m_n_device_result, pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result, c_m_n_host_result,
"Error: Incorrect results!", "Error: Incorrect results!",
get_rtol<CDataType>(), get_rtol<CDataType>(),
get_atol<CDataType>()); get_atol<CDataType>());
#endif #endif
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
if(config.init_method == 6 || config.init_method == 7)
{
std::cout << std::fixed << std::setprecision(16);
AccDataType d = ck::type_convert<AccDataType>(c_m_n_device_result(0, 10));
AccDataType h = ck::type_convert<AccDataType>(c_m_n_host_result(10, 0));
std::cout << "device result: " << d << std::endl;
std::cout << "host result: " << h << std::endl;
std::cout << "expected result: " << M_PI << std::endl;
std::cout << "device - host: " << std::abs(d - h) << std::endl;
std::cout << "device - expected: " << std::abs(d - M_PI) << std::endl;
std::cout << "atol: " << get_atol<CDataType>() << std::endl;
std::cout << std::endl << std::endl;
}
} }
if((config.do_verification == 2) || (config.do_verification == 3)) if((config.do_verification == 2) || (config.do_verification == 3))
...@@ -416,18 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -416,18 +384,14 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data()); c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
auto gpu_pass = ck::utils::check_err(c_m_n_device_result, pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_device_ref_result, c_m_n_device_ref_result,
"Error: Incorrect results!", "Error: Incorrect results!",
get_rtol<CDataType>(), get_rtol<CDataType>(),
get_atol<CDataType>()); get_atol<CDataType>());
if(gpu_pass)
std::cout << "Verification on GPU: PASS" << std::endl;
pass = pass && gpu_pass;
} }
return pass; return pass == true;
} }
bool run_gemm_example(int argc, char* argv[]) bool run_gemm_example(int argc, char* argv[])
...@@ -435,7 +399,7 @@ bool run_gemm_example(int argc, char* argv[]) ...@@ -435,7 +399,7 @@ bool run_gemm_example(int argc, char* argv[])
ProblemSize problem_size; ProblemSize problem_size;
ExecutionConfig config; ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
} }
bool run_gemm_streamk_example(int argc, char* argv[]) bool run_gemm_streamk_example(int argc, char* argv[])
...@@ -443,5 +407,5 @@ bool run_gemm_streamk_example(int argc, char* argv[]) ...@@ -443,5 +407,5 @@ bool run_gemm_streamk_example(int argc, char* argv[])
ProblemSizeStreamK problem_size; ProblemSizeStreamK problem_size;
ExecutionConfig config; ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
} }
...@@ -162,12 +162,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -162,12 +162,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2}); b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break; break;
case 2: case 2:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-2, 2}); b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break; break;
case 3: case 3:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2, 2}); a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break; break;
default: default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
...@@ -237,13 +237,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -237,13 +237,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{ {
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return false; return true;
} }
bool pass = true; bool pass = true;
if((config.do_verification == 1) || (config.do_verification == 3)) if((config.do_verification == 1) || (config.do_verification == 3))
{ {
std::cout << "Compute reference GEMM on CPU... ";
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
...@@ -251,11 +250,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -251,11 +250,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{}); a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
std::cout << "DONE!" << std::endl;
std::cout << "Compute GEMM on device... \n";
ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1}); ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1});
std::cout << "DONE!" << std::endl;
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc); Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
...@@ -267,19 +263,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -267,19 +263,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else #else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = ck::utils::check_err(c_m_n_device_result, pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result, c_m_n_host_result,
"Error: Incorrect results!", "Error: Incorrect results!",
get_rtol<CDataType>(), get_rtol<CDataType>(),
get_atol<CDataType>()); get_atol<CDataType>());
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
#endif #endif
} }
if(config.time_kernel) if(config.time_kernel)
{ {
std::cout << "Time GEMM on device... \n";
ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
std::size_t flop = 2_uz * M * N * K; std::size_t flop = 2_uz * M * N * K;
...@@ -301,5 +294,5 @@ bool run_gemm_universal_streamk_example(int argc, char* argv[]) ...@@ -301,5 +294,5 @@ bool run_gemm_universal_streamk_example(int argc, char* argv[])
ProblemSizeStreamK_universal problem_size; ProblemSizeStreamK_universal problem_size;
ExecutionConfig config; ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
} }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -146,11 +146,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -146,11 +146,19 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1}); b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break; break;
case 1: case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2, 2}); a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-3, 3}); b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 2:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
break;
case 3:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break; break;
default: default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
} }
...@@ -216,13 +224,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -216,13 +224,12 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
{ {
std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
return false; return true;
} }
bool pass = true; bool pass = true;
if((config.do_verification == 1) || (config.do_verification == 3)) if((config.do_verification == 1) || (config.do_verification == 3))
{ {
std::cout << "Compute reference GEMM on CPU... ";
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
...@@ -230,11 +237,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -230,11 +237,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{}); a_m_k, b_k_n, c_m_n_host_result, PassThrough{}, PassThrough{}, PassThrough{});
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
std::cout << "DONE!" << std::endl;
std::cout << "Compute GEMM on device... \n";
ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1}); ave_time = invoker.Run(argument, StreamConfig{nullptr, false, 1});
std::cout << "DONE!" << std::endl;
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc); Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
...@@ -246,19 +250,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) ...@@ -246,19 +250,16 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config)
#else #else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
pass = ck::utils::check_err(c_m_n_device_result, pass &= ck::utils::check_err(c_m_n_device_result,
c_m_n_host_result, c_m_n_host_result,
"Error: Incorrect results!", "Error: Incorrect results!",
get_rtol<CDataType>(), get_rtol<CDataType>(),
get_atol<CDataType>()); get_atol<CDataType>());
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
#endif #endif
} }
if(config.time_kernel) if(config.time_kernel)
{ {
std::cout << "Time GEMM on device... \n";
ave_time = ave_time =
invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4}); invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 0, 5, 10, true, 4});
...@@ -281,5 +282,5 @@ bool run_gemm_splitk_example(int argc, char* argv[]) ...@@ -281,5 +282,5 @@ bool run_gemm_splitk_example(int argc, char* argv[])
ProblemSizeSplitK problem_size; ProblemSizeSplitK problem_size;
ExecutionConfig config; ExecutionConfig config;
return parse_cmd_args(argc, argv, problem_size, config) && run_gemm(problem_size, config); return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
} }
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -57,7 +57,7 @@ struct ProblemSize final ...@@ -57,7 +57,7 @@ struct ProblemSize final
struct ExecutionConfig final struct ExecutionConfig final
{ {
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
}; };
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp" #include "common.hpp"
...@@ -7,7 +7,7 @@ using ADataType = BF16; ...@@ -7,7 +7,7 @@ using ADataType = BF16;
using BDataType = BF16; using BDataType = BF16;
using AccDataType = F32; using AccDataType = F32;
using CShuffleDataType = F32; using CShuffleDataType = F32;
using CDataType = F32; // C matrix doesn't exist in GPU memory, this is used for host verification using CDataType = F32; // C matrix doesn't exsit in GPU memory, this is used for host verification
using D0DataType = BF16; using D0DataType = BF16;
using D1DataType = BF16; using D1DataType = BF16;
using DsDataType = ck::Tuple<D0DataType, D1DataType>; using DsDataType = ck::Tuple<D0DataType, D1DataType>;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
template <typename DataType>
inline __host__ __device__ constexpr double get_rtol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 2e-1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 2e-1;
}
else
{
return 1e-3;
}
}
template <typename DataType>
inline __host__ __device__ constexpr double get_atol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 2e-1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 2e-1;
}
else
{
return 1e-3;
}
}
bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) #if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
...@@ -235,11 +150,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -235,11 +150,7 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result); return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
#else #else
return ck::utils::check_err(e_m_n_device_result, return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
e_m_n_host_result,
"Error: Incorrect results!",
get_rtol<EDataType>(),
get_atol<EDataType>());
#endif #endif
} }
......
...@@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -157,8 +157,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break; break;
default: default:
a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0}); a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0}); b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
} }
} }
......
...@@ -75,7 +75,7 @@ struct ProblemSize final ...@@ -75,7 +75,7 @@ struct ProblemSize final
struct ExecutionConfig final struct ExecutionConfig final
{ {
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
int k_batch = 1; int k_batch = 1;
bool time_kernel = false; bool time_kernel = false;
}; };
...@@ -154,12 +154,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -154,12 +154,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break; break;
case 2: case 2:
a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0}); a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break; break;
default: default:
a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0}); a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0}); b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
} }
} }
...@@ -266,7 +266,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -266,7 +266,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
BElementOp, BElementOp,
CDEElementOp>; CDEElementOp>;
std::cout << "Running verification on CPU." << std::endl;
for(std::size_t i = 0; i < gemm_descs.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data(), c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data(),
...@@ -286,9 +285,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -286,9 +285,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]); pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
} }
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
} }
return pass; return pass;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -123,12 +123,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -123,12 +123,12 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}); b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
break; break;
case 2: case 2:
a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{-1.0, 1.0}); a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break; break;
default: default:
a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{1}); a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{1}); b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
} }
} }
...@@ -187,7 +187,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -187,7 +187,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
bool pass = true; bool pass = true;
if(config.do_verification) if(config.do_verification)
{ {
std::cout << "Running verification on CPU." << std::endl;
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
EDataType, EDataType,
...@@ -219,8 +218,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -219,8 +218,6 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]); pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
#endif #endif
} }
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
} }
if(config.time_kernel) if(config.time_kernel)
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -72,7 +72,7 @@ using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLa ...@@ -72,7 +72,7 @@ using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLa
struct ExecutionConfig final struct ExecutionConfig final
{ {
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
}; };
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
template <ck::index_t NDimSpatial> template <ck::index_t NDimSpatial>
bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
...@@ -37,8 +37,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, ...@@ -37,8 +37,8 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}); out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
break; break;
default: default:
in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0}); in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 0.2});
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-1, 1}); out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.1, 0.1});
} }
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
...@@ -128,12 +128,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, ...@@ -128,12 +128,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
wei_device_buf.FromDevice(wei_device_result.mData.data()); wei_device_buf.FromDevice(wei_device_result.mData.data());
return ck::utils::check_err( return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData);
wei_device_result.mData,
wei_host_result.mData,
"Error: Incorrect results!",
1e-3,
1e-3); // the errors must be consistent with the less precise type of In/Out DataTypes
} }
float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
......
...@@ -175,8 +175,8 @@ int main(int argc, char* argv[]) ...@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break; break;
default: default:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1}); a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1}); b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
} }
c0_n_bias.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5}); c0_n_bias.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});
......
...@@ -3,88 +3,6 @@ ...@@ -3,88 +3,6 @@
#pragma once #pragma once
template <typename DataType>
inline __host__ __device__ constexpr double get_rtol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 2e-1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 2e-1;
}
else
{
return 1e-3;
}
}
template <typename DataType>
inline __host__ __device__ constexpr double get_atol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 2e-1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 2e-1;
}
else
{
return 1e-3;
}
}
struct ProblemSize final struct ProblemSize final
{ {
ck::index_t M = 3840; ck::index_t M = 3840;
...@@ -100,10 +18,9 @@ struct ProblemSize final ...@@ -100,10 +18,9 @@ struct ProblemSize final
struct ExecutionConfig final struct ExecutionConfig final
{ {
// 0 - no verification, 1 - CPU, 2 - GPU, 3 - CPU + GPU bool do_verification = true;
int do_verification = 1; int init_method = 1;
int init_method = 7; bool time_kernel = false;
bool time_kernel = false;
}; };
bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
...@@ -151,17 +68,9 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con ...@@ -151,17 +68,9 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
break; break;
case 6:
a_m_k.GenerateTensorValue(GeneratorTensor_PI<ADataType>{});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
break;
case 7:
a_m_k.GenerateTensorValue(GeneratorTensor_PI_A<ADataType>{});
b_k_n.GenerateTensorValue(GeneratorTensor_PI_B<BDataType>{});
break;
default: default:
a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1}); a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<ADataType, 0>{});
b_k_n.GenerateTensorValue(GeneratorTensor_1<BDataType>{1}); b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
...@@ -217,7 +126,7 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con ...@@ -217,7 +126,7 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
invoker.Run(argument, StreamConfig{nullptr, false}); invoker.Run(argument, StreamConfig{nullptr, false});
bool pass = true; bool pass = true;
if((config.do_verification == 1) || (config.do_verification == 3)) if(config.do_verification)
{ {
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
...@@ -236,7 +145,6 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con ...@@ -236,7 +145,6 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
auto ref_argument = ref_gemm.MakeArgument( auto ref_argument = ref_gemm.MakeArgument(
a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
std::cout << "Running verification on CPU." << std::endl;
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
if(std::is_same<CDataType, ck::half_t>::value) if(std::is_same<CDataType, ck::half_t>::value)
...@@ -246,82 +154,10 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con ...@@ -246,82 +154,10 @@ bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& con
} }
else else
{ {
pass &= ck::utils::check_err(c_m_n_device_result, pass &= ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
c_m_n_host_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
}
if(pass)
std::cout << "Verification on CPU: PASS" << std::endl;
if(config.init_method == 6 || config.init_method == 7)
{
std::cout << std::fixed << std::setprecision(16);
AccDataType d = ck::type_convert<AccDataType>(c_m_n_device_result(0, 10));
AccDataType h = ck::type_convert<AccDataType>(c_m_n_host_result(10, 0));
std::cout << "device result: " << d << std::endl;
std::cout << "host result: " << h << std::endl;
std::cout << "expected result: " << M_PI << std::endl;
std::cout << "device - host: " << std::abs(d - h) << std::endl;
std::cout << "device - expected: " << std::abs(d - M_PI) << std::endl;
std::cout << "atol: " << get_atol<CDataType>() << std::endl;
std::cout << std::endl << std::endl;
} }
} }
if((config.do_verification == 2) || (config.do_verification == 3))
{
Tensor<CDataType> c_m_n_device_ref_result(
f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) *
c_m_n_device_ref_result.mDesc.GetElementSpaceSize());
// GPU verification
using ReferenceComputeType = float;
using ReferenceGemmInstanceGPU =
ck::tensor_operation::device::ReferenceGemm<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
AccDataType,
AElementOp,
BElementOp,
CElementOp,
ReferenceComputeType,
ReferenceComputeType>;
auto ref_gemm_gpu = ReferenceGemmInstanceGPU{};
auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker();
auto ref_argument_gpu = ref_gemm_gpu.MakeArgument(
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_ref_buf.GetDeviceBuffer()),
M,
N,
K,
a_element_op,
b_element_op,
c_element_op);
std::cout << "Running verification on GPU." << std::endl;
ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{});
c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
auto gpu_pass = ck::utils::check_err(c_m_n_device_result,
c_m_n_device_ref_result,
"Error: Incorrect results!",
get_rtol<CDataType>(),
get_atol<CDataType>());
if(gpu_pass)
std::cout << "Verification on GPU: PASS" << std::endl;
pass &= gpu_pass;
}
if(config.time_kernel) if(config.time_kernel)
{ {
float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 1}); float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel, 1});
...@@ -372,7 +208,7 @@ bool run_splitK_gemm_example(int argc, char* argv[]) ...@@ -372,7 +208,7 @@ bool run_splitK_gemm_example(int argc, char* argv[])
} }
else else
{ {
printf("arg1: verification (0=no, 1=CPU, 2=GPU, 3=CPU and GPU)\n"); printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n"); printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4: KBatch\n"); printf("arg4: KBatch\n");
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment