Commit 3289a5c9 authored by Andriy Roshchenko's avatar Andriy Roshchenko
Browse files

Narrowing the scope of PR to OCP FP8 enablement only

parent dbfb222d
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
...@@ -24,7 +24,6 @@ ...@@ -24,7 +24,6 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
......
...@@ -40,7 +40,7 @@ using BF8 = ck::bf8_t; ...@@ -40,7 +40,7 @@ using BF8 = ck::bf8_t;
struct ExecutionConfig final struct ExecutionConfig final
{ {
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
}; };
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp"
#include "common.hpp" #include "common.hpp"
...@@ -11,7 +11,6 @@ using AccDataType = FP32; ...@@ -11,7 +11,6 @@ using AccDataType = FP32;
using CShuffleDataType = FP16; using CShuffleDataType = FP16;
using DsDataType = ck::Tuple<>; using DsDataType = ck::Tuple<>;
using InDataType = FP16; using InDataType = FP16;
using VerifyDataType = FP16; // is used for selection of check tolerances
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::GNHWK;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
#include "common.hpp" #include "common.hpp"
...@@ -10,7 +10,6 @@ using AccDataType = FP32; ...@@ -10,7 +10,6 @@ using AccDataType = FP32;
using CShuffleDataType = FP16; using CShuffleDataType = FP16;
using DsDataType = ck::Tuple<>; using DsDataType = ck::Tuple<>;
using InDataType = FP16; using InDataType = FP16;
using VerifyDataType = FP16; // is used for selection of check tolerances
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::GNHWK;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp" #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
#include "common.hpp" #include "common.hpp"
...@@ -12,7 +12,6 @@ using DsDataType = ck::Tuple<>; ...@@ -12,7 +12,6 @@ using DsDataType = ck::Tuple<>;
using InDataType = FP16; using InDataType = FP16;
using AComputeType = BF8; using AComputeType = BF8;
using BComputeType = FP8; using BComputeType = FP8;
using VerifyDataType = BF8; // is used for selection of check tolerances
using OutLayout = ck::tensor_layout::convolution::GNHWK; using OutLayout = ck::tensor_layout::convolution::GNHWK;
using WeiLayout = ck::tensor_layout::convolution::GKYXC; using WeiLayout = ck::tensor_layout::convolution::GKYXC;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
template <typename DataType>
inline __host__ __device__ constexpr double get_rtol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 1;
}
else
{
return 1e-3;
}
}
template <typename DataType>
inline __host__ __device__ constexpr double get_atol()
{
if constexpr(std::is_same_v<DataType, float>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, double>)
{
return 1e-6;
}
else if constexpr(std::is_same_v<DataType, ck::half_t>)
{
return 1e-3;
}
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
{
return 5e-2;
}
else if constexpr(std::is_same_v<DataType, int32_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, int8_t>)
{
return 1e-1;
}
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
{
return 1;
}
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
{
return 1;
}
else
{
return 1e-3;
}
}
bool run_conv_bwd_data(const ExecutionConfig& config, bool run_conv_bwd_data(const ExecutionConfig& config,
const ck::utils::conv::ConvParam& conv_params, const ck::utils::conv::ConvParam& conv_params,
...@@ -108,7 +27,7 @@ bool run_conv_bwd_data(const ExecutionConfig& config, ...@@ -108,7 +27,7 @@ bool run_conv_bwd_data(const ExecutionConfig& config,
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5}); wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
break; break;
default: default:
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5}); out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
} }
...@@ -219,11 +138,7 @@ bool run_conv_bwd_data(const ExecutionConfig& config, ...@@ -219,11 +138,7 @@ bool run_conv_bwd_data(const ExecutionConfig& config,
in_device_buf.FromDevice(in_device.mData.data()); in_device_buf.FromDevice(in_device.mData.data());
return ck::utils::check_err(in_device.mData, return ck::utils::check_err(in_device.mData, in_host.mData);
in_host.mData,
"Error: Incorrect results!",
get_rtol<VerifyDataType>(),
get_atol<VerifyDataType>());
} }
return true; return true;
......
...@@ -194,9 +194,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -194,9 +194,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b1_tensors[i].GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5}); b1_tensors[i].GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
break; break;
default: default:
a0_tensors[i].GenerateTensorValue(GeneratorTensor_1<A0DataType>{1}); a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<A0DataType, 0>{});
b0_tensors[i].GenerateTensorValue(GeneratorTensor_1<B0DataType>{1}); b0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
b1_tensors[i].GenerateTensorValue(GeneratorTensor_1<B1DataType>{1}); b1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<B1DataType, 1>{});
} }
d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5}); d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
......
...@@ -184,9 +184,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -184,9 +184,9 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
b_tensors[i].GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5}); b_tensors[i].GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
break; break;
default: default:
a0_tensors[i].GenerateTensorValue(GeneratorTensor_1<A0DataType>{1}); a0_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<A0DataType, 0>{});
a1_tensors[i].GenerateTensorValue(GeneratorTensor_1<A1DataType>{1}); a1_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<A1DataType, 0>{});
b_tensors[i].GenerateTensorValue(GeneratorTensor_1<B0DataType>{-1}); b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<B0DataType, 1>{});
} }
d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5}); d0_tensors[i].GenerateTensorValue(GeneratorTensor_3<D0DataType>{-0.5, 0.5});
......
...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) ...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
ck::utils::conv::ConvParam conv_param{ ck::utils::conv::ConvParam conv_param{
......
...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) ...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
ck::utils::conv::ConvParam conv_param{ ck::utils::conv::ConvParam conv_param{
......
...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) ...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
ck::utils::conv::ConvParam conv_param{ ck::utils::conv::ConvParam conv_param{
......
...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) ...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
ck::utils::conv::ConvParam conv_param{ ck::utils::conv::ConvParam conv_param{
......
...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[]) ...@@ -8,7 +8,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
print_helper_msg(); print_helper_msg();
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
ck::utils::conv::ConvParam conv_param{ ck::utils::conv::ConvParam conv_param{
......
...@@ -86,7 +86,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu ...@@ -86,7 +86,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShu
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = true; bool do_verification = true;
int init_method = 2; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
// GEMM shape // GEMM shape
......
...@@ -78,7 +78,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_ ...@@ -78,7 +78,7 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_ABScale_
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
bool do_verification = true; bool do_verification = true;
int init_method = 5; int init_method = 1;
bool time_kernel = false; bool time_kernel = false;
// GEMM shape // GEMM shape
...@@ -186,20 +186,6 @@ int main(int argc, char* argv[]) ...@@ -186,20 +186,6 @@ int main(int argc, char* argv[])
b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{}); b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{});
a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0}); a1_m_k.GenerateTensorValue(GeneratorTensor_3<A1DataType>{0, 1.0});
b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0}); b1_k_n.GenerateTensorValue(GeneratorTensor_3<B1DataType>{0, 1.0});
break;
case 6:
a0_m_k.GenerateTensorValue(GeneratorTensor_PI<A0DataType>{});
b0_k_n.GenerateTensorValue(GeneratorTensor_1<B0DataType>{0.5f});
a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{0.5});
b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{4});
break;
case 7:
a0_m_k.GenerateTensorValue(GeneratorTensor_PI_A<A0DataType>{});
b0_k_n.GenerateTensorValue(GeneratorTensor_PI_B<B0DataType>{});
a1_m_k.GenerateTensorValue(GeneratorTensor_1<A1DataType>{2});
b1_k_n.GenerateTensorValue(GeneratorTensor_1<B1DataType>{0.5});
break; break;
default: default:
a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5}); a0_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{-0.5, 0.5});
...@@ -254,28 +240,23 @@ int main(int argc, char* argv[]) ...@@ -254,28 +240,23 @@ int main(int argc, char* argv[])
"not support this GEMM problem"); "not support this GEMM problem");
} }
std::cout << "Compute GEMM on device... \n";
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 20, 50}); float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 20, 50});
std::cout << "DONE!" << std::endl;
if(time_kernel)
{
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(EDataType) * M * N;
float gb_per_sec = num_btype / 1.E6 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec float gb_per_sec = num_btype / 1.E6 / ave_time;
<< " GB/s" << std::endl;
} std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
if(do_verification) if(do_verification)
{ {
std::cout << "Running verification on CPU." << std::endl;
Tensor<AccDataType> c_m_n({M, N}); Tensor<AccDataType> c_m_n({M, N});
Tensor<float> a_m_k({M, K}); Tensor<float> a_m_k({M, K});
Tensor<float> b_k_n({K, N}); Tensor<float> b_k_n({K, N});
...@@ -325,28 +306,10 @@ int main(int argc, char* argv[]) ...@@ -325,28 +306,10 @@ int main(int argc, char* argv[])
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.mData.data());
if(init_method == 6 || init_method == 7) return ck::utils::check_err(
{ e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 5e-2, 5e-2)
std::cout << std::fixed << std::setprecision(16); ? 0
: 1;
float d = ck::type_convert<float>(e_m_n_device_result(0, 10));
float h = ck::type_convert<float>(e_m_n_host_result(10, 0));
std::cout << "device result: " << d << std::endl;
std::cout << "host result: " << h << std::endl;
std::cout << "expected result: " << M_PI << std::endl;
std::cout << "device - host: " << std::abs(d - h) << std::endl;
std::cout << "device - expected: " << std::abs(d - M_PI) << std::endl;
std::cout << "atol: " << 5e-2 << std::endl;
}
if(ck::utils::check_err(
e_m_n_device_result, e_m_n_host_result, "Error: Incorrect results!", 5e-2, 5e-2))
{
std::cout << "Verification on CPU: PASS" << std::endl;
return 0;
}
else
return 1;
} }
return 0; return 0;
......
...@@ -847,7 +847,7 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave, ...@@ -847,7 +847,7 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>( vector_t tmp{amd_buffer_load_impl<scalar_t, vector_size, coherence>(
src_wave_buffer_resource, src_thread_addr_offset, 0)}; src_wave_buffer_resource, src_thread_addr_offset, 0)};
return src_thread_element_valid ? tmp : vector_t{0}; return src_thread_element_valid ? tmp : vector_t(0);
#endif #endif
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment