Commit 1945c26b authored by Adam Osewski's avatar Adam Osewski
Browse files

Reduce number of logged output. Add constant initialization.

parent 88436bd9
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/utility/fill.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
namespace ck { namespace ck {
...@@ -43,7 +44,6 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -43,7 +44,6 @@ bool profile_grouped_gemm_impl(int do_verification,
const std::vector<int>& StrideCs, const std::vector<int>& StrideCs,
int kbatch = 1) int kbatch = 1)
{ {
bool pass = true; bool pass = true;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
...@@ -81,15 +81,18 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -81,15 +81,18 @@ bool profile_grouped_gemm_impl(int do_verification,
c_m_n_device_results.push_back( c_m_n_device_results.push_back(
Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}))); Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
#if DEBUG_LOG
std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
<< "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
<< "]:" << c_m_n_device_results[i].mDesc << std::endl; << "]:" << c_m_n_device_results[i].mDesc << std::endl;
#endif // DEBUG_LOG
std::size_t num_thread = 1; std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0:
utils::FillConstant<ADataType>{1.0}(a_m_k[i]);
utils::FillConstant<BDataType>{1.0}(b_k_n[i]);
break;
case 1: case 1:
a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread); a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread); b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
...@@ -191,10 +194,10 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -191,10 +194,10 @@ bool profile_grouped_gemm_impl(int do_verification,
DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get())); DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer()); gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
std::string gemm_name = gemm_ptr->GetTypeString();
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
std::string gemm_name = gemm_ptr->GetTypeString();
if(kbatch > 1) if(kbatch > 1)
{ {
...@@ -221,35 +224,41 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -221,35 +224,41 @@ bool profile_grouped_gemm_impl(int do_verification,
float ave_time = float ave_time =
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = 0, num_btype = 0; if(time_kernel)
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i]; std::size_t flop = 0, num_btype = 0;
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] + num_btype += sizeof(ADataType) * Ms[i] * Ks[i] +
sizeof(CDataType) * Ms[i] * Ns[i]; sizeof(BDataType) * Ks[i] * Ns[i] +
} sizeof(CDataType) * Ms[i] * Ns[i];
}
float tflops = static_cast<float>(flop) / 1.E9 / ave_time; float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
<< gb_per_sec << " GB/s, " << gemm_name << std::endl; << " TFlops, " << gb_per_sec << " GB/s, " << gemm_name << std::endl;
if(tflops > best_tflops) if(tflops > best_tflops)
{ {
best_gemm_name = gemm_name; best_gemm_name = gemm_name;
best_tflops = tflops; best_tflops = tflops;
best_ave_time = ave_time; best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec; best_gb_per_sec = gb_per_sec;
}
} }
if(do_verification) if(do_verification)
{ {
bool instance_pass = true;
for(std::size_t i = 0; i < gemm_descs.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data()); c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
c_device_buf[i]->SetZero();
Tensor<CDataType> c_m_n_host_result( Tensor<CDataType> c_m_n_host_result(
f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})); f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
...@@ -274,7 +283,8 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -274,7 +283,8 @@ bool profile_grouped_gemm_impl(int do_verification,
c_element_op); c_element_op);
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result); instance_pass = instance_pass && ck::utils::check_err(c_m_n_device_results[i],
c_m_n_host_result);
if(do_log) if(do_log)
{ {
...@@ -289,16 +299,25 @@ bool profile_grouped_gemm_impl(int do_verification, ...@@ -289,16 +299,25 @@ bool profile_grouped_gemm_impl(int do_verification,
<< std::endl; << std::endl;
} }
} }
std::cout << "Instance: " << gemm_name << " verification "
<< (instance_pass ? "SUCCEED" : "FAILED") << std::endl;
pass = pass && instance_pass;
} }
} }
else else
{ {
std::cout << "does not support this GEMM problem" << std::endl; std::cout << "Instance: " << gemm_name << ", does not support this GEMM problem"
<< std::endl;
} }
} }
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " if(time_kernel)
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl; {
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
}
return pass; return pass;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment