Unverified Commit f015c776 authored by Anthony Chang's avatar Anthony Chang Committed by GitHub
Browse files

use single threaded tensor generator (#161)

parent c8f3acf9
...@@ -261,7 +261,7 @@ int main(int argc, char* argv[]) ...@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
float alpha = args.scales[0]; float alpha = args.scales[0];
float beta = args.scales[1]; float beta = args.scales[1];
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
if(args.do_verification) if(args.do_verification)
{ {
......
...@@ -277,7 +277,7 @@ struct ReductionHost ...@@ -277,7 +277,7 @@ struct ReductionHost
out_indices[dst_offset] = accuIndex; out_indices[dst_offset] = accuIndex;
}; };
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
std::size_t work_per_thread = std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread; (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
...@@ -374,7 +374,7 @@ struct ReductionHost ...@@ -374,7 +374,7 @@ struct ReductionHost
out_data[dst_offset] = type_convert<OutDataType>(accuVal); out_data[dst_offset] = type_convert<OutDataType>(accuVal);
}; };
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
std::size_t work_per_thread = std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread; (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
......
...@@ -163,7 +163,7 @@ struct ParallelTensorFunctor ...@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
return indices; return indices;
} }
void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const void operator()(std::size_t num_thread = 1) const
{ {
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread; std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
...@@ -213,7 +213,7 @@ struct Tensor ...@@ -213,7 +213,7 @@ struct Tensor
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {} Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
template <typename G> template <typename G>
void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency()) void GenerateTensorValue(G g, std::size_t num_thread = 1)
{ {
switch(mDesc.GetNumOfDimension()) switch(mDesc.GetNumOfDimension())
{ {
......
...@@ -302,7 +302,7 @@ int main(int argc, char* argv[]) ...@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
...@@ -317,7 +317,7 @@ int main(int argc, char* argv[]) ...@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
...@@ -319,7 +319,7 @@ int main(int argc, char* argv[]) ...@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
...@@ -282,7 +282,7 @@ int main(int argc, char* argv[]) ...@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
...@@ -300,7 +300,7 @@ int main(int argc, char* argv[]) ...@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
...@@ -289,7 +289,7 @@ int main(int argc, char* argv[]) ...@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w)); print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w)); print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
...@@ -313,7 +313,7 @@ int main(int argc, char* argv[]) ...@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: "); ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: ");
ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: "); ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: ");
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
...@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification, ...@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl; std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl; std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
......
...@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification, ...@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl; std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
......
...@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification, ...@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "c0_n: " << c0_n.mDesc << std::endl; std::cout << "c0_n: " << c0_n.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
......
...@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification, ...@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
...@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification, ...@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get())) if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
// re-init C to zero before profiling next kernel
c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
c_device_buf.ToDevice(c_m_n_device_result.mData.data());
std::string gemm_name = gemm_ptr->GetTypeString(); std::string gemm_name = gemm_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
......
...@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification, ...@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl; std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl; std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
......
...@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification, ...@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
<< "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
<< "]:" << c_m_n_device_results[i].mDesc << std::endl; << "]:" << c_m_n_device_results[i].mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
case 0: break; case 0: break;
......
...@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification, ...@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
size_t invariant_total_length = out.mDesc.GetElementSize(); size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
if(do_verification) if(do_verification)
{ {
......
...@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args) ...@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major)); f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
// init data // init data
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread); a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread); b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
// set zero to c_device_buf // set zero to c_device_buf
......
...@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method, ...@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
size_t invariant_total_length = out.mDesc.GetElementSize(); size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
...@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method, ...@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
size_t invariant_total_length = out.mDesc.GetElementSize(); size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = std::thread::hardware_concurrency(); std::size_t num_thread = 1;
switch(init_method) switch(init_method)
{ {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment