Unverified Commit f015c776 authored by Anthony Chang's avatar Anthony Chang Committed by GitHub
Browse files

use single threaded tensor generator (#161)

parent c8f3acf9
......@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
float alpha = args.scales[0];
float beta = args.scales[1];
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
if(args.do_verification)
{
......
......@@ -277,7 +277,7 @@ struct ReductionHost
out_indices[dst_offset] = accuIndex;
};
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
......@@ -374,7 +374,7 @@ struct ReductionHost
out_data[dst_offset] = type_convert<OutDataType>(accuVal);
};
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
......
......@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
return indices;
}
void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
void operator()(std::size_t num_thread = 1) const
{
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
......@@ -213,7 +213,7 @@ struct Tensor
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
template <typename G>
void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
void GenerateTensorValue(G g, std::size_t num_thread = 1)
{
switch(mDesc.GetNumOfDimension())
{
......
......@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
......@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
......@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
......@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
......@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
......@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
print_array("ConvStrides", make_tuple(conv_stride_h, conv_stride_w));
print_array("ConvDilations", make_tuple(conv_dilation_h, conv_dilation_w));
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
......@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
ostream_HostTensorDescriptor(b.mDesc, std::cout << "b: ");
ostream_HostTensorDescriptor(c_host.mDesc, std::cout << "c: ");
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
......@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
case 0: break;
......
......@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
std::cout << "c0_m_n: " << c0_m_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
case 0: break;
......
......@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "c0_n: " << c0_n.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
case 0: break;
......
......@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
case 0: break;
......@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,
if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
{
// re-init C to zero before profiling next kernel
c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
c_device_buf.ToDevice(c_m_n_device_result.mData.data());
std::string gemm_name = gemm_ptr->GetTypeString();
float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
......
......@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
std::cout << "d0_m: " << d0_m_host_result.mDesc << std::endl;
std::cout << "d1_m: " << d1_m_host_result.mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
case 0: break;
......
......@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
<< "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
<< "]:" << c_m_n_device_results[i].mDesc << std::endl;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
case 0: break;
......
......@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
if(do_verification)
{
......
......@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
// init data
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
// set zero to c_device_buf
......
......@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
......@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t num_thread = 1;
switch(init_method)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment