Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f015c776
Unverified
Commit
f015c776
authored
Mar 31, 2022
by
Anthony Chang
Committed by
GitHub
Mar 30, 2022
Browse files
use single threaded tensor generator (#161)
parent
c8f3acf9
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
26 additions
and
22 deletions
+26
-22
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+1
-1
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+2
-2
library/include/ck/library/host_tensor/host_tensor.hpp
library/include/ck/library/host_tensor/host_tensor.hpp
+2
-2
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
...lete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+1
-1
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+1
-1
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+1
-1
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
...obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+1
-1
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
..._driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+1
-1
library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
...y/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
+1
-1
library/src/obselete_driver_offline/gemm_driver_offline.cpp
library/src/obselete_driver_offline/gemm_driver_offline.cpp
+1
-1
profiler/include/profile_batched_gemm_impl.hpp
profiler/include/profile_batched_gemm_impl.hpp
+1
-1
profiler/include/profile_gemm_bias_2d_impl.hpp
profiler/include/profile_gemm_bias_2d_impl.hpp
+1
-1
profiler/include/profile_gemm_bias_relu_impl.hpp
profiler/include/profile_gemm_bias_relu_impl.hpp
+1
-1
profiler/include/profile_gemm_impl.hpp
profiler/include/profile_gemm_impl.hpp
+5
-1
profiler/include/profile_gemm_reduce_impl.hpp
profiler/include/profile_gemm_reduce_impl.hpp
+1
-1
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+1
-1
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+1
-1
test/gemm_split_k/gemm_split_k.cpp
test/gemm_split_k/gemm_split_k.cpp
+1
-1
test/reduce/reduce_no_index.cpp
test/reduce/reduce_no_index.cpp
+1
-1
test/reduce/reduce_with_index.cpp
test/reduce/reduce_with_index.cpp
+1
-1
No files found.
example/12_reduce/reduce_blockwise.cpp
View file @
f015c776
...
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
...
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
float
alpha
=
args
.
scales
[
0
];
float
alpha
=
args
.
scales
[
0
];
float
beta
=
args
.
scales
[
1
];
float
beta
=
args
.
scales
[
1
];
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
if
(
args
.
do_verification
)
if
(
args
.
do_verification
)
{
{
...
...
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
f015c776
...
@@ -277,7 +277,7 @@ struct ReductionHost
...
@@ -277,7 +277,7 @@ struct ReductionHost
out_indices
[
dst_offset
]
=
accuIndex
;
out_indices
[
dst_offset
]
=
accuIndex
;
};
};
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
std
::
size_t
work_per_thread
=
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
...
@@ -374,7 +374,7 @@ struct ReductionHost
...
@@ -374,7 +374,7 @@ struct ReductionHost
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
};
};
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
std
::
size_t
work_per_thread
=
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
...
...
library/include/ck/library/host_tensor/host_tensor.hpp
View file @
f015c776
...
@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
...
@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
return
indices
;
return
indices
;
}
}
void
operator
()(
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
)
const
void
operator
()(
std
::
size_t
num_thread
=
1
)
const
{
{
std
::
size_t
work_per_thread
=
(
mN1d
+
num_thread
-
1
)
/
num_thread
;
std
::
size_t
work_per_thread
=
(
mN1d
+
num_thread
-
1
)
/
num_thread
;
...
@@ -213,7 +213,7 @@ struct Tensor
...
@@ -213,7 +213,7 @@ struct Tensor
Tensor
(
const
HostTensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
Tensor
(
const
HostTensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
template
<
typename
G
>
template
<
typename
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
)
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
{
{
switch
(
mDesc
.
GetNumOfDimension
())
switch
(
mDesc
.
GetNumOfDimension
())
{
{
...
...
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
View file @
f015c776
...
@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
...
@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
View file @
f015c776
...
@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
...
@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
View file @
f015c776
...
@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
...
@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
View file @
f015c776
...
@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
...
@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
View file @
f015c776
...
@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
...
@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
View file @
f015c776
...
@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
...
@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/gemm_driver_offline.cpp
View file @
f015c776
...
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
...
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
ostream_HostTensorDescriptor
(
b
.
mDesc
,
std
::
cout
<<
"b: "
);
ostream_HostTensorDescriptor
(
b
.
mDesc
,
std
::
cout
<<
"b: "
);
ostream_HostTensorDescriptor
(
c_host
.
mDesc
,
std
::
cout
<<
"c: "
);
ostream_HostTensorDescriptor
(
c_host
.
mDesc
,
std
::
cout
<<
"c: "
);
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
profiler/include/profile_batched_gemm_impl.hpp
View file @
f015c776
...
@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
...
@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_g_m_n: "
<<
c_g_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_g_m_n: "
<<
c_g_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_gemm_bias_2d_impl.hpp
View file @
f015c776
...
@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
...
@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
std
::
cout
<<
"c0_m_n: "
<<
c0_m_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c0_m_n: "
<<
c0_m_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_gemm_bias_relu_impl.hpp
View file @
f015c776
...
@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
...
@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c0_n: "
<<
c0_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c0_n: "
<<
c0_n
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_gemm_impl.hpp
View file @
f015c776
...
@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
...
@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_device_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_device_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,
...
@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
// re-init C to zero before profiling next kernel
c_m_n_device_result
.
GenerateTensorValue
(
GeneratorTensor_0
<
CDataType
>
{},
num_thread
);
c_device_buf
.
ToDevice
(
c_m_n_device_result
.
mData
.
data
());
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
...
...
profiler/include/profile_gemm_reduce_impl.hpp
View file @
f015c776
...
@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
...
@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
std
::
cout
<<
"d0_m: "
<<
d0_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d0_m: "
<<
d0_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d1_m: "
<<
d1_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d1_m: "
<<
d1_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
f015c776
...
@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_reduce_impl.hpp
View file @
f015c776
...
@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
if
(
do_verification
)
if
(
do_verification
)
{
{
...
...
test/gemm_split_k/gemm_split_k.cpp
View file @
f015c776
...
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
...
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
f_host_tensor_descriptor
(
args
.
M
,
args
.
N
,
args
.
StrideC
,
c_row_major
));
f_host_tensor_descriptor
(
args
.
M
,
args
.
N
,
args
.
StrideC
,
c_row_major
));
// init data
// init data
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
// set zero to c_device_buf
// set zero to c_device_buf
...
...
test/reduce/reduce_no_index.cpp
View file @
f015c776
...
@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
...
@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
test/reduce/reduce_with_index.cpp
View file @
f015c776
...
@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
...
@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment