Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel
Commits
f015c776
You need to sign in or sign up before continuing.
Unverified
Commit
f015c776
authored
Mar 31, 2022
by
Anthony Chang
Committed by
GitHub
Mar 30, 2022
Browse files
use single threaded tensor generator (#161)
parent
c8f3acf9
Changes
20
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
26 additions
and
22 deletions
+26
-22
example/12_reduce/reduce_blockwise.cpp
example/12_reduce/reduce_blockwise.cpp
+1
-1
library/include/ck/library/host_tensor/host_reduction.hpp
library/include/ck/library/host_tensor/host_reduction.hpp
+2
-2
library/include/ck/library/host_tensor/host_tensor.hpp
library/include/ck/library/host_tensor/host_tensor.hpp
+2
-2
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
...lete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
+1
-1
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
+1
-1
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
...y/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
+1
-1
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
...obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
+1
-1
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
..._driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
+1
-1
library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
...y/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
+1
-1
library/src/obselete_driver_offline/gemm_driver_offline.cpp
library/src/obselete_driver_offline/gemm_driver_offline.cpp
+1
-1
profiler/include/profile_batched_gemm_impl.hpp
profiler/include/profile_batched_gemm_impl.hpp
+1
-1
profiler/include/profile_gemm_bias_2d_impl.hpp
profiler/include/profile_gemm_bias_2d_impl.hpp
+1
-1
profiler/include/profile_gemm_bias_relu_impl.hpp
profiler/include/profile_gemm_bias_relu_impl.hpp
+1
-1
profiler/include/profile_gemm_impl.hpp
profiler/include/profile_gemm_impl.hpp
+5
-1
profiler/include/profile_gemm_reduce_impl.hpp
profiler/include/profile_gemm_reduce_impl.hpp
+1
-1
profiler/include/profile_grouped_gemm_impl.hpp
profiler/include/profile_grouped_gemm_impl.hpp
+1
-1
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+1
-1
test/gemm_split_k/gemm_split_k.cpp
test/gemm_split_k/gemm_split_k.cpp
+1
-1
test/reduce/reduce_no_index.cpp
test/reduce/reduce_no_index.cpp
+1
-1
test/reduce/reduce_with_index.cpp
test/reduce/reduce_with_index.cpp
+1
-1
No files found.
example/12_reduce/reduce_blockwise.cpp
View file @
f015c776
...
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
...
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
float
alpha
=
args
.
scales
[
0
];
float
alpha
=
args
.
scales
[
0
];
float
beta
=
args
.
scales
[
1
];
float
beta
=
args
.
scales
[
1
];
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
if
(
args
.
do_verification
)
if
(
args
.
do_verification
)
{
{
...
...
library/include/ck/library/host_tensor/host_reduction.hpp
View file @
f015c776
...
@@ -277,7 +277,7 @@ struct ReductionHost
...
@@ -277,7 +277,7 @@ struct ReductionHost
out_indices
[
dst_offset
]
=
accuIndex
;
out_indices
[
dst_offset
]
=
accuIndex
;
};
};
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
std
::
size_t
work_per_thread
=
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
...
@@ -374,7 +374,7 @@ struct ReductionHost
...
@@ -374,7 +374,7 @@ struct ReductionHost
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
out_data
[
dst_offset
]
=
type_convert
<
OutDataType
>
(
accuVal
);
};
};
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
std
::
size_t
work_per_thread
=
std
::
size_t
work_per_thread
=
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
(
invariant_dim_indexes
.
size
()
+
num_thread
-
1
)
/
num_thread
;
...
...
library/include/ck/library/host_tensor/host_tensor.hpp
View file @
f015c776
...
@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
...
@@ -163,7 +163,7 @@ struct ParallelTensorFunctor
return
indices
;
return
indices
;
}
}
void
operator
()(
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
)
const
void
operator
()(
std
::
size_t
num_thread
=
1
)
const
{
{
std
::
size_t
work_per_thread
=
(
mN1d
+
num_thread
-
1
)
/
num_thread
;
std
::
size_t
work_per_thread
=
(
mN1d
+
num_thread
-
1
)
/
num_thread
;
...
@@ -213,7 +213,7 @@ struct Tensor
...
@@ -213,7 +213,7 @@ struct Tensor
Tensor
(
const
HostTensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
Tensor
(
const
HostTensorDescriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpace
())
{}
template
<
typename
G
>
template
<
typename
G
>
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
)
void
GenerateTensorValue
(
G
g
,
std
::
size_t
num_thread
=
1
)
{
{
switch
(
mDesc
.
GetNumOfDimension
())
switch
(
mDesc
.
GetNumOfDimension
())
{
{
...
...
library/src/obselete_driver_offline/conv_add_fwd_driver_offline_nchwc.cpp
View file @
f015c776
...
@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
...
@@ -302,7 +302,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_bwd_driver_offline.cpp
View file @
f015c776
...
@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
...
@@ -317,7 +317,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_fwd_driver_offline.cpp
View file @
f015c776
...
@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
...
@@ -319,7 +319,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_fwd_driver_offline_nchwc.cpp
View file @
f015c776
...
@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
...
@@ -282,7 +282,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_maxpool_fwd_driver_offline_nchwc.cpp
View file @
f015c776
...
@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
...
@@ -300,7 +300,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/conv_wrw_driver_offline.cpp
View file @
f015c776
...
@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
...
@@ -289,7 +289,7 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvStrides"
,
make_tuple
(
conv_stride_h
,
conv_stride_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
print_array
(
"ConvDilations"
,
make_tuple
(
conv_dilation_h
,
conv_dilation_w
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
library/src/obselete_driver_offline/gemm_driver_offline.cpp
View file @
f015c776
...
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
...
@@ -313,7 +313,7 @@ int main(int argc, char* argv[])
ostream_HostTensorDescriptor
(
b
.
mDesc
,
std
::
cout
<<
"b: "
);
ostream_HostTensorDescriptor
(
b
.
mDesc
,
std
::
cout
<<
"b: "
);
ostream_HostTensorDescriptor
(
c_host
.
mDesc
,
std
::
cout
<<
"c: "
);
ostream_HostTensorDescriptor
(
c_host
.
mDesc
,
std
::
cout
<<
"c: "
);
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
profiler/include/profile_batched_gemm_impl.hpp
View file @
f015c776
...
@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
...
@@ -103,7 +103,7 @@ bool profile_batched_gemm_impl(int do_verification,
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_g_k_n: "
<<
b_g_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_g_m_n: "
<<
c_g_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_g_m_n: "
<<
c_g_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_gemm_bias_2d_impl.hpp
View file @
f015c776
...
@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
...
@@ -98,7 +98,7 @@ void profile_gemm_bias_2d_impl(int do_verification,
std
::
cout
<<
"c0_m_n: "
<<
c0_m_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c0_m_n: "
<<
c0_m_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_gemm_bias_relu_impl.hpp
View file @
f015c776
...
@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
...
@@ -83,7 +83,7 @@ void profile_gemm_bias_relu_impl(int do_verification,
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c0_n: "
<<
c0_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c0_n: "
<<
c0_n
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_gemm_impl.hpp
View file @
f015c776
...
@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
...
@@ -120,7 +120,7 @@ void profile_gemm_impl(int do_verification,
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_device_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"c_m_n: "
<<
c_m_n_device_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,
...
@@ -408,6 +408,10 @@ void profile_gemm_impl(int do_verification,
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
gemm_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
// re-init C to zero before profiling next kernel
c_m_n_device_result
.
GenerateTensorValue
(
GeneratorTensor_0
<
CDataType
>
{},
num_thread
);
c_device_buf
.
ToDevice
(
c_m_n_device_result
.
mData
.
data
());
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
std
::
string
gemm_name
=
gemm_ptr
->
GetTypeString
();
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
float
ave_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
nrepeat
);
...
...
profiler/include/profile_gemm_reduce_impl.hpp
View file @
f015c776
...
@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
...
@@ -98,7 +98,7 @@ bool profile_gemm_reduce_impl(int do_verification,
std
::
cout
<<
"d0_m: "
<<
d0_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d0_m: "
<<
d0_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d1_m: "
<<
d1_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d1_m: "
<<
d1_m_host_result
.
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_grouped_gemm_impl.hpp
View file @
f015c776
...
@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
...
@@ -95,7 +95,7 @@ void profile_grouped_gemm_impl(int do_verification,
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
<<
"]:"
<<
b_k_n
[
i
].
mDesc
<<
", c_m_n_device_results["
<<
i
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
<<
"]:"
<<
c_m_n_device_results
[
i
].
mDesc
<<
std
::
endl
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
case
0
:
break
;
case
0
:
break
;
...
...
profiler/include/profile_reduce_impl.hpp
View file @
f015c776
...
@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
...
@@ -242,7 +242,7 @@ void profile_reduce_impl_impl(bool do_verification,
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
if
(
do_verification
)
if
(
do_verification
)
{
{
...
...
test/gemm_split_k/gemm_split_k.cpp
View file @
f015c776
...
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
...
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
f_host_tensor_descriptor
(
args
.
M
,
args
.
N
,
args
.
StrideC
,
c_row_major
));
f_host_tensor_descriptor
(
args
.
M
,
args
.
N
,
args
.
StrideC
,
c_row_major
));
// init data
// init data
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
// set zero to c_device_buf
// set zero to c_device_buf
...
...
test/reduce/reduce_no_index.cpp
View file @
f015c776
...
@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
...
@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
test/reduce/reduce_with_index.cpp
View file @
f015c776
...
@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
...
@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
switch
(
init_method
)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment