Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
a6e310af
Commit
a6e310af
authored
Apr 01, 2022
by
carlushuang
Browse files
support omp and multi-core
parent
7e7640ce
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
57 additions
and
30 deletions
+57
-30
test/cpu_ukernel/CMakeLists.txt
test/cpu_ukernel/CMakeLists.txt
+4
-0
test/cpu_ukernel/cpu_gemm_uk.cpp
test/cpu_ukernel/cpu_gemm_uk.cpp
+53
-30
No files found.
test/cpu_ukernel/CMakeLists.txt
View file @
a6e310af
add_test_executable
(
test_cpu_gemm_uk cpu_gemm_uk.cpp
)
add_test_executable
(
test_cpu_gemm_uk cpu_gemm_uk.cpp
)
target_link_libraries
(
test_cpu_gemm_uk PRIVATE host_tensor
)
target_link_libraries
(
test_cpu_gemm_uk PRIVATE host_tensor
)
# 3.13 introduce target_link_directories, which is better
set_target_properties
(
test_cpu_gemm_uk PROPERTIES LINK_FLAGS -Wl,-rpath,/opt/rocm/llvm/lib
)
target_link_libraries
(
test_cpu_gemm_uk PRIVATE /opt/rocm/llvm/lib/libomp.so
)
target_compile_options
(
test_cpu_gemm_uk PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument
)
test/cpu_ukernel/cpu_gemm_uk.cpp
View file @
a6e310af
...
@@ -7,6 +7,7 @@
...
@@ -7,6 +7,7 @@
#include <tuple>
#include <tuple>
#include <memory>
#include <memory>
#include <half.hpp>
#include <half.hpp>
#include <omp.h>
#include "host_tensor.hpp"
#include "host_tensor.hpp"
#include "device.hpp"
#include "device.hpp"
#include "config.hpp"
#include "config.hpp"
...
@@ -228,22 +229,14 @@ void test_ukernel(ukenrel_t uk,
...
@@ -228,22 +229,14 @@ void test_ukernel(ukenrel_t uk,
uint32_t
n
,
uint32_t
n
,
uint32_t
k
)
uint32_t
k
)
{
{
ck
::
cpu
::
ThreadwiseGemmParam
param
;
int
max_threads
=
omp_get_max_threads
();
param
.
p_a
=
mat_a
;
param
.
p_b
=
mat_b
;
auto
invoke_uk
=
[
&
](
ck
::
cpu
::
ThreadwiseGemmParam
&
param
,
float
*
current_mat_c
)
{
param
.
p_c
=
mat_c
;
param
.
Kr
=
k
;
param
.
lda
=
(
std
::
is_same
<
Row
,
ALayout
>::
value
?
k
:
m
)
*
sizeof
(
FloatA
);
param
.
ldb
=
(
std
::
is_same
<
Row
,
BLayout
>::
value
?
n
:
k
*
8
)
*
sizeof
(
FloatB
);
param
.
ldc
=
n
*
sizeof
(
float
);
param
.
alpha
=
alpha
;
auto
invoke_uk
=
[
&
]()
{
if
constexpr
(
std
::
is_same
<
Row
,
ALayout
>::
value
&&
std
::
is_same
<
Row
,
BLayout
>::
value
)
if
constexpr
(
std
::
is_same
<
Row
,
ALayout
>::
value
&&
std
::
is_same
<
Row
,
BLayout
>::
value
)
{
{
assert
(
m
%
uk
.
Mr_
==
0
&&
n
==
uk
.
Nr_
);
assert
(
m
%
uk
.
Mr_
==
0
&&
n
==
uk
.
Nr_
);
FloatA
*
p_a
=
mat_a
;
FloatA
*
p_a
=
mat_a
;
float
*
p_c
=
mat_c
;
float
*
p_c
=
current_
mat_c
;
param
.
p_a
=
p_a
;
param
.
p_a
=
p_a
;
param
.
p_c
=
p_c
;
param
.
p_c
=
p_c
;
for
(
uint32_t
i_m
=
0
;
i_m
<
m
;
i_m
+=
uk
.
Mr_
)
for
(
uint32_t
i_m
=
0
;
i_m
<
m
;
i_m
+=
uk
.
Mr_
)
...
@@ -259,7 +252,7 @@ void test_ukernel(ukenrel_t uk,
...
@@ -259,7 +252,7 @@ void test_ukernel(ukenrel_t uk,
{
{
assert
(
m
%
uk
.
Mr_
==
0
&&
n
%
uk
.
Nr_
==
0
);
assert
(
m
%
uk
.
Mr_
==
0
&&
n
%
uk
.
Nr_
==
0
);
FloatA
*
p_a
=
mat_a
;
FloatA
*
p_a
=
mat_a
;
float
*
p_c
=
mat_c
;
float
*
p_c
=
current_
mat_c
;
param
.
p_a
=
p_a
;
param
.
p_a
=
p_a
;
param
.
p_b
=
mat_b
;
param
.
p_b
=
mat_b
;
param
.
p_c
=
p_c
;
param
.
p_c
=
p_c
;
...
@@ -291,7 +284,7 @@ void test_ukernel(ukenrel_t uk,
...
@@ -291,7 +284,7 @@ void test_ukernel(ukenrel_t uk,
{
{
assert
(
m
%
uk
.
Mr_
==
0
&&
n
%
uk
.
Nr_
==
0
);
assert
(
m
%
uk
.
Mr_
==
0
&&
n
%
uk
.
Nr_
==
0
);
FloatB
*
p_b
=
mat_b
;
FloatB
*
p_b
=
mat_b
;
float
*
p_c
=
mat_c
;
float
*
p_c
=
current_
mat_c
;
param
.
p_b
=
p_b
;
param
.
p_b
=
p_b
;
param
.
p_c
=
p_c
;
param
.
p_c
=
p_c
;
for
(
uint32_t
i_n
=
0
;
i_n
<
n
;
i_n
+=
uk
.
Nr_
)
for
(
uint32_t
i_n
=
0
;
i_n
<
n
;
i_n
+=
uk
.
Nr_
)
...
@@ -308,29 +301,54 @@ void test_ukernel(ukenrel_t uk,
...
@@ -308,29 +301,54 @@ void test_ukernel(ukenrel_t uk,
printf
(
"gemm_uk_%dx%d_%c%c: "
,
uk
.
Mr_
,
uk
.
Nr_
,
ALayout
::
name
[
0
],
BLayout
::
name
[
0
]);
printf
(
"gemm_uk_%dx%d_%c%c: "
,
uk
.
Mr_
,
uk
.
Nr_
,
ALayout
::
name
[
0
],
BLayout
::
name
[
0
]);
fflush
(
stdout
);
fflush
(
stdout
);
// printf("%s: ", typeid(uk).name());fflush(stdout);
// printf("%s: ", typeid(uk).name());fflush(stdout);
memset
(
mat_c
,
0
,
m
*
n
*
sizeof
(
float
));
int
repeat
=
7e10
/
(
2
*
m
*
n
*
k
)
;
float
us
=
.0
f
;
for
(
int
i
=
0
;
i
<
(
repeat
/
5
);
i
++
)
#pragma omp parallel reduction(+ : us
)
{
{
in
voke_uk
();
in
t
tid
=
omp_get_thread_num
();
}
float
*
private_c
=
reinterpret_cast
<
float
*>
(
malloc
(
m
*
n
*
sizeof
(
float
)));
WallTimer
timer
;
ck
::
cpu
::
ThreadwiseGemmParam
param
;
param
.
p_a
=
mat_a
;
param
.
p_b
=
mat_b
;
param
.
p_c
=
private_c
;
param
.
Kr
=
k
;
param
.
lda
=
(
std
::
is_same
<
Row
,
ALayout
>::
value
?
k
:
m
)
*
sizeof
(
FloatA
);
param
.
ldb
=
(
std
::
is_same
<
Row
,
BLayout
>::
value
?
n
:
k
*
8
)
*
sizeof
(
FloatB
);
param
.
ldc
=
n
*
sizeof
(
float
);
param
.
alpha
=
alpha
;
timer
.
Start
();
memset
(
private_c
,
0
,
m
*
n
*
sizeof
(
float
));
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
int
repeat
=
7e10
/
(
2
*
m
*
n
*
k
);
invoke_uk
();
for
(
int
i
=
0
;
i
<
(
repeat
/
5
);
i
++
)
{
invoke_uk
(
param
,
private_c
);
}
WallTimer
timer
;
timer
.
Start
();
for
(
int
i
=
0
;
i
<
repeat
;
i
++
)
{
invoke_uk
(
param
,
private_c
);
}
timer
.
End
();
us
+=
timer
.
GetElapsedTime
()
*
1e3
/
repeat
;
memset
(
private_c
,
0
,
m
*
n
*
sizeof
(
float
));
invoke_uk
(
param
,
private_c
);
memcpy
(
mat_c
+
tid
*
m
*
n
,
private_c
,
m
*
n
*
sizeof
(
float
));
free
(
private_c
);
}
}
timer
.
End
();
float
us
=
timer
.
GetElapsedTime
()
*
1e3
/
repeat
;
us
=
us
/
max_threads
;
float
gflops
=
static_cast
<
float
>
(
2
*
m
*
n
*
k
)
*
1e-3
/
us
;
memset
(
mat_c
,
0
,
m
*
n
*
sizeof
(
float
));
float
gflops
=
static_cast
<
float
>
(
2
*
m
*
n
*
k
*
max_threads
)
*
1e-3
/
us
;
invoke_uk
();
printf
(
"m:%u, n:%u, k:%u, alpha:%f, cost:%lfus, GFLOPS:%lf, "
,
m
,
n
,
k
,
alpha
,
us
,
gflops
);
printf
(
"m:%u, n:%u, k:%u, alpha:%f, cost:%lfus, GFLOPS:%lf, "
,
m
,
n
,
k
,
alpha
,
us
,
gflops
);
fflush
(
stdout
);
fflush
(
stdout
);
...
@@ -340,10 +358,11 @@ void test_ukernel(ukenrel_t uk,
...
@@ -340,10 +358,11 @@ void test_ukernel(ukenrel_t uk,
template
<
typename
FloatA
,
typename
FloatB
,
typename
ALayout
,
typename
BLayout
>
template
<
typename
FloatA
,
typename
FloatB
,
typename
ALayout
,
typename
BLayout
>
void
test_cpu_ukernel
(
float
alpha
,
uint32_t
m
,
uint32_t
n
,
uint32_t
k
)
void
test_cpu_ukernel
(
float
alpha
,
uint32_t
m
,
uint32_t
n
,
uint32_t
k
)
{
{
int
max_threads
=
omp_get_max_threads
();
DeviceAlignedMemCPU
a_mem
(
m
*
k
*
sizeof
(
FloatA
),
32
);
DeviceAlignedMemCPU
a_mem
(
m
*
k
*
sizeof
(
FloatA
),
32
);
DeviceAlignedMemCPU
b_mem
(
k
*
n
*
sizeof
(
FloatB
),
32
);
DeviceAlignedMemCPU
b_mem
(
k
*
n
*
sizeof
(
FloatB
),
32
);
DeviceAlignedMemCPU
c_mem
(
m
*
n
*
sizeof
(
float
),
32
);
DeviceAlignedMemCPU
c_mem
(
m
*
n
*
sizeof
(
float
)
*
max_threads
,
32
);
DeviceAlignedMemCPU
c_mem_ref
(
m
*
n
*
sizeof
(
float
),
32
);
DeviceAlignedMemCPU
c_mem_ref
(
m
*
n
*
sizeof
(
float
),
32
);
c_mem_ref
.
SetZero
();
c_mem_ref
.
SetZero
();
...
@@ -409,6 +428,10 @@ int main(int argc, char** argv)
...
@@ -409,6 +428,10 @@ int main(int argc, char** argv)
alpha
=
std
::
atof
(
argv
[
4
]);
alpha
=
std
::
atof
(
argv
[
4
]);
}
}
dump_cache_hierarchy
();
dump_cache_hierarchy
();
if
(
std
::
getenv
(
"OMP_NUM_THREADS"
)
==
nullptr
)
omp_set_num_threads
(
1
);
printf
(
"max threads:%d
\n
"
,
omp_get_max_threads
());
test_cpu_ukernel
<
AType
,
BType
,
Row
,
Row
>
(
alpha
,
m
,
n
,
k
);
test_cpu_ukernel
<
AType
,
BType
,
Row
,
Row
>
(
alpha
,
m
,
n
,
k
);
test_cpu_ukernel
<
AType
,
BType
,
Row
,
Col
>
(
alpha
,
m
,
n
,
k
);
test_cpu_ukernel
<
AType
,
BType
,
Row
,
Col
>
(
alpha
,
m
,
n
,
k
);
test_cpu_ukernel
<
AType
,
BType
,
Col
,
Row
>
(
alpha
,
m
,
n
,
k
);
test_cpu_ukernel
<
AType
,
BType
,
Col
,
Row
>
(
alpha
,
m
,
n
,
k
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment