Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7e7640ce
Commit
7e7640ce
authored
Mar 31, 2022
by
carlushuang
Browse files
Merge remote-tracking branch 'origin/develop' into cpu_avx2
parents
cc8df39e
f015c776
Changes
54
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
114 additions
and
137 deletions
+114
-137
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+5
-0
test/CMakeLists.txt
test/CMakeLists.txt
+1
-0
test/batched_gemm/batched_gemm_fp16.cpp
test/batched_gemm/batched_gemm_fp16.cpp
+26
-124
test/batched_gemm_reduce/CMakeLists.txt
test/batched_gemm_reduce/CMakeLists.txt
+9
-0
test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
+64
-0
test/conv_util/conv_util.cpp
test/conv_util/conv_util.cpp
+1
-1
test/convnd_fwd/conv1d_fwd.cpp
test/convnd_fwd/conv1d_fwd.cpp
+2
-0
test/convnd_fwd/conv2d_fwd.cpp
test/convnd_fwd/conv2d_fwd.cpp
+1
-1
test/convnd_fwd/conv3d_fwd.cpp
test/convnd_fwd/conv3d_fwd.cpp
+1
-1
test/gemm_reduce/gemm_reduce_fp16.cpp
test/gemm_reduce/gemm_reduce_fp16.cpp
+0
-6
test/gemm_split_k/gemm_split_k.cpp
test/gemm_split_k/gemm_split_k.cpp
+1
-1
test/reduce/reduce_no_index.cpp
test/reduce/reduce_no_index.cpp
+1
-1
test/reduce/reduce_with_index.cpp
test/reduce/reduce_with_index.cpp
+1
-1
test/reference_conv_fwd/reference_conv_fwd.cpp
test/reference_conv_fwd/reference_conv_fwd.cpp
+1
-1
No files found.
profiler/src/profiler.cpp
View file @
7e7640ce
...
...
@@ -17,6 +17,7 @@ int profile_conv_fwd_bias_relu_add(int, char*[]);
int
profile_conv_fwd_bias_relu_atomic_add
(
int
,
char
*
[]);
int
profile_convnd_bwd_data
(
int
,
char
*
[],
int
);
int
profile_reduce
(
int
,
char
*
[]);
int
profile_batched_gemm_reduce
(
int
,
char
*
[]);
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
@@ -44,6 +45,10 @@ int main(int argc, char* argv[])
{
return
profile_batched_gemm
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"batched_gemm_reduce"
)
==
0
)
{
return
profile_batched_gemm_reduce
(
argc
,
argv
);
}
else
if
(
strcmp
(
argv
[
1
],
"grouped_gemm"
)
==
0
)
{
profile_grouped_gemm
(
argc
,
argv
);
...
...
test/CMakeLists.txt
View file @
7e7640ce
...
...
@@ -40,6 +40,7 @@ add_subdirectory(gemm)
add_subdirectory
(
gemm_split_k
)
add_subdirectory
(
gemm_reduce
)
add_subdirectory
(
batched_gemm
)
add_subdirectory
(
batched_gemm_reduce
)
add_subdirectory
(
grouped_gemm
)
add_subdirectory
(
convnd_fwd
)
add_subdirectory
(
reduce
)
...
...
test/batched_gemm/batched_gemm_fp16.cpp
View file @
7e7640ce
#include <half.hpp>
#include <tuple>
#include <vector>
#include "profile_batched_gemm_impl.hpp"
#include "batched_gemm_util.hpp"
#include "reference_batched_gemm.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "test_util.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceBatchedGemmPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_instance
{
void
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceBatchedGemmPtr
>&
instances
);
}
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
#include <iostream>
namespace
{
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
auto
PrepareGemmTensor
(
const
std
::
size_t
batch_count
,
const
ck
::
batched_gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[
batch_count
](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
row
*
stride
,
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
col
*
stride
,
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
0.5
,
0.5
});
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
return
std
::
make_tuple
(
a_g_m_k
,
b_g_k_n
,
c_g_m_n_host_result
,
c_g_m_n_device_result
);
}
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
bool
TestBatchedGemm
(
const
std
::
size_t
batch_count
,
DeviceBatchedGemmPtr
&
gemmPtr
)
{
// Arrange
ck
::
batched_gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensor
(
batch_count
,
params
);
const
Tensor
<
ADataType
>&
a
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BDataType
>&
b
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_host
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_device
=
std
::
get
<
3
>
(
host_tensors
);
auto
a_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
auto
c_element_op
=
PassThrough
{};
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
ck
::
batched_gemm_util
::
RunHostBatchedGemm
<
ReferenceBatchedGemmInstance
>
(
a
,
b
,
c_host
,
a_element_op
,
b_element_op
,
c_element_op
);
// Act
ck
::
batched_gemm_util
::
RunDeviceBatchedGemm
(
gemmPtr
,
params
,
a
,
b
,
c_device
,
a_element_op
,
b_element_op
,
c_element_op
);
// Assert
// bool pass = test::check_err(
// c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
bool
pass
=
check_error
(
c_device
,
c_host
)
<
0.007815
f
;
std
::
cout
<<
(
pass
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
pass
;
}
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
}
// namespace
int
main
()
{
std
::
vector
<
DeviceBatchedGemmPtr
>
batched_gemm_ptrs
;
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
(
batched_gemm_ptrs
);
int
M
=
512
;
int
N
=
256
;
int
K
=
128
;
int
BatchCount
=
3
;
bool
pass
=
true
;
const
std
::
size_t
batch_count
=
4
;
for
(
auto
&
gemmPtr
:
batched_gemm_ptrs
)
{
pass
&=
TestBatchedGemm
(
batch_count
,
gemmPtr
);
}
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
BatchCount
);
std
::
cout
<<
"TestGemm ..... "
<<
(
pass
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMM fp16: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
}
test/batched_gemm_reduce/CMakeLists.txt
0 → 100644
View file @
7e7640ce
include_directories
(
BEFORE
${
PROJECT_SOURCE_DIR
}
/profiler/include
${
PROJECT_SOURCE_DIR
}
/test/include
${
PROJECT_SOURCE_DIR
}
/external/include/half
)
add_test_executable
(
test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp
)
target_link_libraries
(
test_batched_gemm_reduce_fp16 PRIVATE host_tensor
)
target_link_libraries
(
test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance
)
test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
0 → 100644
View file @
7e7640ce
#include <iostream>
#include "profile_batched_gemm_reduce_impl.hpp"
int
main
()
{
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
int
M
=
512
;
int
N
=
256
;
int
K
=
128
;
int
BatchCount
=
3
;
bool
pass
=
true
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Row
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_reduce_impl
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
BatchCount
);
if
(
pass
)
{
std
::
cout
<<
"test BatchedGEMM+Reduce fp16: Pass"
<<
std
::
endl
;
return
0
;
}
else
{
std
::
cout
<<
"test BatchedGEMM+Reduce fp16: Fail"
<<
std
::
endl
;
return
-
1
;
}
}
test/conv_util/conv_util.cpp
View file @
7e7640ce
...
...
@@ -193,5 +193,5 @@ int main(void)
<<
std
::
endl
;
res
=
TestGetHostTensorDescriptor
();
std
::
cout
<<
"TestGetHostTensorDescriptor ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
0
;
return
res
?
0
:
1
;
}
test/convnd_fwd/conv1d_fwd.cpp
View file @
7e7640ce
...
...
@@ -146,4 +146,6 @@ int main()
res
=
TestConv1DNWCInt8Instances
();
std
::
cout
<<
"
\n
TestConv1DNWCInt8Instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
res
?
0
:
1
;
}
test/convnd_fwd/conv2d_fwd.cpp
View file @
7e7640ce
...
...
@@ -143,5 +143,5 @@ int main()
std
::
cout
<<
"
\n
TestConv2DNHWCInt8Instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
0
;
return
res
?
0
:
1
;
}
test/convnd_fwd/conv3d_fwd.cpp
View file @
7e7640ce
...
...
@@ -290,5 +290,5 @@ int main()
std
::
cout
<<
"
\n
TestConv3DNDHWCInt8Instances ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
0
;
return
res
?
0
:
1
;
}
test/gemm_reduce/gemm_reduce_fp16.cpp
View file @
7e7640ce
#include <algorithm>
#include <cstdlib>
#include <half.hpp>
#include <iostream>
#include <numeric>
#include <tuple>
#include <vector>
#include "profile_gemm_reduce_impl.hpp"
...
...
test/gemm_split_k/gemm_split_k.cpp
View file @
7e7640ce
...
...
@@ -120,7 +120,7 @@ int test_gemm(const gemmArgs& args)
f_host_tensor_descriptor
(
args
.
M
,
args
.
N
,
args
.
StrideC
,
c_row_major
));
// init data
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
float
>
{
-
5
,
5
},
num_thread
);
// set zero to c_device_buf
...
...
test/reduce/reduce_no_index.cpp
View file @
7e7640ce
...
...
@@ -101,7 +101,7 @@ bool test_reduce_no_index_impl(int init_method,
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
...
...
test/reduce/reduce_with_index.cpp
View file @
7e7640ce
...
...
@@ -99,7 +99,7 @@ bool test_reduce_with_index_impl(int init_method,
size_t
invariant_total_length
=
out
.
mDesc
.
GetElementSize
();
size_t
reduce_total_length
=
in
.
mDesc
.
GetElementSize
()
/
invariant_total_length
;
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
()
;
std
::
size_t
num_thread
=
1
;
switch
(
init_method
)
{
...
...
test/reference_conv_fwd/reference_conv_fwd.cpp
View file @
7e7640ce
...
...
@@ -422,5 +422,5 @@ int main(void)
std
::
cout
<<
"TestConv1DNHWC ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
res
=
TestConv3DNCDHW
();
std
::
cout
<<
"TestConv3DNCDHW ..... "
<<
(
res
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
0
;
return
res
?
0
:
1
;
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment