Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c8f3acf9
Unverified
Commit
c8f3acf9
authored
Mar 30, 2022
by
Jianfeng Yan
Committed by
GitHub
Mar 30, 2022
Browse files
batched_gemm: use profiler in ctest (#163)
parent
982f8bbc
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
48 additions
and
134 deletions
+48
-134
include/ck/tensor_operation/gpu/device/device_gemm.hpp
include/ck/tensor_operation/gpu/device/device_gemm.hpp
+2
-0
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+1
-3
library/include/ck/library/host_tensor/host_tensor_generator.hpp
.../include/ck/library/host_tensor/host_tensor_generator.hpp
+3
-4
profiler/include/profile_batched_gemm_impl.hpp
profiler/include/profile_batched_gemm_impl.hpp
+16
-3
test/batched_gemm/batched_gemm_fp16.cpp
test/batched_gemm/batched_gemm_fp16.cpp
+26
-124
No files found.
include/ck/tensor_operation/gpu/device/device_gemm.hpp
View file @
c8f3acf9
#pragma once
#pragma once
#include <iostream>
#include <iostream>
#include <vector>
#include "device_base.hpp"
#include "device_base.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
View file @
c8f3acf9
#ifndef TENSOR_LAYOUT_HPP
#pragma once
#define TENSOR_LAYOUT_HPP
namespace
ck
{
namespace
ck
{
namespace
tensor_layout
{
namespace
tensor_layout
{
...
@@ -128,4 +127,3 @@ std::ostream& operator<<(std::ostream& os, const Layout&)
...
@@ -128,4 +127,3 @@ std::ostream& operator<<(std::ostream& os, const Layout&)
}
// namespace tensor_layout
}
// namespace tensor_layout
}
// namespace ck
}
// namespace ck
#endif
library/include/ck/library/host_tensor/host_tensor_generator.hpp
View file @
c8f3acf9
#ifndef HOST_TENSOR_GENERATOR_HPP
#pragma once
#define HOST_TENSOR_GENERATOR_HPP
#include <cmath>
#include <cmath>
#include <numeric>
#include "config.hpp"
#include "config.hpp"
template
<
typename
T
>
template
<
typename
T
>
...
@@ -147,5 +148,3 @@ struct GeneratorTensor_Sequential
...
@@ -147,5 +148,3 @@ struct GeneratorTensor_Sequential
return
dims
[
Dim
];
return
dims
[
Dim
];
}
}
};
};
#endif
profiler/include/profile_batched_gemm_impl.hpp
View file @
c8f3acf9
#pragma once
#pragma once
#include <memory>
#include <memory>
#include "config.hpp"
#include "element_wise_operation.hpp"
#include "tensor_layout.hpp"
#include "device.hpp"
#include "host_tensor_generator.hpp"
#include "device_gemm.hpp"
#include "reference_batched_gemm.hpp"
#include "reference_batched_gemm.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -52,7 +59,7 @@ template <typename ADataType,
...
@@ -52,7 +59,7 @@ template <typename ADataType,
typename
ALayout
,
typename
ALayout
,
typename
BLayout
,
typename
BLayout
,
typename
CLayout
>
typename
CLayout
>
void
profile_batched_gemm_impl
(
int
do_verification
,
bool
profile_batched_gemm_impl
(
int
do_verification
,
int
init_method
,
int
init_method
,
bool
do_log
,
bool
do_log
,
int
nrepeat
,
int
nrepeat
,
...
@@ -64,6 +71,8 @@ void profile_batched_gemm_impl(int do_verification,
...
@@ -64,6 +71,8 @@ void profile_batched_gemm_impl(int do_verification,
int
StrideC
,
int
StrideC
,
int
BatchCount
=
1
)
int
BatchCount
=
1
)
{
{
bool
pass
=
true
;
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count
,
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
batch_count
,
std
::
size_t
row
,
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
col
,
...
@@ -379,12 +388,14 @@ void profile_batched_gemm_impl(int do_verification,
...
@@ -379,12 +388,14 @@ void profile_batched_gemm_impl(int do_verification,
{
{
bf16_to_f32_
(
c_g_m_n_device_result
,
*
c_f32_g_m_n_device_result
);
bf16_to_f32_
(
c_g_m_n_device_result
,
*
c_f32_g_m_n_device_result
);
check_error
(
*
c_f32_g_m_n_host_result
,
*
c_f32_g_m_n_device_result
);
float
err
=
check_error
(
*
c_f32_g_m_n_host_result
,
*
c_f32_g_m_n_device_result
);
pass
=
pass
&&
(
err
<
1E-6
);
}
}
else
else
{
{
check_error
(
c_g_m_n_host_result
,
c_g_m_n_device_result
);
float
err
=
check_error
(
c_g_m_n_host_result
,
c_g_m_n_device_result
);
pass
=
pass
&&
(
err
<
1E-6
);
}
}
if
(
do_log
)
if
(
do_log
)
...
@@ -408,6 +419,8 @@ void profile_batched_gemm_impl(int do_verification,
...
@@ -408,6 +419,8 @@ void profile_batched_gemm_impl(int do_verification,
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
std
::
cout
<<
"Best Perf: "
<<
best_ave_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_gemm_name
<<
std
::
endl
;
return
pass
;
}
}
}
// namespace profiler
}
// namespace profiler
...
...
test/batched_gemm/batched_gemm_fp16.cpp
View file @
c8f3acf9
#include <half.hpp>
#include "profile_batched_gemm_impl.hpp"
#include <tuple>
#include <vector>
#include "batched_gemm_util.hpp"
#include <iostream>
#include "reference_batched_gemm.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_batched_gemm_xdl.hpp"
#include "element_wise_operation.hpp"
#include "test_util.hpp"
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
DeviceBatchedGemmPtr
=
ck
::
tensor_operation
::
device
::
DeviceGemmPtr
<
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
>
;
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
device_batched_gemm_instance
{
void
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
(
std
::
vector
<
DeviceBatchedGemmPtr
>&
instances
);
}
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
{
namespace
{
using
ADataType
=
ck
::
half_t
;
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
CDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
auto
PrepareGemmTensor
(
const
std
::
size_t
batch_count
,
const
ck
::
batched_gemm_util
::
GemmParams
&
params
)
{
auto
f_host_tensor_descriptor
=
[
batch_count
](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
row
*
stride
,
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
batch_count
,
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
col
*
stride
,
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_g_m_k
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
K
,
params
.
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_g_k_n
(
f_host_tensor_descriptor
(
params
.
K
,
params
.
N
,
params
.
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_g_m_n_host_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
Tensor
<
CDataType
>
c_g_m_n_device_result
(
f_host_tensor_descriptor
(
params
.
M
,
params
.
N
,
params
.
StrideC
,
CLayout
{}));
a_g_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
0.5
,
0.5
});
b_g_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
return
std
::
make_tuple
(
a_g_m_k
,
b_g_k_n
,
c_g_m_n_host_result
,
c_g_m_n_device_result
);
}
bool
TestBatchedGemm
(
const
std
::
size_t
batch_count
,
DeviceBatchedGemmPtr
&
gemmPtr
)
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
{
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
// Arrange
ck
::
batched_gemm_util
::
GemmParams
params
;
params
.
M
=
1024
;
params
.
N
=
1024
;
params
.
K
=
1024
;
params
.
StrideA
=
1024
;
params
.
StrideB
=
1024
;
params
.
StrideC
=
1024
;
auto
host_tensors
=
PrepareGemmTensor
(
batch_count
,
params
);
const
Tensor
<
ADataType
>&
a
=
std
::
get
<
0
>
(
host_tensors
);
const
Tensor
<
BDataType
>&
b
=
std
::
get
<
1
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_host
=
std
::
get
<
2
>
(
host_tensors
);
Tensor
<
CDataType
>&
c_device
=
std
::
get
<
3
>
(
host_tensors
);
auto
a_element_op
=
PassThrough
{};
auto
b_element_op
=
PassThrough
{};
auto
c_element_op
=
PassThrough
{};
using
ReferenceBatchedGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceBatchedGemm
<
ADataType
,
BDataType
,
CDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
ck
::
batched_gemm_util
::
RunHostBatchedGemm
<
ReferenceBatchedGemmInstance
>
(
a
,
b
,
c_host
,
a_element_op
,
b_element_op
,
c_element_op
);
// Act
ck
::
batched_gemm_util
::
RunDeviceBatchedGemm
(
gemmPtr
,
params
,
a
,
b
,
c_device
,
a_element_op
,
b_element_op
,
c_element_op
);
// Assert
// bool pass = test::check_err(
// c_device.mData, c_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
bool
pass
=
check_error
(
c_device
,
c_host
)
<
0.007815
f
;
std
::
cout
<<
(
pass
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
return
pass
;
}
}
// namespace
}
// namespace
int
main
()
int
main
()
{
{
std
::
vector
<
DeviceBatchedGemmPtr
>
batched_gemm_ptrs
;
int
M
=
512
;
ck
::
tensor_operation
::
device
::
device_batched_gemm_instance
::
int
N
=
256
;
add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances
(
batched_gemm_ptrs
);
int
K
=
128
;
int
BatchCount
=
3
;
bool
pass
=
true
;
bool
pass
=
true
;
const
std
::
size_t
batch_count
=
4
;
pass
=
pass
&&
for
(
auto
&
gemmPtr
:
batched_gemm_ptrs
)
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Row
,
Row
>
(
{
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
N
,
N
,
BatchCount
);
pass
&=
TestBatchedGemm
(
batch_count
,
gemmPtr
);
}
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Row
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
K
,
K
,
N
,
BatchCount
);
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Row
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
N
,
N
,
BatchCount
);
std
::
cout
<<
"TestGemm ..... "
<<
(
pass
?
"SUCCESS"
:
"FAILURE"
)
<<
std
::
endl
;
pass
=
pass
&&
ck
::
profiler
::
profile_batched_gemm_impl
<
ADataType
,
BDataType
,
CDataType
,
Col
,
Col
,
Row
>
(
true
,
1
,
false
,
1
,
M
,
N
,
K
,
M
,
K
,
N
,
BatchCount
);
std
::
cout
<<
"test BatchedGEMM fp16: "
<<
(
pass
?
"Pass"
:
"Fail"
)
<<
std
::
endl
;
return
pass
?
0
:
1
;
return
pass
?
0
:
1
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment