Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5b7c2432
Commit
5b7c2432
authored
Oct 20, 2022
by
Adam Osewski
Browse files
Merge remote-tracking branch 'rosenrodt/gemm-standalone-bench' into wavelet_model
parents
7e493730
5a995b14
Changes
353
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
209 additions
and
28 deletions
+209
-28
Jenkinsfile
Jenkinsfile
+4
-2
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+1
-1
client_example/05_layernorm/layernorm2d.cpp
client_example/05_layernorm/layernorm2d.cpp
+9
-9
client_example/07_conv2d_fwd/CMakeLists.txt
client_example/07_conv2d_fwd/CMakeLists.txt
+2
-0
client_example/07_conv2d_fwd/conv2d_fwd.cpp
client_example/07_conv2d_fwd/conv2d_fwd.cpp
+177
-0
example/01_gemm/gemm_dl_fp16.cpp
example/01_gemm/gemm_dl_fp16.cpp
+1
-1
example/01_gemm/gemm_dl_fp32.cpp
example/01_gemm/gemm_dl_fp32.cpp
+1
-1
example/01_gemm/gemm_dl_int4.cpp
example/01_gemm/gemm_dl_int4.cpp
+1
-1
example/01_gemm/gemm_dl_int8.cpp
example/01_gemm/gemm_dl_int8.cpp
+1
-1
example/01_gemm/gemm_xdl_bf16.cpp
example/01_gemm/gemm_xdl_bf16.cpp
+1
-1
example/01_gemm/gemm_xdl_fp64.cpp
example/01_gemm/gemm_xdl_fp64.cpp
+1
-1
example/01_gemm/gemm_xdl_int4.cpp
example/01_gemm/gemm_xdl_int4.cpp
+1
-1
example/01_gemm/gemm_xdl_int8.cpp
example/01_gemm/gemm_xdl_int8.cpp
+1
-1
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
+2
-2
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
+1
-1
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
+1
-1
example/04_gemm_add_add_fastgelu/common.hpp
example/04_gemm_add_add_fastgelu/common.hpp
+1
-1
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
+1
-1
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+1
-1
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
+1
-1
No files found.
Jenkinsfile
View file @
5b7c2432
...
...
@@ -233,6 +233,7 @@ def buildHipClangJob(Map conf=[:]){
def
variant
=
env
.
STAGE_NAME
def
retimage
(
retimage
,
image
)
=
getDockerImage
(
conf
)
gitStatusWrapper
(
credentialsId:
"${status_wrapper_creds}"
,
gitHubContext:
"Jenkins - ${variant}"
,
account:
'ROCmSoftwarePlatform'
,
repo:
'composable_kernel'
)
{
withDockerContainer
(
image:
image
,
args:
dockerOpts
+
' -v=/var/jenkins/:/var/jenkins'
)
{
...
...
@@ -548,8 +549,9 @@ def process_results(Map conf=[:]){
}
}
//launch develop branch daily at 23:00 in FULL_QA mode
CRON_SETTINGS
=
BRANCH_NAME
==
"develop"
?
'''0 23 * * * % RUN_FULL_QA=true'''
:
""
//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
CRON_SETTINGS
=
BRANCH_NAME
==
"develop"
?
'''0 23 * * * % RUN_FULL_QA=true;COMPILER_VERSION=release
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open'''
:
""
pipeline
{
agent
none
...
...
client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
View file @
5b7c2432
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_elementwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp"
...
...
client_example/05_layernorm/layernorm2d.cpp
View file @
5b7c2432
...
...
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/
layernorm
.hpp"
#include "ck/library/tensor_operation_instance/gpu/
normalization
.hpp"
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
...
...
@@ -51,14 +51,14 @@ int main(int argc, char* argv[])
SimpleDeviceMem
beta_device_buf
(
sizeof
(
BetaDataType
)
*
N
);
SimpleDeviceMem
y_device_buf
(
sizeof
(
YDataType
)
*
xy_size
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
Device
Layernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
Device
Normalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
...
client_example/07_conv2d_fwd/CMakeLists.txt
0 → 100644
View file @
5b7c2432
add_executable
(
client_conv2d_fwd conv2d_fwd.cpp
)
target_link_libraries
(
client_conv2d_fwd PRIVATE composable_kernel::device_operations
)
client_example/07_conv2d_fwd/conv2d_fwd.cpp
0 → 100644
View file @
5b7c2432
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
N
=
16
;
static
constexpr
ck
::
index_t
K
=
32
;
static
constexpr
ck
::
index_t
C
=
3
;
static
constexpr
ck
::
index_t
Y
=
3
;
static
constexpr
ck
::
index_t
X
=
3
;
static
constexpr
ck
::
index_t
Hi
=
224
;
static
constexpr
ck
::
index_t
Wi
=
224
;
static
constexpr
ck
::
index_t
Ho
=
113
;
static
constexpr
ck
::
index_t
Wo
=
113
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
std
::
vector
<
ck
::
index_t
>
in_spatial_lengths
{
Hi
,
Wi
};
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
{
Y
,
X
};
std
::
vector
<
ck
::
index_t
>
out_spatial_lengths
{
Ho
,
Wo
};
std
::
vector
<
ck
::
index_t
>
filter_strides
{
2
,
2
};
std
::
vector
<
ck
::
index_t
>
filter_dilations
{
1
,
1
};
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
2
,
2
};
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
2
,
2
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceConvFwd
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
OutLayout
,
InDataType
,
WeiDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
float
best_tflops
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
N
,
K
,
C
,
in_spatial_lengths
,
filter_spatial_lengths
,
out_spatial_lengths
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
2
*
N
*
K
*
C
*
Ho
*
Wo
*
Y
*
X
;
std
::
size_t
num_bytes
=
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
+
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
+
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
best_tflops
=
tflops
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
N
,
K
,
C
,
in_spatial_lengths
,
filter_spatial_lengths
,
out_spatial_lengths
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
\ No newline at end of file
example/01_gemm/gemm_dl_fp16.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_dl.hpp"
using
ADataType
=
ck
::
half_t
;
using
BDataType
=
ck
::
half_t
;
...
...
example/01_gemm/gemm_dl_fp32.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_dl.hpp"
using
ADataType
=
float
;
using
BDataType
=
float
;
...
...
example/01_gemm/gemm_dl_int4.cpp
View file @
5b7c2432
...
...
@@ -7,7 +7,7 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_dl.hpp"
using
ADataType
=
ck
::
int4_t
;
using
BDataType
=
ck
::
int4_t
;
...
...
example/01_gemm/gemm_dl_int8.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_dl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_dl.hpp"
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
...
...
example/01_gemm/gemm_xdl_bf16.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_cshuffle.hpp"
using
ADataType
=
ck
::
bhalf_t
;
using
BDataType
=
ck
::
bhalf_t
;
...
...
example/01_gemm/gemm_xdl_fp64.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl.hpp"
using
ADataType
=
double
;
using
BDataType
=
double
;
...
...
example/01_gemm/gemm_xdl_int4.cpp
View file @
5b7c2432
...
...
@@ -7,7 +7,7 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_cshuffle.hpp"
using
ADataType
=
ck
::
int4_t
;
using
BDataType
=
ck
::
int4_t
;
...
...
example/01_gemm/gemm_xdl_int8.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_cshuffle.hpp"
using
ADataType
=
int8_t
;
using
BDataType
=
int8_t
;
...
...
example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
View file @
5b7c2432
...
...
@@ -3,8 +3,8 @@
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_skip_b_lds.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_xdl_skip_b_lds.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
...
...
example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
View file @
5b7c2432
...
...
@@ -8,7 +8,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
View file @
5b7c2432
...
...
@@ -9,7 +9,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
example/04_gemm_add_add_fastgelu/common.hpp
View file @
5b7c2432
...
...
@@ -12,7 +12,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
...
...
example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
...
example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
View file @
5b7c2432
...
...
@@ -3,7 +3,7 @@
#include "convnd_fwd_common.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/
impl/
device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...
...
Prev
1
2
3
4
5
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment