Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
0cd78566
You need to sign in or sign up before continuing.
Unverified
Commit
0cd78566
authored
Feb 15, 2023
by
zjing14
Committed by
GitHub
Feb 15, 2023
Browse files
Merge branch 'develop' into lwpck-471
parents
07905e77
19490ac4
Changes
38
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3283 additions
and
38 deletions
+3283
-38
CHANGELOG.md
CHANGELOG.md
+1
-1
client_example/05_layernorm/layernorm2d.cpp
client_example/05_layernorm/layernorm2d.cpp
+7
-7
example/01_gemm/CMakeLists.txt
example/01_gemm/CMakeLists.txt
+5
-3
example/02_gemm_bilinear/CMakeLists.txt
example/02_gemm_bilinear/CMakeLists.txt
+3
-0
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
+304
-0
example/26_contraction/CMakeLists.txt
example/26_contraction/CMakeLists.txt
+3
-0
example/26_contraction/contraction_bilinear_xdl_fp64.cpp
example/26_contraction/contraction_bilinear_xdl_fp64.cpp
+427
-0
example/26_contraction/contraction_scale_xdl_fp64.cpp
example/26_contraction/contraction_scale_xdl_fp64.cpp
+409
-0
example/27_layernorm/layernorm_blockwise.cpp
example/27_layernorm/layernorm_blockwise.cpp
+8
-8
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
+4
-0
example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
..._bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
+431
-0
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
+3
-0
example/30_grouped_conv_fwd_multiple_d/common.hpp
example/30_grouped_conv_fwd_multiple_d/common.hpp
+1
-1
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
+355
-0
example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
...d_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
+26
-0
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
...ple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+286
-0
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+7
-7
include/ck/host_utility/kernel_launch.hpp
include/ck/host_utility/kernel_launch.hpp
+5
-4
include/ck/tensor_operation/gpu/device/device_normalization.hpp
...e/ck/tensor_operation/gpu/device/device_normalization.hpp
+7
-7
include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
...l/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+991
-0
No files found.
CHANGELOG.md
View file @
0cd78566
...
@@ -9,7 +9,7 @@ Full documentation for Composable Kernel is not yet available.
...
@@ -9,7 +9,7 @@ Full documentation for Composable Kernel is not yet available.
-
Fixed grouped ConvBwdWeight test case failure (#524).
-
Fixed grouped ConvBwdWeight test case failure (#524).
### Optimizations
### Optimizations
-
Optimized ...
-
Improve proformance of normalization kernel
### Added
### Added
-
Added user tutorial (#563).
-
Added user tutorial (#563).
...
...
client_example/05_layernorm/layernorm2d.cpp
View file @
0cd78566
...
@@ -12,12 +12,12 @@
...
@@ -12,12 +12,12 @@
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
using
XDataType
=
ck
::
half_t
;
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
Acc
DataType
=
float
;
using
Compute
DataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
constexpr
int
Rank
=
2
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
constexpr
int
NumReduceDim
=
1
;
...
@@ -54,7 +54,7 @@ int main(int argc, char* argv[])
...
@@ -54,7 +54,7 @@ int main(int argc, char* argv[])
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalization
<
XDataType
,
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceNormalization
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
Acc
DataType
,
Compute
DataType
,
YDataType
,
YDataType
,
PassThrough
,
PassThrough
,
Rank
,
Rank
,
...
...
example/01_gemm/CMakeLists.txt
View file @
0cd78566
...
@@ -38,7 +38,9 @@ add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
...
@@ -38,7 +38,9 @@ add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_fp64
)
add_dependencies
(
example_gemm_xdl example_gemm_xdl_fp64
)
add_custom_target
(
example_gemm_wmma
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_example_executable
(
example_gemm_wmma_fp16 gemm_wmma_fp16.cpp
)
add_custom_target
(
example_gemm_wmma
)
add_dependencies
(
example_gemm_wmma example_gemm_wmma_fp16
)
add_example_executable
(
example_gemm_wmma_fp16 gemm_wmma_fp16.cpp
)
add_dependencies
(
example_gemm_wmma example_gemm_wmma_fp16
)
endif
()
example/02_gemm_bilinear/CMakeLists.txt
View file @
0cd78566
add_example_executable
(
example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp
)
add_example_executable
(
example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_example_executable
(
example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp
)
endif
()
example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp
0 → 100644
View file @
0cd78566
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
struct
AlphaBetaAdd
{
AlphaBetaAdd
(
float
alpha
,
float
beta
)
:
alpha_
(
alpha
),
beta_
(
beta
){};
template
<
typename
E
,
typename
C
,
typename
D
>
__host__
__device__
constexpr
void
operator
()(
E
&
e
,
const
C
&
c
,
const
D
&
d
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
ck
::
half_t
,
float
,
ck
::
half_t
>
(
ck
::
half_t
&
e
,
const
float
&
c
,
const
ck
::
half_t
&
d
)
const
{
e
=
ck
::
type_convert
<
ck
::
half_t
>
(
alpha_
*
c
+
beta_
*
ck
::
type_convert
<
float
>
(
d
));
};
float
alpha_
;
float
beta_
;
};
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F32
;
using
DDataType
=
F16
;
using
EDataType
=
F16
;
using
ALayout
=
Row
;
using
BLayout
=
Col
;
using
DLayout
=
Row
;
using
ELayout
=
Row
;
using
AElementOp
=
PassThrough
;
using
BElementOp
=
PassThrough
;
using
CDEElementOp
=
AlphaBetaAdd
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
using
DeviceOpInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmMultipleD_Wmma_CShuffle
<
ALayout
,
BLayout
,
ck
::
Tuple
<
DLayout
>
,
ELayout
,
ADataType
,
BDataType
,
ck
::
Tuple
<
DDataType
>
,
EDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
256
,
128
,
256
,
8
,
8
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
true
;
// GEMM shape
ck
::
index_t
M
=
3840
;
ck
::
index_t
N
=
4096
;
ck
::
index_t
K
=
4096
;
ck
::
index_t
StrideA
=
4096
;
ck
::
index_t
StrideB
=
4096
;
ck
::
index_t
StrideD
=
4096
;
ck
::
index_t
StrideE
=
4096
;
float
alpha
=
1.0
f
;
float
beta
=
1.0
f
;
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
if
(
argc
==
6
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
alpha
=
std
::
stof
(
argv
[
4
]);
beta
=
std
::
stof
(
argv
[
5
]);
}
else
if
(
argc
==
13
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
M
=
std
::
stoi
(
argv
[
4
]);
N
=
std
::
stoi
(
argv
[
5
]);
K
=
std
::
stoi
(
argv
[
6
]);
StrideA
=
std
::
stoi
(
argv
[
7
]);
StrideB
=
std
::
stoi
(
argv
[
8
]);
StrideD
=
std
::
stoi
(
argv
[
9
]);
StrideE
=
std
::
stoi
(
argv
[
10
]);
alpha
=
std
::
stof
(
argv
[
11
]);
beta
=
std
::
stof
(
argv
[
12
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
printf
(
"arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
"beta
\n
"
);
exit
(
0
);
}
auto
f_host_tensor_descriptor
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
using
namespace
ck
::
literals
;
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
({
row
,
col
},
{
stride
,
1
_uz
});
}
else
{
return
HostTensorDescriptor
({
row
,
col
},
{
1
_uz
,
stride
});
}
};
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor
(
M
,
K
,
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
DDataType
>
d_m_n
(
f_host_tensor_descriptor
(
M
,
N
,
StrideD
,
DLayout
{}));
Tensor
<
EDataType
>
e_m_n_host_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
Tensor
<
EDataType
>
e_m_n_device_result
(
f_host_tensor_descriptor
(
M
,
N
,
StrideE
,
ELayout
{}));
std
::
cout
<<
"a_m_k: "
<<
a_m_k
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_k_n: "
<<
b_k_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d_m_n: "
<<
d_m_n
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"e_m_n: "
<<
e_m_n_host_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
d_m_n
.
GenerateTensorValue
(
GeneratorTensor_2
<
DDataType
>
{
-
5
,
5
});
break
;
default:
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
d_m_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
DDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
d_m_n
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_m_n_device_result
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
d_device_buf
.
ToDevice
(
d_m_n
.
mData
.
data
());
e_device_buf
.
ToDevice
(
e_m_n_device_result
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{
alpha
,
beta
};
// do GEMM
auto
device_op
=
DeviceOpInstance
{};
auto
invoker
=
device_op
.
MakeInvoker
();
auto
argument
=
device_op
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
M
,
N
,
K
,
StrideA
,
StrideB
,
std
::
array
<
ck
::
index_t
,
1
>
{
StrideD
},
StrideE
,
a_element_op
,
b_element_op
,
cde_element_op
);
if
(
!
device_op
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
);
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
EDataType
)
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
endl
;
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
if
(
do_verification
)
{
Tensor
<
CShuffleDataType
>
c_m_n
({
M
,
N
});
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CShuffleDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
PassThrough
>
;
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_m_k
,
b_k_n
,
c_m_n
,
a_element_op
,
b_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
cde_element_op
(
e_m_n_host_result
(
m
,
n
),
c_m_n
(
m
,
n
),
d_m_n
(
m
,
n
));
}
}
e_device_buf
.
FromDevice
(
e_m_n_device_result
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
e_m_n_device_result
,
e_m_n_host_result
)
?
0
:
1
;
}
return
0
;
}
example/26_contraction/CMakeLists.txt
View file @
0cd78566
add_example_executable
(
example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp
)
add_example_executable
(
example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp
)
add_example_executable
(
example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp
)
example/26_contraction/contraction_bilinear_xdl_fp64.cpp
0 → 100644
View file @
0cd78566
This diff is collapsed.
Click to expand it.
example/26_contraction/contraction_scale_xdl_fp64.cpp
0 → 100644
View file @
0cd78566
This diff is collapsed.
Click to expand it.
example/27_layernorm/layernorm_blockwise.cpp
View file @
0cd78566
...
@@ -20,12 +20,12 @@
...
@@ -20,12 +20,12 @@
#include "ck/library/utility/literals.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
using
XDataType
=
ck
::
half_t
;
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
Acc
DataType
=
float
;
using
Compute
DataType
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
constexpr
int
Rank
=
2
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
constexpr
int
NumReduceDim
=
1
;
...
@@ -34,7 +34,7 @@ using DeviceInstance =
...
@@ -34,7 +34,7 @@ using DeviceInstance =
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
Acc
DataType
,
Compute
DataType
,
YDataType
,
YDataType
,
PassThrough
,
PassThrough
,
Rank
,
Rank
,
...
@@ -121,7 +121,7 @@ int main()
...
@@ -121,7 +121,7 @@ int main()
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
YDataType
,
YDataType
,
Acc
DataType
,
Compute
DataType
,
PassThrough
,
PassThrough
,
Rank
,
Rank
,
NumReduceDim
>
;
NumReduceDim
>
;
...
...
example/29_batched_gemm_bias_e_permute/CMakeLists.txt
View file @
0cd78566
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
add_example_executable
(
example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp
)
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_example_executable
(
example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp
)
endif
()
example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp
0 → 100644
View file @
0cd78566
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/numeric.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
AccDataType
=
F32
;
using
CShuffleDataType
=
F16
;
using
DDataType
=
F16
;
using
DsDataType
=
ck
::
Tuple
<
DDataType
>
;
using
EDataType
=
F16
;
static
constexpr
ck
::
index_t
NumDimG
=
2
;
static
constexpr
ck
::
index_t
NumDimM
=
2
;
static
constexpr
ck
::
index_t
NumDimN
=
2
;
static
constexpr
ck
::
index_t
NumDimK
=
1
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CDEElementOp
=
ck
::
tensor_operation
::
element_wise
::
Add
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
static
constexpr
auto
ABSpec
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Packed
;
static
constexpr
auto
DESpec
=
ck
::
tensor_operation
::
device
::
TensorSpecialization
::
Default
;
using
DeviceOpInstanceKKNN
=
ck
::
tensor_operation
::
device
::
DeviceBatchedContractionMultipleD_Wmma_CShuffle
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
DsDataType
,
EDataType
,
AccDataType
,
CShuffleDataType
,
AElementOp
,
BElementOp
,
CDEElementOp
,
GemmSpec
,
ABSpec
,
ABSpec
,
DESpec
,
256
,
128
,
256
,
8
,
8
,
16
,
16
,
4
,
4
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
true
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
using
DeviceOpInstance
=
DeviceOpInstanceKKNN
;
// hardcoded for NumDimM == NumDimN == NumDimK == 2
template
<
ck
::
index_t
NumDimG
,
ck
::
index_t
NumDimM
,
ck
::
index_t
NumDimN
,
ck
::
index_t
NumDimK
,
typename
ADataType
,
typename
BDataType
,
typename
EDataType
,
typename
AccDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
ck
::
enable_if_t
<
NumDimG
==
2
&&
NumDimM
==
2
&&
NumDimN
==
2
&&
NumDimK
==
1
,
bool
>
=
false
>
struct
ReferenceContraction_G2_M2_N2_K1
:
public
ck
::
tensor_operation
::
device
::
BaseOperator
{
// Argument
struct
Argument
:
public
ck
::
tensor_operation
::
device
::
BaseArgument
{
Argument
(
const
Tensor
<
ADataType
>&
a_gs_ms_ks
,
const
Tensor
<
BDataType
>&
b_gs_ns_ks
,
Tensor
<
EDataType
>&
e_gs_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
:
a_gs_ms_ks_
{
a_gs_ms_ks
},
b_gs_ns_ks_
{
b_gs_ns_ks
},
e_gs_ms_ns_
{
e_gs_ms_ns
},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
}
{
}
const
Tensor
<
ADataType
>&
a_gs_ms_ks_
;
const
Tensor
<
BDataType
>&
b_gs_ns_ks_
;
Tensor
<
EDataType
>&
e_gs_ms_ns_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
};
// Invoker
struct
Invoker
:
public
ck
::
tensor_operation
::
device
::
BaseInvoker
{
using
Argument
=
ReferenceContraction_G2_M2_N2_K1
::
Argument
;
float
Run
(
const
Argument
&
arg
)
{
auto
f_ms_ns
=
[
&
](
auto
g0
,
auto
g1
,
auto
m0
,
auto
m1
,
auto
n0
,
auto
n1
)
{
const
int
K0
=
arg
.
a_gs_ms_ks_
.
mDesc
.
GetLengths
()[
4
];
AccDataType
v_acc
=
0
;
for
(
int
k0
=
0
;
k0
<
K0
;
++
k0
)
{
AccDataType
v_a
;
AccDataType
v_b
;
arg
.
a_element_op_
(
v_a
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
a_gs_ms_ks_
(
g0
,
g1
,
m0
,
m1
,
k0
)));
arg
.
b_element_op_
(
v_b
,
ck
::
type_convert
<
const
AccDataType
>
(
arg
.
b_gs_ns_ks_
(
g0
,
g1
,
n0
,
n1
,
k0
)));
v_acc
+=
v_a
*
v_b
;
}
AccDataType
v_c
;
arg
.
cde_element_op_
(
v_c
,
v_acc
);
arg
.
e_gs_ms_ns_
(
g0
,
g1
,
m0
,
m1
,
n0
,
n1
)
=
v_c
;
};
make_ParallelTensorFunctor
(
f_ms_ns
,
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
0
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
1
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
2
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
3
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
4
],
arg
.
e_gs_ms_ns_
.
mDesc
.
GetLengths
()[
5
])(
std
::
thread
::
hardware_concurrency
());
return
0
;
}
float
Run
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
ck
::
tensor_operation
::
device
::
BaseArgument
*
)
override
{
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
ADataType
>&
a_gs_ms_ks
,
const
Tensor
<
BDataType
>&
b_gs_ns_ks
,
Tensor
<
EDataType
>&
e_gs_ms_ns
,
AElementwiseOperation
a_element_op
,
BElementwiseOperation
b_element_op
,
CDEElementwiseOperation
cde_element_op
)
{
return
Argument
{
a_gs_ms_ks
,
b_gs_ns_ks
,
e_gs_ms_ns
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
ck
::
tensor_operation
::
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceContraction_G2_M2_N2_K1"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
int
main
(
int
argc
,
char
*
argv
[])
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
true
;
ck
::
index_t
G0
=
1
;
ck
::
index_t
G1
=
2
;
ck
::
index_t
M0
=
4
;
ck
::
index_t
M1
=
128
;
ck
::
index_t
N0
=
16
;
ck
::
index_t
N1
=
256
;
ck
::
index_t
K0
=
2048
;
// A[G0, G1, M0, M1, K0]
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_lengths
{
G0
,
G1
,
M0
,
M1
,
K0
};
std
::
vector
<
ck
::
index_t
>
a_gs_ms_ks_strides
{
G1
*
M0
*
M1
*
K0
,
M0
*
M1
*
K0
,
M1
*
K0
,
K0
,
1
};
// B[G0, G1, N0, N1, K0]
std
::
vector
<
ck
::
index_t
>
b_gs_ns_ks_lengths
{
G0
,
G1
,
N0
,
N1
,
K0
};
std
::
vector
<
ck
::
index_t
>
b_gs_ns_ks_strides
{
G1
*
N0
*
N1
*
K0
,
N0
*
N1
*
K0
,
N1
*
K0
,
K0
,
1
};
// D[G0, G1, M0, N0, M1, N1]
std
::
vector
<
ck
::
index_t
>
d_gs_ms_ns_lengths
{
G0
,
G1
,
M0
,
M1
,
N0
,
N1
};
std
::
vector
<
ck
::
index_t
>
d_gs_ms_ns_strides
{
G1
*
N0
*
N1
,
N0
*
N1
,
0
,
0
,
N1
,
1
};
// E[G0, G1, M0, N0, M1, N1]
std
::
vector
<
ck
::
index_t
>
e_gs_ms_ns_lengths
{
G0
,
G1
,
M0
,
M1
,
N0
,
N1
};
std
::
vector
<
ck
::
index_t
>
e_gs_ms_ns_strides
{
G1
*
M0
*
N0
*
M1
*
N1
,
M0
*
N0
*
M1
*
N1
,
N0
*
M1
*
N1
,
N1
,
M1
*
N1
,
1
};
if
(
argc
==
1
)
{
// use default case
}
else
if
(
argc
==
4
)
{
do_verification
=
std
::
stoi
(
argv
[
1
]);
init_method
=
std
::
stoi
(
argv
[
2
]);
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
else
{
printf
(
"arg1: verification (0=no, 1=yes)
\n
"
);
printf
(
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
);
printf
(
"arg3: time kernel (0=no, 1=yes)
\n
"
);
exit
(
0
);
}
Tensor
<
ADataType
>
a_gs_ms_ks
(
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
);
Tensor
<
BDataType
>
b_gs_ns_ks
(
b_gs_ns_ks_lengths
,
b_gs_ns_ks_strides
);
Tensor
<
DDataType
>
d_gs_ms_ns
(
d_gs_ms_ns_lengths
,
d_gs_ms_ns_strides
);
Tensor
<
EDataType
>
e_gs_ms_ns_host_result
(
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
);
Tensor
<
EDataType
>
e_gs_ms_ns_device_result
(
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
);
std
::
cout
<<
"a_gs_ms_ks: "
<<
a_gs_ms_ks
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"b_gs_ns_ks: "
<<
b_gs_ns_ks
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"d_gs_ms_ns: "
<<
d_gs_ms_ns
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"e_gs_ms_ns: "
<<
e_gs_ms_ns_host_result
.
mDesc
<<
std
::
endl
;
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
d_gs_ms_ns
.
GenerateTensorValue
(
GeneratorTensor_2
<
DDataType
>
{
-
5
,
5
});
break
;
default:
a_gs_ms_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0.0
,
1.0
});
b_gs_ns_ks
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
0.5
,
0.5
});
d_gs_ms_ns
.
GenerateTensorValue
(
GeneratorTensor_3
<
DDataType
>
{
-
0.5
,
0.5
});
break
;
}
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_gs_ms_ks
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_gs_ns_ks
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
d_device_buf
(
sizeof
(
DDataType
)
*
d_gs_ms_ns
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
e_device_buf
(
sizeof
(
EDataType
)
*
e_gs_ms_ns_device_result
.
mDesc
.
GetElementSpaceSize
());
a_device_buf
.
ToDevice
(
a_gs_ms_ks
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_gs_ns_ks
.
mData
.
data
());
d_device_buf
.
ToDevice
(
d_gs_ms_ns
.
mData
.
data
());
// set zero
e_device_buf
.
SetZero
();
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
cde_element_op
=
CDEElementOp
{};
// device operation
auto
op
=
DeviceOpInstance
{};
auto
invoker
=
op
.
MakeInvoker
();
auto
argument
=
op
.
MakeArgument
(
a_device_buf
.
GetDeviceBuffer
(),
b_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
1
>
{
d_device_buf
.
GetDeviceBuffer
()},
e_device_buf
.
GetDeviceBuffer
(),
a_gs_ms_ks_lengths
,
a_gs_ms_ks_strides
,
b_gs_ns_ks_lengths
,
b_gs_ns_ks_strides
,
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_gs_ms_ns_lengths
},
std
::
array
<
std
::
vector
<
ck
::
index_t
>
,
1
>
{
d_gs_ms_ns_strides
},
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
,
a_element_op
,
b_element_op
,
cde_element_op
);
if
(
!
op
.
IsSupportedArgument
(
argument
))
{
std
::
cout
<<
op
.
GetTypeString
()
<<
" does not support this problem"
<<
std
::
endl
;
return
0
;
}
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
ck
::
index_t
G
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_gs_ms_ns_lengths
.
begin
(),
NumDimG
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
M
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_gs_ms_ns_lengths
.
begin
()
+
NumDimG
,
NumDimM
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
N
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
e_gs_ms_ns_lengths
.
begin
()
+
NumDimG
+
NumDimM
,
NumDimN
,
1
,
std
::
multiplies
<>
{});
ck
::
index_t
K
=
ck
::
accumulate_n
<
ck
::
index_t
>
(
a_gs_ms_ks_lengths
.
begin
()
+
NumDimG
+
NumDimM
,
NumDimK
,
1
,
std
::
multiplies
<>
{});
std
::
cout
<<
"GMNK="
<<
G
<<
", "
<<
M
<<
", "
<<
N
<<
", "
<<
K
<<
std
::
endl
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
G
*
M
*
N
*
K
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
G
*
M
*
K
+
sizeof
(
BDataType
)
*
G
*
K
*
N
+
sizeof
(
DDataType
)
*
G
*
M
*
N
+
sizeof
(
EDataType
)
*
G
*
M
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op
.
GetTypeString
()
<<
std
::
endl
;
e_device_buf
.
FromDevice
(
e_gs_ms_ns_device_result
.
mData
.
data
());
if
(
do_verification
)
{
Tensor
<
CShuffleDataType
>
c_ms_ns_host_result
(
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
);
using
ReferenceOpInstance
=
ReferenceContraction_G2_M2_N2_K1
<
NumDimG
,
NumDimM
,
NumDimN
,
NumDimK
,
ADataType
,
BDataType
,
CShuffleDataType
,
AccDataType
,
AElementOp
,
BElementOp
,
PassThrough
>
;
auto
ref_gemm
=
ReferenceOpInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
a_gs_ms_ks
,
b_gs_ns_ks
,
c_ms_ns_host_result
,
a_element_op
,
b_element_op
,
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
for
(
size_t
g0
=
0
;
g0
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
0
];
++
g0
)
{
for
(
size_t
g1
=
0
;
g1
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
1
];
++
g1
)
{
for
(
size_t
m0
=
0
;
m0
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
2
];
++
m0
)
{
for
(
size_t
m1
=
0
;
m1
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
3
];
++
m1
)
{
for
(
size_t
n0
=
0
;
n0
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
4
];
++
n0
)
{
for
(
size_t
n1
=
0
;
n1
<
e_gs_ms_ns_host_result
.
mDesc
.
GetLengths
()[
5
];
++
n1
)
{
cde_element_op
(
e_gs_ms_ns_host_result
(
g0
,
g1
,
m0
,
m1
,
n0
,
n1
),
c_ms_ns_host_result
(
g0
,
g1
,
m0
,
m1
,
n0
,
n1
),
d_gs_ms_ns
(
g0
,
g1
,
m0
,
m1
,
n0
,
n1
));
}
}
}
}
}
}
return
ck
::
utils
::
check_err
(
e_gs_ms_ns_device_result
,
e_gs_ms_ns_host_result
)
?
0
:
1
;
}
return
0
;
}
example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
View file @
0cd78566
...
@@ -16,6 +16,9 @@ if(USE_BITINT_EXTENSION_INT4)
...
@@ -16,6 +16,9 @@ if(USE_BITINT_EXTENSION_INT4)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
add_dependencies
(
example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4
)
endif
()
# USE_BITINT_EXTENSION_INT4
endif
()
# USE_BITINT_EXTENSION_INT4
if
(
GPU_TARGETS MATCHES
"gfx1100"
)
add_example_executable
(
example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
)
endif
()
add_example_executable
(
example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp
)
add_example_executable
(
example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp
)
...
...
example/30_grouped_conv_fwd_multiple_d/common.hpp
View file @
0cd78566
...
@@ -137,7 +137,7 @@ inline bool parse_cmd_args(int argc,
...
@@ -137,7 +137,7 @@ inline bool parse_cmd_args(int argc,
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
threshold_to_catch_partial_args
,
argv
);
num_dim_spatial
,
threshold_to_catch_partial_args
+
1
,
argv
);
}
}
else
else
{
{
...
...
example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp
0 → 100644
View file @
0cd78566
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <algorithm>
#include <array>
#include <iostream>
#include <string>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using
BF16
=
ck
::
bhalf_t
;
using
FP16
=
ck
::
half_t
;
using
FP32
=
float
;
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
using
I4
=
ck
::
int4_t
;
#endif
using
I8
=
std
::
int8_t
;
using
I32
=
std
::
int32_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
typename
InputLay
,
typename
WeightLay
,
typename
OutputLay
>
struct
CommonLayoutSetting
{
using
InputLayout
=
InputLay
;
using
WeightLayout
=
WeightLay
;
using
OutputLayout
=
OutputLay
;
};
template
<
ck
::
index_t
NDimSpatial
>
struct
CommonLayoutSettingSelector
;
namespace
ctl
=
ck
::
tensor_layout
::
convolution
;
template
<
>
struct
CommonLayoutSettingSelector
<
1
>
final
:
CommonLayoutSetting
<
ctl
::
G_NW_C
,
ctl
::
G_K_X_C
,
ctl
::
G_NW_K
>
{
};
template
<
>
struct
CommonLayoutSettingSelector
<
2
>
final
:
CommonLayoutSetting
<
ctl
::
G_NHW_C
,
ctl
::
G_K_YX_C
,
ctl
::
G_NHW_K
>
{
};
template
<
>
struct
CommonLayoutSettingSelector
<
3
>
final
:
CommonLayoutSetting
<
ctl
::
G_NDHW_C
,
ctl
::
G_K_ZYX_C
,
ctl
::
G_NDHW_K
>
{
};
template
<
ck
::
index_t
NDimSpatial
>
using
InputLayout
=
typename
CommonLayoutSettingSelector
<
NDimSpatial
>::
InputLayout
;
template
<
ck
::
index_t
NDimSpatial
>
using
WeightLayout
=
typename
CommonLayoutSettingSelector
<
NDimSpatial
>::
WeightLayout
;
template
<
ck
::
index_t
NDimSpatial
>
using
OutputLayout
=
typename
CommonLayoutSettingSelector
<
NDimSpatial
>::
OutputLayout
;
struct
ExecutionConfig
final
{
bool
do_verification
=
true
;
int
init_method
=
1
;
bool
time_kernel
=
true
;
};
#define DefaultConvParam \
ck::utils::conv::ConvParam \
{ \
2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
}
inline
void
print_help_msg
()
{
std
::
cerr
<<
"arg1: verification (0=no, 1=yes)
\n
"
<<
"arg2: initialization (0=no init, 1=integer value, 2=decimal value)
\n
"
<<
"arg3: time kernel (0=no, 1=yes)
\n
"
<<
ck
::
utils
::
conv
::
get_conv_param_parser_helper_msg
()
<<
std
::
endl
;
}
inline
bool
parse_cmd_args
(
int
argc
,
char
*
argv
[],
ExecutionConfig
&
config
,
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
constexpr
int
num_execution_config_args
=
3
;
// arguments for do_verification, init_method, time_kernel
constexpr
int
num_conv_param_leading_args
=
5
;
// arguments for num_dim_spatial_, G_, N_, K_, C_
constexpr
int
threshold_to_catch_partial_args
=
1
+
num_execution_config_args
;
constexpr
int
threshold_to_catch_all_args
=
threshold_to_catch_partial_args
+
num_conv_param_leading_args
;
if
(
argc
==
1
)
{
// use default
}
// catch only ExecutionConfig arguments
else
if
(
argc
==
threshold_to_catch_partial_args
)
{
config
.
do_verification
=
std
::
stoi
(
argv
[
1
]);
config
.
init_method
=
std
::
stoi
(
argv
[
2
]);
config
.
time_kernel
=
std
::
stoi
(
argv
[
3
]);
}
// catch both ExecutionConfig & ConvParam arguments
else
if
(
threshold_to_catch_all_args
<
argc
&&
((
argc
-
threshold_to_catch_all_args
)
%
3
==
0
))
{
config
.
do_verification
=
std
::
stoi
(
argv
[
1
]);
config
.
init_method
=
std
::
stoi
(
argv
[
2
]);
config
.
time_kernel
=
std
::
stoi
(
argv
[
3
]);
const
ck
::
index_t
num_dim_spatial
=
std
::
stoi
(
argv
[
4
]);
conv_param
=
ck
::
utils
::
conv
::
parse_conv_param
(
num_dim_spatial
,
threshold_to_catch_partial_args
+
1
,
argv
);
}
else
{
print_help_msg
();
return
false
;
}
return
true
;
}
inline
HostTensorDescriptor
make_input_descriptor
(
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
C_
,
conv_param
.
input_spatial_lengths_
[
0
]},
{
conv_param
.
C_
,
// g
conv_param
.
input_spatial_lengths_
[
0
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// n
1
,
// c
conv_param
.
G_
*
conv_param
.
C_
// wi
});
case
2
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
C_
,
conv_param
.
input_spatial_lengths_
[
0
],
conv_param
.
input_spatial_lengths_
[
1
]},
{
conv_param
.
C_
,
// g
conv_param
.
input_spatial_lengths_
[
0
]
*
conv_param
.
input_spatial_lengths_
[
1
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// n
1
,
// c
conv_param
.
input_spatial_lengths_
[
1
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// hi
conv_param
.
G_
*
conv_param
.
C_
// wi
});
case
3
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
C_
,
conv_param
.
input_spatial_lengths_
[
0
],
conv_param
.
input_spatial_lengths_
[
1
],
conv_param
.
input_spatial_lengths_
[
2
]},
{
conv_param
.
C_
,
// g
conv_param
.
input_spatial_lengths_
[
0
]
*
conv_param
.
input_spatial_lengths_
[
1
]
*
conv_param
.
input_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// n
1
,
// c
conv_param
.
input_spatial_lengths_
[
1
]
*
conv_param
.
input_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// di
conv_param
.
input_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
C_
,
// hi
conv_param
.
G_
*
conv_param
.
C_
// wi
});
}
throw
std
::
runtime_error
(
"unsuppored # dim spatial"
);
}
inline
HostTensorDescriptor
make_weight_descriptor
(
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
K_
,
conv_param
.
C_
,
conv_param
.
filter_spatial_lengths_
[
0
]},
{
conv_param
.
K_
*
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
C_
,
// g
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
C_
,
// k
1
,
// c
conv_param
.
C_
// x
});
case
2
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
K_
,
conv_param
.
C_
,
conv_param
.
filter_spatial_lengths_
[
0
],
conv_param
.
filter_spatial_lengths_
[
1
]},
{
conv_param
.
K_
*
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
C_
,
// g
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
C_
,
// k
1
,
// c
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
C_
,
// y
conv_param
.
C_
// x
});
case
3
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
K_
,
conv_param
.
C_
,
conv_param
.
filter_spatial_lengths_
[
0
],
conv_param
.
filter_spatial_lengths_
[
1
],
conv_param
.
filter_spatial_lengths_
[
2
]},
{
conv_param
.
K_
*
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
filter_spatial_lengths_
[
2
]
*
conv_param
.
C_
,
// g
conv_param
.
filter_spatial_lengths_
[
0
]
*
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
filter_spatial_lengths_
[
2
]
*
conv_param
.
C_
,
// k
1
,
// c
conv_param
.
filter_spatial_lengths_
[
1
]
*
conv_param
.
filter_spatial_lengths_
[
2
]
*
conv_param
.
C_
,
// z
conv_param
.
filter_spatial_lengths_
[
2
]
*
conv_param
.
C_
,
// y
conv_param
.
C_
// x
});
}
throw
std
::
runtime_error
(
"unsuppored # dim spatial"
);
}
inline
HostTensorDescriptor
make_bias_descriptor
(
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
]},
{
conv_param
.
K_
,
// g
0
,
// k
1
,
// c
0
// x
});
case
2
:
return
HostTensorDescriptor
({
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
]},
{
conv_param
.
K_
,
// g
0
,
// n
1
,
// k
0
,
// ho
0
// wo
});
case
3
:
return
HostTensorDescriptor
({
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
],
conv_param
.
output_spatial_lengths_
[
2
]},
{
conv_param
.
K_
,
// g
0
,
// n
1
,
// k
0
,
// z
0
,
// y
0
// x
});
}
throw
std
::
runtime_error
(
"unsuppored # dim spatial"
);
}
inline
HostTensorDescriptor
make_output_descriptor
(
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
]},
{
conv_param
.
K_
,
// g
conv_param
.
output_spatial_lengths_
[
0
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// n
1
,
// k
conv_param
.
G_
*
conv_param
.
K_
// wo
});
case
2
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
]},
{
conv_param
.
K_
,
// g
conv_param
.
output_spatial_lengths_
[
0
]
*
conv_param
.
output_spatial_lengths_
[
1
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// n
1
,
// k
conv_param
.
output_spatial_lengths_
[
1
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// ho
conv_param
.
G_
*
conv_param
.
K_
// wo
});
case
3
:
return
HostTensorDescriptor
(
{
conv_param
.
G_
,
conv_param
.
N_
,
conv_param
.
K_
,
conv_param
.
output_spatial_lengths_
[
0
],
conv_param
.
output_spatial_lengths_
[
1
],
conv_param
.
output_spatial_lengths_
[
2
]},
{
conv_param
.
K_
,
// g
conv_param
.
output_spatial_lengths_
[
0
]
*
conv_param
.
output_spatial_lengths_
[
1
]
*
conv_param
.
output_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// n
1
,
// k
conv_param
.
output_spatial_lengths_
[
1
]
*
conv_param
.
output_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// do
conv_param
.
output_spatial_lengths_
[
2
]
*
conv_param
.
G_
*
conv_param
.
K_
,
// ho
conv_param
.
G_
*
conv_param
.
K_
// wo
});
}
throw
std
::
runtime_error
(
"unsuppored # dim spatial"
);
}
example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp
0 → 100644
View file @
0cd78566
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "common_wmma.hpp"
// kernel data types
using
InKernelDataType
=
FP16
;
using
WeiKernelDataType
=
FP16
;
using
AccDataType
=
FP32
;
using
CShuffleDataType
=
FP16
;
using
BiasKernelDataType
=
FP16
;
using
ResidualKernelDataType
=
FP16
;
using
OutKernelDataType
=
FP16
;
// tensor data types
using
InUserDataType
=
InKernelDataType
;
using
WeiUserDataType
=
WeiKernelDataType
;
using
OutUserDataType
=
OutKernelDataType
;
using
InElementOp
=
PassThrough
;
using
WeiElementOp
=
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
AddReluAdd
;
#include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_grouped_conv_fwd_bias_relu_add_example
(
argc
,
argv
);
}
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
0 → 100644
View file @
0cd78566
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
template
<
typename
BiasLay
,
typename
ResidualLay
>
struct
LayoutSetting
{
using
BiasLayout
=
BiasLay
;
using
ResidualLayout
=
ResidualLay
;
};
template
<
ck
::
index_t
NDimSpatial
>
struct
LayoutSettingSelector
;
template
<>
struct
LayoutSettingSelector
<
1
>
final
:
LayoutSetting
<
ctl
::
G_K
,
ctl
::
G_NW_K
>
{
};
template
<>
struct
LayoutSettingSelector
<
2
>
final
:
LayoutSetting
<
ctl
::
G_K
,
ctl
::
G_NHW_K
>
{
};
template
<>
struct
LayoutSettingSelector
<
3
>
final
:
LayoutSetting
<
ctl
::
G_K
,
ctl
::
G_NDHW_K
>
{
};
template
<
ck
::
index_t
NDimSpatial
>
using
BiasLayout
=
typename
LayoutSettingSelector
<
NDimSpatial
>::
BiasLayout
;
template
<
ck
::
index_t
NDimSpatial
>
using
ResidualLayout
=
typename
LayoutSettingSelector
<
NDimSpatial
>::
ResidualLayout
;
template
<
ck
::
index_t
NDimSpatial
>
using
DeviceConvFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
<
NDimSpatial
,
InputLayout
<
NDimSpatial
>
,
WeightLayout
<
NDimSpatial
>
,
ck
::
Tuple
<
BiasLayout
<
NDimSpatial
>
,
ResidualLayout
<
NDimSpatial
>>
,
OutputLayout
<
NDimSpatial
>
,
InKernelDataType
,
WeiKernelDataType
,
ck
::
Tuple
<
BiasKernelDataType
,
ResidualKernelDataType
>
,
OutKernelDataType
,
AccDataType
,
CShuffleDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
256
,
// BlockSize
128
,
// MPerBlock
128
,
// NPerBlock
4
,
// K0PerBlock
8
,
// K1
16
,
// MPerWMMA
16
,
// NPerWMMA
4
,
// MRepeat
2
,
// NRepeat
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
true
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
true
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
template
<
ck
::
index_t
NDimSpatial
>
using
HostConvFwdInstance
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InUserDataType
,
WeiUserDataType
,
CShuffleDataType
,
InElementOp
,
WeiElementOp
,
PassThrough
>
;
template
<
ck
::
index_t
NDimSpatial
>
bool
run_grouped_conv_fwd_bias_relu_add
(
const
ExecutionConfig
&
config
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
)
{
static_assert
(
1
<=
NDimSpatial
&&
NDimSpatial
<=
3
,
"Unsupported NDimSpatial"
);
const
auto
in_g_n_c_wis_desc
=
make_input_descriptor
(
conv_param
);
const
auto
wei_g_k_c_xs_desc
=
make_weight_descriptor
(
conv_param
);
const
auto
bias_g_n_k_wos_desc
=
make_bias_descriptor
(
conv_param
);
const
auto
out_g_n_k_wos_desc
=
make_output_descriptor
(
conv_param
);
Tensor
<
InUserDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiUserDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
OutUserDataType
>
bias
(
bias_g_n_k_wos_desc
);
Tensor
<
OutUserDataType
>
residual
(
bias_g_n_k_wos_desc
);
Tensor
<
OutUserDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutKernelDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"bias: "
<<
bias
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"residual: "
<<
residual
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
switch
(
config
.
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InUserDataType
>
{
-
5
,
5
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiUserDataType
>
{
-
5
,
5
});
bias
.
GenerateTensorValue
(
GeneratorTensor_2
<
OutUserDataType
>
{
-
5
,
5
});
break
;
default
:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InUserDataType
>
{
0.0
,
1.0
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiUserDataType
>
{
-
0.5
,
0.5
});
bias
.
GenerateTensorValue
(
GeneratorTensor_3
<
OutUserDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
in_device_buf
(
sizeof
(
InKernelDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiKernelDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
bias_device_buf
(
sizeof
(
OutKernelDataType
)
*
bias
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
residual_device_buf
(
sizeof
(
OutKernelDataType
)
*
residual
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutKernelDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
InKernelDataType
>
in_converted
(
in
);
const
Tensor
<
WeiKernelDataType
>
wei_converted
(
wei
);
const
Tensor
<
OutKernelDataType
>
bias_converted
(
bias
);
const
Tensor
<
OutKernelDataType
>
residual_converted
(
residual
);
in_device_buf
.
ToDevice
(
in_converted
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei_converted
.
mData
.
data
());
bias_device_buf
.
ToDevice
(
bias_converted
.
mData
.
data
());
residual_device_buf
.
ToDevice
(
residual_converted
.
mData
.
data
());
#else
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
bias_device_buf
.
ToDevice
(
bias
.
mData
.
data
());
residual_device_buf
.
ToDevice
(
residual
.
mData
.
data
());
#endif
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d0_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d0_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d1_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
d1_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
bias_g_n_k_wos_desc
.
GetLengths
(),
d0_g_n_k_wos_lengths
);
copy
(
bias_g_n_k_wos_desc
.
GetStrides
(),
d0_g_n_k_wos_strides
);
copy
(
bias_g_n_k_wos_desc
.
GetLengths
(),
d1_g_n_k_wos_lengths
);
copy
(
bias_g_n_k_wos_desc
.
GetStrides
(),
d1_g_n_k_wos_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
// do Conv
auto
conv
=
DeviceConvFwdInstance
<
NDimSpatial
>
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
std
::
array
<
const
void
*
,
2
>
{
bias_device_buf
.
GetDeviceBuffer
(),
residual_device_buf
.
GetDeviceBuffer
()},
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
2
>
{
{
d0_g_n_k_wos_lengths
,
d1_g_n_k_wos_lengths
}},
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
2
>
{
{
d0_g_n_k_wos_strides
,
d1_g_n_k_wos_strides
}},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
InElementOp
{},
WeiElementOp
{},
OutElementOp
{});
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"wrong! device_conv with the specified compilation parameters does "
"not support this Conv problem"
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
config
.
time_kernel
});
std
::
size_t
flop
=
conv_param
.
GetFlops
();
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InUserDataType
,
WeiUserDataType
,
OutUserDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
if
(
config
.
do_verification
)
{
Tensor
<
CShuffleDataType
>
c_host
(
out_g_n_k_wos_desc
);
auto
ref_conv
=
HostConvFwdInstance
<
NDimSpatial
>
{};
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
c_host
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
InElementOp
{},
WeiElementOp
{},
PassThrough
{});
ref_invoker
.
Run
(
ref_argument
);
// TODO: implement elementwise operation for host
out_host
.
ForEach
([
&
](
auto
&
,
auto
idx
)
{
OutElementOp
{}(
out_host
(
idx
),
c_host
(
idx
),
bias
(
idx
),
residual
(
idx
));
});
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
#ifdef BUILD_INT4_EXAMPLE
const
Tensor
<
OutUserDataType
>
out_device_converted
(
out_device
);
return
ck
::
utils
::
check_err
(
out_device_converted
,
out_host
,
"Error: incorrect results!"
,
1
e
-
5
f
,
1
e
-
4
f
);
#else
return
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
,
1
e
-
5
f
,
1
e
-
4
f
);
#endif
}
return
true
;
}
bool
run_grouped_conv_fwd_bias_relu_add_example
(
int
argc
,
char
*
argv
[])
{
ExecutionConfig
config
;
ck
::
utils
::
conv
::
ConvParam
conv_param
=
DefaultConvParam
;
if
(
!
parse_cmd_args
(
argc
,
argv
,
config
,
conv_param
))
{
return
false
;
}
switch
(
conv_param
.
num_dim_spatial_
)
{
case
1
:
return
run_grouped_conv_fwd_bias_relu_add
<
1
>
(
config
,
conv_param
);
case
2
:
return
run_grouped_conv_fwd_bias_relu_add
<
2
>
(
config
,
conv_param
);
case
3
:
return
run_grouped_conv_fwd_bias_relu_add
<
3
>
(
config
,
conv_param
);
}
return
false
;
}
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
View file @
0cd78566
...
@@ -23,11 +23,11 @@
...
@@ -23,11 +23,11 @@
constexpr
int
Rank
=
5
;
constexpr
int
Rank
=
5
;
constexpr
int
NumReduceDim
=
3
;
constexpr
int
NumReduceDim
=
3
;
using
XDataType
=
ck
::
half_t
;
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
Acc
DataType
=
float
;
using
Compute
DataType
=
float
;
struct
YElementOp
struct
YElementOp
{
{
...
@@ -50,7 +50,7 @@ using DeviceInstance =
...
@@ -50,7 +50,7 @@ using DeviceInstance =
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
ck
::
tensor_operation
::
device
::
DeviceNormalizationImpl
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
Acc
DataType
,
Compute
DataType
,
YDataType
,
YDataType
,
YElementOp
,
YElementOp
,
Rank
,
Rank
,
...
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
...
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
YDataType
,
YDataType
,
Acc
DataType
,
Compute
DataType
,
YElementOp
>
;
YElementOp
>
;
ReferenceInstance
ref
;
ReferenceInstance
ref
;
...
...
include/ck/host_utility/kernel_launch.hpp
View file @
0cd78566
...
@@ -20,6 +20,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
...
@@ -20,6 +20,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
#if CK_TIME_KERNEL
#if CK_TIME_KERNEL
if
(
stream_config
.
time_kernel_
)
if
(
stream_config
.
time_kernel_
)
{
{
#if DEBUG_LOG
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
printf
(
"%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d}
\n
"
,
__func__
,
__func__
,
grid_dim
.
x
,
grid_dim
.
x
,
...
@@ -29,15 +30,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
...
@@ -29,15 +30,15 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
block_dim
.
y
,
block_dim
.
y
,
block_dim
.
z
);
block_dim
.
z
);
const
int
nrepeat
=
10
;
printf
(
"Warm up 1 time
\n
"
);
printf
(
"Warm up 1 time
\n
"
);
#endif
// warm up
// warm up
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
kernel
<<<
grid_dim
,
block_dim
,
lds_byte
,
stream_config
.
stream_id_
>>>
(
args
...);
const
int
nrepeat
=
10
;
#if DEBUG_LOG
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
printf
(
"Start running %d times...
\n
"
,
nrepeat
);
#endif
hipEvent_t
start
,
stop
;
hipEvent_t
start
,
stop
;
hip_check_error
(
hipEventCreate
(
&
start
));
hip_check_error
(
hipEventCreate
(
&
start
));
...
...
include/ck/tensor_operation/gpu/device/device_normalization.hpp
View file @
0cd78566
...
@@ -14,9 +14,9 @@ namespace device {
...
@@ -14,9 +14,9 @@ namespace device {
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
BetaDataType
,
typename
Acc
DataType
,
typename
Compute
DataType
,
typename
YDataType
,
typename
YDataType
,
typename
Acc
ElementwiseOperation
,
typename
Y
ElementwiseOperation
,
index_t
Rank
,
index_t
Rank
,
index_t
NumReduceDim
>
index_t
NumReduceDim
>
struct
DeviceNormalization
:
public
BaseOperator
struct
DeviceNormalization
:
public
BaseOperator
...
@@ -35,7 +35,7 @@ struct DeviceNormalization : public BaseOperator
...
@@ -35,7 +35,7 @@ struct DeviceNormalization : public BaseOperator
void
*
p_y
,
void
*
p_y
,
void
*
p_savedMean
,
void
*
p_savedMean
,
void
*
p_savedInvVar
,
void
*
p_savedInvVar
,
Acc
ElementwiseOperation
acc
_elementwise_op
)
=
0
;
Y
ElementwiseOperation
y
_elementwise_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
};
...
@@ -43,17 +43,17 @@ struct DeviceNormalization : public BaseOperator
...
@@ -43,17 +43,17 @@ struct DeviceNormalization : public BaseOperator
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
BetaDataType
,
typename
Acc
DataType
,
typename
Compute
DataType
,
typename
YDataType
,
typename
YDataType
,
typename
Acc
ElementwiseOperation
,
typename
Y
ElementwiseOperation
,
index_t
Rank
,
index_t
Rank
,
index_t
NumReduceDim
>
index_t
NumReduceDim
>
using
DeviceNormalizationPtr
=
std
::
unique_ptr
<
DeviceNormalization
<
XDataType
,
using
DeviceNormalizationPtr
=
std
::
unique_ptr
<
DeviceNormalization
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
Acc
DataType
,
Compute
DataType
,
YDataType
,
YDataType
,
Acc
ElementwiseOperation
,
Y
ElementwiseOperation
,
Rank
,
Rank
,
NumReduceDim
>>
;
NumReduceDim
>>
;
...
...
include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
0 → 100644
View file @
0cd78566
This diff is collapsed.
Click to expand it.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment