Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
7db609fe
"tests/pipelines/vscode:/vscode.git/clone" did not exist on "07f1fbb18e26c81353cec4790ede5de3c3daa9fa"
Commit
7db609fe
authored
Oct 27, 2024
by
dummycoderfe
Browse files
build ok
parent
5c9cba82
Changes
18
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
579 additions
and
47 deletions
+579
-47
CMakeLists.txt
CMakeLists.txt
+0
-4
cmake/EnableCompilerWarnings.cmake
cmake/EnableCompilerWarnings.cmake
+0
-1
example/ck_tile/06_layernorm2d_bwd/CMakeLists.txt
example/ck_tile/06_layernorm2d_bwd/CMakeLists.txt
+21
-0
example/ck_tile/06_layernorm2d_bwd/README.md
example/ck_tile/06_layernorm2d_bwd/README.md
+22
-0
example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_api.cpp
...tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_api.cpp
+44
-0
example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_bf16_n64_n128_instance.cpp
..._bwd/instances/layernorm2d_bwd_bf16_n64_n128_instance.cpp
+11
-0
example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_instance_common.hpp
...rnorm2d_bwd/instances/layernorm2d_bwd_instance_common.hpp
+55
-0
example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.cpp
example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.cpp
+179
-0
example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.hpp
example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.hpp
+113
-0
example/ck_tile/06_layernorm2d_bwd/script/perf_test.sh
example/ck_tile/06_layernorm2d_bwd/script/perf_test.sh
+38
-0
example/ck_tile/06_layernorm2d_bwd/script/smoke_test.sh
example/ck_tile/06_layernorm2d_bwd/script/smoke_test.sh
+31
-0
example/ck_tile/CMakeLists.txt
example/ck_tile/CMakeLists.txt
+1
-0
include/ck_tile/ops/layernorm2d.hpp
include/ck_tile/ops/layernorm2d.hpp
+7
-0
include/ck_tile/ops/layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp
.../layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp
+2
-4
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp
...rm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp
+14
-2
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_gamma_beta.hpp
...ernorm2d/pipeline/layernorm2d_bwd_pipeline_gamma_beta.hpp
+40
-4
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp
...layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp
+1
-3
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
...layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
+0
-29
No files found.
CMakeLists.txt
View file @
7db609fe
...
@@ -495,10 +495,6 @@ include_directories(BEFORE
...
@@ -495,10 +495,6 @@ include_directories(BEFORE
)
)
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
SET
(
BUILD_DEV ON CACHE BOOL
"BUILD_DEV"
)
if
(
BUILD_DEV
)
add_compile_options
(
-Werror
)
add_compile_options
(
-Weverything
)
endif
()
message
(
"CMAKE_CXX_FLAGS:
${
CMAKE_CXX_FLAGS
}
"
)
message
(
"CMAKE_CXX_FLAGS:
${
CMAKE_CXX_FLAGS
}
"
)
if
(
"
${
CMAKE_CXX_COMPILER_ID
}
"
MATCHES
"Clang"
)
if
(
"
${
CMAKE_CXX_COMPILER_ID
}
"
MATCHES
"Clang"
)
...
...
cmake/EnableCompilerWarnings.cmake
View file @
7db609fe
...
@@ -66,7 +66,6 @@ else()
...
@@ -66,7 +66,6 @@ else()
-Wunreachable-code
-Wunreachable-code
-Wunused
-Wunused
-Wno-reserved-identifier
-Wno-reserved-identifier
-Werror
-Wno-option-ignored
-Wno-option-ignored
-Wsign-compare
-Wsign-compare
-Wno-extra-semi-stmt
-Wno-extra-semi-stmt
...
...
example/ck_tile/06_layernorm2d_bwd/CMakeLists.txt
0 → 100644
View file @
7db609fe
set
(
EXAMPLE_LAYERNORM2D_BWD
"tile_example_layernorm2d_bwd"
)
# not using add_example_executable() to add this target, since we don't want this to have
# to be included in "make all/install/check"
message
(
"adding example
${
EXAMPLE_LAYERNORM2D_BWD
}
"
)
file
(
GLOB INSTANCE_SRCS instances/*.cpp
)
add_executable
(
${
EXAMPLE_LAYERNORM2D_BWD
}
EXCLUDE_FROM_ALL layernorm2d_bwd.cpp
)
target_include_directories
(
${
EXAMPLE_LAYERNORM2D_BWD
}
PRIVATE
${
CMAKE_CURRENT_LIST_DIR
}
)
target_sources
(
${
EXAMPLE_LAYERNORM2D_BWD
}
PRIVATE
${
INSTANCE_SRCS
}
)
set
(
EXAMPLE_layernorm2d_bwd_COMPILE_OPTIONS
)
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
list
(
APPEND EXAMPLE_layernorm2d_bwd_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal
)
target_compile_options
(
${
EXAMPLE_LAYERNORM2D_BWD
}
PRIVATE
${
EXAMPLE_layernorm2d_bwd_COMPILE_OPTIONS
}
)
# TODO: we have to turn off this global prop, otherwise the progress bar generated
# by cmake will print too many files, execvp: /bin/sh: Argument list too long
# however, this property may affect global
# TODO: consider codegen a makefile by us
set_property
(
GLOBAL PROPERTY RULE_MESSAGES OFF
)
example/ck_tile/06_layernorm2d_bwd/README.md
0 → 100644
View file @
7db609fe
# Layernorm2D forward
This folder contains example for Layernorm2D forward using ck_tile tile-programming implementation.
## build
```
# in the root of ck_tile
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch> # you can replace this <arch> to gfx90a, gfx942...
make tile_example_layernorm2d_bwd -j
```
This will result in an executable
`build/bin/tile_example_layernorm2d_bwd`
## example
```
args:
-m m dimension (default:3328)
-n m dimension (default:4096)
-e epsilon (default:1e-5)
-v cpu validation or not (default:1)
-prec precision (default:fp16)
```
example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_api.cpp
0 → 100644
View file @
7db609fe
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <ck_tile/core.hpp>
#include "layernorm2d_bwd.hpp"
template
<
typename
DataType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
ck_tile
::
index_t
ThreadPerBlock_M_
,
// num threads along M
ck_tile
::
index_t
ThreadPerBlock_N_
,
// num threads along N
bool
kPadN_
>
using
trait_
=
layernorm2d_bwd_traits_
<
DataType_
,
Repeat_M_
,
ThreadPerBlock_M_
,
ThreadPerBlock_N_
,
kPadN_
>
;
template
<
typename
data_type
>
float
layernorm2d_bwd_b16_
(
layernorm2d_bwd_traits
/*t*/
,
layernorm2d_bwd_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
return
layernorm2d_bwd_
<
trait_
<
data_type
,
1
,
1
,
64
,
true
>>
(
s
,
a
);
}
float
layernorm2d_bwd
(
layernorm2d_bwd_traits
t
,
layernorm2d_bwd_args
a
,
const
ck_tile
::
stream_config
&
s
)
{
float
r
=
-
1
;
if
(
t
.
data_type
.
compare
(
"fp16"
)
==
0
)
{
return
layernorm2d_bwd_b16_
<
ck_tile
::
fp16_t
>
(
t
,
a
,
s
);
}
else
if
(
t
.
data_type
.
compare
(
"bf16"
)
==
0
)
{
return
layernorm2d_bwd_b16_
<
ck_tile
::
bf16_t
>
(
t
,
a
,
s
);
}
if
(
r
<
0
)
throw
std
::
runtime_error
(
"Without supported instances!"
);
return
r
;
}
example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_bf16_n64_n128_instance.cpp
0 → 100644
View file @
7db609fe
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include "layernorm2d_bwd_instance_common.hpp"
// clang-format off
// rm tm tn pd
template
float
layernorm2d_bwd_
<
trait_
<
ck_tile
::
bf16_t
,
1
,
1
,
64
,
true
>
>
(
const
S
&
,
A
);
template
float
layernorm2d_bwd_
<
trait_
<
ck_tile
::
fp16_t
,
1
,
1
,
64
,
true
>
>
(
const
S
&
,
A
);
// clang-format on
example/ck_tile/06_layernorm2d_bwd/instances/layernorm2d_bwd_instance_common.hpp
0 → 100644
View file @
7db609fe
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#include <ck_tile/core.hpp>
#include "layernorm2d_bwd.hpp"
#include <iostream>
#pragma once
using
S
=
ck_tile
::
stream_config
;
using
A
=
layernorm2d_bwd_args
;
template
<
typename
DataType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
ck_tile
::
index_t
ThreadPerBlock_M_
,
// num threads along M
ck_tile
::
index_t
ThreadPerBlock_N_
,
// num threads along N
bool
kPadN_
>
using
trait_
=
layernorm2d_bwd_traits_
<
DataType_
,
Repeat_M_
,
ThreadPerBlock_M_
,
ThreadPerBlock_N_
,
kPadN_
>
;
template
<
typename
Traits_
>
float
layernorm2d_bwd_
(
const
S
&
s
,
A
a
)
{
using
DataType
=
typename
Traits_
::
DataType
;
using
PipelineProblem
=
ck_tile
::
Layernorm2dBwdGammaBetaPipelineProblem
<
typename
LayerNormTypeConfig
<
DataType
>::
XDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
GammaDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
BetaDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
ComputeDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
YDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
MeanDataType
,
typename
LayerNormTypeConfig
<
DataType
>::
InvStdDataType
,
typename
Traits_
::
Shape
,
Traits_
::
kPadN
>
;
using
Pipeline
=
ck_tile
::
Layernorm2dBwdGammaBetaPipeline
<
PipelineProblem
>
;
using
Kernel
=
ck_tile
::
Layernorm2dBwdGammaBeta
<
Pipeline
>
;
const
dim3
grids
=
Kernel
::
GridSize
(
a
);
constexpr
dim3
blocks
=
Kernel
::
BlockSize
();
constexpr
ck_tile
::
index_t
kBlockPerCu
=
1
;
auto
kargs
=
Kernel
::
MakeKargs
(
a
);
if
(
s
.
log_level_
>
0
)
std
::
cout
<<
", "
<<
Kernel
::
GetName
()
<<
std
::
flush
;
return
ck_tile
::
launch_kernel
(
s
,
ck_tile
::
make_kernel
<
blocks
.
x
,
kBlockPerCu
>
(
Kernel
{},
grids
,
blocks
,
0
,
kargs
));
}
example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.cpp
0 → 100644
View file @
7db609fe
#include "ck_tile/host.hpp"
#include "layernorm2d_bwd.hpp"
#include <cstring>
// different threshold for different dtype
template
<
typename
DataType
>
auto
get_elimit
()
{
double
rtol
=
1e-2
;
double
atol
=
1e-2
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
template
<
>
auto
get_elimit
<
ck_tile
::
bf16_t
>
()
{
double
rtol
=
1e-2
;
double
atol
=
1e-2
;
return
ck_tile
::
make_tuple
(
rtol
,
atol
);
}
auto
create_args
(
int
argc
,
char
*
argv
[])
{
ck_tile
::
ArgParser
arg_parser
;
arg_parser
.
insert
(
"m"
,
"3328"
,
"m dimension"
)
.
insert
(
"n"
,
"4096"
,
"n dimension"
)
.
insert
(
"stride"
,
"-1"
,
"stride per row, if -1 then equal to n"
)
.
insert
(
"v"
,
"1"
,
"cpu validation or not"
)
.
insert
(
"kname"
,
"1"
,
"print kernel name or not"
)
.
insert
(
"prec"
,
"fp16"
,
"precision"
)
.
insert
(
"warmup"
,
"5"
,
"cold iter"
)
.
insert
(
"repeat"
,
"20"
,
"hot iter"
);
bool
result
=
arg_parser
.
parse
(
argc
,
argv
);
return
std
::
make_tuple
(
result
,
arg_parser
);
}
template
<
typename
DataType
>
bool
run
(
const
ck_tile
::
ArgParser
&
arg_parser
)
{
ck_tile
::
index_t
m
=
arg_parser
.
get_int
(
"m"
);
ck_tile
::
index_t
n
=
arg_parser
.
get_int
(
"n"
);
ck_tile
::
index_t
stride
=
arg_parser
.
get_int
(
"stride"
);
if
(
stride
<
0
)
stride
=
n
;
std
::
string
data_type
=
arg_parser
.
get_str
(
"prec"
);
int
kname
=
arg_parser
.
get_int
(
"kname"
);
int
do_validation
=
arg_parser
.
get_int
(
"v"
);
int
warmup
=
arg_parser
.
get_int
(
"warmup"
);
int
repeat
=
arg_parser
.
get_int
(
"repeat"
);
assert
(
stride
>=
n
);
using
TypeConfig
=
LayerNormTypeConfig
<
DataType
>
;
using
XDataType
=
typename
TypeConfig
::
XDataType
;
using
YDataType
=
typename
TypeConfig
::
YDataType
;
using
GammaDataType
=
typename
TypeConfig
::
GammaDataType
;
using
BetaDataType
=
typename
TypeConfig
::
BetaDataType
;
using
MeanDataType
=
typename
TypeConfig
::
MeanDataType
;
using
InvStdDataType
=
typename
TypeConfig
::
InvStdDataType
;
using
ComputeDataType
=
typename
TypeConfig
::
ComputeDataType
;
// host verify
ck_tile
::
HostTensor
<
YDataType
>
dy_host
({
m
,
n
},
{
stride
,
1
});
ck_tile
::
HostTensor
<
MeanDataType
>
mean_host
({
m
});
ck_tile
::
HostTensor
<
InvStdDataType
>
invStd_host
({
m
});
ck_tile
::
HostTensor
<
GammaDataType
>
dgamma_host_dev
({
n
});
ck_tile
::
HostTensor
<
BetaDataType
>
dbeta_host_dev
({
n
});
ck_tile
::
HostTensor
<
GammaDataType
>
dgamma_host_ref
({
n
});
ck_tile
::
HostTensor
<
BetaDataType
>
dbeta_host_ref
({
n
});
ck_tile
::
FillUniformDistribution
<
YDataType
>
{
-
.5
f
,
.5
f
}(
dy_host
);
// ck_tile::FillUniformDistribution<MeanDataType>{-.5f, .5f}(mean_host);
ck_tile
::
FillMonotonicSeq
<
MeanDataType
>
{}(
mean_host
);
ck_tile
::
FillUniformDistribution
<
InvStdDataType
>
{
-
.5
f
,
.5
f
}(
invStd_host
);
ck_tile
::
DeviceMem
dy_buf
(
dy_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
mean_buf
(
mean_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
invStd_buf
(
invStd_host
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
dgamma_buf
(
dgamma_host_dev
.
get_element_space_size_in_bytes
());
ck_tile
::
DeviceMem
dbeta_buf
(
dbeta_host_dev
.
get_element_space_size_in_bytes
());
dy_buf
.
ToDevice
(
dy_host
.
data
());
mean_buf
.
ToDevice
(
mean_host
.
data
());
invStd_buf
.
ToDevice
(
invStd_host
.
data
());
std
::
cout
<<
"["
<<
data_type
<<
"]"
<<
" m:"
<<
m
<<
", n:"
<<
n
<<
", stride:"
<<
stride
<<
std
::
flush
;
layernorm2d_bwd_traits
traits
{
data_type
};
layernorm2d_bwd_args
args
{
dy_buf
.
GetDeviceBuffer
(),
mean_buf
.
GetDeviceBuffer
(),
invStd_buf
.
GetDeviceBuffer
(),
dgamma_buf
.
GetDeviceBuffer
(),
dbeta_buf
.
GetDeviceBuffer
(),
nullptr
,
m
,
n
,
stride
};
float
ave_time
=
layernorm2d_bwd
(
traits
,
args
,
ck_tile
::
stream_config
{
nullptr
,
true
,
kname
?
1
:
0
,
warmup
,
repeat
});
std
::
size_t
num_byte
=
sizeof
(
XDataType
)
*
m
*
n
+
sizeof
(
GammaDataType
)
*
n
+
sizeof
(
BetaDataType
)
*
n
+
sizeof
(
YDataType
)
*
m
*
n
;
float
gb_per_sec
=
num_byte
/
1.E6
/
ave_time
;
std
::
cout
<<
sizeof
(
ComputeDataType
)
<<
", "
<<
ave_time
*
1.E3
<<
" us, "
<<
gb_per_sec
<<
" GB/s"
<<
std
::
flush
;
bool
pass
=
true
;
if
(
do_validation
)
{
// // reference
// ck_tile::reference_layernorm2d_bwd<XDataType,
// GammaDataType,
// BetaDataType,
// ComputeDataType,
// YDataType,
// MeanDataType,
// InvStdDataType>(
// x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
// y_buf.FromDevice(y_host_dev.data());
// auto [rtol, atol] = get_elimit<DataType>();
// if(stride == n)
// {
// pass = ck_tile::check_err(
// y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
// }
// else
// {
// for(int i_r = 0; i_r < m; i_r++)
// {
// std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * stride,
// y_host_dev.begin() + i_r * stride + n);
// std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * stride,
// y_host_ref.begin() + i_r * stride + n);
// pass &= ck_tile::check_err(y_host_dev_row,
// y_host_ref_row,
// std::string("OUT[") + std::to_string(i_r) +
// std::string("] Error: Incorrect results!"),
// rtol,
// atol);
// }
// }
std
::
cout
<<
", valid:"
<<
(
pass
?
"y"
:
"n"
)
<<
std
::
flush
<<
std
::
endl
;
}
return
pass
;
}
int
main
(
int
argc
,
char
*
argv
[])
{
auto
[
result
,
arg_parser
]
=
create_args
(
argc
,
argv
);
if
(
!
result
)
return
-
1
;
const
std
::
string
data_type
=
arg_parser
.
get_str
(
"prec"
);
if
(
data_type
==
"fp16"
)
{
return
run
<
ck_tile
::
half_t
>
(
arg_parser
)
?
0
:
-
2
;
}
else
if
(
data_type
==
"bf16"
)
{
return
run
<
ck_tile
::
bf16_t
>
(
arg_parser
)
?
0
:
-
2
;
}
return
-
3
;
}
example/ck_tile/06_layernorm2d_bwd/layernorm2d_bwd.hpp
0 → 100644
View file @
7db609fe
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/layernorm2d.hpp"
#include <string>
template
<
typename
DataType
>
struct
LayerNormTypeConfig
;
template
<
>
struct
LayerNormTypeConfig
<
ck_tile
::
half_t
>
{
using
XDataType
=
ck_tile
::
half_t
;
using
YDataType
=
ck_tile
::
half_t
;
using
GammaDataType
=
ck_tile
::
half_t
;
using
BetaDataType
=
ck_tile
::
half_t
;
using
MeanDataType
=
ck_tile
::
half_t
;
using
InvStdDataType
=
ck_tile
::
half_t
;
using
ComputeDataType
=
float
;
};
template
<
>
struct
LayerNormTypeConfig
<
ck_tile
::
bf16_t
>
{
using
XDataType
=
ck_tile
::
bf16_t
;
using
YDataType
=
ck_tile
::
bf16_t
;
using
GammaDataType
=
ck_tile
::
bf16_t
;
using
BetaDataType
=
ck_tile
::
bf16_t
;
using
MeanDataType
=
ck_tile
::
bf16_t
;
using
InvStdDataType
=
ck_tile
::
bf16_t
;
using
ComputeDataType
=
float
;
};
// runtime args
struct
layernorm2d_bwd_args
:
public
ck_tile
::
Layernorm2dBwdGammaBetaHostArgs
{
};
// this is used to pattern-match internl kernel implementation, not to instantiate kernel
template
<
typename
DataType_
,
ck_tile
::
index_t
Repeat_M_
,
// each thread repeat along M
ck_tile
::
index_t
ThreadPerBlock_M_
,
// num threads along M
ck_tile
::
index_t
ThreadPerBlock_N_
,
// num threads along N
bool
kPadN_
>
struct
layernorm2d_bwd_traits_
{
using
DataType
=
ck_tile
::
remove_cvref_t
<
DataType_
>
;
static
constexpr
bool
is_warp_per_row
=
ThreadPerBlock_N_
<=
warpSize
;
static_assert
((
ThreadPerBlock_M_
*
ThreadPerBlock_N_
)
%
warpSize
==
0
);
static
constexpr
ck_tile
::
index_t
total_warps
=
(
ThreadPerBlock_M_
*
ThreadPerBlock_N_
)
/
warpSize
;
// num of warps along m
static
constexpr
ck_tile
::
index_t
BlockWarps_M
=
[]()
{
if
constexpr
(
is_warp_per_row
)
{
static_assert
(
warpSize
%
ThreadPerBlock_N_
==
0
);
return
total_warps
*
(
warpSize
/
ThreadPerBlock_N_
);
}
else
{
// static_assert(warpSize % ThreadPerBlock_M_ == 0);
return
total_warps
/
(
ThreadPerBlock_N_
/
warpSize
);
}
}();
// num of warps along n
static
constexpr
ck_tile
::
index_t
BlockWarps_N
=
[]()
{
if
constexpr
(
is_warp_per_row
)
{
static_assert
(
warpSize
%
ThreadPerBlock_N_
==
0
);
return
1
;
}
else
{
static_assert
(
ThreadPerBlock_N_
%
warpSize
==
0
);
return
ThreadPerBlock_N_
/
warpSize
;
}
}();
static
constexpr
ck_tile
::
index_t
Repeat_M
=
Repeat_M_
;
static
constexpr
ck_tile
::
index_t
Block_M
=
Repeat_M_
*
ThreadPerBlock_M_
;
static
constexpr
ck_tile
::
index_t
Block_N
=
ThreadPerBlock_N_
;
static
constexpr
ck_tile
::
index_t
Warp_M
=
ThreadPerBlock_M_
/
BlockWarps_M
;
static
constexpr
ck_tile
::
index_t
Warp_N
=
ThreadPerBlock_N_
/
BlockWarps_N
;
using
BlockTile
=
ck_tile
::
sequence
<
Block_M
,
Block_N
>
;
using
BlockWarps
=
ck_tile
::
sequence
<
BlockWarps_M
,
BlockWarps_N
>
;
using
WarpTile
=
ck_tile
::
sequence
<
Warp_M
,
Warp_N
>
;
using
Vector
=
ck_tile
::
sequence
<
1
,
1
>
;
using
Shape
=
ck_tile
::
Layernorm2dShape
<
BlockTile
,
BlockWarps
,
WarpTile
,
Vector
>
;
static
constexpr
bool
kPadN
=
kPadN_
;
};
template
<
typename
Traits_
>
float
layernorm2d_bwd_
(
const
ck_tile
::
stream_config
&
s
,
layernorm2d_bwd_args
a
);
// This is the public API, will be generated by script
struct
layernorm2d_bwd_traits
{
std
::
string
data_type
;
};
float
layernorm2d_bwd
(
layernorm2d_bwd_traits
,
layernorm2d_bwd_args
,
const
ck_tile
::
stream_config
&
);
example/ck_tile/06_layernorm2d_bwd/script/perf_test.sh
0 → 100755
View file @
7db609fe
# run from top of ck folder
EXE
=
build/bin/tile_example_layernorm2d_bwd
$EXE
-m
=
1
-n
=
1
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
80
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
128
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
144
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
168
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
184
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
256
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
288
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
344
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
376
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
448
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
512
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
924
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1024
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1078
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1996
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
4080
-e
=
1e-12
-v
=
1
-prec
=
bf16
-repeat
=
1000
$EXE
-m
=
700
-n
=
80
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
128
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
144
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
168
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
184
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
256
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
288
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
344
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
376
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
448
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
512
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
924
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1024
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1078
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
1996
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
$EXE
-m
=
700
-n
=
4080
-e
=
1e-12
-v
=
1
-prec
=
fp16
-repeat
=
1000
\ No newline at end of file
example/ck_tile/06_layernorm2d_bwd/script/smoke_test.sh
0 → 100755
View file @
7db609fe
#!/bin/sh
# call from top of CK folder
EXE
=
./build/bin/tile_example_layernorm2d_bwd
for
pr_i
in
"fp16"
"bf16"
;
do
$EXE
-prec
=
$pr_i
-m
=
99
-n
=
13
$EXE
-prec
=
$pr_i
-m
=
17
-n
=
16
$EXE
-prec
=
$pr_i
-m
=
1
-n
=
100
$EXE
-prec
=
$pr_i
-m
=
4
-n
=
128
$EXE
-prec
=
$pr_i
-m
=
80
-n
=
127
$EXE
-prec
=
$pr_i
-m
=
22
-n
=
255
-stride
=
256
$EXE
-prec
=
$pr_i
-m
=
7
-n
=
599
$EXE
-prec
=
$pr_i
-m
=
19
-n
=
512
$EXE
-prec
=
$pr_i
-m
=
33
-n
=
313
-stride
=
1000
$EXE
-prec
=
$pr_i
-m
=
11
-n
=
510
$EXE
-prec
=
$pr_i
-m
=
171
-n
=
676
-stride
=
818
$EXE
-prec
=
$pr_i
-m
=
91
-n
=
636
$EXE
-prec
=
$pr_i
-m
=
12
-n
=
768
-stride
=
800
$EXE
-prec
=
$pr_i
-m
=
100
-n
=
766
-stride
=
812
$EXE
-prec
=
$pr_i
-m
=
31
-n
=
1024
$EXE
-prec
=
$pr_i
-m
=
64
-n
=
1000
-stride
=
1004
$EXE
-prec
=
$pr_i
-m
=
8
-n
=
1501
$EXE
-prec
=
$pr_i
-m
=
3
-n
=
1826
$EXE
-prec
=
$pr_i
-m
=
5
-n
=
2040
$EXE
-prec
=
$pr_i
-m
=
7
-n
=
2734
$EXE
-prec
=
$pr_i
-m
=
1
-n
=
3182
$EXE
-prec
=
$pr_i
-m
=
9
-n
=
4096
$EXE
-prec
=
$pr_i
-m
=
3
-n
=
8192
$EXE
-prec
=
$pr_i
-m
=
1
-n
=
10547
$EXE
-prec
=
$pr_i
-m
=
3
-n
=
17134
done
example/ck_tile/CMakeLists.txt
View file @
7db609fe
...
@@ -7,3 +7,4 @@ add_subdirectory(02_layernorm2d)
...
@@ -7,3 +7,4 @@ add_subdirectory(02_layernorm2d)
add_subdirectory
(
03_gemm
)
add_subdirectory
(
03_gemm
)
add_subdirectory
(
04_img2col
)
add_subdirectory
(
04_img2col
)
add_subdirectory
(
05_reduce
)
add_subdirectory
(
05_reduce
)
add_subdirectory
(
06_layernorm2d_bwd
)
include/ck_tile/ops/layernorm2d.hpp
View file @
7db609fe
...
@@ -9,4 +9,11 @@
...
@@ -9,4 +9,11 @@
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp"
#include "ck_tile/ops/layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_gamma_beta.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
include/ck_tile/ops/layernorm2d/kernel/layernorm2d_bwd_gamma_beta_kernel.hpp
View file @
7db609fe
...
@@ -104,15 +104,13 @@ struct Layernorm2dBwdGammaBeta
...
@@ -104,15 +104,13 @@ struct Layernorm2dBwdGammaBeta
auto
surfix
=
[
&
]
()
{
auto
surfix
=
[
&
]
()
{
std
::
string
n
;
std
::
string
n
;
if
(
kPadN
)
n
+=
"_pn"
;
if
(
kPadN
)
n
+=
"_pn"
;
if
(
kSaveMeanInvStd
)
n
+=
"_mv"
;
if
(
kTwoPass
)
n
+=
"_2p"
;
return
n
;
}();
return
n
;
}();
#define _SS_ std::string
#define _SS_ std::string
#define _TS_ std::to_string
#define _TS_ std::to_string
return
_SS_
(
"layernorm2d_
f
wd_"
)
+
_SS_
(
t2s
<
XDataType
>::
name
)
+
"_"
+
return
_SS_
(
"layernorm2d_
b
wd_"
)
+
_SS_
(
t2s
<
XDataType
>::
name
)
+
"_"
+
_TS_
(
S_
::
Block_M
)
+
"x"
+
_TS_
(
S_
::
Block_N
)
+
"_"
+
_TS_
(
S_
::
WarpPerBlock_M
)
+
"x"
+
_TS_
(
S_
::
WarpPerBlock_N
)
+
"_"
+
_TS_
(
S_
::
Block_M
)
+
"x"
+
_TS_
(
S_
::
Block_N
)
+
"_"
+
_TS_
(
S_
::
WarpPerBlock_M
)
+
"x"
+
_TS_
(
S_
::
WarpPerBlock_N
)
+
"_"
+
_TS_
(
S_
::
Warp_M
)
+
"x"
+
_TS_
(
S_
::
Warp_N
)
+
"_"
+
_TS_
(
S_
::
Vector_M
)
+
"x"
+
_TS_
(
S_
::
Vector_N
)
+
"_"
+
_TS_
(
S_
::
Warp_M
)
+
"x"
+
_TS_
(
S_
::
Warp_N
)
+
"_"
+
_TS_
(
S_
::
Vector_M
)
+
"x"
+
_TS_
(
1
)
+
"_"
+
_SS_
(
Pipeline
::
name
)
+
surfix
;
_SS_
(
Pipeline
::
name
)
+
surfix
;
#undef _SS_
#undef _SS_
#undef _TS_
#undef _TS_
...
...
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_default_policy.hpp
View file @
7db609fe
...
@@ -4,8 +4,6 @@
...
@@ -4,8 +4,6 @@
#pragma once
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/ops/welford/block/block_welford_problem.hpp"
#include "ck_tile/ops/welford/block/block_welford.hpp"
namespace
ck_tile
{
namespace
ck_tile
{
...
@@ -41,6 +39,20 @@ struct Layernorm2dBwdGammaBetaPipelineDefaultPolicy
...
@@ -41,6 +39,20 @@ struct Layernorm2dBwdGammaBetaPipelineDefaultPolicy
sequence
<
0
>>
{});
sequence
<
0
>>
{});
}
}
// template <typename Problem>
// CK_TILE_DEVICE static constexpr auto MakeGammaBetaBlockTileDistribution()
// {
// using S = typename Problem::BlockShape;
// return make_static_tile_distribution(
// tile_distribution_encoding<
// sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M>,
// tuple<sequence<S::WarpPerBlock_N, S::ThreadPerWarp_N>>,
// tuple<sequence<0, 1>, sequence<0, 1>>,
// tuple<sequence<1, 0>, sequence<2, 1>>,
// sequence<0>,
// sequence<0>>{});
// }
template
<
typename
Problem
>
template
<
typename
Problem
>
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
{
...
...
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_gamma_beta.hpp
→
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_
pipeline_
gamma_beta.hpp
View file @
7db609fe
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#pragma once
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_
f
wd_pipeline_default_policy.hpp"
#include "ck_tile/ops/layernorm2d/pipeline/layernorm2d_
b
wd_pipeline_default_policy.hpp"
#include <string>
#include <string>
#include <type_traits>
#include <type_traits>
...
@@ -28,14 +28,32 @@ struct Layernorm2dBwdGammaBetaPipeline
...
@@ -28,14 +28,32 @@ struct Layernorm2dBwdGammaBetaPipeline
static
constexpr
bool
kPadN
=
Problem
::
kPadN
;
static
constexpr
bool
kPadN
=
Problem
::
kPadN
;
static
constexpr
const
char
*
name
=
[]()
{
static
constexpr
const
char
*
name
=
[]()
{
return
"bwd_gamma_beta"
return
"bwd_gamma_beta"
;
}();
}();
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
CK_TILE_HOST_DEVICE
static
constexpr
index_t
GetSmemSize
()
{
{
return
Policy
::
template
GetSmemSize
<
Problem
>();
return
Policy
::
template
GetSmemSize
<
Problem
>();
}
}
// template <typename DumpTensor_>
// CK_TILE_DEVICE void dump(const DumpTensor_& x) const
// {
// constexpr auto I0 = number<0>{};
// constexpr auto I1 = number<1>{};
// constexpr auto spans = DumpTensor_::get_distributed_spans();
// sweep_tile_span(spans[I1], [&](auto i1) {
// sweep_tile_span(spans[I0], [&](auto i0) {
// constexpr auto in_dstr_idx = make_tuple(i0, i1);
// auto v = ck_tile::type_convert<float>(x[in_dstr_idx]);
// index_t tid =
// (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
// printf("%d %f\n", tid, v);
// });
// });
// }
template
<
typename
DYWindow
,
template
<
typename
DYWindow
,
typename
MeanWindow
,
typename
MeanWindow
,
typename
InvStdWindow
,
typename
InvStdWindow
,
...
@@ -49,10 +67,12 @@ struct Layernorm2dBwdGammaBetaPipeline
...
@@ -49,10 +67,12 @@ struct Layernorm2dBwdGammaBetaPipeline
ck_tile
::
index_t
row_size
,
ck_tile
::
index_t
row_size
,
void
*
smem
)
const
void
*
smem
)
const
{
{
const
auto
dy_window
=
const
auto
dy_window
=
make_tile_window
(
dy_window_
,
make_tile_window
(
dy_window_
,
Policy
::
template
MakeDyBlockTileDistribution
<
Problem
>());
Policy
::
template
MakeDyBlockTileDistribution
<
Problem
>());
const
auto
mean_window
=
make_tile_window
(
const
auto
mean_window
=
make_tile_window
(
mean_window_
,
Policy
::
template
MakeMeanBlockTileDistribution
<
Problem
>());
mean_window_
,
Policy
::
template
MakeMeanBlockTileDistribution
<
Problem
>());
const
auto
inv_std_window
=
make_tile_window
(
inv_std_window_
,
Policy
::
template
MakeMeanBlockTileDistribution
<
Problem
>());
// const auto gamma_window = make_tile_window(
// const auto gamma_window = make_tile_window(
// gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
// gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution<Problem>());
// const auto beta_window = make_tile_window(
// const auto beta_window = make_tile_window(
...
@@ -60,7 +80,23 @@ struct Layernorm2dBwdGammaBetaPipeline
...
@@ -60,7 +80,23 @@ struct Layernorm2dBwdGammaBetaPipeline
const
auto
dy
=
load_tile
(
dy_window
);
const
auto
dy
=
load_tile
(
dy_window
);
const
auto
mean
=
load_tile
(
mean_window
);
const
auto
mean
=
load_tile
(
mean_window
);
const
auto
inv_std
=
load_tile
(
inv_std_window
);
// auto y = make_static_distributed_tensor<YDataType>(dy.get_tile_distribution());
sweep_tile
(
mean
,
[
&
](
auto
idx
)
{
constexpr
auto
i_idx
=
make_tuple
(
idx
[
number
<
0
>
{}]);
// constexpr auto j_idx = make_tuple(idx[number<1>{}]);
index_t
tid
=
(
threadIdx
.
y
*
blockDim
.
x
)
+
threadIdx
.
x
;
const
auto
m
=
type_convert
<
float
>
(
mean
[
i_idx
]);
if
(
blockIdx
.
x
==
0
&&
blockIdx
.
y
==
0
)
printf
(
"%d %f
\n
"
,
tid
,
m
);
});
// dump(dy);
// dump(mean);
// dump(inv_std);
*
reinterpret_cast
<
char
*>
(
smem
)
=
row_size
;
// layernorm computation
// layernorm computation
// auto y = make_static_distributed_tensor<YDataType>(x.get_tile_distribution());
// auto y = make_static_distributed_tensor<YDataType>(x.get_tile_distribution());
...
...
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_bwd_pipeline_problem.hpp
View file @
7db609fe
...
@@ -15,9 +15,7 @@ template <typename XDataType_,
...
@@ -15,9 +15,7 @@ template <typename XDataType_,
typename
MeanDataType_
,
typename
MeanDataType_
,
typename
InvStdDataType_
,
typename
InvStdDataType_
,
typename
BlockShape_
,
typename
BlockShape_
,
bool
kPadN_
,
bool
kPadN_
>
bool
kSaveMeanInvStd_
,
bool
kTwoPass_
>
struct
Layernorm2dBwdGammaBetaPipelineProblem
struct
Layernorm2dBwdGammaBetaPipelineProblem
{
{
using
XDataType
=
remove_cvref_t
<
XDataType_
>
;
using
XDataType
=
remove_cvref_t
<
XDataType_
>
;
...
...
include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp
View file @
7db609fe
...
@@ -37,33 +37,4 @@ struct Layernorm2dFwdPipelineProblem
...
@@ -37,33 +37,4 @@ struct Layernorm2dFwdPipelineProblem
static
constexpr
bool
kTwoPass
=
kTwoPass_
;
static
constexpr
bool
kTwoPass
=
kTwoPass_
;
};
};
template
<
typename
XDataType_
,
typename
GammaDataType_
,
typename
BetaDataType_
,
typename
ComputeDataType_
,
typename
YDataType_
,
typename
MeanDataType_
,
typename
InvStdDataType_
,
typename
BlockShape_
,
bool
kPadN_
,
bool
kSaveMeanInvStd_
,
bool
kTwoPass_
>
struct
Layernorm2dFwdPipelineProblem
{
using
XDataType
=
remove_cvref_t
<
XDataType_
>
;
using
GammaDataType
=
remove_cvref_t
<
GammaDataType_
>
;
using
BetaDataType
=
remove_cvref_t
<
BetaDataType_
>
;
using
ComputeDataType
=
remove_cvref_t
<
ComputeDataType_
>
;
using
YDataType
=
remove_cvref_t
<
YDataType_
>
;
using
MeanDataType
=
remove_cvref_t
<
MeanDataType_
>
;
using
InvStdDataType
=
remove_cvref_t
<
InvStdDataType_
>
;
using
BlockShape
=
remove_cvref_t
<
BlockShape_
>
;
static
constexpr
bool
kNeedCrossLaneSync
=
BlockShape
::
ThreadPerWarp_N
>
1
;
static
constexpr
bool
kNeedCrossWarpSync
=
BlockShape
::
WarpPerBlock_N
>
1
;
static
constexpr
bool
kPadN
=
kPadN_
;
static
constexpr
bool
kSaveMeanInvStd
=
kSaveMeanInvStd_
;
static
constexpr
bool
kTwoPass
=
kTwoPass_
;
};
}
// namespace ck_tile
}
// namespace ck_tile
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment