Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
667d7f0f
Commit
667d7f0f
authored
May 26, 2022
by
rocking
Browse files
Add layernorm example
parent
bd34d666
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
292 additions
and
0 deletions
+292
-0
example/21_gemm_normalize_xdl/CMakeLists.txt
example/21_gemm_normalize_xdl/CMakeLists.txt
+1
-0
example/21_gemm_normalize_xdl/gemm_layernorm_xdl_fp16.cpp
example/21_gemm_normalize_xdl/gemm_layernorm_xdl_fp16.cpp
+234
-0
example/CMakeLists.txt
example/CMakeLists.txt
+1
-0
include/ck/tensor_operation/gpu/device/device_5ary_elementwise_xdl_cshuffle.hpp
...ation/gpu/device/device_5ary_elementwise_xdl_cshuffle.hpp
+38
-0
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+18
-0
No files found.
example/21_gemm_normalize_xdl/CMakeLists.txt
0 → 100644
View file @
667d7f0f
add_example_executable
(
example_gemm_layernorm_xdl_fp16 gemm_layernorm_xdl_fp16.cpp
)
example/21_gemm_normalize_xdl/gemm_layernorm_xdl_fp16.cpp
0 → 100644
View file @
667d7f0f
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include "check_err.hpp"
#include "config.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "device_tensor.hpp"
#include "device_5ary_elementwise_xdl_cshuffle.hpp"
#include "device_gemm_reduce_xdl_cshuffle.hpp"
#include "element_wise_operation.hpp"
#include "reference_gemm.hpp"
#include "gemm_specialization.hpp"
#include "element_wise_reduce_operation.hpp"
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Row
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
Col
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
ADataType
=
F16
;
using
BDataType
=
F16
;
using
CDataType
=
F16
;
using
ReduceAccDataType
=
F32
;
using
DDataType
=
F32
;
using
DPtrsGlobal
=
ck
::
Tuple
<
DDataType
*
,
DDataType
*>
;
using
GammaDataType
=
F16
;
using
BetaDataType
=
F16
;
using
LayerNormOutDataType
=
F16
;
using
NormalizeComputeDataType
=
F32
;
using
ALayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
BLayout
=
ck
::
tensor_layout
::
gemm
::
ColumnMajor
;
using
CLayout
=
ck
::
tensor_layout
::
gemm
::
RowMajor
;
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
CElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
D0ReduceOp
=
ck
::
reduce
::
Add
<
ReduceAccDataType
>
;
using
D1ReduceOp
=
ck
::
reduce
::
Add
<
ReduceAccDataType
>
;
using
DxsReduceOp
=
ck
::
Tuple
<
D0ReduceOp
,
D1ReduceOp
>
;
using
UnaryIdenticElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>
;
using
UnaryDivElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnaryIdentic
<
ReduceAccDataType
,
ReduceAccDataType
,
true
>
;
using
UnarySquareElementOp
=
ck
::
tensor_operation
::
element_wise
::
UnarySquare
<
ReduceAccDataType
,
ReduceAccDataType
,
false
>
;
using
DxsInElementOp
=
ck
::
Tuple
<
UnaryIdenticElementOp
,
UnarySquareElementOp
>
;
using
DxsOutElementOp
=
ck
::
Tuple
<
UnaryDivElementOp
,
UnaryDivElementOp
>
;
using
DGlobalMemOp
=
ck
::
InMemoryDataOperationEnumSequence
<
ck
::
InMemoryDataOperationEnum
::
AtomicAdd
,
ck
::
InMemoryDataOperationEnum
::
AtomicAdd
>
;
static
constexpr
auto
GemmSpecialization
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
Default
;
// clang-format off
using
DeviceGemmReduceInstance
=
ck
::
tensor_operation
::
device
::
DeviceGemmReduce_Xdl_CShuffle
//######| ALayout| BLayout| CLayout|AData| BData| CData| GemmAcc| CShuffle| ReduceAcc| DData| A| B| C| Dxs| DxsInEleOp| DxsAccEleOp| D| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
//######| | | | Type| Type| Type| DataType| DataType| DataType| Type Tuple| Elementwise| Elementwise| Elementwise| Reduce| | | MemoryData| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| ExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector| SrcDstScalarPerVector|
//######| | | | | | | | | | | Operation| Operation| Operation| Operation| | | Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NPerBlock| _NPerBlock| _MPerBlock_NPerBlock| _NPerBlock| _MPerBlock|
//######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
<
Row
,
Col
,
Row
,
F16
,
F16
,
F16
,
F32
,
F32
,
F32
,
DPtrsGlobal
,
AElementOp
,
BElementOp
,
CElementOp
,
DxsReduceOp
,
DxsInElementOp
,
DxsOutElementOp
,
DGlobalMemOp
,
GemmSpecialization
,
1
,
256
,
256
,
128
,
32
,
8
,
8
,
32
,
32
,
4
,
2
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
S
<
4
,
64
,
1
>
,
S
<
1
,
0
,
2
>
,
S
<
1
,
0
,
2
>
,
2
,
8
,
8
,
1
,
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
,
S
<
64
,
4
>
,
4
,
1
>
;
// clang-format on
using
ReferenceGemmInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGemm
<
ADataType
,
BDataType
,
CDataType
,
AElementOp
,
BElementOp
,
CElementOp
>
;
using
NormalizeFunctor
=
ck
::
tensor_operation
::
element_wise
::
Normalize
;
// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
using
DeviceNormalizeInstance
=
ck
::
tensor_operation
::
device
::
Device5AryElementwise_Xdl_CShuffle
<
CDataType
,
DDataType
,
DDataType
,
GammaDataType
,
BetaDataType
,
LayerNormOutDataType
,
NormalizeComputeDataType
,
NormalizeFunctor
,
2
,
8
,
8
,
// scalarPerVector: gemm_out
1
,
// scalarPerVector: reduce_mean
1
,
// scalarPerVector: reduce_mean_square
8
,
// scalarPerVector: Gamma
8
,
// scalarPerVector: Beta
8
>
;
// scalarPerVector: LayerNorm_out
int
main
()
{
bool
time_kernel
=
false
;
// GEMM shape
ck
::
index_t
M
=
1024
;
ck
::
index_t
N
=
1024
;
ck
::
index_t
K
=
1024
;
ck
::
index_t
StrideA
=
1024
;
ck
::
index_t
StrideB
=
1024
;
ck
::
index_t
StrideC
=
1024
;
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
len
}),
std
::
vector
<
std
::
size_t
>
({
stride
}));
};
auto
f_host_tensor_descriptor2d
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
,
auto
layout
)
{
if
(
std
::
is_same
<
decltype
(
layout
),
ck
::
tensor_layout
::
gemm
::
RowMajor
>::
value
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
}
else
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
1
,
stride
}));
}
};
Tensor
<
ADataType
>
a_m_k
(
f_host_tensor_descriptor2d
(
M
,
K
,
StrideA
,
ALayout
{}));
Tensor
<
BDataType
>
b_k_n
(
f_host_tensor_descriptor2d
(
K
,
N
,
StrideB
,
BLayout
{}));
Tensor
<
CDataType
>
c_m_n
(
f_host_tensor_descriptor2d
(
M
,
N
,
StrideC
,
CLayout
{}));
Tensor
<
DDataType
>
reduceMean_m
(
f_host_tensor_descriptor1d
(
M
,
1
));
Tensor
<
DDataType
>
reduceMeanSquare_m
(
f_host_tensor_descriptor1d
(
M
,
1
));
Tensor
<
GammaDataType
>
gamma_n
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
BetaDataType
>
beta_n
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
LayerNormOutDataType
>
layerNorm_m_n
(
f_host_tensor_descriptor2d
(
M
,
N
,
StrideC
,
CLayout
{}));
a_m_k
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
-
5
,
5
});
b_k_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
-
5
,
5
});
gamma_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
GammaDataType
>
{
-
0.5
,
0.5
});
beta_n
.
GenerateTensorValue
(
GeneratorTensor_3
<
BetaDataType
>
{
-
0.5
,
0.5
});
DeviceMem
a_device_buf
(
sizeof
(
ADataType
)
*
a_m_k
.
mDesc
.
GetElementSpace
());
DeviceMem
b_device_buf
(
sizeof
(
BDataType
)
*
b_k_n
.
mDesc
.
GetElementSpace
());
DeviceMem
c_device_buf
(
sizeof
(
CDataType
)
*
c_m_n
.
mDesc
.
GetElementSpace
());
DeviceMem
reduceMean_device_buf
(
sizeof
(
DDataType
)
*
reduceMean_m
.
mDesc
.
GetElementSpace
());
DeviceMem
reduceMeanSquare_device_buf
(
sizeof
(
DDataType
)
*
reduceMeanSquare_m
.
mDesc
.
GetElementSpace
());
DeviceMem
gamma_device_buf
(
sizeof
(
GammaDataType
)
*
gamma_n
.
mDesc
.
GetElementSpace
());
DeviceMem
beta_device_buf
(
sizeof
(
BetaDataType
)
*
beta_n
.
mDesc
.
GetElementSpace
());
DeviceMem
layerNorm_device_buf
(
sizeof
(
LayerNormOutDataType
)
*
layerNorm_m_n
.
mDesc
.
GetElementSpace
());
a_device_buf
.
ToDevice
(
a_m_k
.
mData
.
data
());
b_device_buf
.
ToDevice
(
b_k_n
.
mData
.
data
());
gamma_device_buf
.
ToDevice
(
gamma_n
.
mData
.
data
());
beta_device_buf
.
ToDevice
(
beta_n
.
mData
.
data
());
auto
a_element_op
=
AElementOp
{};
auto
b_element_op
=
BElementOp
{};
auto
c_element_op
=
CElementOp
{};
auto
dxs_global
=
ck
::
make_tuple
(
static_cast
<
DDataType
*>
(
reduceMean_device_buf
.
GetDeviceBuffer
()),
static_cast
<
DDataType
*>
(
reduceMeanSquare_device_buf
.
GetDeviceBuffer
()));
auto
dxs_in_element_op
=
DxsInElementOp
{};
auto
dxs_out_element_op
=
DxsOutElementOp
{
M
,
M
};
// Prepare GEMM, reduce_mean, reduce_mean_square
auto
gemmReduce
=
DeviceGemmReduceInstance
{};
auto
gemmReduce_invoker
=
gemmReduce
.
MakeInvoker
();
auto
gemmReduce_argument
=
gemmReduce
.
MakeArgument
(
static_cast
<
ADataType
*>
(
a_device_buf
.
GetDeviceBuffer
()),
static_cast
<
BDataType
*>
(
b_device_buf
.
GetDeviceBuffer
()),
static_cast
<
CDataType
*>
(
c_device_buf
.
GetDeviceBuffer
()),
dxs_global
,
M
,
N
,
K
,
StrideA
,
StrideB
,
StrideC
,
a_element_op
,
b_element_op
,
c_element_op
,
dxs_in_element_op
,
dxs_out_element_op
);
if
(
!
gemmReduce
.
IsSupportedArgument
(
gemmReduce_argument
))
{
throw
std
::
runtime_error
(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem"
);
}
reduceMean_device_buf
.
SetZero
();
reduceMeanSquare_device_buf
.
SetZero
();
// Prepare LayerNorm
auto
layerNorm
=
DeviceNormalizeInstance
{};
auto
layerNorm_invoker_ptr
=
layerNorm
.
MakeInvokerPointer
();
auto
layerNorm_argument
=
layerNorm
.
MakeArgumentPointer
(
c_device_buf
.
GetDeviceBuffer
(),
reduceMean_device_buf
.
GetDeviceBuffer
(),
reduceMeanSquare_device_buf
.
GetDeviceBuffer
(),
gamma_device_buf
.
GetDeviceBuffer
(),
beta_device_buf
.
GetDeviceBuffer
(),
layerNorm_device_buf
.
GetDeviceBuffer
(),
{
M
,
N
},
{
StrideC
,
1
},
{
1
,
0
},
{
1
,
0
},
{
0
,
1
},
{
0
,
1
},
{
StrideC
,
1
},
NormalizeFunctor
{});
if
(
!
layerNorm
.
IsSupportedArgument
(
layerNorm_argument
.
get
()))
{
throw
std
::
runtime_error
(
"The runtime parameters seems not supported by the "
"Device5AryElementwise_Xdl_CShuffle instance, exiting!"
);
}
// run kernel
gemmReduce_invoker
.
Run
(
gemmReduce_argument
,
StreamConfig
{
nullptr
,
time_kernel
});
layerNorm_invoker_ptr
->
Run
(
layerNorm_argument
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
bool
pass
=
true
;
return
pass
?
0
:
1
;
}
example/CMakeLists.txt
View file @
667d7f0f
...
@@ -53,3 +53,4 @@ add_subdirectory(16_gemm_reduce)
...
@@ -53,3 +53,4 @@ add_subdirectory(16_gemm_reduce)
add_subdirectory
(
18_batched_gemm_reduce
)
add_subdirectory
(
18_batched_gemm_reduce
)
add_subdirectory
(
19_binary_elementwise
)
add_subdirectory
(
19_binary_elementwise
)
add_subdirectory
(
20_convnd_bwd_weight_xdl
)
add_subdirectory
(
20_convnd_bwd_weight_xdl
)
add_subdirectory
(
21_gemm_normalize_xdl
)
include/ck/tensor_operation/gpu/device/device_5ary_elementwise_xdl_cshuffle.hpp
View file @
667d7f0f
...
@@ -181,6 +181,9 @@ struct Device5AryElementwise_Xdl_CShuffle : public BaseOperator
...
@@ -181,6 +181,9 @@ struct Device5AryElementwise_Xdl_CShuffle : public BaseOperator
AGridDesc_M
,
AGridDesc_M
,
BGridDesc_M
,
BGridDesc_M
,
CGridDesc_M
,
CGridDesc_M
,
DGridDesc_M
,
EGridDesc_M
,
FGridDesc_M
,
ElementwiseFunctor
>
;
ElementwiseFunctor
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
...
@@ -253,7 +256,42 @@ struct Device5AryElementwise_Xdl_CShuffle : public BaseOperator
...
@@ -253,7 +256,42 @@ struct Device5AryElementwise_Xdl_CShuffle : public BaseOperator
if
(
!
IsScalarPerVectorValid
(
pArg
->
f_strides_
.
back
()
==
1
,
FScalarPerVector
))
if
(
!
IsScalarPerVectorValid
(
pArg
->
f_strides_
.
back
()
==
1
,
FScalarPerVector
))
return
false
;
return
false
;
return
true
;
};
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
const
void
*
p_b
,
const
void
*
p_c
,
const
void
*
p_d
,
const
void
*
p_e
,
void
*
p_f
,
std
::
vector
<
index_t
>
lengths
,
std
::
vector
<
index_t
>
a_strides
,
std
::
vector
<
index_t
>
b_strides
,
std
::
vector
<
index_t
>
c_strides
,
std
::
vector
<
index_t
>
d_strides
,
std
::
vector
<
index_t
>
e_strides
,
std
::
vector
<
index_t
>
f_strides
,
ElementwiseFunctor
functor
)
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
const
BDataType
*>
(
p_b
),
static_cast
<
const
CDataType
*>
(
p_c
),
static_cast
<
const
DDataType
*>
(
p_d
),
static_cast
<
const
EDataType
*>
(
p_e
),
static_cast
<
FDataType
*>
(
p_f
),
lengths
,
a_strides
,
b_strides
,
c_strides
,
d_strides
,
e_strides
,
f_strides
,
functor
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
();
}
};
};
}
// namespace device
}
// namespace device
...
...
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
View file @
667d7f0f
...
@@ -143,6 +143,24 @@ struct AddHardswishAdd
...
@@ -143,6 +143,24 @@ struct AddHardswishAdd
}
}
};
};
struct
Normalize
{
Normalize
(
float
epsilon
=
1e-4
)
:
epsilon_
(
epsilon
)
{}
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x
,
const
float
&
mean
,
const
float
&
mean_square
,
const
float
&
gamma
,
const
float
&
beta
)
const
{
float
variance
=
mean_square
-
(
mean
*
mean
);
y
=
((
x
-
mean
)
/
sqrtf
(
variance
+
epsilon_
))
*
gamma
+
beta
;
}
float
epsilon_
;
};
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment