Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6c496076
Commit
6c496076
authored
Jun 02, 2022
by
Anthony Chang
Browse files
activation in correct order
parent
93235bb4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
62 additions
and
59 deletions
+62
-59
example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
..._gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
+8
-6
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
...tion/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+37
-35
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
...ference_tensor_operation/cpu/reference_gemm_layernorm.hpp
+17
-18
No files found.
example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
View file @
6c496076
...
...
@@ -48,7 +48,8 @@ struct Relu
using
AElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
BElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
// Elementwise operation that operates on the output of matrix multiplication Acc = A * B
// Elementwise operation that operates on the output of matrix multiplication
// i.e., AccElementOp(A * B + bias)
using
AccElementOp
=
Relu
;
// Elementwise operation that operates on the output of layer normalization
using
CElementOp
=
Relu
;
...
...
@@ -227,15 +228,16 @@ int main(int argc, char* argv[])
float
ave_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
// extra
5
MN flops due to: bias + gamma + beta + norm_sub + norm_div,
// extra
6
MN flops due to: bias +
add +
gamma + beta + norm_sub + norm_div,
// excluding reduction steps
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
+
std
::
size_t
(
5
)
*
M
*
N
;
std
::
size_t
num_btype
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
M
*
N
+
sizeof
(
CDataType
)
*
3
*
N
;
std
::
size_t
flop
=
std
::
size_t
(
2
)
*
M
*
N
*
K
+
std
::
size_t
(
6
)
*
M
*
N
;
// extra MN and 3N due to c0_add (MxN), bias (1xN), gamma (1xN), beta (1xN)
std
::
size_t
bytes
=
sizeof
(
ADataType
)
*
M
*
K
+
sizeof
(
BDataType
)
*
K
*
N
+
sizeof
(
CDataType
)
*
2
*
M
*
N
+
sizeof
(
C0DataType
)
*
3
*
N
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
ave_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
ave_time
;
float
gb_per_sec
=
bytes
/
1.E6
/
ave_time
;
std
::
cout
<<
"Perf: "
<<
ave_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
gemm
.
GetTypeString
()
<<
std
::
endl
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
View file @
6c496076
...
...
@@ -624,7 +624,7 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
FloatCShuffle
,
decltype
(
c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
decltype
(
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
AccE
lementwise
Operation
,
tensor_operation
::
e
lement
_
wise
::
PassThrough
,
Sequence
<
CShuffleMXdlPerWavePerShuffle
,
CShuffleNXdlPerWavePerShuffle
,
I1
,
...
...
@@ -648,7 +648,7 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
m_thread_data_on_block_idx
[
I3
],
m_thread_data_on_block_idx
[
I4
],
n_thread_data_on_block_idx
[
I2
]),
acc_element_op
};
tensor_operation
::
element_wise
::
PassThrough
{}
};
// shuffle: blockwise copy C from LDS to global
auto
c_shuffle_block_copy_lds_to_global
=
ThreadGroupTensorSliceTransfer_v6r1
<
...
...
@@ -883,41 +883,43 @@ struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
block_sync_lds
();
// layernorm
{
// load from LDS and global, add bias
c_reduce_thread_copy_lds_to_vgpr
.
Run
(
c_reduce_block_desc_mperblock_nperblock
,
c_shuffle_block_buf
,
c_reduce_thread_desc_mperblock_nperblock
,
make_tuple
(
I0
,
I0
),
c_reduce_thread_buf
);
c0_thread_copy_global_to_vgpr
.
Run
(
c0_grid_desc_mblock_mperblock_nblock_nperblock
,
c0_bias_grid_buf
,
c_reduce_thread_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c0_thread_buf
);
static_for
<
0
,
c_reduce_thread_desc_mperblock_nperblock
.
GetElementSize
(),
1
>
{}(
[
&
](
auto
i
)
{
c_reduce_thread_buf
(
i
)
+=
static_cast
<
FloatReduceAcc
>
(
c0_thread_buf
(
i
));
// bias
});
c0_add_thread_copy_global_to_vgpr
.
Run
(
c_grid_desc_mblock_mperblock_nblock_nperblock
,
c0_add_grid_buf
,
c_reduce_thread_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c0_thread_buf
);
// load from LDS and global, add bias
c_reduce_thread_copy_lds_to_vgpr
.
Run
(
c_reduce_block_desc_mperblock_nperblock
,
c_shuffle_block_buf
,
c_reduce_thread_desc_mperblock_nperblock
,
make_tuple
(
I0
,
I0
),
c_reduce_thread_buf
);
c0_thread_copy_global_to_vgpr
.
Run
(
c0_grid_desc_mblock_mperblock_nblock_nperblock
,
c0_bias_grid_buf
,
c_reduce_thread_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c0_thread_buf
);
static_for
<
0
,
c_reduce_thread_desc_mperblock_nperblock
.
GetElementSize
(),
1
>
{}(
[
&
](
auto
i
)
{
FloatReduceAcc
out
;
acc_element_op
(
out
,
c_reduce_thread_buf
(
i
)
+
static_cast
<
FloatReduceAcc
>
(
c0_thread_buf
(
i
)));
c_reduce_thread_buf
(
i
)
=
out
;
// acc_element_op(acc + bias)
});
static_for
<
0
,
c_reduce_thread_desc_mperblock_nperblock
.
GetElementSize
(),
1
>
{}(
[
&
](
auto
i
)
{
c_reduce_thread_buf
(
i
)
+=
static_cast
<
FloatReduceAcc
>
(
c0_thread_buf
(
i
));
// add
});
c0_add_thread_copy_global_to_vgpr
.
Run
(
c_grid_desc_mblock_mperblock_nblock_nperblock
,
c0_add_grid_buf
,
c_reduce_thread_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c0_thread_buf
);
static_for
<
0
,
c_reduce_thread_desc_mperblock_nperblock
.
GetElementSize
(),
1
>
{}(
[
&
](
auto
i
)
{
c_reduce_thread_buf
(
i
)
+=
static_cast
<
FloatReduceAcc
>
(
c0_thread_buf
(
i
));
// add
});
// layernorm
{
using
ThreadwiseReduceD0
=
ThreadwiseReduction
<
FloatReduceAcc
,
decltype
(
c_reduce_thread_desc_mperblock_nperblock
),
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
View file @
6c496076
...
...
@@ -26,20 +26,17 @@ struct ReferenceGemmLayernorm : public device::BaseOperator
AccDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
AccE
lementwise
Operation
>
;
e
lement
_
wise
::
PassThrough
>
;
// D = Layernorm(acc + broadcast(bias)) * broadcast(gamma) + broadcast(beta)
template
<
typename
InDataType
,
typename
OutDataType
,
typename
ComputeDataType
>
static
void
RunLayernorm
(
Tensor
<
OutDataType
>&
result
,
const
Tensor
<
ComputeDataType
>&
acc
,
// MxN
const
Tensor
<
InDataType
>&
bias
,
// 1xN
const
Tensor
<
InDataType
>&
add
,
// MxN
const
Tensor
<
InDataType
>&
gamma
,
// 1xN
const
Tensor
<
InDataType
>&
beta
,
// 1xN
const
InDataType
epsilon
=
1e-5
)
{
assert
(
acc
.
mDesc
.
GetLengths
()[
1
]
==
bias
.
mDesc
.
GetLengths
()[
0
]
&&
acc
.
mDesc
.
GetLengths
()[
1
]
==
gamma
.
mDesc
.
GetLengths
()[
0
]
&&
assert
(
acc
.
mDesc
.
GetLengths
()[
1
]
==
gamma
.
mDesc
.
GetLengths
()[
0
]
&&
acc
.
mDesc
.
GetLengths
()[
1
]
==
beta
.
mDesc
.
GetLengths
()[
0
]);
size_t
M
=
acc
.
mDesc
.
GetLengths
()[
0
];
...
...
@@ -47,17 +44,7 @@ struct ReferenceGemmLayernorm : public device::BaseOperator
Tensor
<
ComputeDataType
>
avg_acc_sq
(
HostTensorDescriptor
(
std
::
vector
<
size_t
>
({
M
})));
Tensor
<
ComputeDataType
>
avg_acc
(
HostTensorDescriptor
(
std
::
vector
<
size_t
>
({
M
})));
Tensor
<
ComputeDataType
>
acc_layernorm
(
acc
.
mDesc
);
// add bias
acc_layernorm
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
[
0
],
idx
[
1
])
=
acc
(
idx
[
0
],
idx
[
1
])
+
bias
(
idx
[
1
]);
});
// add from other layer
acc_layernorm
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
[
0
],
idx
[
1
])
+=
add
(
idx
[
0
],
idx
[
1
]);
});
Tensor
<
ComputeDataType
>
acc_layernorm
(
acc
);
// reduce N dim
for
(
size_t
i
=
0
;
i
<
M
;
i
++
)
...
...
@@ -152,13 +139,25 @@ struct ReferenceGemmLayernorm : public device::BaseOperator
acc_m_n
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
acc_element_op_
);
element_wise
::
PassThrough
{}
);
// gemm
ref_invoker
.
Run
(
ref_argument
);
// activation(acc + bias)
acc_m_n
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
AccDataType
out
;
arg
.
acc_element_op_
(
out
,
acc_m_n
(
idx
[
0
],
idx
[
1
])
+
arg
.
c0_n_bias_
(
idx
[
1
]));
self
(
idx
[
0
],
idx
[
1
])
=
out
;
});
// add from other layers
acc_m_n
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
self
(
idx
[
0
],
idx
[
1
])
+=
arg
.
c0_m_n_add_
(
idx
[
0
],
idx
[
1
]);
});
// layernorm
RunLayernorm
(
arg
.
c_m_n_
,
acc_m_n
,
arg
.
c0_n_bias_
,
arg
.
c0_m_n_add_
,
arg
.
c0_n_gamma_
,
arg
.
c0_n_beta_
);
RunLayernorm
(
arg
.
c_m_n_
,
acc_m_n
,
arg
.
c0_n_gamma_
,
arg
.
c0_n_beta_
);
// elementwise op
arg
.
c_m_n_
.
ForEach
([
&
](
auto
&
self
,
auto
idx
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment