Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
87df7683
Commit
87df7683
authored
Dec 21, 2022
by
fsx950223
Browse files
change to elementwise op
parent
935422a4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
43 additions
and
33 deletions
+43
-33
example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
..._sparse_embedding/sparse_embedding3_forward_layernorm.cpp
+11
-11
include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
...evice/impl/device_sparse_embeddings_forward_layernorm.hpp
+8
-8
include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
...gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
+24
-14
No files found.
example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
View file @
87df7683
...
...
@@ -10,7 +10,7 @@
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp"
#include "ck/tensor_operation/gpu/element/
binary_
element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
@@ -25,16 +25,16 @@ using GammaDataType = ck::half_t;
using
BetaDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
OutType
=
ck
::
half_t
;
using
Reduc
eOperation
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
Elementwis
eOperation
=
ck
::
tensor_operation
::
element_wise
::
Add
Add
;
using
DeviceInstance_fp16_e256
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Reduc
eOperation
,
256
,
1
,
256
,
1
,
256
,
1
,
1
,
3
>
;
using
DeviceInstance_fp16_e512
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Reduc
eOperation
,
256
,
1
,
256
,
1
,
512
,
1
,
2
,
3
>
;
using
DeviceInstance_fp16_e768
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Reduc
eOperation
,
256
,
1
,
256
,
1
,
768
,
1
,
1
,
3
>
;
using
DeviceInstance_fp16_e1024
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Reduc
eOperation
,
256
,
1
,
256
,
1
,
1024
,
1
,
2
,
3
>
;
using
DeviceInstance_fp16_e1536
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Reduc
eOperation
,
256
,
1
,
256
,
1
,
1536
,
1
,
2
,
3
>
;
using
DeviceInstance_fp16_e2048
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Reduc
eOperation
,
256
,
1
,
256
,
1
,
2048
,
1
,
2
,
3
>
;
using
DeviceInstance_fp16_e4096
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Reduc
eOperation
,
256
,
1
,
256
,
1
,
4096
,
1
,
8
,
3
>
;
using
DeviceInstance_fp16_e8192
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Reduc
eOperation
,
256
,
1
,
256
,
1
,
8192
,
1
,
8
,
3
>
;
using
DeviceInstance_fp16_e256
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Elementwis
eOperation
,
256
,
1
,
256
,
1
,
256
,
1
,
1
,
3
>
;
using
DeviceInstance_fp16_e512
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Elementwis
eOperation
,
256
,
1
,
256
,
1
,
512
,
1
,
2
,
3
>
;
using
DeviceInstance_fp16_e768
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Elementwis
eOperation
,
256
,
1
,
256
,
1
,
768
,
1
,
1
,
3
>
;
using
DeviceInstance_fp16_e1024
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Elementwis
eOperation
,
256
,
1
,
256
,
1
,
1024
,
1
,
2
,
3
>
;
using
DeviceInstance_fp16_e1536
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Elementwis
eOperation
,
256
,
1
,
256
,
1
,
1536
,
1
,
2
,
3
>
;
using
DeviceInstance_fp16_e2048
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Elementwis
eOperation
,
256
,
1
,
256
,
1
,
2048
,
1
,
2
,
3
>
;
using
DeviceInstance_fp16_e4096
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Elementwis
eOperation
,
256
,
1
,
256
,
1
,
4096
,
1
,
8
,
3
>
;
using
DeviceInstance_fp16_e8192
=
ck
::
tensor_operation
::
device
::
DeviceSparseEmbeddingsForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
Elementwis
eOperation
,
256
,
1
,
256
,
1
,
8192
,
1
,
8
,
3
>
;
template
<
typename
emb_type
,
ck
::
index_t
dim
>
struct
emb_kernel
{};
...
...
@@ -137,7 +137,7 @@ int main()
current_dim
,
index_length
,
epsilon
,
Reduc
eOperation
{});
Elementwis
eOperation
{});
std
::
cout
<<
"Dim:"
<<
current_dim
<<
", kernel:"
<<
device_instance
.
GetTypeString
()
<<
std
::
endl
<<
std
::
flush
;
...
...
include/ck/tensor_operation/gpu/device/impl/device_sparse_embeddings_forward_layernorm.hpp
View file @
87df7683
...
...
@@ -24,7 +24,7 @@ template <typename EmbType,
typename
BetaDataType
,
typename
AccDataType
,
typename
OutType
,
typename
Reduc
eOperation
,
typename
Elementwis
eOperation
,
ck
::
index_t
BlockSize
,
ck
::
index_t
DimClusterSize
,
ck
::
index_t
RowClusterSize
,
...
...
@@ -50,7 +50,7 @@ struct DeviceSparseEmbeddingsForwardLayernorm : public BaseOperator
const
ck
::
index_t
EmbeddingDim
,
const
ck
::
index_t
IndexLength
,
const
AccDataType
epsilon
,
const
Reduc
eOperation
reduc
e_op
)
const
Elementwis
eOperation
elementwis
e_op
)
:
p_out_
(
p_out
),
p_embs_
(
p_embs
),
p_indexs_
(
p_indexs
),
...
...
@@ -59,7 +59,7 @@ struct DeviceSparseEmbeddingsForwardLayernorm : public BaseOperator
EmbeddingDim_
(
EmbeddingDim
),
IndexLength_
(
IndexLength
),
epsilon_
(
epsilon
),
reduce_op_
(
reduc
e_op
)
reduce_op_
(
elementwis
e_op
)
{
grid_size_
=
(
IndexLength
+
DimClusterSize
-
1
)
/
DimClusterSize
;
}
...
...
@@ -72,7 +72,7 @@ struct DeviceSparseEmbeddingsForwardLayernorm : public BaseOperator
ck
::
index_t
EmbeddingDim_
;
ck
::
index_t
IndexLength_
;
AccDataType
epsilon_
;
Reduc
eOperation
reduce_op_
;
Elementwis
eOperation
reduce_op_
;
size_t
grid_size_
;
};
...
...
@@ -86,7 +86,7 @@ struct DeviceSparseEmbeddingsForwardLayernorm : public BaseOperator
ck
::
index_t
EmbeddingDim
,
ck
::
index_t
IndexLength
,
const
AccDataType
epsilon
,
const
Reduc
eOperation
reduc
e_op
)
const
Elementwis
eOperation
elementwis
e_op
)
{
return
std
::
make_unique
<
Argument
>
(
reinterpret_cast
<
OutType
*>
(
p_out
),
p_embs
,
...
...
@@ -96,7 +96,7 @@ struct DeviceSparseEmbeddingsForwardLayernorm : public BaseOperator
EmbeddingDim
,
IndexLength
,
epsilon
,
reduc
e_op
);
elementwis
e_op
);
}
using
GridwiseSparseEmbedding
=
...
...
@@ -107,7 +107,7 @@ struct DeviceSparseEmbeddingsForwardLayernorm : public BaseOperator
AccDataType
,
OutType
,
decltype
(
MakeOutputDescriptor
(
1
,
1
)),
Reduc
eOperation
,
Elementwis
eOperation
,
BlockSize
,
DimClusterSize
,
RowClusterSize
,
...
...
@@ -131,7 +131,7 @@ struct DeviceSparseEmbeddingsForwardLayernorm : public BaseOperator
AccDataType
,
OutType
,
decltype
(
out_desc
),
Reduc
eOperation
,
Elementwis
eOperation
,
NumEmbeddings
>
;
float
avg_time
=
0
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
...
...
include/ck/tensor_operation/gpu/grid/gridwise_sparse_embeddings_forward_layernorm.hpp
View file @
87df7683
...
...
@@ -18,7 +18,7 @@ template <typename GridwiseSparseEmbedding,
typename
AccDataType
,
typename
OutType
,
typename
OutGridDesc
,
typename
Reduc
eOperation
,
typename
Elementwis
eOperation
,
ck
::
index_t
NumEmbeddings
>
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
...
...
@@ -31,9 +31,10 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
const
BetaDataType
*
p_beta
,
const
OutGridDesc
out_grid_desc
,
const
AccDataType
epsilon
,
const
Reduc
eOperation
reduc
e_op
)
const
Elementwis
eOperation
elementwis
e_op
)
{
GridwiseSparseEmbedding
::
Run
(
p_out
,
p_embs
,
p_indexes
,
p_gamma
,
p_beta
,
out_grid_desc
,
epsilon
,
reduce_op
);
GridwiseSparseEmbedding
::
Run
(
p_out
,
p_embs
,
p_indexes
,
p_gamma
,
p_beta
,
out_grid_desc
,
epsilon
,
elementwise_op
);
}
template
<
typename
EmbType
,
...
...
@@ -43,7 +44,7 @@ template <typename EmbType,
typename
AccDataType
,
typename
OutType
,
typename
OutGridDesc
,
typename
Reduc
eOperation
,
typename
Elementwis
eOperation
,
ck
::
index_t
BlockSize
,
ck
::
index_t
DimClusterSize
,
ck
::
index_t
RowClusterSize
,
...
...
@@ -95,7 +96,7 @@ struct GridwiseSparseEmbeddingsForwardLayernorm
const
BetaDataType
*
p_beta
,
const
OutGridDesc
,
const
AccDataType
epsilon
,
const
Reduc
eOperation
reduc
e_op
)
const
Elementwis
eOperation
elementwis
e_op
)
{
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
...
...
@@ -127,7 +128,7 @@ struct GridwiseSparseEmbeddingsForwardLayernorm
constexpr
auto
gamma_beta_buf_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
RowSubBlocks
,
RowVectorSize
));
ck
::
Array
<
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
Emb
Type
,
thread_buf_size
,
true
>
,
ck
::
Array
<
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccData
Type
,
thread_buf_size
,
true
>
,
NumEmbeddings
>
in_thread_bufs
;
ck
::
Array
<
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexType
,
DimPerBlock
,
true
>
,
NumEmbeddings
>
...
...
@@ -166,7 +167,8 @@ struct GridwiseSparseEmbeddingsForwardLayernorm
make_tuple
(
i_dim_sub_
,
i_dim_vec_
,
i_row_sub_
,
i_row_vec_
));
static_for
<
0
,
NumEmbeddings
,
1
>
{}([
&
](
auto
i_embedding_
)
{
in_thread_bufs
(
i_embedding_
)(
Number
<
register_offset
>
{})
=
emb_vectors
[
i_embedding_
].
template
AsType
<
EmbType
>()[
i_row_vec_
];
ck
::
type_convert
<
AccDataType
>
(
emb_vectors
[
i_embedding_
].
template
AsType
<
EmbType
>()[
i_row_vec_
]);
});
});
});
...
...
@@ -177,10 +179,17 @@ struct GridwiseSparseEmbeddingsForwardLayernorm
static_for
<
0
,
RowVectorSize
,
1
>
{}([
&
](
auto
i_row_vec_
)
{
constexpr
auto
register_offset
=
thread_buf_desc
.
CalculateOffset
(
make_tuple
(
i_dim_sub_
,
i_dim_vec_
,
i_row_sub_
,
i_row_vec_
));
static_for
<
0
,
NumEmbeddings
,
1
>
{}([
&
](
auto
i_embedding_
)
{
reduce_op
(
acc_thread_buf
(
Number
<
register_offset
>
{}),
acc_thread_buf
(
Number
<
register_offset
>
{}),
ck
::
type_convert
<
AccDataType
>
(
in_thread_bufs
(
i_embedding_
)(
Number
<
register_offset
>
{})));
});
auto
in_data_refs
=
generate_tie
(
[
&
](
auto
i_embedding_
)
->
const
auto
&
{
return
in_thread_bufs
(
i_embedding_
)(
Number
<
register_offset
>
{});
},
Number
<
NumEmbeddings
>
{});
auto
out_data_refs
=
generate_tie
(
[
&
](
auto
output_index_
)
->
auto
&
{
return
acc_thread_buf
(
Number
<
register_offset
>
{});
},
Number
<
1
>
{});
unpack2
(
elementwise_op
,
out_data_refs
,
in_data_refs
);
});
});
};
...
...
@@ -210,7 +219,8 @@ struct GridwiseSparseEmbeddingsForwardLayernorm
constexpr
auto
mean_var_offset
=
mean_var_buf_desc
.
CalculateOffset
(
make_tuple
(
i_dim_sub_
,
i_dim_vec_
));
auto
divisor
=
1
/
__builtin_amdgcn_sqrtf
(
var_thread_buf
(
Number
<
mean_var_offset
>
{})
+
epsilon
);
auto
divisor
=
1
/
__builtin_amdgcn_sqrtf
(
var_thread_buf
(
Number
<
mean_var_offset
>
{})
+
epsilon
);
static_for
<
0
,
RowVectorSize
,
1
>
{}([
&
](
auto
i_row_vec_
)
{
constexpr
auto
register_offset
=
thread_buf_desc
.
CalculateOffset
(
make_tuple
(
i_dim_sub_
,
i_dim_vec_
,
i_row_sub_
,
i_row_vec_
));
...
...
@@ -218,8 +228,8 @@ struct GridwiseSparseEmbeddingsForwardLayernorm
gamma_beta_buf_desc
.
CalculateOffset
(
make_tuple
(
i_row_sub_
,
i_row_vec_
));
auto
acc_val
=
acc_thread_buf
[
Number
<
register_offset
>
{}];
acc_val
=
(
acc_val
-
mean_thread_buf
(
Number
<
mean_var_offset
>
{}))
*
divisor
;
acc_val
=
acc_val
*
gamma_thread_buf
[
Number
<
gamma_beta_offset
>
{}]
+
acc_val
=
(
acc_val
-
mean_thread_buf
(
Number
<
mean_var_offset
>
{}))
*
divisor
;
acc_val
=
acc_val
*
gamma_thread_buf
[
Number
<
gamma_beta_offset
>
{}]
+
beta_thread_buf
[
Number
<
gamma_beta_offset
>
{}];
out_vector
.
template
AsType
<
OutType
>()(
Number
<
i_row_vec_
>
{})
=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment