Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
23262ab6
Commit
23262ab6
authored
Jul 21, 2022
by
ltqin
Browse files
add class blockwise softmax
parent
480d6219
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
132 additions
and
91 deletions
+132
-91
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
...e/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+4
-1
include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
...de/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
+117
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
...k/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+11
-90
No files found.
include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
View file @
23262ab6
...
@@ -263,7 +263,10 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
...
@@ -263,7 +263,10 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
make_tuple
(
Sequence
<
3
>
{},
Sequence
<
0
,
1
,
2
>
{}));
make_tuple
(
Sequence
<
3
>
{},
Sequence
<
0
,
1
,
2
>
{}));
}
}
__host__
__device__
static
constexpr
auto
GetCThreadDesc
()
{
return
c_thread_desc_
;
}
__host__
__device__
static
constexpr
index_t
GetRegSizePerXdlops
()
{
return
xdlops_gemm
.
GetRegSizePerXdlops
();
}
static
constexpr
auto
a_block_desc_m0_m1_m2_k
=
MakeABlockDescriptor_M0_M1_M2_K
();
static
constexpr
auto
a_block_desc_m0_m1_m2_k
=
MakeABlockDescriptor_M0_M1_M2_K
();
static
constexpr
auto
b_block_desc_n0_n1_n2_k
=
MakeBBlockDescriptor_N0_N1_N2_K
();
static
constexpr
auto
b_block_desc_n0_n1_n2_k
=
MakeBBlockDescriptor_N0_N1_N2_K
();
...
...
include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
0 → 100644
View file @
23262ab6
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
namespace
ck
{
template
<
index_t
BlockSize
,
typename
AccDataType
,
index_t
MPerXDL
,
index_t
NPerXDL
,
index_t
RegSizePerXdlops
,
index_t
MRepeat
,
index_t
NRepeat
,
index_t
MThreadSliceSize
,
index_t
NThreadSliceSize
>
struct
BlockwiseSoftmax_V1
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
static
auto
c_thread_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MRepeat
>
{},
Number
<
NRepeat
>
{},
Number
<
RegSizePerXdlops
>
{}));
template
<
typename
CThreadBuffer
>
__host__
__device__
static
void
Run
(
CThreadBuffer
&
c_thread_buf
)
{
// printf("c_thread_desc: {%d, %d, %d}", c_thread_desc.GetLength(I0).value,
// c_thread_desc.GetLength(I1).value, c_thread_desc.GetLength(I2).value);
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
max_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
max_value_buf
(
I
)
=
reduce
::
Max
::
template
GetIdentityValue
<
AccDataType
>();
});
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
reduce
::
Add
::
template
GetIdentityValue
<
AccDataType
>();
});
constexpr
index_t
c_offset
=
c_thread_desc
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
));
auto
&
xdlops_out
=
c_thread_buf
.
GetVectorTypeReference
(
Number
<
c_offset
>
{});
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
c_thread_desc
.
GetLength
(
I2
)
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{})));
using
ThreadwiseMaxReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
reduce
::
Max
,
false
,
// param ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Max
,
AccDataType
>>
;
ThreadwiseMaxReduce
::
Reduce
(
xdlops_out
.
template
AsType
<
float
>(),
max_value_buf
);
// const index_t thread_local_id = get_thread_local_1d_id();
// printf("thread id: %d, Max: %f\t\t",thread_local_id,max_value_buf[I0]);
using
ThreadClusterLengths_M_K
=
Sequence
<
32
,
2
>
;
using
ThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
using
BlockwiseMaxReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
reduce
::
Max
,
false
,
// param ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Max
,
AccDataType
>>
;
auto
reduce_work_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_buffer
,
BlockSize
);
block_sync_lds
();
BlockwiseMaxReduce
::
Reduce
(
reduce_work_buf
,
max_value_buf
(
I0
));
block_sync_lds
();
// printf("\n");
// printf("thread id: %d, Max: %f\t\t",thread_local_id,max_value_buf[I0]);
// softmax
using
BlockwiseSumReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
reduce
::
Add
,
false
,
// ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Add
,
AccDataType
>>
;
using
ThreadwiseSumReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
reduce
::
Add
,
false
,
// ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Add
,
AccDataType
>>
;
static_for
<
0
,
c_thread_desc
.
GetLength
(
I2
),
1
>
{}([
&
](
auto
iK
)
{
xdlops_out
.
template
AsType
<
float
>()(
iK
)
=
math
::
exp
(
xdlops_out
.
template
AsType
<
float
>()[
iK
]
-
max_value_buf
(
I0
));
});
ThreadwiseSumReduce
::
Reduce
(
xdlops_out
.
template
AsType
<
float
>(),
accu_value_buf
);
block_sync_lds
();
BlockwiseSumReduce
::
Reduce
(
reduce_work_buf
,
accu_value_buf
(
I0
));
block_sync_lds
();
static_for
<
0
,
c_thread_desc
.
GetLength
(
I2
),
1
>
{}([
&
](
auto
iK
)
{
xdlops_out
.
template
AsType
<
float
>()(
iK
)
=
xdlops_out
.
template
AsType
<
float
>()[
iK
]
/
accu_value_buf
(
I0
);
});
}
};
}
// namespace ck
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
View file @
23262ab6
...
@@ -13,12 +13,7 @@
...
@@ -13,12 +13,7 @@
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp"
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -478,90 +473,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
...
@@ -478,90 +473,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
c_thread_buf
,
c_thread_buf
,
num_k_block_main_loop
);
num_k_block_main_loop
);
{
{
// LDS
using
BlockwiseSoftmax
=
BlockwiseSoftmax_V1
<
BlockSize
,
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
FloatAcc
,
MPerXDL
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
1
,
true
>
max_value_buf
;
NPerXDL
,
static_for
<
0
,
1
,
1
>
{}([
&
](
auto
I
)
{
blockwise_gemm
.
GetRegSizePerXdlops
(),
max_value_buf
(
I
)
=
reduce
::
Max
::
template
GetIdentityValue
<
AccDataType
>();
MXdlPerWave
,
});
NXdlPerWave
,
1
,
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
1
,
true
>
accu_value_buf
;
1
>
;
static_for
<
0
,
1
,
1
>
{}([
&
](
auto
I
)
{
BlockwiseSoftmax
::
Run
(
c_thread_buf
);
accu_value_buf
(
I
)
=
reduce
::
Add
::
template
GetIdentityValue
<
AccDataType
>();
});
constexpr
auto
c_thread_desc
=
blockwise_gemm
.
GetCThreadDesc
();
// printf("c_thread_desc: {%d, %d, %d}", c_thread_desc.GetLength(I0).value,
// c_thread_desc.GetLength(I1).value, c_thread_desc.GetLength(I2));
constexpr
index_t
c_offset
=
c_thread_desc
.
CalculateOffset
(
make_tuple
(
0
,
0
,
0
));
auto
&
xdlops_out
=
c_thread_buf
.
GetVectorTypeReference
(
Number
<
c_offset
>
{});
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{},
Number
<
c_thread_desc
.
GetLength
(
I2
)
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
1
>
{})));
using
ThreadwiseMaxReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
reduce
::
Max
,
false
,
// param ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Max
,
AccDataType
>>
;
ThreadwiseMaxReduce
::
Reduce
(
xdlops_out
.
template
AsType
<
float
>(),
max_value_buf
);
// const index_t thread_local_id = get_thread_local_1d_id();
// printf("thread id: %d, Max: %f\t\t",thread_local_id,max_value_buf[I0]);
using
ThreadClusterLengths_M_K
=
Sequence
<
32
,
2
>
;
using
ThreadClusterArrangeOrder
=
Sequence
<
1
,
0
>
;
using
BlockwiseMaxReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
reduce
::
Max
,
false
,
// param ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Max
,
AccDataType
>>
;
auto
reduce_work_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_buffer
,
BlockSize
);
block_sync_lds
();
BlockwiseMaxReduce
::
Reduce
(
reduce_work_buf
,
max_value_buf
(
I0
));
block_sync_lds
();
// printf("\n");
// printf("thread id: %d, Max: %f\t\t",thread_local_id,max_value_buf[I0]);
// softmax
using
BlockwiseSumReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
reduce
::
Add
,
false
,
// ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Add
,
AccDataType
>>
;
using
ThreadwiseSumReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
reduce
::
Add
,
false
,
// ignored
detail
::
AccumulateWithNanIgnore
<
reduce
::
Add
,
AccDataType
>>
;
static_for
<
0
,
c_thread_desc
.
GetLength
(
I2
),
1
>
{}([
&
](
auto
iK
)
{
xdlops_out
.
template
AsType
<
float
>()(
iK
)
=
math
::
exp
(
xdlops_out
.
template
AsType
<
float
>()[
iK
]
-
max_value_buf
(
I0
));
});
ThreadwiseSumReduce
::
Reduce
(
xdlops_out
.
template
AsType
<
float
>(),
accu_value_buf
);
block_sync_lds
();
BlockwiseSumReduce
::
Reduce
(
reduce_work_buf
,
accu_value_buf
(
I0
));
block_sync_lds
();
static_for
<
0
,
c_thread_desc
.
GetLength
(
I2
),
1
>
{}([
&
](
auto
iK
)
{
xdlops_out
.
template
AsType
<
float
>()(
iK
)
=
xdlops_out
.
template
AsType
<
float
>()[
iK
]
/
accu_value_buf
(
I0
);
});
}
}
// output: register to global memory
// output: register to global memory
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment