Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
d08aa99e
Commit
d08aa99e
authored
May 30, 2022
by
Anthony Chang
Browse files
clang-format
parent
f906b23d
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
196 additions
and
184 deletions
+196
-184
include/ck/tensor_operation/gpu/device/device_gemm_xdl_layernorm_cshuffle.hpp
...eration/gpu/device/device_gemm_xdl_layernorm_cshuffle.hpp
+4
-5
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
...tion/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
+167
-157
include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
...r_operation/gpu/thread/reduction_functions_threadwise.hpp
+1
-1
include/ck/utility/debug.hpp
include/ck/utility/debug.hpp
+2
-8
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
...ference_tensor_operation/cpu/reference_gemm_layernorm.hpp
+22
-13
No files found.
include/ck/tensor_operation/gpu/device/device_gemm_xdl_layernorm_cshuffle.hpp
View file @
d08aa99e
...
...
@@ -67,8 +67,7 @@ template <typename ALayout,
index_t
CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock
,
index_t
CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock
,
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
struct
DeviceGemmLayerNorm_Xdl_CShuffle
:
public
BaseOperator
struct
DeviceGemmLayerNorm_Xdl_CShuffle
:
public
BaseOperator
{
using
DeviceOp
=
DeviceGemmLayerNorm_Xdl_CShuffle
;
...
...
@@ -463,7 +462,8 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle
GridwiseGemm
::
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
c_grid_desc_m_n_
);
c0_grid_desc_nblock_nperblock_
=
GridwiseGemm
::
MakeC0GridDescriptor_NBlock_NPerBlock
(
c0_grid_desc_n_
);
c0_grid_desc_nblock_nperblock_
=
GridwiseGemm
::
MakeC0GridDescriptor_NBlock_NPerBlock
(
c0_grid_desc_n_
);
// TODO ANT: adopt tensile style workgroup mapping
block_2_ctile_map_
=
GridwiseGemm
::
MakeDefaultBlock2CTileMap
(
c_grid_desc_m_n_
);
...
...
@@ -483,8 +483,7 @@ struct DeviceGemmLayerNorm_Xdl_CShuffle
C0GridDesc_N
c0_grid_desc_n_
;
typename
GridwiseGemm
::
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
c_grid_desc_mblock_mperblock_nblock_nperblock_
;
typename
GridwiseGemm
::
C0GridDescriptor_NBlock_NPerBlock
c0_grid_desc_nblock_nperblock_
;
typename
GridwiseGemm
::
C0GridDescriptor_NBlock_NPerBlock
c0_grid_desc_nblock_nperblock_
;
typename
GridwiseGemm
::
DefaultBlock2CTileMap
block_2_ctile_map_
;
AElementwiseOperation
a_element_op_
;
BElementwiseOperation
b_element_op_
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
View file @
d08aa99e
This diff is collapsed.
Click to expand it.
include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
View file @
d08aa99e
...
...
@@ -51,7 +51,7 @@ struct ThreadwiseReduction
static_assert
(
src_length_m
==
dst_length_m
,
"lengths of source and dst buffer must match!"
);
using
Op
=
OpReduce
;
using
Op
=
OpReduce
;
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
OpReduce
,
AccDataType
>
;
template
<
typename
SrcBufferType
,
typename
DstBufferType
>
...
...
include/ck/utility/debug.hpp
View file @
d08aa99e
...
...
@@ -12,10 +12,7 @@ template <typename T>
struct
PrintAsType
<
T
,
typename
std
::
enable_if
<
std
::
is_floating_point
<
T
>::
value
>::
type
>
{
using
type
=
float
;
__host__
__device__
static
void
Print
(
const
T
&
p
)
{
printf
(
"%.3f "
,
static_cast
<
type
>
(
p
));
}
__host__
__device__
static
void
Print
(
const
T
&
p
)
{
printf
(
"%.3f "
,
static_cast
<
type
>
(
p
));
}
};
template
<
>
...
...
@@ -32,10 +29,7 @@ template <typename T>
struct
PrintAsType
<
T
,
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
>::
type
>
{
using
type
=
int
;
__host__
__device__
static
void
Print
(
const
T
&
p
)
{
printf
(
"%d "
,
static_cast
<
type
>
(
p
));
}
__host__
__device__
static
void
Print
(
const
T
&
p
)
{
printf
(
"%d "
,
static_cast
<
type
>
(
p
));
}
};
}
// namespace detail
...
...
library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
View file @
d08aa99e
...
...
@@ -18,21 +18,26 @@ template <typename ADataType,
typename
CElementwiseOperation
>
struct
ReferenceGemmLayernorm
:
public
device
::
BaseOperator
{
using
ReferenceGemmInstance
=
ReferenceGemm
<
ADataType
,
BDataType
,
AccDataType
,
AccDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
;
using
ReferenceGemmInstance
=
ReferenceGemm
<
ADataType
,
BDataType
,
AccDataType
,
AccDataType
,
AElementwiseOperation
,
BElementwiseOperation
,
CElementwiseOperation
>
;
// D = Layernorm(acc + broadcast(bias)) * broadcast(gamma) + broadcast(beta)
template
<
typename
InDataType
,
typename
OutDataType
,
typename
ComputeDataType
>
static
void
RunLayernorm
(
Tensor
<
OutDataType
>&
result
,
const
Tensor
<
ComputeDataType
>&
acc
,
// MxN
const
Tensor
<
InDataType
>&
bias
,
// 1xN
const
Tensor
<
InDataType
>&
gamma
,
// 1xN
const
Tensor
<
InDataType
>&
beta
,
// 1xN
const
InDataType
epsilon
=
1e-5
)
const
Tensor
<
ComputeDataType
>&
acc
,
// MxN
const
Tensor
<
InDataType
>&
bias
,
// 1xN
const
Tensor
<
InDataType
>&
gamma
,
// 1xN
const
Tensor
<
InDataType
>&
beta
,
// 1xN
const
InDataType
epsilon
=
1e-5
)
{
assert
(
acc
.
mDesc
.
GetLengths
()[
1
]
==
bias
.
mDesc
.
GetLengths
()[
0
]
&&
acc
.
mDesc
.
GetLengths
()[
1
]
==
gamma
.
mDesc
.
GetLengths
()[
0
]
&&
acc
.
mDesc
.
GetLengths
()[
1
]
==
beta
.
mDesc
.
GetLengths
()[
0
]);
acc
.
mDesc
.
GetLengths
()[
1
]
==
gamma
.
mDesc
.
GetLengths
()[
0
]
&&
acc
.
mDesc
.
GetLengths
()[
1
]
==
beta
.
mDesc
.
GetLengths
()[
0
]);
size_t
M
=
acc
.
mDesc
.
GetLengths
()[
0
];
size_t
N
=
acc
.
mDesc
.
GetLengths
()[
1
];
...
...
@@ -127,10 +132,14 @@ struct ReferenceGemmLayernorm : public device::BaseOperator
Tensor
<
AccDataType
>
acc_m_n
(
arg
.
c_m_n_
.
mDesc
);
acc_m_n
.
GenerateTensorValue
(
GeneratorTensor_1
<
AccDataType
>
{
0
});
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
arg
.
a_m_k_
,
arg
.
b_k_n_
,
acc_m_n
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
auto
ref_gemm
=
ReferenceGemmInstance
{};
auto
ref_invoker
=
ref_gemm
.
MakeInvoker
();
auto
ref_argument
=
ref_gemm
.
MakeArgument
(
arg
.
a_m_k_
,
arg
.
b_k_n_
,
acc_m_n
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
ref_invoker
.
Run
(
ref_argument
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment