Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
6ea9257e
Unverified
Commit
6ea9257e
authored
Oct 25, 2022
by
guangzlu
Committed by
GitHub
Oct 25, 2022
Browse files
Revert "Fused elementwise layernorm (#468)" (#491)
This reverts commit
efbcc6ed
.
parent
efbcc6ed
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
4 additions
and
1814 deletions
+4
-1814
example/27_layernorm/CMakeLists.txt
example/27_layernorm/CMakeLists.txt
+1
-1
example/45_elementwise_normalization/CMakeLists.txt
example/45_elementwise_normalization/CMakeLists.txt
+0
-1
example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
...entwise_normalization/elementwise_layernorm_blockwise.cpp
+0
-195
include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
...operation/gpu/device/device_elementwise_normalization.hpp
+0
-68
include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
...gpu/device/impl/device_elementwise_normalization_impl.hpp
+0
-592
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
.../grid/gridwise_elementwise_layernorm_welford_variance.hpp
+0
-500
library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
...nsor_operation_instance/gpu/elementwise_normalization.hpp
+0
-79
library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
...ion_instance/gpu/elementwise_normalization/CMakeLists.txt
+0
-3
library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
...ization/device_elementwise_normalization_f16_instance.cpp
+0
-54
profiler/include/profile_elementwise_layernorm_impl.hpp
profiler/include/profile_elementwise_layernorm_impl.hpp
+0
-264
test/CMakeLists.txt
test/CMakeLists.txt
+0
-1
test/elementwise_normalization/CMakeLists.txt
test/elementwise_normalization/CMakeLists.txt
+0
-7
test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
...entwise_normalization/test_elementwise_layernorm_fp16.cpp
+0
-47
test/normalization/CMakeLists.txt
test/normalization/CMakeLists.txt
+3
-2
No files found.
example/27_layernorm/CMakeLists.txt
View file @
6ea9257e
add_example_executable
(
example_layernorm_blockwise layernorm_blockwise.cpp
)
add_example_executable
(
example_layernorm_blockwise layernorm_blockwise.cpp
)
\ No newline at end of file
example/45_elementwise_normalization/CMakeLists.txt
deleted
100644 → 0
View file @
efbcc6ed
add_example_executable
(
example_elementwise_layernorm_blockwise elementwise_layernorm_blockwise.cpp
)
example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
deleted
100644 → 0
View file @
efbcc6ed
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
using
ADataType
=
ck
::
half_t
;
// Input 1
using
BDataType
=
ck
::
half_t
;
// Input 2
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
using
BetaDataType
=
ck
::
half_t
;
using
YDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
XElementwiseOperation
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
YElementwiseOperation
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
// X = Elementwise(input1, input2, input3, ...)
// Y = Layernorm(X, beta, gamma)
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
ADataType
,
BDataType
>
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
32
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
8
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
>
;
// OutScalarPerVector
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
void
host_elementwise2D
(
HostTensorC
&
C
,
const
HostTensorA
&
A
,
const
HostTensorB
&
B
,
const
std
::
vector
<
std
::
size_t
>&
shape
,
Functor
functor
)
{
using
ctype
=
ck
::
remove_reference_t
<
decltype
(
C
(
0
,
0
))
>
;
for
(
std
::
size_t
m
=
0
;
m
<
shape
[
0
];
++
m
)
for
(
std
::
size_t
n
=
0
;
n
<
shape
[
1
];
++
n
)
{
auto
a_val
=
A
(
m
,
n
);
auto
b_val
=
B
(
m
,
n
);
ctype
c_val
=
0
;
functor
(
c_val
,
a_val
,
b_val
);
C
(
m
,
n
)
=
c_val
;
}
}
int
main
()
{
bool
time_kernel
=
true
;
ck
::
index_t
M
=
48
*
256
;
ck
::
index_t
N
=
1024
;
ck
::
index_t
Stride
=
N
;
auto
f_host_tensor_descriptor1d
=
[](
std
::
size_t
len
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
len
}),
std
::
vector
<
std
::
size_t
>
({
stride
}));
};
auto
f_host_tensor_descriptor2d
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
};
Tensor
<
ADataType
>
a
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
Tensor
<
BDataType
>
b
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
Tensor
<
GammaDataType
>
gamma
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
BetaDataType
>
beta
(
f_host_tensor_descriptor1d
(
N
,
1
));
Tensor
<
YDataType
>
y
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_2
<
GammaDataType
>
{
-
5
,
5
});
beta
.
GenerateTensorValue
(
GeneratorTensor_2
<
BetaDataType
>
{
-
5
,
5
});
DeviceMem
a_dev
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_dev
(
sizeof
(
BDataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
a_dev
.
ToDevice
(
a
.
mData
.
data
());
b_dev
.
ToDevice
(
b
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
std
::
array
<
const
void
*
,
2
>
input
=
{
a_dev
.
GetDeviceBuffer
(),
b_dev
.
GetDeviceBuffer
()};
auto
device_instance
=
DeviceInstance
{};
auto
argument_ptr
=
device_instance
.
MakeArgumentPointer
(
{
M
,
N
},
{
std
::
vector
<
ck
::
index_t
>
{
a
.
mDesc
.
GetStrides
().
begin
(),
a
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
b
.
mDesc
.
GetStrides
().
begin
(),
b
.
mDesc
.
GetStrides
().
end
()},
},
{
0
,
1
},
{
0
,
1
},
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
{
1
},
1e-4
,
input
,
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
XElementwiseOperation
{},
YElementwiseOperation
{});
if
(
!
device_instance
.
IsSupportedArgument
(
argument_ptr
.
get
()))
{
std
::
cout
<<
"The runtime parameters are not supported"
<<
std
::
endl
;
return
1
;
};
auto
invoker_ptr
=
device_instance
.
MakeInvokerPointer
();
float
ela_time
=
0
;
ela_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
float
data_mem_size
=
M
*
N
*
sizeof
(
ADataType
)
+
M
*
N
*
sizeof
(
BDataType
)
+
M
*
N
*
sizeof
(
YDataType
)
+
N
*
sizeof
(
GammaDataType
)
+
N
*
sizeof
(
BetaDataType
);
float
bandwidth
=
data_mem_size
*
1000
/
ela_time
/
1024
/
1024
/
1024
;
std
::
cout
<<
"Bandwidth is : "
<<
bandwidth
<<
"GB/s . "
<<
std
::
endl
;
std
::
cout
<<
"Time elapase is : "
<<
ela_time
<<
" ms . "
<<
std
::
endl
;
bool
pass
=
true
;
{
std
::
vector
<
std
::
size_t
>
mn
=
{
static_cast
<
unsigned
long
>
(
M
),
static_cast
<
unsigned
long
>
(
N
)};
Tensor
<
XDataType
>
x
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
host_elementwise2D
<
Tensor
<
ADataType
>
,
Tensor
<
BDataType
>
,
Tensor
<
XDataType
>
,
XElementwiseOperation
>
(
x
,
a
,
b
,
mn
,
XElementwiseOperation
{});
Tensor
<
YDataType
>
host_y
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
YElementwiseOperation
,
Rank
,
NumReduceDim
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
YElementwiseOperation
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
y_dev
.
FromDevice
(
y
.
mData
.
data
());
pass
&=
ck
::
utils
::
check_err
(
y
.
mData
,
host_y
.
mData
,
"Error: Incorrect results d1"
,
1e-3
,
1e-3
);
if
(
!
(
pass
))
{
std
::
cout
<<
"layernorm wrong"
<<
std
::
endl
;
}
}
return
(
pass
?
0
:
1
);
}
include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
deleted
100644 → 0
View file @
efbcc6ed
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataTypeTuple
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
,
typename
XElementwiseOperation
,
typename
YElementwiseOperation
,
index_t
Rank
,
index_t
NumReduceDim
>
struct
DeviceElementwiseNormalization
:
public
BaseOperator
{
static
constexpr
int
NumInput
=
InDataTypeTuple
::
Size
();
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
lengths
,
const
std
::
array
<
std
::
vector
<
index_t
>
,
NumInput
>
inStridesArray
,
const
std
::
vector
<
index_t
>
gammaStrides
,
const
std
::
vector
<
index_t
>
betaStrides
,
const
std
::
vector
<
index_t
>
yStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
epsilon
,
const
std
::
array
<
const
void
*
,
NumInput
>
in_dev_buffers
,
const
void
*
p_gamma
,
const
void
*
p_beta
,
void
*
p_y
,
XElementwiseOperation
x_elementwise_op
,
YElementwiseOperation
y_elementwise_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
};
template
<
typename
InDataTypeTuple
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
,
typename
XElementwiseOperation
,
typename
YElementwiseOperation
,
index_t
Rank
,
index_t
NumReduceDim
>
using
DeviceElementwiseNormalizationPtr
=
std
::
unique_ptr
<
DeviceElementwiseNormalization
<
InDataTypeTuple
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
Rank
,
NumReduceDim
>>
;
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
deleted
100644 → 0
View file @
efbcc6ed
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
// X = Elementwise(input1, input2, input3, ...)
// Y = Normalization(X, beta, gamma)
namespace
ck
{
template
<
typename
GridwiseElementwiseReduction
,
typename
InDataTypePointerTuple
,
// Datatype tuple of inputs
typename
XDataType
,
// Datatype of X
typename
GammaDataType
,
// Datatype of Gamma
typename
BetaDataType
,
// Datatype of Beta
typename
YDataType
,
// Datatype of Y
typename
AccDataType
,
// AccDatatype
typename
XElementwiseOperation
,
// Operation of input
typename
YElementwiseOperation
,
// Operation of output of normalization
typename
InGrid2dDescTuple
,
// Descriptor tuple of inputs
typename
GridDesc_M_K
>
// Descriptor of inputs, Gamma, Beta
__global__
void
kernel_elementwise_layernorm
(
const
InGrid2dDescTuple
in_grid_2d_desc_tuple
,
// Descriptor tuple of inputs
const
GridDesc_M_K
x_grid_desc_m_k
,
// Descriptor of X
const
GridDesc_M_K
gamma_grid_desc_m_k
,
// Descriptor of gamma
const
GridDesc_M_K
beta_grid_desc_m_k
,
// Descriptor of beta
const
GridDesc_M_K
y_grid_desc_m_k
,
// Descriptor of Y
index_t
num_k_block_tile_iteration
,
//
AccDataType
epsilon
,
// Datatype of epsilon
const
InDataTypePointerTuple
p_in_global_tuple
,
// Ptr tuple of input matrixs
const
GammaDataType
*
const
__restrict__
p_gamma_global
,
// Ptr of gamma
const
BetaDataType
*
const
__restrict__
p_beta_global
,
// Ptr of beta
YDataType
*
const
__restrict__
p_y_global
,
// Ptr of y
const
XElementwiseOperation
x_elementwise_op
,
// Operation of input
const
YElementwiseOperation
y_elementwise_op
)
// Operation of output of normalization
{
extern
__shared__
XDataType
p_x_lds
[];
GridwiseElementwiseReduction
::
Run
(
in_grid_2d_desc_tuple
,
// Descriptor tuple of inputs
x_grid_desc_m_k
,
// Descriptor of X
gamma_grid_desc_m_k
,
// Descriptor of Gamma
beta_grid_desc_m_k
,
// Descriptor of Beta
y_grid_desc_m_k
,
// Descriptor of Y
num_k_block_tile_iteration
,
//
epsilon
,
// epsilon
p_in_global_tuple
,
// Ptr tuple of inputs
p_x_lds
,
// Ptr of X
p_gamma_global
,
// Ptr of gamma
p_beta_global
,
// Ptr of beta
p_y_global
,
// Ptr of Y
x_elementwise_op
,
// Operation of input
y_elementwise_op
);
// Operation of output of normalization
};
}
// namespace ck
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// Y = LayerNorm(A + B, Beta, Gamma)
template
<
typename
InDataTypeTuple
,
// Datatype of inputs
typename
GammaDataType
,
// Datatype of gamma
typename
BetaDataType
,
// Datatype of beta
typename
AccDataType
,
//
typename
YDataType
,
//
typename
XElementwiseOperation
,
//
typename
YElementwiseOperation
,
//
index_t
Rank
,
//
index_t
NumReduceDim
,
//
index_t
BlockSize
,
//
index_t
MThreadClusterSize
,
// Num of threads in a block on M direction
index_t
KThreadClusterSize
,
// Num of threads in a block on N direction
index_t
MThreadSliceSize
,
// Each thread calculate rows
index_t
KThreadSliceSize
,
// Each thread calculate columns
index_t
XYSrcVectorDim
,
// Dimension to do reduce
index_t
XSrcVectorSize
,
// Size to fetch source x
index_t
GammaSrcVectorDim
,
// Dimension for gamma to do reduce
index_t
GammaSrcVectorSize
,
// Size to fetch source gamma
index_t
BetaSrcVectorDim
,
// Dimension for beta to do reduce
index_t
BetaSrcVectorSize
,
// Size to fetch source beta
index_t
YDstVectorSize
>
// Size to write destination Y
struct
DeviceElementwiseNormalizationImpl
:
public
DeviceElementwiseNormalization
<
InDataTypeTuple
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
Rank
,
NumReduceDim
>
{
static
constexpr
int
NumInput
=
InDataTypeTuple
::
Size
();
using
XDataType
=
YDataType
;
static_assert
(
(
KThreadSliceSize
%
GammaSrcVectorSize
==
0
),
"Invalid thread slice sizes and/or gamma vector sizes configuration, please check!"
);
static_assert
(
(
KThreadSliceSize
%
BetaSrcVectorSize
==
0
),
"Invalid thread slice sizes and/or beta vector sizes configuration, please check!"
);
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
// num of rows calculated in a block
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
// num of columns calculated in a block
static
auto
GenerateInDataTypePointerTuple
()
{
return
generate_tuple
(
[
&
](
auto
I
)
{
using
DataType
=
remove_cvref_t
<
decltype
(
InDataTypeTuple
{}[
I
])
>
;
return
static_cast
<
const
DataType
*>
(
nullptr
);
},
Number
<
NumInput
>
{});
};
using
InDataTypePointerTuple
=
decltype
(
GenerateInDataTypePointerTuple
());
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
index_t
>&
inLengths
,
const
std
::
vector
<
index_t
>&
inStrides
,
int
blkGroupSize
,
int
numBlockTileIteration
)
{
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
numSrcDim
=
Rank
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
in_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
reduceAllDim
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
reduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
int
reduceSizePerBlock
=
K_BlockTileSize
*
numBlockTileIteration
;
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
reduceSizePerBlock
*
blkGroupSize
-
reduceLength
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad_M
),
make_right_pad_transform
(
reduceLength
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
template
<
index_t
TupleSize
>
static
auto
GenerateSrcGrid2dDescTuple
(
Number
<
TupleSize
>
)
{
return
generate_tuple
([
&
](
auto
)
{
return
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
);
},
Number
<
TupleSize
>
{});
};
using
InGrid2dDescTuple
=
decltype
(
GenerateSrcGrid2dDescTuple
(
Number
<
NumInput
>
{}));
using
GridDesc_M_K
=
decltype
(
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridwiseReduceLayernormGeneric
=
GridwiseElementwiseLayernormWelfordVariance_mk_to_mk
<
InDataTypePointerTuple
,
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
InGrid2dDescTuple
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
XYSrcVectorDim
,
XSrcVectorSize
,
GammaSrcVectorDim
,
GammaSrcVectorSize
,
BetaSrcVectorDim
,
BetaSrcVectorSize
,
XYSrcVectorDim
,
YDstVectorSize
,
false
>
;
using
GridwiseReduceLayernormSweepOnce
=
GridwiseElementwiseLayernormWelfordVariance_mk_to_mk
<
InDataTypePointerTuple
,
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
InGrid2dDescTuple
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
XYSrcVectorDim
,
XSrcVectorSize
,
GammaSrcVectorDim
,
GammaSrcVectorSize
,
BetaSrcVectorDim
,
BetaSrcVectorSize
,
XYSrcVectorDim
,
YDstVectorSize
,
true
>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
index_t
>
lengths
,
const
std
::
array
<
std
::
vector
<
index_t
>
,
NumInput
>
inStridesArray
,
const
std
::
vector
<
index_t
>
gammaStrides
,
const
std
::
vector
<
index_t
>
betaStrides
,
const
std
::
vector
<
index_t
>
yStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
XElementwiseOperation
x_elementwise_op
,
YElementwiseOperation
y_elementwise_op
,
AccDataType
epsilon
,
const
std
::
array
<
const
void
*
,
NumInput
>
in_dev_buffers
,
const
GammaDataType
*
p_gamma
,
const
BetaDataType
*
p_beta
,
YDataType
*
p_y
)
:
epsilon_
(
epsilon
),
p_gamma_
(
p_gamma
),
p_beta_
(
p_beta
),
p_y_
(
p_y
),
x_elementwise_op_
(
x_elementwise_op
),
y_elementwise_op_
(
y_elementwise_op
)
{
Lengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
lengths
,
reduceDims
);
for
(
int
i
=
0
;
i
<
NumInput
;
i
++
)
{
inStridesArray_
[
i
]
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStridesArray
[
i
],
reduceDims
);
}
yStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
yStrides
,
reduceDims
);
xStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
yStrides
,
reduceDims
);
gammaStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
gammaStrides
,
reduceDims
);
betaStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
betaStrides
,
reduceDims
);
in_dev_buffers_
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataType
=
remove_cvref_t
<
decltype
(
InDataTypeTuple
{}[
I
])
>
;
return
static_cast
<
const
DataType
*>
(
in_dev_buffers
[
I
.
value
]);
},
Number
<
NumInput
>
{});
long_index_t
invariant_total_length
;
long_index_t
reduce_total_length
;
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
NumReduceDim
>
(
Lengths_
);
blkGroupSize_
=
1
;
numBlockTileIteration_
=
(
reduce_total_length
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
gridSize_
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
*
blkGroupSize_
;
in_grid_2d_desc_tuple_
=
generate_tuple
(
[
&
](
auto
I
)
{
return
MakeSrc2dDescriptor
(
Lengths_
,
inStridesArray_
[
I
.
value
],
blkGroupSize_
,
numBlockTileIteration_
);
},
Number
<
NumInput
>
{});
x_grid_desc_m_k_
=
MakeSrc2dDescriptor
(
Lengths_
,
xStrides_
,
blkGroupSize_
,
numBlockTileIteration_
);
gamma_grid_desc_m_k_
=
MakeSrc2dDescriptor
(
Lengths_
,
gammaStrides_
,
blkGroupSize_
,
numBlockTileIteration_
);
beta_grid_desc_m_k_
=
MakeSrc2dDescriptor
(
Lengths_
,
betaStrides_
,
blkGroupSize_
,
numBlockTileIteration_
);
y_grid_desc_m_k_
=
MakeSrc2dDescriptor
(
Lengths_
,
yStrides_
,
blkGroupSize_
,
numBlockTileIteration_
);
sweep_once_
=
x_grid_desc_m_k_
.
GetLength
(
Number
<
1
>
{})
<=
KThreadClusterSize
*
KThreadSliceSize
;
if
(
!
sweep_once_
)
// if not sweep once, compute memory size for matrix X in lds for
// store Intermediate results
{
int
block_TileSize
=
M_BlockTileSize
*
reduce_total_length
;
x_lds_size_
=
block_TileSize
*
sizeof
(
XDataType
);
}
else
x_lds_size_
=
0
;
}
AccDataType
epsilon_
;
InDataTypePointerTuple
in_dev_buffers_
;
const
GammaDataType
*
p_gamma_
;
const
BetaDataType
*
p_beta_
;
YDataType
*
p_y_
;
std
::
vector
<
index_t
>
Lengths_
;
std
::
array
<
std
::
vector
<
index_t
>
,
NumInput
>
inStridesArray_
;
std
::
vector
<
index_t
>
xStrides_
;
std
::
vector
<
index_t
>
gammaStrides_
;
std
::
vector
<
index_t
>
betaStrides_
;
std
::
vector
<
index_t
>
yStrides_
;
XElementwiseOperation
x_elementwise_op_
;
YElementwiseOperation
y_elementwise_op_
;
int
blkGroupSize_
;
int
numBlockTileIteration_
;
size_t
gridSize_
;
InGrid2dDescTuple
in_grid_2d_desc_tuple_
;
GridDesc_M_K
x_grid_desc_m_k_
;
GridDesc_M_K
gamma_grid_desc_m_k_
;
GridDesc_M_K
beta_grid_desc_m_k_
;
GridDesc_M_K
y_grid_desc_m_k_
;
bool
sweep_once_
;
int
x_lds_size_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
kernel_main
=
arg
.
sweep_once_
?
kernel_elementwise_layernorm
<
GridwiseReduceLayernormSweepOnce
,
InDataTypePointerTuple
,
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
InGrid2dDescTuple
,
GridDesc_M_K
>
:
kernel_elementwise_layernorm
<
GridwiseReduceLayernormGeneric
,
InDataTypePointerTuple
,
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
XElementwiseOperation
,
YElementwiseOperation
,
InGrid2dDescTuple
,
GridDesc_M_K
>
;
float
avg_time
=
0
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_main
,
dim3
(
arg
.
gridSize_
),
dim3
(
BlockSize
),
arg
.
x_lds_size_
,
arg
.
in_grid_2d_desc_tuple_
,
arg
.
x_grid_desc_m_k_
,
arg
.
gamma_grid_desc_m_k_
,
arg
.
beta_grid_desc_m_k_
,
arg
.
y_grid_desc_m_k_
,
arg
.
numBlockTileIteration_
,
arg
.
epsilon_
,
arg
.
in_dev_buffers_
,
arg
.
p_gamma_
,
arg
.
p_beta_
,
arg
.
p_y_
,
arg
.
x_elementwise_op_
,
arg
.
y_elementwise_op_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
if
constexpr
(
XYSrcVectorDim
==
0
)
{
if
constexpr
(
NumInvariantDim
==
0
)
{
return
false
;
}
else
{
for
(
int
i
=
0
;
i
<
NumInput
;
i
++
)
{
if
(
p_arg_
->
inStridesArray_
[
i
][
NumInvariantDim
-
1
]
!=
1
)
return
false
;
}
if
(
p_arg_
->
inStridesArray_
[
0
][
NumInvariantDim
-
1
]
!=
1
&&
p_arg_
->
inStridesArray_
[
1
][
NumInvariantDim
-
1
]
!=
1
)
return
false
;
if
(
p_arg_
->
invariant_lowest_length
%
XSrcVectorSize
!=
0
)
return
false
;
};
}
else
{
for
(
int
i
=
0
;
i
<
NumInput
;
i
++
)
{
if
(
p_arg_
->
inStridesArray_
[
i
][
Rank
-
1
]
!=
1
)
return
false
;
}
if
(
p_arg_
->
Lengths_
[
Rank
-
1
]
%
XSrcVectorSize
!=
0
)
return
false
;
};
if
(
p_arg_
->
Lengths_
[
Rank
-
1
]
%
YDstVectorSize
!=
0
)
{
return
false
;
}
auto
IsScalarPerVectorValid
=
[](
bool
isLastDimensionCoalesced
,
int
scalarPerVector
)
{
bool
ret
=
true
;
if
(
!
isLastDimensionCoalesced
)
ret
=
scalarPerVector
==
1
;
else
ret
=
KThreadSliceSize
%
scalarPerVector
==
0
;
return
ret
;
};
if
(
!
IsScalarPerVectorValid
(
p_arg_
->
gammaStrides_
.
back
()
==
1
,
GammaSrcVectorSize
))
return
false
;
if
(
!
IsScalarPerVectorValid
(
p_arg_
->
betaStrides_
.
back
()
==
1
,
BetaSrcVectorSize
))
return
false
;
// if fastest dim is not reduced
if
constexpr
(
XYSrcVectorDim
==
0
)
//
{
if
(
p_arg_
->
gammaStrides_
[
NumInvariantDim
-
1
]
!=
1
)
return
(
false
);
if
(
p_arg_
->
Lengths_
[
Rank
-
1
]
%
GammaSrcVectorSize
!=
0
)
return
(
false
);
}
else
// if fastest dim is reduced
{
if
(
p_arg_
->
gammaStrides_
[
Rank
-
1
]
!=
1
)
return
(
false
);
if
(
p_arg_
->
Lengths_
[
Rank
-
1
]
%
GammaSrcVectorSize
!=
0
)
return
(
false
);
}
// if fastest dim is not reduced
if
constexpr
(
XYSrcVectorDim
==
0
)
{
if
(
p_arg_
->
betaStrides_
[
NumInvariantDim
-
1
]
!=
1
)
return
(
false
);
if
(
p_arg_
->
invariant_lowest_length
%
BetaSrcVectorSize
!=
0
)
return
(
false
);
}
else
// if fastest dim is reduced
{
if
(
p_arg_
->
betaStrides_
[
Rank
-
1
]
!=
1
)
return
(
false
);
if
(
p_arg_
->
Lengths_
[
Rank
-
1
]
%
BetaSrcVectorSize
!=
0
)
return
(
false
);
}
return
true
;
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
lengths
,
const
std
::
array
<
std
::
vector
<
index_t
>
,
NumInput
>
inStridesArray
,
const
std
::
vector
<
index_t
>
gammaStrides
,
const
std
::
vector
<
index_t
>
betaStrides
,
const
std
::
vector
<
index_t
>
yStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
epsilon
,
const
std
::
array
<
const
void
*
,
NumInput
>
in_dev_buffers
,
const
void
*
p_gamma
,
const
void
*
p_beta
,
void
*
p_y
,
XElementwiseOperation
x_elementwise_op
,
YElementwiseOperation
y_elementwise_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
lengths
,
inStridesArray
,
gammaStrides
,
betaStrides
,
yStrides
,
reduceDims
,
x_elementwise_op
,
y_elementwise_op
,
epsilon
,
in_dev_buffers
,
static_cast
<
const
GammaDataType
*>
(
p_gamma
),
static_cast
<
const
BetaDataType
*>
(
p_beta
),
static_cast
<
YDataType
*>
(
p_y
));
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceElementwiseNormalizationImpl<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"XYSrcVectorDim_"
<<
XYSrcVectorDim
<<
","
;
str
<<
"VectorSize_X"
<<
XSrcVectorSize
<<
"_Gamma"
<<
GammaSrcVectorSize
<<
"_Beta"
<<
BetaSrcVectorSize
<<
"_Y"
<<
YDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
deleted
100644 → 0
View file @
efbcc6ed
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
// X = Elementwise(input1, input2, input3, ...)
// Y = Normalization(X, beta, gamma)
template
<
typename
InDataTypePointerTuple
,
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
XElementwiseOperation
,
typename
YElementwiseOperation
,
typename
InGrid2dDescTuple
,
typename
GridDesc_M_K
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
XSrcVectorDim
,
index_t
XSrcVectorSize
,
index_t
GammaSrcVectorDim
,
index_t
GammaSrcVectorSize
,
index_t
BetaSrcVectorDim
,
index_t
BetaSrcVectorSize
,
index_t
YDstVectorDim
,
index_t
YDstVectorSize
,
bool
SweepOnce
>
struct
GridwiseElementwiseLayernormWelfordVariance_mk_to_mk
{
static_assert
((
XSrcVectorDim
==
0
&&
MThreadSliceSize
%
XSrcVectorSize
==
0
)
||
(
XSrcVectorDim
==
1
&&
KThreadSliceSize
%
XSrcVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static_assert
((
YDstVectorDim
==
0
&&
MThreadSliceSize
%
YDstVectorSize
==
0
)
||
(
YDstVectorDim
==
1
&&
KThreadSliceSize
%
YDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static
constexpr
index_t
NumInput
=
InDataTypePointerTuple
::
Size
();
static
constexpr
bool
reorder_thread_cluster
=
(
XSrcVectorDim
==
0
);
using
ThreadClusterLengths_M_K
=
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
;
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadClusterArrangeOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
XSrcVectorSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{})));
using
ThreadwiseWelford
=
ThreadwiseWelford
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
>
;
using
BlockwiseWelford
=
BlockwiseWelford
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
>
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
constexpr
index_t
K_BlockTileStepSize
=
KThreadClusterSize
*
XSrcVectorSize
;
static
constexpr
auto
XThreadBufferNumber
=
Number
<
KThreadSliceSize
/
XSrcVectorSize
>
{};
static
constexpr
auto
GammaThreadBufferNumber
=
Number
<
KThreadSliceSize
/
GammaSrcVectorSize
>
{};
static
constexpr
auto
BetaThreadBufferNumber
=
Number
<
KThreadSliceSize
/
BetaSrcVectorSize
>
{};
static
constexpr
auto
YThreadBufferNumber
=
Number
<
KThreadSliceSize
/
YDstVectorSize
>
{};
__device__
static
int
GetKPerThread
(
const
GridDesc_M_K
&
x_grid_desc_m_k
,
int
thread_k_cluster_id
)
{
int
kPerBlock
=
x_grid_desc_m_k
.
GetTransforms
()[
I2
].
GetUpperLengths
()[
I0
];
int
kPerThread
=
kPerBlock
<
K_BlockTileSize
?
0
:
KThreadSliceSize
*
(
kPerBlock
/
K_BlockTileSize
);
int
kPerBlockTail
=
kPerBlock
-
kPerThread
*
KThreadClusterSize
;
if
(
kPerBlockTail
>
0
)
{
static_for
<
0
,
XThreadBufferNumber
,
1
>
{}([
&
](
auto
i
)
{
int
thread_max_len
=
(
thread_k_cluster_id
+
1
)
*
XSrcVectorSize
+
K_BlockTileStepSize
*
i
;
int
delta
=
thread_max_len
-
kPerBlockTail
;
delta
=
math
::
clamp
(
thread_max_len
-
kPerBlockTail
,
0
,
XSrcVectorSize
);
kPerThread
+=
XSrcVectorSize
-
delta
;
});
}
return
kPerThread
;
}
__device__
static
void
Run
(
const
InGrid2dDescTuple
in_grid_2d_desc_tuple
,
const
GridDesc_M_K
&
x_grid_desc_m_k
,
const
GridDesc_M_K
&
gamma_grid_desc_m_k
,
const
GridDesc_M_K
&
beta_grid_desc_m_k
,
const
GridDesc_M_K
&
y_grid_desc_m_k
,
index_t
num_k_block_tile_iteration
,
AccDataType
epsilon
,
const
InDataTypePointerTuple
p_in_global_tuple
,
XDataType
*
const
__restrict__
p_x_lds
,
const
GammaDataType
*
const
__restrict__
p_gamma_global
,
const
BetaDataType
*
const
__restrict__
p_beta_global
,
YDataType
*
const
__restrict__
p_y_global
,
const
XElementwiseOperation
x_elementwise_op
,
const
YElementwiseOperation
y_elementwise_op
)
{
if
constexpr
(
SweepOnce
)
{
num_k_block_tile_iteration
=
1
;
}
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
grid_size
=
get_grid_size
();
auto
in_global_buf_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
static_assert
(
in_grid_2d_desc_tuple
[
I
].
GetNumOfDimension
()
==
2
);
// matrix dimension
return
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global_tuple
[
I
],
in_grid_2d_desc_tuple
[
I
].
GetElementSpaceSize
());
},
Number
<
NumInput
>
{});
auto
y_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_y_global
,
y_grid_desc_m_k
.
GetElementSpaceSize
());
auto
x_lds_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_x_lds
,
x_grid_desc_m_k
.
GetElementSpaceSize
()
/
grid_size
);
auto
in_thread_buf_tuple
=
generate_tuple
(
[
&
](
auto
)
{
return
generate_tuple
(
[
&
](
auto
)
{
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
XSrcVectorSize
,
true
>
{};
},
Number
<
NumInput
>
{});
},
Number
<
XThreadBufferNumber
>
{});
auto
x_thread_buf
=
generate_tuple
(
[
&
](
auto
)
{
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
XSrcVectorSize
,
true
>
{};
},
Number
<
XThreadBufferNumber
>
{});
auto
gamma_thread_buf
=
generate_tuple
(
[
&
](
auto
)
{
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
GammaSrcVectorSize
,
true
>
{};
},
Number
<
GammaThreadBufferNumber
>
{});
auto
beta_thread_buf
=
generate_tuple
(
[
&
](
auto
)
{
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
BetaSrcVectorSize
,
true
>
{};
},
Number
<
BetaThreadBufferNumber
>
{});
auto
y_thread_buf
=
generate_tuple
(
[
&
](
auto
)
{
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
YDstVectorSize
,
true
>
{};
},
Number
<
YThreadBufferNumber
>
{});
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
mean_thread_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
var_thread_buf
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
I0
];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
I1
];
using
ThreadBufferLengths_M_K
=
Sequence
<
MThreadSliceSize
,
XSrcVectorSize
>
;
constexpr
auto
thread_buffer_desc_m_k
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
XSrcVectorSize
>
{}));
auto
in_global_load_tuple
=
generate_tuple
(
[
&
](
auto
I
)
{
using
DataTypePointer
=
remove_cvref_t
<
decltype
(
InDataTypePointerTuple
{}[
I
])
>
;
using
DataType
=
remove_cv_t
<
remove_pointer_t
<
DataTypePointer
>>
;
return
ThreadwiseTensorSliceTransfer_v2
<
DataType
,
AccDataType
,
decltype
(
in_grid_2d_desc_tuple
[
I
]),
decltype
(
thread_buffer_desc_m_k
),
ThreadBufferLengths_M_K
,
ThreadBufferDimAccessOrder
,
XSrcVectorDim
,
XSrcVectorSize
,
1
,
false
>
{
in_grid_2d_desc_tuple
[
I
],
make_multi_index
(
block_global_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
XSrcVectorSize
)};
},
Number
<
NumInput
>
{});
auto
threadwise_x_load
=
ThreadwiseTensorSliceTransfer_v2
<
XDataType
,
AccDataType
,
GridDesc_M_K
,
decltype
(
thread_buffer_desc_m_k
),
ThreadBufferLengths_M_K
,
ThreadBufferDimAccessOrder
,
XSrcVectorDim
,
XSrcVectorSize
,
1
,
true
>
(
x_grid_desc_m_k
,
make_multi_index
(
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
XSrcVectorSize
));
auto
threadwise_gamma_load
=
ThreadwiseTensorSliceTransfer_v2
<
GammaDataType
,
AccDataType
,
GridDesc_M_K
,
decltype
(
thread_buffer_desc_m_k
),
ThreadBufferLengths_M_K
,
ThreadBufferDimAccessOrder
,
GammaSrcVectorDim
,
GammaSrcVectorSize
,
1
,
true
>
(
gamma_grid_desc_m_k
,
make_multi_index
(
block_global_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
GammaSrcVectorSize
));
auto
threadwise_beta_load
=
ThreadwiseTensorSliceTransfer_v2
<
BetaDataType
,
AccDataType
,
GridDesc_M_K
,
decltype
(
thread_buffer_desc_m_k
),
ThreadBufferLengths_M_K
,
ThreadBufferDimAccessOrder
,
BetaSrcVectorDim
,
BetaSrcVectorSize
,
1
,
true
>
(
beta_grid_desc_m_k
,
make_multi_index
(
block_global_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
BetaSrcVectorSize
));
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
PassThrough
pass_through_op
;
auto
threadwise_x_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
XDataType
,
decltype
(
thread_buffer_desc_m_k
),
GridDesc_M_K
,
YElementwiseOperation
,
ThreadBufferLengths_M_K
,
ThreadBufferDimAccessOrder
,
XSrcVectorDim
,
XSrcVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
x_grid_desc_m_k
,
make_multi_index
(
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
XSrcVectorSize
),
pass_through_op
);
auto
threadwise_y_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
YDataType
,
decltype
(
thread_buffer_desc_m_k
),
GridDesc_M_K
,
YElementwiseOperation
,
ThreadBufferLengths_M_K
,
ThreadBufferDimAccessOrder
,
YDstVectorDim
,
YDstVectorSize
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
(
y_grid_desc_m_k
,
make_multi_index
(
block_global_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
thread_k_cluster_id
*
YDstVectorSize
),
y_elementwise_op
);
// Copy x from Cache
// one pass: fwd, second pass: bwd
constexpr
auto
thread_copy_fwd_step_m_k
=
make_multi_index
(
0
,
K_BlockTileStepSize
);
constexpr
auto
thread_copy_bwd_step_m_k
=
make_multi_index
(
0
,
SweepOnce
?
0
:
-
K_BlockTileSize
);
const
auto
gamma_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_gamma_global
,
gamma_grid_desc_m_k
.
GetElementSpaceSize
());
const
auto
beta_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_beta_global
,
beta_grid_desc_m_k
.
GetElementSpaceSize
());
auto
threadwise_welford
=
ThreadwiseWelford
();
threadwise_welford
.
max_count_
=
GetKPerThread
(
x_grid_desc_m_k
,
thread_k_cluster_id
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
mean_thread_buf
(
I
)
=
type_convert
<
AccDataType
>
(
0.0
f
);
var_thread_buf
(
I
)
=
type_convert
<
AccDataType
>
(
0.0
f
);
});
for
(
index_t
reducedTiles
=
0
;
reducedTiles
<
num_k_block_tile_iteration
;
++
reducedTiles
)
{
static_for
<
0
,
XThreadBufferNumber
,
1
>
{}([
&
](
auto
iK0
)
{
static_for
<
0
,
NumInput
,
1
>
{}([
&
](
auto
I
)
{
// input load loop
in_global_load_tuple
(
I
).
Run
(
in_grid_2d_desc_tuple
[
I
],
in_global_buf_tuple
[
I
],
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
in_thread_buf_tuple
(
iK0
)(
I
));
in_global_load_tuple
(
I
).
MoveSrcSliceWindow
(
in_grid_2d_desc_tuple
[
I
],
thread_copy_fwd_step_m_k
);
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// input add loop
static_for
<
0
,
XSrcVectorSize
,
1
>
{}([
&
](
auto
iK1
)
{
constexpr
auto
offset_m_k
=
thread_buffer_desc_m_k
.
CalculateOffset
(
make_tuple
(
iM
,
iK1
));
// get reference to in data
const
auto
in_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
I
)
->
const
auto
&
{
return
in_thread_buf_tuple
(
iK0
)(
I
)(
Number
<
offset_m_k
>
{});
},
Number
<
NumInput
>
{});
// get reference to dst data
auto
out_data_refs
=
generate_tie
(
// return type should be lvalue
[
&
](
auto
)
->
auto
&
{
return
x_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{});
},
I1
);
unpack2
(
x_elementwise_op
,
out_data_refs
,
in_data_refs
);
});
});
threadwise_welford
.
Run
(
x_thread_buf
[
iK0
],
mean_thread_buf
,
var_thread_buf
);
if
constexpr
(
!
SweepOnce
)
{
threadwise_x_store
.
Run
(
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
x_thread_buf
(
iK0
),
x_grid_desc_m_k
,
x_lds_val_buf
);
threadwise_x_store
.
MoveDstSliceWindow
(
x_grid_desc_m_k
,
thread_copy_fwd_step_m_k
);
}
});
}
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
constexpr
(
I
>
0
)
block_sync_lds
();
int
count
=
threadwise_welford
.
cur_count_
;
BlockwiseWelford
::
Run
(
mean_thread_buf
(
I
),
var_thread_buf
(
I
),
count
);
});
auto
thread_copy_tail_m_k
=
(
num_k_block_tile_iteration
-
1
)
*
XThreadBufferNumber
*
thread_copy_fwd_step_m_k
;
if
constexpr
(
!
SweepOnce
)
threadwise_x_load
.
MoveSrcSliceWindow
(
x_grid_desc_m_k
,
thread_copy_tail_m_k
);
threadwise_gamma_load
.
MoveSrcSliceWindow
(
gamma_grid_desc_m_k
,
thread_copy_tail_m_k
);
threadwise_beta_load
.
MoveSrcSliceWindow
(
beta_grid_desc_m_k
,
thread_copy_tail_m_k
);
threadwise_y_store
.
MoveDstSliceWindow
(
y_grid_desc_m_k
,
thread_copy_tail_m_k
);
for
(
index_t
reducedTiles
=
0
;
reducedTiles
<
num_k_block_tile_iteration
;
++
reducedTiles
)
{
if
constexpr
(
!
SweepOnce
)
{
static_for
<
0
,
XThreadBufferNumber
,
1
>
{}([
&
](
auto
i
)
{
threadwise_x_load
.
Run
(
x_grid_desc_m_k
,
x_lds_val_buf
,
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
x_thread_buf
(
i
));
threadwise_x_load
.
MoveSrcSliceWindow
(
x_grid_desc_m_k
,
thread_copy_fwd_step_m_k
);
});
}
static_for
<
0
,
GammaThreadBufferNumber
,
1
>
{}([
&
](
auto
i
)
{
threadwise_gamma_load
.
Run
(
gamma_grid_desc_m_k
,
gamma_global_val_buf
,
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
gamma_thread_buf
(
i
));
threadwise_gamma_load
.
MoveSrcSliceWindow
(
gamma_grid_desc_m_k
,
thread_copy_fwd_step_m_k
);
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
auto
divisor
=
1
/
__builtin_amdgcn_sqrtf
(
var_thread_buf
(
iM
)
+
epsilon
);
static_for
<
0
,
XThreadBufferNumber
,
1
>
{}([
&
](
auto
iK0
)
{
static_for
<
0
,
XSrcVectorSize
,
1
>
{}([
&
](
auto
iK1
)
{
constexpr
auto
offset_m_k
=
thread_buffer_desc_m_k
.
CalculateOffset
(
make_tuple
(
iM
,
iK1
));
// normalize
y_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{})
=
(
x_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{})
-
mean_thread_buf
(
iM
))
*
divisor
;
// gamma
y_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{})
=
y_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{})
*
gamma_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{});
});
});
});
static_for
<
0
,
BetaThreadBufferNumber
,
1
>
{}([
&
](
auto
i
)
{
threadwise_beta_load
.
Run
(
beta_grid_desc_m_k
,
beta_global_val_buf
,
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
beta_thread_buf
(
i
));
threadwise_beta_load
.
MoveSrcSliceWindow
(
beta_grid_desc_m_k
,
thread_copy_fwd_step_m_k
);
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
XThreadBufferNumber
,
1
>
{}([
&
](
auto
iK0
)
{
static_for
<
0
,
XSrcVectorSize
,
1
>
{}([
&
](
auto
iK1
)
{
constexpr
auto
offset_m_k
=
thread_buffer_desc_m_k
.
CalculateOffset
(
make_tuple
(
iM
,
iK1
));
// beta
y_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{})
=
y_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{})
+
beta_thread_buf
(
iK0
)(
Number
<
offset_m_k
>
{});
});
});
});
static_for
<
0
,
YThreadBufferNumber
,
1
>
{}([
&
](
auto
i
)
{
threadwise_y_store
.
Run
(
thread_buffer_desc_m_k
,
make_tuple
(
I0
,
I0
),
y_thread_buf
(
i
),
y_grid_desc_m_k
,
y_global_val_buf
);
threadwise_y_store
.
MoveDstSliceWindow
(
y_grid_desc_m_k
,
thread_copy_fwd_step_m_k
);
});
if
constexpr
(
!
SweepOnce
)
threadwise_x_load
.
MoveSrcSliceWindow
(
x_grid_desc_m_k
,
2
*
thread_copy_bwd_step_m_k
);
threadwise_gamma_load
.
MoveSrcSliceWindow
(
gamma_grid_desc_m_k
,
2
*
thread_copy_bwd_step_m_k
);
threadwise_beta_load
.
MoveSrcSliceWindow
(
beta_grid_desc_m_k
,
2
*
thread_copy_bwd_step_m_k
);
threadwise_y_store
.
MoveDstSliceWindow
(
y_grid_desc_m_k
,
2
*
thread_copy_bwd_step_m_k
);
}
}
};
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
deleted
100644 → 0
View file @
efbcc6ed
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
// FP16
void
add_device_elementwise_normalization_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwiseNormalization
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
element_wise
::
Add
,
PassThrough
,
2
,
1
>>>&
);
template
<
typename
InDataTypeTuple
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
YDataType
,
index_t
Rank
,
index_t
NumReduceDim
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceElementwiseNormalization
<
InDataTypeTuple
,
GammaDataType
,
BetaDataType
,
F32
,
YDataType
,
ck
::
tensor_operation
::
element_wise
::
Add
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Rank
,
NumReduceDim
>>
{
using
DeviceOp
=
DeviceElementwiseNormalization
<
InDataTypeTuple
,
GammaDataType
,
BetaDataType
,
F32
,
YDataType
,
ck
::
tensor_operation
::
element_wise
::
Add
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Rank
,
NumReduceDim
>
;
static
auto
GetInstances
()
{
std
::
vector
<
std
::
unique_ptr
<
DeviceOp
>>
op_ptrs
;
if
constexpr
(
is_same_v
<
GammaDataType
,
F16
>
&&
is_same_v
<
BetaDataType
,
F16
>
&&
is_same_v
<
YDataType
,
F16
>
)
{
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
{
add_device_elementwise_normalization_rank_2_1_f16_instances
(
op_ptrs
);
}
}
return
op_ptrs
;
}
};
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
deleted
100644 → 0
View file @
efbcc6ed
add_instance_library
(
device_elementwise_normalization_instance
device_elementwise_normalization_f16_instance.cpp
)
library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
deleted
100644 → 0
View file @
efbcc6ed
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
typename
XElementwise
,
typename
YElementwise
,
index_t
Rank
,
index_t
Reduce
>
// clang-format off
using
device_elementwise_normalization_f16_instances
=
std
::
tuple
<
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// fallback kernel
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// fallback kernel
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceElementwiseNormalizationImpl
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
XElementwise
,
YElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
>
;
// clang-format on
void
add_device_elementwise_normalization_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceElementwiseNormalization
<
ck
::
Tuple
<
F16
,
F16
>
,
F16
,
F16
,
F32
,
F16
,
Add
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_elementwise_normalization_f16_instances
<
Add
,
Pass
,
2
,
1
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
profiler/include/profile_elementwise_layernorm_impl.hpp
deleted
100644 → 0
View file @
efbcc6ed
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
HostTensorA
,
typename
HostTensorB
,
typename
HostTensorC
,
typename
Functor
>
void
host_elementwise2D
(
HostTensorC
&
C
,
const
HostTensorA
&
A
,
const
HostTensorB
&
B
,
const
std
::
vector
<
std
::
size_t
>&
shape
,
Functor
functor
)
{
using
ctype
=
ck
::
remove_reference_t
<
decltype
(
C
(
0
,
0
))
>
;
for
(
std
::
size_t
m
=
0
;
m
<
shape
[
0
];
++
m
)
for
(
std
::
size_t
n
=
0
;
n
<
shape
[
1
];
++
n
)
{
auto
a_val
=
A
(
m
,
n
);
auto
b_val
=
B
(
m
,
n
);
ctype
c_val
=
0
;
functor
(
c_val
,
a_val
,
b_val
);
C
(
m
,
n
)
=
c_val
;
}
}
template
<
typename
ADataType
,
typename
BDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
>
bool
profile_elementwise_layernorm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
length
)
{
using
Add
=
ck
::
tensor_operation
::
element_wise
::
Add
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
if
(
length
.
size
()
!=
2
)
return
false
;
index_t
M
=
length
[
0
];
index_t
N
=
length
[
1
];
index_t
Stride
=
N
;
constexpr
int
Rank
=
2
;
constexpr
int
NumReduceDim
=
1
;
std
::
vector
<
index_t
>
reduce_dim
=
{
1
};
std
::
vector
<
index_t
>
gammaBetaLength
=
{
N
};
std
::
vector
<
index_t
>
gammaBetaStride
=
{
0
,
1
};
auto
f_host_tensor_descriptor2d
=
[](
std
::
size_t
row
,
std
::
size_t
col
,
std
::
size_t
stride
)
{
return
HostTensorDescriptor
(
std
::
vector
<
std
::
size_t
>
({
row
,
col
}),
std
::
vector
<
std
::
size_t
>
({
stride
,
1
}));
};
Tensor
<
ADataType
>
a
(
length
);
Tensor
<
BDataType
>
b
(
length
);
Tensor
<
GammaDataType
>
gamma
(
gammaBetaLength
);
Tensor
<
BetaDataType
>
beta
(
gammaBetaLength
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
YDataType
>
host_y
(
length
);
switch
(
init_method
)
{
case
0
:
a
.
GenerateTensorValue
(
GeneratorTensor_1
<
ADataType
>
{});
b
.
GenerateTensorValue
(
GeneratorTensor_1
<
BDataType
>
{});
gamma
.
GenerateTensorValue
(
GeneratorTensor_1
<
GammaDataType
>
{});
beta
.
GenerateTensorValue
(
GeneratorTensor_1
<
BetaDataType
>
{});
break
;
case
1
:
a
.
GenerateTensorValue
(
GeneratorTensor_2
<
ADataType
>
{
-
5
,
5
});
b
.
GenerateTensorValue
(
GeneratorTensor_2
<
BDataType
>
{
-
5
,
5
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_2
<
GammaDataType
>
{
-
5
,
5
});
beta
.
GenerateTensorValue
(
GeneratorTensor_2
<
BetaDataType
>
{
-
5
,
5
});
break
;
default:
a
.
GenerateTensorValue
(
GeneratorTensor_3
<
ADataType
>
{
0
,
1
});
b
.
GenerateTensorValue
(
GeneratorTensor_3
<
BDataType
>
{
0
,
1
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_3
<
GammaDataType
>
{
-
0.5
,
0.5
});
beta
.
GenerateTensorValue
(
GeneratorTensor_3
<
BetaDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
a_dev
(
sizeof
(
ADataType
)
*
a
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
b_dev
(
sizeof
(
ADataType
)
*
b
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
a_dev
.
ToDevice
(
a
.
mData
.
data
());
b_dev
.
ToDevice
(
b
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
std
::
array
<
const
void
*
,
2
>
input
=
{
a_dev
.
GetDeviceBuffer
(),
b_dev
.
GetDeviceBuffer
()};
// add device normalization instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceElementwiseNormalization
<
ck
::
Tuple
<
ADataType
,
BDataType
>
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
Add
,
PassThrough
,
2
,
1
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
XDataType
=
ADataType
;
std
::
vector
<
std
::
size_t
>
mn
=
{
static_cast
<
unsigned
long
>
(
M
),
static_cast
<
unsigned
long
>
(
N
)};
Tensor
<
XDataType
>
x
(
f_host_tensor_descriptor2d
(
M
,
N
,
Stride
));
host_elementwise2D
<
Tensor
<
ADataType
>
,
Tensor
<
BDataType
>
,
Tensor
<
XDataType
>
,
Add
>
(
x
,
a
,
b
,
mn
,
Add
{});
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
{
M
,
N
},
{
1
},
1e-4
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
{
std
::
vector
<
ck
::
index_t
>
{
a
.
mDesc
.
GetStrides
().
begin
(),
a
.
mDesc
.
GetStrides
().
end
()},
std
::
vector
<
ck
::
index_t
>
{
b
.
mDesc
.
GetStrides
().
begin
(),
b
.
mDesc
.
GetStrides
().
end
()},
},
gammaBetaStride
,
gammaBetaStride
,
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
reduce_dim
,
1e-4
,
input
,
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
Add
{},
PassThrough
{});
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
++
num_kernel
;
}
else
{
continue
;
}
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
a
.
mDesc
.
GetElementSize
()
*
sizeof
(
ADataType
)
+
b
.
mDesc
.
GetElementSize
()
*
sizeof
(
BDataType
)
+
gamma
.
mDesc
.
GetElementSize
()
*
sizeof
(
GammaDataType
)
+
beta
.
mDesc
.
GetElementSize
()
*
sizeof
(
BetaDataType
)
+
y
.
mDesc
.
GetElementSize
()
*
sizeof
(
YDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
y_dev
.
FromDevice
(
y
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
y
.
mData
,
host_y
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"a : "
,
a
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"b : "
,
b
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host_y : "
,
host_y
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"y : "
,
y
.
mData
,
","
)
<<
std
::
endl
;
}
if
(
!
pass
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" failed verification: "
;
LogRange
(
std
::
cout
<<
"lengths = ["
,
length
,
", "
)
<<
"]."
<<
std
::
endl
;
return
false
;
}
else
{
if
(
time_kernel
)
std
::
cout
<<
"pass"
<<
std
::
endl
;
}
}
}
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
", "
;
std
::
cout
<<
"num_kernel = "
<<
num_kernel
<<
", best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is tested"
<<
std
::
endl
;
return
false
;
}
return
true
;
}
}
// namespace profiler
}
// namespace ck
test/CMakeLists.txt
View file @
6ea9257e
...
...
@@ -52,4 +52,3 @@ add_subdirectory(block_to_ctile_map)
add_subdirectory
(
softmax
)
add_subdirectory
(
normalization
)
add_subdirectory
(
data_type
)
add_subdirectory
(
elementwise_normalization
)
test/elementwise_normalization/CMakeLists.txt
deleted
100644 → 0
View file @
efbcc6ed
add_custom_target
(
test_elementwise_normalization
)
add_gtest_executable
(
test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp
)
target_link_libraries
(
test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance
)
add_dependencies
(
test_elementwise_normalization test_elementwise_layernorm_fp16
)
test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
deleted
100644 → 0
View file @
efbcc6ed
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/include/profile_elementwise_layernorm_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestElementwiseLayernorm
:
public
::
testing
::
Test
{
protected:
using
ADataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
BDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
GammaDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
BetaDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
AccDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
using
YDataType
=
std
::
tuple_element_t
<
5
,
Tuple
>
;
void
Run
()
{
// M, N
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{
{
1
,
1
},
{
25
,
16
},
{
39
,
777
},
{
100
,
200
},
{
1024
,
1024
},
{
48
*
256
,
2048
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_elementwise_layernorm_impl
<
ADataType
,
BDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// ADataType, BDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
std
::
tuple
<
F16
,
F16
,
F16
,
F16
,
F32
,
F16
>>
;
TYPED_TEST_SUITE
(
TestElementwiseLayernorm
,
KernelTypes
);
TYPED_TEST
(
TestElementwiseLayernorm
,
Test_FP16
)
{
this
->
Run
();
}
test/normalization/CMakeLists.txt
View file @
6ea9257e
...
...
@@ -3,9 +3,9 @@ add_custom_target(test_layernorm)
add_gtest_executable
(
test_layernorm2d_fp32 test_layernorm2d_fp32.cpp
)
add_gtest_executable
(
test_layernorm2d_fp16 test_layernorm2d_fp16.cpp
)
add_gtest_executable
(
test_groupnorm_fp16 test_groupnorm_fp16.cpp
)
add_gtest_executable
(
test_groupnorm_fp32 test_groupnorm_fp32.cpp
)
add_gtest_executable
(
test_groupnorm_fp32 test_groupnorm_fp32.cpp
)
target_link_libraries
(
test_layernorm2d_fp32 PRIVATE utility
)
target_link_libraries
(
test_layernorm2d_fp32 PRIVATE utility
)
target_link_libraries
(
test_layernorm2d_fp16 PRIVATE utility
)
target_link_libraries
(
test_groupnorm_fp16 PRIVATE utility device_normalization_instance
)
target_link_libraries
(
test_groupnorm_fp32 PRIVATE utility device_normalization_instance
)
...
...
@@ -14,3 +14,4 @@ add_dependencies(test_layernorm test_layernorm2d_fp32)
add_dependencies
(
test_layernorm test_layernorm2d_fp16
)
add_dependencies
(
test_layernorm test_groupnorm_fp16
)
add_dependencies
(
test_layernorm test_groupnorm_fp32
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment