Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
48c85879
"...composable_kernel.git" did not exist on "4472627d8a13ee6700cf106774acea639eeebd65"
Unverified
Commit
48c85879
authored
Oct 13, 2022
by
rocking5566
Committed by
GitHub
Oct 13, 2022
Browse files
Merge branch 'develop' into conv_quant_int8
parents
aa71a478
1b62bfaa
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
531 additions
and
202 deletions
+531
-202
client_example/05_layernorm/layernorm2d.cpp
client_example/05_layernorm/layernorm2d.cpp
+9
-9
client_example/07_conv2d_fwd/CMakeLists.txt
client_example/07_conv2d_fwd/CMakeLists.txt
+2
-0
client_example/07_conv2d_fwd/conv2d_fwd.cpp
client_example/07_conv2d_fwd/conv2d_fwd.cpp
+177
-0
example/27_layernorm/layernorm_blockwise.cpp
example/27_layernorm/layernorm_blockwise.cpp
+21
-21
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
+21
-21
include/ck/tensor_operation/gpu/device/device_normalization.hpp
...e/ck/tensor_operation/gpu/device/device_normalization.hpp
+9
-36
include/ck/tensor_operation/gpu/device/device_normalization_impl.hpp
...tensor_operation/gpu/device/device_normalization_impl.hpp
+9
-9
library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
...k/library/tensor_operation_instance/gpu/normalization.hpp
+109
-0
library/src/tensor_operation_instance/gpu/CMakeLists.txt
library/src/tensor_operation_instance/gpu/CMakeLists.txt
+0
-1
library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
...ensor_operation_instance/gpu/normalization/CMakeLists.txt
+2
-4
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp
...tance/gpu/normalization/device_layernorm_f16_instance.cpp
+0
-61
library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
...e/gpu/normalization/device_normalization_f16_instance.cpp
+65
-0
library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp
...e/gpu/normalization/device_normalization_f32_instance.cpp
+60
-0
library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
.../src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+4
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
..._instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
+0
-0
library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
..._instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
+0
-0
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+2
-1
profiler/include/profile_groupnorm_impl.hpp
profiler/include/profile_groupnorm_impl.hpp
+9
-9
profiler/include/profile_layernorm_impl.hpp
profiler/include/profile_layernorm_impl.hpp
+22
-20
profiler/include/profile_softmax_impl.hpp
profiler/include/profile_softmax_impl.hpp
+10
-10
No files found.
client_example/05_layernorm/layernorm2d.cpp
View file @
48c85879
...
...
@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/
layernorm
.hpp"
#include "ck/library/tensor_operation_instance/gpu/
normalization
.hpp"
using
XDataType
=
ck
::
half_t
;
using
GammaDataType
=
ck
::
half_t
;
...
...
@@ -51,14 +51,14 @@ int main(int argc, char* argv[])
SimpleDeviceMem
beta_device_buf
(
sizeof
(
BetaDataType
)
*
N
);
SimpleDeviceMem
y_device_buf
(
sizeof
(
YDataType
)
*
xy_size
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
Device
Layernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
Device
Normalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
...
...
client_example/07_conv2d_fwd/CMakeLists.txt
0 → 100644
View file @
48c85879
add_executable
(
client_conv2d_fwd conv2d_fwd.cpp
)
target_link_libraries
(
client_conv2d_fwd PRIVATE composable_kernel::device_operations
)
client_example/07_conv2d_fwd/conv2d_fwd.cpp
0 → 100644
View file @
48c85879
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NHWC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
KYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NHWK
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
static
constexpr
ck
::
index_t
N
=
16
;
static
constexpr
ck
::
index_t
K
=
32
;
static
constexpr
ck
::
index_t
C
=
3
;
static
constexpr
ck
::
index_t
Y
=
3
;
static
constexpr
ck
::
index_t
X
=
3
;
static
constexpr
ck
::
index_t
Hi
=
224
;
static
constexpr
ck
::
index_t
Wi
=
224
;
static
constexpr
ck
::
index_t
Ho
=
113
;
static
constexpr
ck
::
index_t
Wo
=
113
;
struct
SimpleDeviceMem
{
SimpleDeviceMem
()
=
delete
;
SimpleDeviceMem
(
std
::
size_t
mem_size
)
:
p_mem_
{}
{
(
void
)
hipMalloc
(
static_cast
<
void
**>
(
&
p_mem_
),
mem_size
);
}
void
*
GetDeviceBuffer
()
{
return
p_mem_
;
}
~
SimpleDeviceMem
()
{
(
void
)
hipFree
(
p_mem_
);
}
void
*
p_mem_
;
};
int
main
(
int
argc
,
char
*
argv
[])
{
std
::
vector
<
ck
::
index_t
>
in_spatial_lengths
{
Hi
,
Wi
};
std
::
vector
<
ck
::
index_t
>
filter_spatial_lengths
{
Y
,
X
};
std
::
vector
<
ck
::
index_t
>
out_spatial_lengths
{
Ho
,
Wo
};
std
::
vector
<
ck
::
index_t
>
filter_strides
{
2
,
2
};
std
::
vector
<
ck
::
index_t
>
filter_dilations
{
1
,
1
};
std
::
vector
<
ck
::
index_t
>
input_left_pads
{
2
,
2
};
std
::
vector
<
ck
::
index_t
>
input_right_pads
{
2
,
2
};
SimpleDeviceMem
in
(
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
);
SimpleDeviceMem
wei
(
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
);
SimpleDeviceMem
out
(
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
);
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceConvFwd
<
NumDimSpatial
,
InLayout
,
WeiLayout
,
OutLayout
,
InDataType
,
WeiDataType
,
OutDataType
,
PassThrough
,
PassThrough
,
PassThrough
>
;
// get device op instances
const
auto
op_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
op_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_op_name
;
int
best_op_id
=
-
1
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
float
best_tflops
=
0
;
// profile device operation instances
std
::
cout
<<
"Run all instances and do timing"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
op_ptrs
.
size
();
++
i
)
{
auto
&
op_ptr
=
op_ptrs
[
i
];
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
N
,
K
,
C
,
in_spatial_lengths
,
filter_spatial_lengths
,
out_spatial_lengths
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
std
::
string
op_name
=
op_ptr
->
GetTypeString
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
true
});
std
::
size_t
flop
=
2
*
N
*
K
*
C
*
Ho
*
Wo
*
Y
*
X
;
std
::
size_t
num_bytes
=
sizeof
(
InDataType
)
*
N
*
Hi
*
Wi
*
C
+
sizeof
(
WeiDataType
)
*
K
*
Y
*
X
*
C
+
sizeof
(
OutDataType
)
*
N
*
Ho
*
Wo
*
K
;
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
op_name
<<
std
::
endl
;
if
(
tflops
>
best_tflops
)
{
best_op_id
=
i
;
best_op_name
=
op_name
;
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
best_tflops
=
tflops
;
}
}
else
{
std
::
cout
<<
op_name
<<
" does not support this problem"
<<
std
::
endl
;
}
}
std
::
cout
<<
"Best Perf: "
<<
std
::
setw
(
10
)
<<
best_avg_time
<<
" ms, "
<<
best_tflops
<<
" TFlops, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_op_name
<<
std
::
endl
;
// run the best intance
{
auto
&
op_ptr
=
op_ptrs
[
best_op_id
];
std
::
cout
<<
"Run the best instance without timing: "
<<
op_ptr
->
GetTypeString
()
<<
std
::
endl
;
auto
argument_ptr
=
op_ptr
->
MakeArgumentPointer
(
in
.
GetDeviceBuffer
(),
wei
.
GetDeviceBuffer
(),
out
.
GetDeviceBuffer
(),
N
,
K
,
C
,
in_spatial_lengths
,
filter_spatial_lengths
,
out_spatial_lengths
,
filter_strides
,
filter_dilations
,
input_left_pads
,
input_right_pads
,
PassThrough
{},
PassThrough
{},
PassThrough
{});
auto
invoker_ptr
=
op_ptr
->
MakeInvokerPointer
();
if
(
op_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
false
});
}
std
::
cout
<<
"Done"
<<
std
::
endl
;
}
return
0
;
}
\ No newline at end of file
example/27_layernorm/layernorm_blockwise.cpp
View file @
48c85879
...
...
@@ -9,7 +9,7 @@
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_
layernorm
_impl.hpp"
#include "ck/tensor_operation/gpu/device/device_
normalization
_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/check_err.hpp"
...
...
@@ -30,26 +30,26 @@ constexpr int Rank = 2;
constexpr
int
NumReduceDim
=
1
;
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
Device
Layernorm
Impl
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
8
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
>
;
// OutScalarPerVector
ck
::
tensor_operation
::
device
::
Device
Normalization
Impl
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
,
256
,
// BlockSize
8
,
// ClusterM
32
,
// ClusterK
1
,
// SliceM
8
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
8
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
8
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
8
,
// BetaScalarPerVector
8
>
;
// OutScalarPerVector
int
main
()
{
...
...
example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
View file @
48c85879
...
...
@@ -9,7 +9,7 @@
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_
layernorm
_impl.hpp"
#include "ck/tensor_operation/gpu/device/device_
normalization
_impl.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/fill.hpp"
...
...
@@ -47,26 +47,26 @@ struct YElementOp
};
using
DeviceInstance
=
ck
::
tensor_operation
::
device
::
Device
Layernorm
Impl
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
YElementOp
,
Rank
,
NumReduceDim
,
1024
,
// BlockSize
1
,
// ClusterM
1024
,
// ClusterK
1
,
// SliceM
32
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
2
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
2
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
2
,
// BetaScalarPerVector
2
>
;
// OutScalarPerVector
ck
::
tensor_operation
::
device
::
Device
Normalization
Impl
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
YElementOp
,
Rank
,
NumReduceDim
,
1024
,
// BlockSize
1
,
// ClusterM
1024
,
// ClusterK
1
,
// SliceM
32
,
// SliceK
1
,
// SrcVecDim (0=M, 1=K)
2
,
// SrcScalarPerVector
1
,
// GammaVecDim (0=M, 1=K)
2
,
// GammaScalarPerVector
1
,
// BetaVecDim (0=M, 1=K)
2
,
// BetaScalarPerVector
2
>
;
// OutScalarPerVector
int
main
(
int
argc
,
char
*
argv
[])
{
...
...
include/ck/tensor_operation/gpu/device/device_normalization.hpp
View file @
48c85879
...
...
@@ -11,33 +11,6 @@
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
struct
DeviceNormalization
:
public
BaseOperator
{
// inLengths: input tensor extent(s) from high to low dimension
// inStrides: input tensor stride(s) from high to low dimension
// reduceDims: the dimension(s) the normalization operation is applied
// alpha: typeless pointer in host memory storing the alpha scaling value of type AccDataType
// beta: typeless pointer in host memory storing the beta scaling value of type AccDataType
// in_dev: typeless const pointer in device memory storing the input tensor
// out_dev: typeless pointer in device memory storing the output tensor
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
void
*
alpha
,
const
void
*
beta
,
const
void
*
in_dev
,
void
*
out_dev
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
index_t
GetRank
()
const
=
0
;
virtual
index_t
GetNumReduceDim
()
const
=
0
;
};
using
DeviceNormalizationPtr
=
std
::
unique_ptr
<
DeviceNormalization
>
;
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
...
...
@@ -46,7 +19,7 @@ template <typename XDataType,
typename
AccElementwiseOperation
,
index_t
Rank
,
index_t
NumReduceDim
>
struct
Device
Layernorm
:
public
BaseOperator
struct
Device
Normalization
:
public
BaseOperator
{
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
lengths
,
...
...
@@ -73,14 +46,14 @@ template <typename XDataType,
typename
AccElementwiseOperation
,
index_t
Rank
,
index_t
NumReduceDim
>
using
Device
Layernorm
Ptr
=
std
::
unique_ptr
<
Device
Layernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
AccElementwiseOperation
,
Rank
,
NumReduceDim
>>
;
using
Device
Normalization
Ptr
=
std
::
unique_ptr
<
Device
Normalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
AccElementwiseOperation
,
Rank
,
NumReduceDim
>>
;
}
// namespace device
}
// namespace tensor_operation
...
...
include/ck/tensor_operation/gpu/device/device_
layernorm
_impl.hpp
→
include/ck/tensor_operation/gpu/device/device_
normalization
_impl.hpp
View file @
48c85879
...
...
@@ -75,14 +75,14 @@ template <typename XDataType,
index_t
BetaSrcVectorDim
,
index_t
BetaSrcVectorSize
,
index_t
YDstVectorSize
>
struct
Device
Layernorm
Impl
:
public
Device
Layernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
AccElementwiseOperation
,
Rank
,
NumReduceDim
>
struct
Device
Normalization
Impl
:
public
Device
Normalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
AccElementwiseOperation
,
Rank
,
NumReduceDim
>
{
static_assert
(
((
GammaSrcVectorDim
==
0
&&
MThreadSliceSize
%
GammaSrcVectorSize
==
0
)
||
...
...
@@ -452,7 +452,7 @@ struct DeviceLayernormImpl : public DeviceLayernorm<XDataType,
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"Device
Layernorm
Impl<"
<<
BlockSize
<<
","
;
str
<<
"Device
Normalization
Impl<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"XYSrcVectorDim_"
<<
XYSrcVectorDim
<<
","
;
...
...
library/include/ck/library/tensor_operation_instance/gpu/
layernorm
.hpp
→
library/include/ck/library/tensor_operation_instance/gpu/
normalization
.hpp
View file @
48c85879
...
...
@@ -18,24 +18,24 @@ namespace device {
namespace
instance
{
// FP16
void
add_device_
layernorm
_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Layernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
2
,
1
>>>&
);
void
add_device_
normalization
_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Normalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
2
,
1
>>>&
);
void
add_device_
layernorm
_rank_4_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Layernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
4
,
3
>>>&
);
void
add_device_
normalization
_rank_4_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Normalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
4
,
3
>>>&
);
void
add_device_
layernorm
_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Layernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
5
,
3
>>>&
);
void
add_device_
normalization
_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Normalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
5
,
3
>>>&
);
// FP32
void
add_device_
layernorm
_rank_2_1_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Layernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
2
,
1
>>>&
);
void
add_device_
normalization
_rank_2_1_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Normalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
2
,
1
>>>&
);
void
add_device_
layernorm
_rank_4_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Layernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
4
,
3
>>>&
);
void
add_device_
normalization
_rank_4_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Normalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
4
,
3
>>>&
);
void
add_device_
layernorm
_rank_5_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Layernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
5
,
3
>>>&
);
void
add_device_
normalization
_rank_5_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
Device
Normalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
5
,
3
>>>&
);
template
<
typename
XDataType
,
typename
GammaDataType
,
...
...
@@ -43,24 +43,24 @@ template <typename XDataType,
typename
YDataType
,
index_t
Rank
,
index_t
NumReduceDim
>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
F32
,
YDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Rank
,
NumReduceDim
>>
struct
DeviceOperationInstanceFactory
<
ck
::
tensor_operation
::
device
::
DeviceNormalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
F32
,
YDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Rank
,
NumReduceDim
>>
{
using
DeviceOp
=
Device
Layernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
F32
,
YDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Rank
,
NumReduceDim
>
;
using
DeviceOp
=
Device
Normalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
F32
,
YDataType
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Rank
,
NumReduceDim
>
;
static
auto
GetInstances
()
{
...
...
@@ -71,15 +71,15 @@ struct DeviceOperationInstanceFactory<
{
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
{
add_device_
layernorm
_rank_2_1_f16_instances
(
op_ptrs
);
add_device_
normalization
_rank_2_1_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
{
add_device_
layernorm
_rank_4_3_f16_instances
(
op_ptrs
);
add_device_
normalization
_rank_4_3_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
5
&&
NumReduceDim
==
3
)
{
add_device_
layernorm
_rank_5_3_f16_instances
(
op_ptrs
);
add_device_
normalization
_rank_5_3_f16_instances
(
op_ptrs
);
}
}
else
if
constexpr
(
is_same_v
<
XDataType
,
F32
>
&&
is_same_v
<
GammaDataType
,
F32
>
&&
...
...
@@ -87,15 +87,15 @@ struct DeviceOperationInstanceFactory<
{
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
{
add_device_
layernorm
_rank_2_1_f32_instances
(
op_ptrs
);
add_device_
normalization
_rank_2_1_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
{
add_device_
layernorm
_rank_4_3_f32_instances
(
op_ptrs
);
add_device_
normalization
_rank_4_3_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
5
&&
NumReduceDim
==
3
)
{
add_device_
layernorm
_rank_5_3_f32_instances
(
op_ptrs
);
add_device_
normalization
_rank_5_3_f32_instances
(
op_ptrs
);
}
}
...
...
library/src/tensor_operation_instance/gpu/CMakeLists.txt
View file @
48c85879
...
...
@@ -17,7 +17,6 @@ IF(IS_DIRECTORY "${subdir_path}")
ENDIF
()
ENDFOREACH
()
add_library
(
device_operations STATIC
${
CK_DEVICE_INSTANCES
}
)
add_library
(
composablekernels::device_operations ALIAS device_operations
)
...
...
library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
View file @
48c85879
add_instance_library
(
device_normalization_instance
device_layernorm_f16_instance.cpp
device_layernorm_f32_instance.cpp
device_softmax_f32_f32_instance.cpp
device_softmax_f16_f16_instance.cpp
device_normalization_f16_instance.cpp
device_normalization_f32_instance.cpp
)
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp
deleted
100644 → 0
View file @
aa71a478
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_layernorm_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_layernorm_f16_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
// clang-format on
>
;
void
add_device_layernorm_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
2
,
1
>
{});
}
void
add_device_layernorm_rank_4_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
4
,
3
>
{});
}
void
add_device_layernorm_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
5
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
0 → 100644
View file @
48c85879
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
// clang-format off
using
device_normalization_f16_instances
=
std
::
tuple
<
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// fallback kernel
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// fallback kernel
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceNormalizationImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
1024
,
1
,
1024
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
>
;
// clang-format on
void
add_device_normalization_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
2
,
1
>
{});
}
void
add_device_normalization_rank_4_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
4
,
3
>
{});
}
void
add_device_normalization_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_normalization_f16_instances
<
Pass
,
5
,
3
>
{});
}
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
library/src/tensor_operation_instance/gpu/normalization/device_
layernorm
_f32_instance.cpp
→
library/src/tensor_operation_instance/gpu/normalization/device_
normalization
_f32_instance.cpp
View file @
48c85879
...
...
@@ -2,7 +2,7 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_
layernorm
_impl.hpp"
#include "ck/tensor_operation/gpu/device/device_
normalization
_impl.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
...
...
@@ -20,33 +20,36 @@ template <typename OutElementwise, index_t Rank, index_t Reduce>
using
device_layernorm_f32_instances
=
std
::
tuple
<
// clang-format off
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// fallback kernel
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Layernorm
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// fallback kernel
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
Device
Normalization
Impl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// clang-format on
>
;
void
add_device_layernorm_rank_2_1_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>>&
instances
)
void
add_device_normalization_rank_2_1_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
2
,
1
>
{});
}
void
add_device_layernorm_rank_4_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>>&
instances
)
void
add_device_normalization_rank_4_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
4
,
3
>
{});
}
void
add_device_layernorm_rank_5_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
5
,
3
>>>&
instances
)
void
add_device_normalization_rank_5_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceNormalization
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
5
,
3
>
{});
}
...
...
library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
0 → 100644
View file @
48c85879
add_instance_library
(
device_softmax_instance
device_softmax_f16_f16_instance.cpp
device_softmax_f32_f32_instance.cpp
)
library/src/tensor_operation_instance/gpu/
normalization
/device_softmax_f16_f16_instance.cpp
→
library/src/tensor_operation_instance/gpu/
softmax
/device_softmax_f16_f16_instance.cpp
View file @
48c85879
File moved
library/src/tensor_operation_instance/gpu/
normalization
/device_softmax_f32_f32_instance.cpp
→
library/src/tensor_operation_instance/gpu/
softmax
/device_softmax_f32_f32_instance.cpp
View file @
48c85879
File moved
profiler/CMakeLists.txt
View file @
48c85879
...
...
@@ -25,7 +25,7 @@ set(PROFILER_SOURCE
src/profile_reduce.cpp
src/profile_groupnorm.cpp
src/profile_layernorm.cpp
src/profile_
normalization
.cpp
src/profile_
softmax
.cpp
)
add_executable
(
ckProfiler
${
PROFILER_SOURCE
}
)
...
...
@@ -55,4 +55,5 @@ target_link_libraries(ckProfiler PRIVATE device_conv3d_bwd_weight_instance)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_normalization_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_softmax_instance
)
target_link_libraries
(
ckProfiler PRIVATE device_reduce_instance
)
profiler/include/profile_groupnorm_impl.hpp
View file @
48c85879
...
...
@@ -7,7 +7,7 @@
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/
layernorm
.hpp"
#include "ck/library/tensor_operation_instance/gpu/
normalization
.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
@@ -75,14 +75,14 @@ bool profile_groupnorm_impl(int do_verification,
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
// add device normalization instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
Device
Layernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
5
,
3
>
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
Device
Normalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
5
,
3
>
;
// get device op instances
const
auto
instance_ptrs
=
...
...
profiler/include/profile_layernorm_impl.hpp
View file @
48c85879
...
...
@@ -7,7 +7,7 @@
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/
layernorm
.hpp"
#include "ck/library/tensor_operation_instance/gpu/
normalization
.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
...
...
@@ -28,27 +28,29 @@ void profile_layernorm_impl(int do_verification,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
length
,
std
::
vector
<
index_t
>
strideXY
,
std
::
vector
<
index_t
>
strideGamma
,
std
::
vector
<
index_t
>
strideBeta
)
std
::
vector
<
index_t
>
length
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
if
(
length
.
size
()
<
2
)
return
;
// Assume normalize dimension except for first dimension
// Assume normalize dimension except for
batch (
first
)
dimension
std
::
vector
<
index_t
>
reduce_length
{
length
.
begin
()
+
1
,
length
.
end
()};
std
::
vector
<
index_t
>
reduce_dim
;
for
(
int
i
=
1
;
i
<
Rank
;
++
i
)
reduce_dim
.
push_back
(
i
);
Tensor
<
XDataType
>
x
(
length
);
Tensor
<
GammaDataType
>
gamma
(
reduce_length
,
strideGamma
);
Tensor
<
BetaDataType
>
beta
(
reduce_length
,
strideBeta
);
Tensor
<
YDataType
>
y
(
length
,
strideXY
);
Tensor
<
YDataType
>
host_y
(
length
,
strideXY
);
Tensor
<
GammaDataType
>
gamma
(
reduce_length
);
Tensor
<
BetaDataType
>
beta
(
reduce_length
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
YDataType
>
host_y
(
length
);
std
::
vector
<
index_t
>
strideXY
=
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()};
std
::
vector
<
index_t
>
strideGammaBeta
=
strideXY
;
strideGammaBeta
[
0
]
=
0
;
switch
(
init_method
)
{
...
...
@@ -84,14 +86,14 @@ void profile_layernorm_impl(int do_verification,
constexpr
int
NumReduceDim
=
Rank
-
1
;
// add device normalization instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
Device
Layernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
Device
Normalization
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
Rank
,
NumReduceDim
>
;
// get device op instances
const
auto
instance_ptrs
=
...
...
@@ -126,8 +128,8 @@ void profile_layernorm_impl(int do_verification,
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
strideXY
,
strideGamma
,
strideBeta
,
strideGamma
Beta
,
stride
Gamma
Beta
,
strideXY
,
reduce_dim
,
1e-4
,
...
...
profiler/include/profile_
normalization
_impl.hpp
→
profiler/include/profile_
softmax
_impl.hpp
View file @
48c85879
...
...
@@ -69,16 +69,16 @@ template <> std::string type_to_string<int32_t>() { return "int32"; }
// clang-format on
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
>
void
profile_
normalization
_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
in_length
,
std
::
vector
<
index_t
>
in_strides
,
std
::
vector
<
index_t
>
reduce_dims
,
AccDataType
alpha
,
AccDataType
beta
,
NormType
norm_type
)
void
profile_
softmax
_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
in_length
,
std
::
vector
<
index_t
>
in_strides
,
std
::
vector
<
index_t
>
reduce_dims
,
AccDataType
alpha
,
AccDataType
beta
,
NormType
norm_type
)
{
if
(
Rank
!=
in_length
.
size
())
{
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment