Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
24f99138
Commit
24f99138
authored
Sep 20, 2022
by
wangshaojie6
Browse files
Merge branch 'develop' into att_with_MNKOPadding
parents
31d2d52a
4eba345f
Changes
42
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1265 additions
and
176 deletions
+1265
-176
include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
+339
-0
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+1
-0
include/ck/utility/span.hpp
include/ck/utility/span.hpp
+67
-0
library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
...ry/reference_tensor_operation/cpu/reference_groupnorm.hpp
+191
-0
library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp
...de/ck/library/tensor_operation_instance/gpu/layernorm.hpp
+36
-12
library/include/ck/library/utility/check_err.hpp
library/include/ck/library/utility/check_err.hpp
+25
-13
library/include/ck/library/utility/fill.hpp
library/include/ck/library/utility/fill.hpp
+12
-0
library/include/ck/library/utility/host_tensor.hpp
library/include/ck/library/utility/host_tensor.hpp
+45
-13
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp
...tance/gpu/normalization/device_layernorm_f16_instance.cpp
+25
-19
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp
...tance/gpu/normalization/device_layernorm_f32_instance.cpp
+23
-17
profiler/CMakeLists.txt
profiler/CMakeLists.txt
+1
-0
profiler/include/profile_groupnorm_impl.hpp
profiler/include/profile_groupnorm_impl.hpp
+207
-0
profiler/include/profile_layernorm_impl.hpp
profiler/include/profile_layernorm_impl.hpp
+21
-56
profiler/src/profile_groupnorm.cpp
profiler/src/profile_groupnorm.cpp
+106
-0
profiler/src/profile_layernorm.cpp
profiler/src/profile_layernorm.cpp
+2
-8
profiler/src/profiler.cpp
profiler/src/profiler.cpp
+26
-19
test/layernorm/CMakeLists.txt
test/layernorm/CMakeLists.txt
+13
-6
test/layernorm/test_groupnorm_fp16.cpp
test/layernorm/test_groupnorm_fp16.cpp
+56
-0
test/layernorm/test_groupnorm_fp32.cpp
test/layernorm/test_groupnorm_fp32.cpp
+56
-0
test/layernorm/test_layernorm2d_fp16.cpp
test/layernorm/test_layernorm2d_fp16.cpp
+13
-13
No files found.
include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
0 → 100644
View file @
24f99138
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <functional>
#include <numeric>
#include <iterator>
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwisePermute
,
typename
InGridDesc
,
typename
OutGridDesc
,
typename
InDataType
,
typename
OutDataType
,
typename
ElementwiseOperation
,
typename
Block2TileMap
>
__global__
void
kernel_nd_permute
(
const
InGridDesc
in_grid_desc
,
const
OutGridDesc
out_grid_desc
,
const
InDataType
*
p_in_global
,
OutDataType
*
p_out_global
,
const
ElementwiseOperation
elementwise_op
,
const
Block2TileMap
block_2_tile_map
)
{
__shared__
char
p_shared
[
GridwisePermute
::
GetSharedMemoryNumberOfByte
()];
GridwisePermute
::
Run
(
in_grid_desc
,
out_grid_desc
,
p_in_global
,
p_out_global
,
p_shared
,
elementwise_op
,
block_2_tile_map
);
}
template
<
typename
InGridDesc
,
typename
OutGridDesc
,
typename
InDataType
,
typename
OutDataType
,
typename
ElementwiseOperation
,
index_t
BlockSize
,
index_t
NPerBlock
,
index_t
HPerBlock
,
index_t
WPerBlock
,
index_t
InBlockLdsExtraW
,
typename
InBlockTransferThreadClusterLengths
,
typename
InBlockTransferThreadClusterArrangeOrder
,
index_t
SrcVectorDim
,
index_t
DstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
>
struct
GridwisePermute
{
static_assert
(
InGridDesc
::
GetNumOfDimension
()
==
OutGridDesc
::
GetNumOfDimension
());
static_assert
(
3
<=
InGridDesc
::
GetNumOfDimension
());
static_assert
((
InGridDesc
::
GetNumOfDimension
()
-
2
)
<=
SrcVectorDim
&&
SrcVectorDim
<
InGridDesc
::
GetNumOfDimension
());
static_assert
((
OutGridDesc
::
GetNumOfDimension
()
-
2
)
<=
DstVectorDim
&&
DstVectorDim
<
OutGridDesc
::
GetNumOfDimension
());
static_assert
(
SrcVectorDim
!=
DstVectorDim
);
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
struct
Block2TileMap
{
static
constexpr
index_t
NumDim
=
InGridDesc
::
GetNumOfDimension
();
static_assert
(
3
<=
NumDim
);
static
constexpr
auto
I0
=
Number
<
0
>
{};
Block2TileMap
()
=
delete
;
Block2TileMap
(
const
Block2TileMap
&
)
=
default
;
Block2TileMap
(
Block2TileMap
&&
)
=
delete
;
~
Block2TileMap
()
=
default
;
Block2TileMap
&
operator
=
(
const
Block2TileMap
&
)
=
delete
;
Block2TileMap
&
operator
=
(
Block2TileMap
&&
)
=
delete
;
explicit
Block2TileMap
(
const
InGridDesc
&
desc
)
:
desc_
(
desc
)
{}
__host__
constexpr
index_t
CalculateGridSize
(
const
InGridDesc
&
desc
)
const
{
const
auto
N0
=
math
::
integer_divide_ceil
(
desc
.
GetLength
(
Number
<
NumDim
-
3
>
{}),
NPerBlock
);
const
auto
H0
=
math
::
integer_divide_ceil
(
desc
.
GetLength
(
Number
<
NumDim
-
2
>
{}),
HPerBlock
);
const
auto
W0
=
math
::
integer_divide_ceil
(
desc
.
GetLength
(
Number
<
NumDim
-
1
>
{}),
WPerBlock
);
const
index_t
grid_size
=
N0
*
H0
*
W0
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
static_assert
(
TopIdx
::
Size
()
==
1
);
auto
block_1d_id
=
idx_top
[
I0
];
const
auto
N0
=
math
::
integer_divide_ceil
(
desc_
.
GetLength
(
Number
<
NumDim
-
3
>
{}),
NPerBlock
);
const
auto
H0
=
math
::
integer_divide_ceil
(
desc_
.
GetLength
(
Number
<
NumDim
-
2
>
{}),
HPerBlock
);
const
auto
W0
=
math
::
integer_divide_ceil
(
desc_
.
GetLength
(
Number
<
NumDim
-
1
>
{}),
WPerBlock
);
block_1d_id
=
block_1d_id
%
(
N0
*
H0
*
W0
);
index_t
idx_N0
=
block_1d_id
/
(
H0
*
W0
);
index_t
idx_H0
=
(
block_1d_id
%
(
H0
*
W0
))
/
W0
;
index_t
idx_W0
=
block_1d_id
%
W0
;
return
make_tuple
(
idx_N0
,
idx_H0
,
idx_W0
);
}
private:
const
InGridDesc
desc_
;
};
using
DefaultBlock2TileMap
=
Block2TileMap
;
// use an [NPerBlock, HPerBlock, WPerBlock] tensor as element-copy relay
__host__
__device__
static
constexpr
auto
GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock
()
{
return
make_naive_tensor_descriptor
(
make_tuple
(
Number
<
NPerBlock
>
{},
Number
<
HPerBlock
>
{},
Number
<
WPerBlock
>
{}),
make_tuple
(
Number
<
HPerBlock
*
(
WPerBlock
+
InBlockLdsExtraW
)
>
{},
Number
<
WPerBlock
+
InBlockLdsExtraW
>
{},
I1
));
}
// for N-dimension descriptor, reserve its last 2 dimensions, then merge its leading dimensions
// into single one. finally, form a 3D descriptor: [d(0), d(1), ..., d(N - 2), d(N - 1)] ->
// [(d(0) x d(1) x ...), d(N - 2), d(N - 1)]
template
<
typename
GridDesc
>
__host__
__device__
static
constexpr
auto
GetMergedDesc
(
const
GridDesc
&
desc
)
{
constexpr
index_t
NumDim
=
GridDesc
::
GetNumOfDimension
();
static_assert
(
3
<=
NumDim
);
const
auto
merged_desc
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
generate_tuple
(
[
&
](
auto
I
)
{
return
desc
.
GetLength
(
I
);
},
Number
<
NumDim
-
2
>
{})),
make_pass_through_transform
(
desc
.
GetLength
(
Number
<
NumDim
-
2
>
{})),
make_pass_through_transform
(
desc
.
GetLength
(
Number
<
NumDim
-
1
>
{}))),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
NumDim
-
2
>
{}),
Sequence
<
NumDim
-
2
>
{},
Sequence
<
NumDim
-
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
return
merged_desc
;
}
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
constexpr
auto
in_block_desc_nperblock_hperblock_wperblock
=
GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock
();
return
in_block_desc_nperblock_hperblock_wperblock
.
GetElementSpaceSize
()
*
sizeof
(
InDataType
);
}
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2TileMap
(
const
InGridDesc
&
desc
)
{
return
DefaultBlock2TileMap
{
desc
};
}
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
InGridDesc
&
in_grid_desc
,
const
OutGridDesc
&
out_grid_desc
)
{
constexpr
index_t
NumDim
=
InGridDesc
::
GetNumOfDimension
();
// check if we only swap last 2 dimensions
bool
valid
=
true
;
static_for
<
0
,
NumDim
-
2
,
1
>
{}([
&
](
auto
I
)
{
if
(
valid
&&
in_grid_desc
.
GetLength
(
I
)
!=
out_grid_desc
.
GetLength
(
I
))
{
valid
=
false
;
}
});
return
valid
&&
(
in_grid_desc
.
GetLength
(
Number
<
NumDim
-
1
>
{})
==
out_grid_desc
.
GetLength
(
Number
<
NumDim
-
2
>
{}))
&&
(
in_grid_desc
.
GetLength
(
Number
<
NumDim
-
2
>
{})
==
out_grid_desc
.
GetLength
(
Number
<
NumDim
-
1
>
{}));
}
template
<
typename
Block2TileMap
>
__device__
static
void
Run
(
const
InGridDesc
in_grid_desc
,
const
OutGridDesc
out_grid_desc
,
const
InDataType
*
p_in_global
,
OutDataType
*
p_out_global
,
void
*
__restrict__
p_shared
,
const
ElementwiseOperation
elementwise_op
,
const
Block2TileMap
&
block_2_tile_map
)
{
auto
in_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_global
,
in_grid_desc
.
GetElementSpaceSize
());
auto
out_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_global
,
out_grid_desc
.
GetElementSpaceSize
());
// each workgroup handles an [NPerBlock, HPerBlock, WPerBLock] slice-transpose problem
const
auto
block_work_idx
=
block_2_tile_map
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
const
index_t
n_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I0
]
*
NPerBlock
);
const
index_t
h_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
HPerBlock
);
const
index_t
w_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I2
]
*
WPerBlock
);
// create [NPerBlock, HPerBlock, WPerBLock] shaped LDS buffer
constexpr
auto
in_block_desc_nperblock_hperblock_wperblock
=
GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock
();
auto
in_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
InDataType
*>
(
p_shared
),
in_block_desc_nperblock_hperblock_wperblock
.
GetElementSpaceSize
());
using
BlockSliceLengths
=
Sequence
<
NPerBlock
,
HPerBlock
,
WPerBlock
>
;
using
InBlockTransferAccessOrder
=
Sequence
<
0
,
1
,
2
>
;
constexpr
index_t
SrcVectorDimAfterMerge
=
SrcVectorDim
-
(
InGridDesc
::
GetNumOfDimension
()
-
3
);
constexpr
index_t
DstVectorDimAfterMerge
=
SrcVectorDimAfterMerge
;
using
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
// merge input descriptor into [(in_grid_desc.GetLength(0) x in_grid_desc.GetLength(1) x
// ...), in_grid_desc.GetLength(NumDim - 2), in_grid_desc.GetLength(NumDim - 1)]
const
auto
in_grid_desc_n_h_w
=
GetMergedDesc
(
in_grid_desc
);
// a workgroup copies an [NPerBlock, HPerBlock, WPerBlock] slice from global memory to LDS
auto
in_global_load
=
ThreadGroupTensorSliceTransfer_v4r1
<
ThisThreadBlock
,
ElementwiseOperation
,
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
BlockSliceLengths
,
InBlockTransferThreadClusterLengths
,
InBlockTransferThreadClusterArrangeOrder
,
InDataType
,
InDataType
,
decltype
(
in_grid_desc_n_h_w
),
decltype
(
in_block_desc_nperblock_hperblock_wperblock
),
InBlockTransferAccessOrder
,
InBlockTransferAccessOrder
,
SrcVectorDimAfterMerge
,
2
,
SrcScalarPerVector
,
1
,
1
,
1
,
true
,
true
>
(
in_grid_desc_n_h_w
,
make_multi_index
(
n_block_data_idx_on_grid
,
h_block_data_idx_on_grid
,
w_block_data_idx_on_grid
),
PassThrough
{},
in_block_desc_nperblock_hperblock_wperblock
,
make_multi_index
(
0
,
0
,
0
),
PassThrough
{});
// merge output descriptor into [(out_grid_desc.GetLength(0) x out_grid_desc.GetLength(1) x
// ...), out_grid_desc.GetLength(NumDim - 2), out_grid_desc.GetLength(NumDim - 1)]
const
auto
out_grid_desc_n_w_h
=
GetMergedDesc
(
out_grid_desc
);
// create transposed view of output tensor
const
auto
out_grid_desc_n_h_w
=
transform_tensor_descriptor
(
out_grid_desc_n_w_h
,
make_tuple
(
make_pass_through_transform
(
out_grid_desc_n_w_h
.
GetLength
(
I0
)),
make_pass_through_transform
(
out_grid_desc_n_w_h
.
GetLength
(
I1
)),
make_pass_through_transform
(
out_grid_desc_n_w_h
.
GetLength
(
I2
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
2
>
{},
Sequence
<
1
>
{}));
// a workgroup copies an [NPerBlock, HPerBlock, WPerBlock] slice from LDS to global memory
auto
out_global_store
=
ThreadGroupTensorSliceTransfer_v4r1
<
ThisThreadBlock
,
ElementwiseOperation
,
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
BlockSliceLengths
,
InBlockTransferThreadClusterLengths
,
InBlockTransferThreadClusterArrangeOrder
,
InDataType
,
OutDataType
,
decltype
(
in_block_desc_nperblock_hperblock_wperblock
),
decltype
(
out_grid_desc_n_h_w
),
InBlockTransferAccessOrder
,
InBlockTransferAccessOrder
,
2
,
DstVectorDimAfterMerge
,
1
,
DstScalarPerVector
,
1
,
1
,
true
,
true
>
(
in_block_desc_nperblock_hperblock_wperblock
,
make_multi_index
(
0
,
0
,
0
),
PassThrough
{},
out_grid_desc_n_h_w
,
make_multi_index
(
n_block_data_idx_on_grid
,
h_block_data_idx_on_grid
,
w_block_data_idx_on_grid
),
elementwise_op
);
in_global_load
.
Run
(
in_grid_desc_n_h_w
,
in_global_buf
,
in_block_desc_nperblock_hperblock_wperblock
,
in_block_buf
,
I0
);
out_global_store
.
Run
(
in_block_desc_nperblock_hperblock_wperblock
,
in_block_buf
,
out_grid_desc_n_h_w
,
out_global_buf
,
I0
);
}
};
}
// namespace ck
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
24f99138
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include "ck/utility/common_header.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor/static_tensor.hpp"
#include "ck/tensor/static_tensor.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/utility/span.hpp
0 → 100644
View file @
24f99138
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstddef>
#include <array>
#include <type_traits>
namespace
ck
{
template
<
typename
T
>
class
span
{
public:
using
element_type
=
T
;
using
value_type
=
std
::
remove_cv_t
<
element_type
>
;
using
size_type
=
std
::
size_t
;
using
difference_type
=
std
::
ptrdiff_t
;
using
pointer
=
element_type
*
;
using
const_pointer
=
const
element_type
*
;
using
reference
=
element_type
&
;
using
const_reference
=
const
element_type
&
;
using
iterator
=
pointer
;
using
const_iterator
=
pointer
;
constexpr
span
()
:
span
(
nullptr
,
size_type
{
0
})
{}
constexpr
span
(
pointer
first
,
size_type
count
)
:
ptr_
(
first
),
size_
(
count
)
{}
constexpr
span
(
pointer
first
,
pointer
last
)
:
span
(
first
,
last
-
first
)
{}
template
<
std
::
size_t
N
>
constexpr
span
(
element_type
(
&
arr
)[
N
])
noexcept
:
span
(
arr
,
N
)
{
}
template
<
std
::
size_t
N
>
constexpr
span
(
std
::
array
<
value_type
,
N
>&
arr
)
noexcept
:
span
(
arr
.
data
(),
N
)
{
}
template
<
typename
Container
>
constexpr
span
(
const
Container
&
container
)
:
span
(
container
.
data
(),
container
.
size
())
{
}
constexpr
iterator
begin
()
const
noexcept
{
return
ptr_
;
}
constexpr
const_iterator
cbegin
()
const
noexcept
{
return
begin
();
}
constexpr
iterator
end
()
const
noexcept
{
return
begin
()
+
size
();
}
constexpr
const_iterator
cend
()
const
noexcept
{
return
end
();
}
constexpr
reference
front
()
const
{
return
*
begin
();
}
constexpr
reference
back
()
const
{
return
*
(
--
end
());
}
constexpr
reference
operator
[](
size_type
idx
)
const
{
return
*
(
begin
()
+
idx
);
}
constexpr
pointer
data
()
const
noexcept
{
return
ptr_
;
}
constexpr
size_type
size
()
const
noexcept
{
return
size_
;
}
private:
pointer
ptr_
;
size_type
size_
;
};
}
// namespace ck
library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
0 → 100644
View file @
24f99138
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include <vector>
#include <algorithm>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
host
{
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
YDataType
,
typename
AccDataType
,
typename
AccElementwiseOperation
>
struct
ReferenceGroupnorm
:
public
device
::
BaseOperator
{
// x = [N, H, W, G, C]
// y = [N, H, W, G, C]
// reduce dim [H, W, C], mean, var = [N, G]
// gamma, beta = [G, C]
// beta: [G, C]
struct
Argument
:
public
device
::
BaseArgument
{
Argument
(
const
Tensor
<
XDataType
>&
x
,
const
Tensor
<
GammaDataType
>&
gamma
,
const
Tensor
<
BetaDataType
>&
beta
,
Tensor
<
YDataType
>&
y
,
AccElementwiseOperation
acc_elementwise_op
,
const
std
::
vector
<
index_t
>
lengths
,
AccDataType
epsilon
)
:
x_
(
x
),
gamma_
(
gamma
),
beta_
(
beta
),
y_
(
y
),
acc_elementwise_op_
(
acc_elementwise_op
),
lengths_
(
lengths
),
epsilon_
(
epsilon
)
{
}
const
Tensor
<
XDataType
>
x_
;
const
Tensor
<
XDataType
>
gamma_
;
const
Tensor
<
XDataType
>
beta_
;
Tensor
<
YDataType
>&
y_
;
AccElementwiseOperation
acc_elementwise_op_
;
std
::
vector
<
index_t
>
lengths_
;
AccDataType
epsilon_
;
};
// Invoker
struct
Invoker
:
public
device
::
BaseInvoker
{
float
Run
(
const
Argument
&
arg
)
{
int
N
=
arg
.
lengths_
[
0
];
int
H
=
arg
.
lengths_
[
1
];
int
W
=
arg
.
lengths_
[
2
];
int
G
=
arg
.
lengths_
[
3
];
int
C
=
arg
.
lengths_
[
4
];
Tensor
<
AccDataType
>
mean
({
N
,
G
});
Tensor
<
AccDataType
>
var
({
N
,
G
});
// Compute mean & var in [H, W, C] by Welford Algorithm
// TODO - parallel for each HWC
// TODO - address calculation
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
for
(
int
g
=
0
;
g
<
G
;
++
g
)
{
AccDataType
mean_val
=
type_convert
<
AccDataType
>
(
0.0
f
);
AccDataType
var_val
=
type_convert
<
AccDataType
>
(
0.0
f
);
int32_t
curr_count
=
0
;
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
w
=
0
;
w
<
W
;
++
w
)
{
for
(
int
c
=
0
;
c
<
C
;
++
c
)
{
curr_count
++
;
AccDataType
x
=
type_convert
<
AccDataType
>
(
arg
.
x_
(
n
,
h
,
w
,
g
,
c
));
AccDataType
delta
=
x
-
mean_val
;
mean_val
+=
delta
/
curr_count
;
AccDataType
delta2
=
x
-
mean_val
;
var_val
+=
delta
*
delta2
;
}
}
}
mean
(
n
,
g
)
=
mean_val
;
var
(
n
,
g
)
=
var_val
/
curr_count
;
}
}
// Normalization
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
w
=
0
;
w
<
W
;
++
w
)
{
for
(
int
g
=
0
;
g
<
G
;
++
g
)
{
for
(
int
c
=
0
;
c
<
C
;
++
c
)
{
AccDataType
x
=
type_convert
<
AccDataType
>
(
arg
.
x_
(
n
,
h
,
w
,
g
,
c
));
AccDataType
gamma
=
type_convert
<
AccDataType
>
(
arg
.
gamma_
(
g
,
c
));
AccDataType
beta
=
type_convert
<
AccDataType
>
(
arg
.
beta_
(
g
,
c
));
AccDataType
mean_val
=
type_convert
<
AccDataType
>
(
mean
(
n
,
g
));
AccDataType
var_val
=
type_convert
<
AccDataType
>
(
var
(
n
,
g
));
AccDataType
y
=
gamma
*
(
x
-
mean_val
)
/
ck
::
math
::
sqrt
(
arg
.
epsilon_
+
var_val
)
+
beta
;
arg
.
acc_elementwise_op_
(
y
,
y
);
arg
.
y_
(
n
,
h
,
w
,
g
,
c
)
=
type_convert
<
YDataType
>
(
y
);
}
}
}
}
}
return
0
;
}
float
Run
(
const
device
::
BaseArgument
*
p_arg
,
const
StreamConfig
&
/* stream_config */
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
};
static
constexpr
bool
IsValidCompilationParameter
()
{
// TODO: properly implement this check
return
true
;
}
bool
IsSupportedArgument
(
const
device
::
BaseArgument
*
p_arg
)
override
{
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
p_arg_
->
lengths_
.
size
()
!=
5
)
return
false
;
return
true
;
}
static
auto
MakeArgument
(
const
Tensor
<
XDataType
>&
x
,
const
Tensor
<
GammaDataType
>&
gamma
,
const
Tensor
<
BetaDataType
>&
beta
,
Tensor
<
YDataType
>&
y
,
AccElementwiseOperation
acc_elementwise_op
,
const
std
::
vector
<
index_t
>
lengths
,
AccDataType
epsilon
)
{
return
Argument
{
x
,
gamma
,
beta
,
y
,
acc_elementwise_op
,
lengths
,
epsilon
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
virtual
std
::
unique_ptr
<
device
::
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"ReferenceLayernorm"
<<
std
::
endl
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace host
}
// namespace tensor_operation
}
// namespace ck
library/include/ck/library/tensor_operation_instance/gpu/layernorm.hpp
View file @
24f99138
...
@@ -17,17 +17,25 @@ namespace tensor_operation {
...
@@ -17,17 +17,25 @@ namespace tensor_operation {
namespace
device
{
namespace
device
{
namespace
instance
{
namespace
instance
{
void
add_device_layernorm_f16_rank2_instances
(
// FP16
std
::
vector
<
DeviceLayernormPtr
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
2
,
1
>>&
);
void
add_device_layernorm_rank_2_1_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
2
,
1
>>>&
);
void
add_device_layernorm_
f16_rank4
_instances
(
void
add_device_layernorm_
rank_4_3_f16
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
4
,
3
>>&
);
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
4
,
3
>>
>
&
);
void
add_device_layernorm_
f32_rank2
_instances
(
void
add_device_layernorm_
rank_5_3_f16
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F32
,
F
32
,
F
32
,
F32
,
F
32
,
PassThrough
,
2
,
1
>>&
);
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F
16
,
F
16
,
F32
,
F
16
,
PassThrough
,
5
,
3
>
>>&
);
void
add_device_layernorm_f32_rank4_instances
(
// FP32
std
::
vector
<
DeviceLayernormPtr
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
4
,
3
>>&
);
void
add_device_layernorm_rank_2_1_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
2
,
1
>>>&
);
void
add_device_layernorm_rank_4_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
4
,
3
>>>&
);
void
add_device_layernorm_rank_5_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
5
,
3
>>>&
);
template
<
typename
XDataType
,
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
GammaDataType
,
...
@@ -62,17 +70,33 @@ struct DeviceOperationInstanceFactory<
...
@@ -62,17 +70,33 @@ struct DeviceOperationInstanceFactory<
is_same_v
<
BetaDataType
,
F16
>
&&
is_same_v
<
YDataType
,
F16
>
)
is_same_v
<
BetaDataType
,
F16
>
&&
is_same_v
<
YDataType
,
F16
>
)
{
{
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
add_device_layernorm_f16_rank2_instances
(
op_ptrs
);
{
add_device_layernorm_rank_2_1_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
add_device_layernorm_f16_rank4_instances
(
op_ptrs
);
{
add_device_layernorm_rank_4_3_f16_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
5
&&
NumReduceDim
==
3
)
{
add_device_layernorm_rank_5_3_f16_instances
(
op_ptrs
);
}
}
}
else
if
constexpr
(
is_same_v
<
XDataType
,
F32
>
&&
is_same_v
<
GammaDataType
,
F32
>
&&
else
if
constexpr
(
is_same_v
<
XDataType
,
F32
>
&&
is_same_v
<
GammaDataType
,
F32
>
&&
is_same_v
<
BetaDataType
,
F32
>
&&
is_same_v
<
YDataType
,
F32
>
)
is_same_v
<
BetaDataType
,
F32
>
&&
is_same_v
<
YDataType
,
F32
>
)
{
{
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
if
constexpr
(
Rank
==
2
&&
NumReduceDim
==
1
)
add_device_layernorm_f32_rank2_instances
(
op_ptrs
);
{
add_device_layernorm_rank_2_1_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
else
if
constexpr
(
Rank
==
4
&&
NumReduceDim
==
3
)
add_device_layernorm_f32_rank4_instances
(
op_ptrs
);
{
add_device_layernorm_rank_4_3_f32_instances
(
op_ptrs
);
}
else
if
constexpr
(
Rank
==
5
&&
NumReduceDim
==
3
)
{
add_device_layernorm_rank_5_3_f32_instances
(
op_ptrs
);
}
}
}
return
op_ptrs
;
return
op_ptrs
;
...
...
library/include/ck/library/utility/check_err.hpp
View file @
24f99138
...
@@ -15,6 +15,7 @@
...
@@ -15,6 +15,7 @@
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/span.hpp"
#include "ck/utility/type.hpp"
#include "ck/utility/type.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/host_utility/io.hpp"
...
@@ -32,7 +33,7 @@ check_err(const std::vector<T>& out,
...
@@ -32,7 +33,7 @@ check_err(const std::vector<T>& out,
{
{
if
(
out
.
size
()
!=
ref
.
size
())
if
(
out
.
size
()
!=
ref
.
size
())
{
{
std
::
c
out
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
std
::
c
err
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
<<
std
::
endl
;
return
false
;
return
false
;
}
}
...
@@ -50,7 +51,7 @@ check_err(const std::vector<T>& out,
...
@@ -50,7 +51,7 @@ check_err(const std::vector<T>& out,
err_count
++
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
)
{
{
std
::
c
out
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
std
::
c
err
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
out
[
i
]
<<
" != "
<<
ref
[
i
]
<<
std
::
endl
;
<<
"] != ref["
<<
i
<<
"]: "
<<
out
[
i
]
<<
" != "
<<
ref
[
i
]
<<
std
::
endl
;
}
}
res
=
false
;
res
=
false
;
...
@@ -58,7 +59,7 @@ check_err(const std::vector<T>& out,
...
@@ -58,7 +59,7 @@ check_err(const std::vector<T>& out,
}
}
if
(
!
res
)
if
(
!
res
)
{
{
std
::
c
out
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
std
::
c
err
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
}
return
res
;
return
res
;
}
}
...
@@ -73,7 +74,7 @@ check_err(const std::vector<T>& out,
...
@@ -73,7 +74,7 @@ check_err(const std::vector<T>& out,
{
{
if
(
out
.
size
()
!=
ref
.
size
())
if
(
out
.
size
()
!=
ref
.
size
())
{
{
std
::
c
out
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
std
::
c
err
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
<<
std
::
endl
;
return
false
;
return
false
;
}
}
...
@@ -94,7 +95,7 @@ check_err(const std::vector<T>& out,
...
@@ -94,7 +95,7 @@ check_err(const std::vector<T>& out,
err_count
++
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
)
{
{
std
::
c
out
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
std
::
c
err
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
}
}
res
=
false
;
res
=
false
;
...
@@ -102,22 +103,22 @@ check_err(const std::vector<T>& out,
...
@@ -102,22 +103,22 @@ check_err(const std::vector<T>& out,
}
}
if
(
!
res
)
if
(
!
res
)
{
{
std
::
c
out
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
std
::
c
err
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
}
return
res
;
return
res
;
}
}
template
<
typename
T
>
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>
::
value
,
bool
>::
type
typename
std
::
enable_if
<
std
::
is_same
_v
<
T
,
half_t
>
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>
&
out
,
check_err
(
span
<
const
T
>
out
,
const
std
::
vector
<
T
>
&
ref
,
span
<
const
T
>
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
double
atol
=
1e-3
)
{
{
if
(
out
.
size
()
!=
ref
.
size
())
if
(
out
.
size
()
!=
ref
.
size
())
{
{
std
::
c
out
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
std
::
c
err
<<
msg
<<
" out.size() != ref.size(), :"
<<
out
.
size
()
<<
" != "
<<
ref
.
size
()
<<
std
::
endl
;
<<
std
::
endl
;
return
false
;
return
false
;
}
}
...
@@ -137,7 +138,7 @@ check_err(const std::vector<T>& out,
...
@@ -137,7 +138,7 @@ check_err(const std::vector<T>& out,
err_count
++
;
err_count
++
;
if
(
err_count
<
5
)
if
(
err_count
<
5
)
{
{
std
::
c
out
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
std
::
c
err
<<
msg
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
" out["
<<
i
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
<<
"] != ref["
<<
i
<<
"]: "
<<
o
<<
" != "
<<
r
<<
std
::
endl
;
}
}
res
=
false
;
res
=
false
;
...
@@ -145,11 +146,22 @@ check_err(const std::vector<T>& out,
...
@@ -145,11 +146,22 @@ check_err(const std::vector<T>& out,
}
}
if
(
!
res
)
if
(
!
res
)
{
{
std
::
c
out
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
std
::
c
err
<<
std
::
setw
(
12
)
<<
std
::
setprecision
(
7
)
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
}
return
res
;
return
res
;
}
}
template
<
typename
T
>
typename
std
::
enable_if
<
std
::
is_same
<
T
,
half_t
>::
value
,
bool
>::
type
check_err
(
const
std
::
vector
<
T
>&
out
,
const
std
::
vector
<
T
>&
ref
,
const
std
::
string
&
msg
=
"Error: Incorrect results!"
,
double
rtol
=
1e-3
,
double
atol
=
1e-3
)
{
return
check_err
(
span
<
const
T
>
{
out
},
span
<
const
T
>
{
ref
},
msg
,
rtol
,
atol
);
}
template
<
typename
T
>
template
<
typename
T
>
std
::
enable_if_t
<
(
std
::
is_integral_v
<
T
>
&&
!
std
::
is_same_v
<
T
,
bhalf_t
>
)
std
::
enable_if_t
<
(
std
::
is_integral_v
<
T
>
&&
!
std
::
is_same_v
<
T
,
bhalf_t
>
)
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
...
@@ -194,7 +206,7 @@ check_err(const std::vector<T>& out,
...
@@ -194,7 +206,7 @@ check_err(const std::vector<T>& out,
}
}
if
(
!
res
)
if
(
!
res
)
{
{
std
::
c
out
<<
"max err: "
<<
max_err
<<
std
::
endl
;
std
::
c
err
<<
"max err: "
<<
max_err
<<
std
::
endl
;
}
}
return
res
;
return
res
;
}
}
...
...
library/include/ck/library/utility/fill.hpp
View file @
24f99138
...
@@ -5,7 +5,10 @@
...
@@ -5,7 +5,10 @@
#include <algorithm>
#include <algorithm>
#include <cmath>
#include <cmath>
#include <iterator>
#include <random>
#include <random>
#include <type_traits>
#include <utility>
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
...
@@ -25,6 +28,15 @@ struct FillUniformDistribution
...
@@ -25,6 +28,15 @@ struct FillUniformDistribution
std
::
uniform_real_distribution
<
float
>
dis
(
a_
,
b_
);
std
::
uniform_real_distribution
<
float
>
dis
(
a_
,
b_
);
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
std
::
generate
(
first
,
last
,
[
&
dis
,
&
gen
]()
{
return
ck
::
type_convert
<
T
>
(
dis
(
gen
));
});
}
}
template
<
typename
ForwardRange
>
auto
operator
()(
ForwardRange
&&
range
)
->
std
::
void_t
<
decltype
(
std
::
declval
<
FillUniformDistribution
>
()(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
))))
>
{
(
*
this
)(
std
::
begin
(
std
::
forward
<
ForwardRange
>
(
range
)),
std
::
end
(
std
::
forward
<
ForwardRange
>
(
range
)));
}
};
};
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
...
...
library/include/ck/library/utility/host_tensor.hpp
View file @
24f99138
...
@@ -3,15 +3,16 @@
...
@@ -3,15 +3,16 @@
#pragma once
#pragma once
#include <thread>
#include <vector>
#include <numeric>
#include <algorithm>
#include <algorithm>
#include <utility>
#include <cassert>
#include <cassert>
#include <iostream>
#include <iostream>
#include <numeric>
#include <thread>
#include <utility>
#include <vector>
#include "ck/utility/data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/span.hpp"
template
<
typename
Range
>
template
<
typename
Range
>
std
::
ostream
&
LogRange
(
std
::
ostream
&
os
,
Range
&&
range
,
std
::
string
delim
)
std
::
ostream
&
LogRange
(
std
::
ostream
&
os
,
Range
&&
range
,
std
::
string
delim
)
...
@@ -235,6 +236,9 @@ auto make_ParallelTensorFunctor(F f, Xs... xs)
...
@@ -235,6 +236,9 @@ auto make_ParallelTensorFunctor(F f, Xs... xs)
template
<
typename
T
>
template
<
typename
T
>
struct
Tensor
struct
Tensor
{
{
using
Descriptor
=
HostTensorDescriptor
;
using
Data
=
std
::
vector
<
T
>
;
template
<
typename
X
>
template
<
typename
X
>
Tensor
(
std
::
initializer_list
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
GetElementSpaceSize
())
Tensor
(
std
::
initializer_list
<
X
>
lens
)
:
mDesc
(
lens
),
mData
(
mDesc
.
GetElementSpaceSize
())
{
{
...
@@ -251,7 +255,7 @@ struct Tensor
...
@@ -251,7 +255,7 @@ struct Tensor
{
{
}
}
Tensor
(
const
HostTensor
Descriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpaceSize
())
{}
Tensor
(
const
Descriptor
&
desc
)
:
mDesc
(
desc
),
mData
(
mDesc
.
GetElementSpaceSize
())
{}
template
<
typename
OutT
>
template
<
typename
OutT
>
Tensor
<
OutT
>
CopyAsType
()
const
Tensor
<
OutT
>
CopyAsType
()
const
...
@@ -278,9 +282,9 @@ struct Tensor
...
@@ -278,9 +282,9 @@ struct Tensor
{
{
}
}
const
std
::
vector
<
std
::
size_t
>&
GetLengths
()
const
{
return
mDesc
.
GetLengths
();
}
decltype
(
auto
)
GetLengths
()
const
{
return
mDesc
.
GetLengths
();
}
const
std
::
vector
<
std
::
size_t
>&
GetStrides
()
const
{
return
mDesc
.
GetStrides
();
}
decltype
(
auto
)
GetStrides
()
const
{
return
mDesc
.
GetStrides
();
}
std
::
size_t
GetNumOfDimension
()
const
{
return
mDesc
.
GetNumOfDimension
();
}
std
::
size_t
GetNumOfDimension
()
const
{
return
mDesc
.
GetNumOfDimension
();
}
...
@@ -288,6 +292,8 @@ struct Tensor
...
@@ -288,6 +292,8 @@ struct Tensor
std
::
size_t
GetElementSpaceSize
()
const
{
return
mDesc
.
GetElementSpaceSize
();
}
std
::
size_t
GetElementSpaceSize
()
const
{
return
mDesc
.
GetElementSpaceSize
();
}
std
::
size_t
GetElementSpaceSizeInBytes
()
const
{
return
sizeof
(
T
)
*
GetElementSpaceSize
();
}
void
SetZero
()
void
SetZero
()
{
{
for
(
auto
&
v
:
mData
)
for
(
auto
&
v
:
mData
)
...
@@ -425,14 +431,40 @@ struct Tensor
...
@@ -425,14 +431,40 @@ struct Tensor
return
mData
[
mDesc
.
GetOffsetFromMultiIndex
(
idx
)];
return
mData
[
mDesc
.
GetOffsetFromMultiIndex
(
idx
)];
}
}
typename
std
::
vector
<
T
>::
iterator
begin
()
{
return
mData
.
begin
();
}
typename
Data
::
iterator
begin
()
{
return
mData
.
begin
();
}
typename
Data
::
iterator
end
()
{
return
mData
.
end
();
}
typename
std
::
vector
<
T
>::
iterator
end
()
{
return
mData
.
end
();
}
typename
Data
::
pointer
data
()
{
return
mData
.
data
();
}
typename
std
::
vector
<
T
>
::
const_iterator
begin
()
const
{
return
mData
.
begin
();
}
typename
Data
::
const_iterator
begin
()
const
{
return
mData
.
begin
();
}
typename
std
::
vector
<
T
>::
const_iterator
end
()
const
{
return
mData
.
end
();
}
typename
Data
::
const_iterator
end
()
const
{
return
mData
.
end
();
}
typename
Data
::
const_pointer
data
()
const
{
return
mData
.
data
();
}
typename
Data
::
size_type
size
()
const
{
return
mData
.
size
();
}
template
<
typename
U
=
T
>
auto
AsSpan
()
const
{
constexpr
std
::
size_t
FromSize
=
sizeof
(
T
);
constexpr
std
::
size_t
ToSize
=
sizeof
(
U
);
using
Element
=
std
::
add_const_t
<
std
::
remove_reference_t
<
U
>>
;
return
ck
::
span
<
Element
>
{
reinterpret_cast
<
Element
*>
(
data
()),
size
()
*
FromSize
/
ToSize
};
}
template
<
typename
U
=
T
>
auto
AsSpan
()
{
constexpr
std
::
size_t
FromSize
=
sizeof
(
T
);
constexpr
std
::
size_t
ToSize
=
sizeof
(
U
);
using
Element
=
std
::
remove_reference_t
<
U
>
;
return
ck
::
span
<
Element
>
{
reinterpret_cast
<
Element
*>
(
data
()),
size
()
*
FromSize
/
ToSize
};
}
HostTensor
Descriptor
mDesc
;
Descriptor
mDesc
;
std
::
vector
<
T
>
mData
;
Data
mData
;
};
};
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f16_instance.cpp
View file @
24f99138
...
@@ -17,34 +17,40 @@ using F32 = float;
...
@@ -17,34 +17,40 @@ using F32 = float;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
index_t
Rank
,
index_t
Reduce
>
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_layernorm_f16_instances
=
std
::
tuple
<
using
device_layernorm_f16_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVector
Size
, BetaSrcVectorSize, YDstVectorSize>
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVector
Dim, GammaSrcVectorSize, BetaSrcVectorDim
, BetaSrcVectorSize, YDstVectorSize>
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
2
,
2
,
2
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
4
,
4
,
4
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
// fallback kernel
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
8
,
8
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
8
,
8
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
8
,
8
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
8
,
8
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
8
,
8
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
8
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
8
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
,
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
8
,
8
,
8
>
DeviceLayernormImpl
<
F16
,
F16
,
F16
,
F32
,
F16
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
8
,
1
,
8
,
1
,
8
,
8
>
// clang-format on
// clang-format on
>
;
>
;
void
add_device_layernorm_
f16_rank2
_instances
(
void
add_device_layernorm_
rank_2_1_f16
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
2
,
1
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
2
,
1
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
2
,
1
>
{});
}
}
void
add_device_layernorm_
f16_rank4
_instances
(
void
add_device_layernorm_
rank_4_3_f16
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
4
,
3
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
4
,
3
>
{});
}
void
add_device_layernorm_rank_5_3_f16_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F16
,
F16
,
F16
,
F32
,
F16
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_layernorm_f16_instances
<
Pass
,
5
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
library/src/tensor_operation_instance/gpu/normalization/device_layernorm_f32_instance.cpp
View file @
24f99138
...
@@ -16,33 +16,39 @@ using F32 = float;
...
@@ -16,33 +16,39 @@ using F32 = float;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
Pass
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
template
<
index_t
Rank
,
index_t
Reduce
>
template
<
typename
OutElementwise
,
index_t
Rank
,
index_t
Reduce
>
using
device_layernorm_f32_instances
=
std
::
tuple
<
using
device_layernorm_f32_instances
=
std
::
tuple
<
// clang-format off
// clang-format off
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
,
// fallback kernel
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
2
,
2
,
2
>
,
// fallback kernel
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
2
,
1
,
2
,
1
,
2
,
2
>
,
// fallback kernel
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
4
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
8
,
32
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
4
,
4
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
4
,
64
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
4
,
4
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
4
,
4
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
4
,
4
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
2
,
128
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
4
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
8
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
4
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
16
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
,
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
4
,
4
,
4
>
DeviceLayernormImpl
<
F32
,
F32
,
F32
,
F32
,
F32
,
OutElementwise
,
Rank
,
Reduce
,
256
,
1
,
256
,
1
,
32
,
1
,
4
,
1
,
4
,
1
,
4
,
4
>
// clang-format on
// clang-format on
>
;
>
;
void
add_device_layernorm_
f32_
rank2_instances
(
void
add_device_layernorm_rank
_2_1_f3
2_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
2
,
1
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
2
,
1
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
2
,
1
>
{});
}
}
void
add_device_layernorm_
f32_rank4
_instances
(
void
add_device_layernorm_
rank_4_3_f32
_instances
(
std
::
vector
<
DeviceLayernorm
Ptr
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>&
instances
)
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
4
,
3
>>
>
&
instances
)
{
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
4
,
3
>
{});
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
4
,
3
>
{});
}
void
add_device_layernorm_rank_5_3_f32_instances
(
std
::
vector
<
std
::
unique_ptr
<
DeviceLayernorm
<
F32
,
F32
,
F32
,
F32
,
F32
,
Pass
,
5
,
3
>>>&
instances
)
{
add_device_operation_instances
(
instances
,
device_layernorm_f32_instances
<
Pass
,
5
,
3
>
{});
}
}
}
// namespace instance
}
// namespace instance
...
...
profiler/CMakeLists.txt
View file @
24f99138
...
@@ -23,6 +23,7 @@ set(PROFILER_SOURCE
...
@@ -23,6 +23,7 @@ set(PROFILER_SOURCE
src/profile_conv_bwd_weight.cpp
src/profile_conv_bwd_weight.cpp
src/profile_grouped_conv_fwd.cpp
src/profile_grouped_conv_fwd.cpp
src/profile_reduce.cpp
src/profile_reduce.cpp
src/profile_groupnorm.cpp
src/profile_layernorm.cpp
src/profile_layernorm.cpp
src/profile_normalization.cpp
src/profile_normalization.cpp
)
)
...
...
profiler/include/profile_groupnorm_impl.hpp
0 → 100644
View file @
24f99138
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
namespace
ck
{
namespace
profiler
{
template
<
typename
XDataType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
YDataType
>
bool
profile_groupnorm_impl
(
int
do_verification
,
int
init_method
,
bool
do_log
,
bool
time_kernel
,
std
::
vector
<
index_t
>
length
)
{
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
if
(
length
.
size
()
!=
5
)
return
false
;
index_t
G
=
length
[
3
];
index_t
C
=
length
[
4
];
std
::
vector
<
index_t
>
reduce_dim
=
{
1
,
2
,
4
};
std
::
vector
<
index_t
>
gammaBetaLength
=
{
G
,
C
};
std
::
vector
<
index_t
>
gammaBetaStride
=
{
0
,
0
,
0
,
C
,
1
};
Tensor
<
XDataType
>
x
(
length
);
Tensor
<
GammaDataType
>
gamma
(
gammaBetaLength
);
Tensor
<
BetaDataType
>
beta
(
gammaBetaLength
);
Tensor
<
YDataType
>
y
(
length
);
Tensor
<
YDataType
>
host_y
(
length
);
switch
(
init_method
)
{
case
0
:
x
.
GenerateTensorValue
(
GeneratorTensor_1
<
XDataType
>
{});
gamma
.
GenerateTensorValue
(
GeneratorTensor_1
<
GammaDataType
>
{});
beta
.
GenerateTensorValue
(
GeneratorTensor_1
<
BetaDataType
>
{});
break
;
case
1
:
x
.
GenerateTensorValue
(
GeneratorTensor_2
<
XDataType
>
{
-
5
,
5
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_2
<
GammaDataType
>
{
-
5
,
5
});
beta
.
GenerateTensorValue
(
GeneratorTensor_2
<
BetaDataType
>
{
-
5
,
5
});
break
;
default:
x
.
GenerateTensorValue
(
GeneratorTensor_3
<
XDataType
>
{
0
,
1
});
gamma
.
GenerateTensorValue
(
GeneratorTensor_3
<
GammaDataType
>
{
-
0.5
,
0.5
});
beta
.
GenerateTensorValue
(
GeneratorTensor_3
<
BetaDataType
>
{
-
0.5
,
0.5
});
}
DeviceMem
x_dev
(
sizeof
(
XDataType
)
*
x
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
gamma_dev
(
sizeof
(
GammaDataType
)
*
gamma
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
beta_dev
(
sizeof
(
BetaDataType
)
*
beta
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
y_dev
(
sizeof
(
YDataType
)
*
y
.
mDesc
.
GetElementSpaceSize
());
x_dev
.
ToDevice
(
x
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
// add device normalization instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceLayernorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
,
PassThrough
,
5
,
3
>
;
// get device op instances
const
auto
instance_ptrs
=
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
DeviceOp
>::
GetInstances
();
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_gb_per_sec
=
0
;
if
(
do_verification
)
{
using
ReferenceInstance
=
ck
::
tensor_operation
::
host
::
ReferenceGroupnorm
<
XDataType
,
GammaDataType
,
BetaDataType
,
YDataType
,
AccDataType
,
PassThrough
>
;
ReferenceInstance
ref
;
auto
ref_argument
=
ref
.
MakeArgument
(
x
,
gamma
,
beta
,
host_y
,
PassThrough
{},
length
,
1e-6
);
auto
ref_invoker
=
ref
.
MakeInvoker
();
ref_invoker
.
Run
(
ref_argument
);
}
int
num_kernel
=
0
;
for
(
auto
&
inst_ptr
:
instance_ptrs
)
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
std
::
vector
<
ck
::
index_t
>
{
x
.
mDesc
.
GetStrides
().
begin
(),
x
.
mDesc
.
GetStrides
().
end
()},
gammaBetaStride
,
gammaBetaStride
,
std
::
vector
<
ck
::
index_t
>
{
y
.
mDesc
.
GetStrides
().
begin
(),
y
.
mDesc
.
GetStrides
().
end
()},
reduce_dim
,
1e-6
,
x_dev
.
GetDeviceBuffer
(),
gamma_dev
.
GetDeviceBuffer
(),
beta_dev
.
GetDeviceBuffer
(),
y_dev
.
GetDeviceBuffer
(),
PassThrough
{});
if
(
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
++
num_kernel
;
}
else
{
continue
;
}
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
float
avg_time
=
invoker_ptr
->
Run
(
argument_ptr
.
get
(),
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
num_bytes
=
x
.
mDesc
.
GetElementSize
()
*
sizeof
(
XDataType
)
+
gamma
.
mDesc
.
GetElementSize
()
*
sizeof
(
GammaDataType
)
+
beta
.
mDesc
.
GetElementSize
()
*
sizeof
(
BetaDataType
)
+
y
.
mDesc
.
GetElementSize
()
*
sizeof
(
YDataType
);
float
gb_per_sec
=
num_bytes
/
1.E6
/
avg_time
;
if
(
time_kernel
)
std
::
cout
<<
"Perf: "
<<
std
::
setw
(
10
)
<<
avg_time
<<
" ms, "
<<
gb_per_sec
<<
" GB/s, "
<<
inst_ptr
->
GetTypeString
()
<<
std
::
endl
;
if
(
avg_time
<
best_avg_time
)
{
best_instance_name
=
inst_ptr
->
GetTypeString
();
best_avg_time
=
avg_time
;
best_gb_per_sec
=
gb_per_sec
;
}
if
(
do_verification
)
{
y_dev
.
FromDevice
(
y
.
mData
.
data
());
bool
pass
=
ck
::
utils
::
check_err
(
y
.
mData
,
host_y
.
mData
,
"Error: Incorrect results"
,
1e-3
,
1e-3
);
if
(
do_log
)
{
LogRangeAsType
<
float
>
(
std
::
cout
<<
"x : "
,
x
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"host_y : "
,
host_y
.
mData
,
","
)
<<
std
::
endl
;
LogRangeAsType
<
float
>
(
std
::
cout
<<
"y : "
,
y
.
mData
,
","
)
<<
std
::
endl
;
}
if
(
!
pass
)
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" failed verification: "
;
LogRange
(
std
::
cout
<<
"lengths = ["
,
length
,
", "
)
<<
"]."
<<
std
::
endl
;
return
false
;
}
else
{
if
(
time_kernel
)
std
::
cout
<<
"pass"
<<
std
::
endl
;
}
}
}
if
(
time_kernel
)
{
LogRange
(
std
::
cout
<<
"length = "
,
length
,
","
)
<<
", "
;
std
::
cout
<<
"num_kernel = "
<<
num_kernel
<<
", best perf = "
<<
best_avg_time
<<
" ms, "
<<
best_gb_per_sec
<<
" GB/s, "
<<
best_instance_name
<<
std
::
endl
;
}
if
(
num_kernel
==
0
)
{
std
::
cout
<<
"Error: No kernel is tested"
<<
std
::
endl
;
return
false
;
}
return
true
;
}
}
// namespace profiler
}
// namespace ck
profiler/include/profile_layernorm_impl.hpp
View file @
24f99138
...
@@ -6,8 +6,8 @@
...
@@ -6,8 +6,8 @@
#include <iomanip>
#include <iomanip>
#include "ck/ck.hpp"
#include "ck/ck.hpp"
#include "profiler/include/data_type_enum.hpp"
#include "ck/tensor_operation
/gpu/device/device_
layernorm
_impl
.hpp"
#include "ck/
library/
tensor_operation
_instance/gpu/
layernorm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
...
@@ -15,26 +15,6 @@
...
@@ -15,26 +15,6 @@
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
instance
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
void
add_device_layernorm_f16_rank2_instances
(
std
::
vector
<
DeviceLayernormPtr
<
F16
,
F16
,
F16
,
F32
,
F16
,
PassThrough
,
2
,
1
>>&
);
void
add_device_layernorm_f32_rank2_instances
(
std
::
vector
<
DeviceLayernormPtr
<
F32
,
F32
,
F32
,
F32
,
F32
,
PassThrough
,
2
,
1
>>&
);
}
// namespace instance
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
namespace
ck
{
namespace
ck
{
namespace
profiler
{
namespace
profiler
{
...
@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification,
...
@@ -53,8 +33,6 @@ void profile_layernorm_impl(int do_verification,
std
::
vector
<
index_t
>
strideGamma
,
std
::
vector
<
index_t
>
strideGamma
,
std
::
vector
<
index_t
>
strideBeta
)
std
::
vector
<
index_t
>
strideBeta
)
{
{
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
PassThrough
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
if
(
length
.
size
()
<
2
)
if
(
length
.
size
()
<
2
)
...
@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification,
...
@@ -103,37 +81,24 @@ void profile_layernorm_impl(int do_verification,
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
gamma_dev
.
ToDevice
(
gamma
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
beta_dev
.
ToDevice
(
beta
.
mData
.
data
());
// add device normalization instances
constexpr
int
NumReduceDim
=
Rank
-
1
;
constexpr
int
NumReduceDim
=
Rank
-
1
;
std
::
vector
<
tensor_operation
::
device
::
DeviceLayernormPtr
<
XDataType
,
// add device normalization instances
using
DeviceOp
=
ck
::
tensor_operation
::
device
::
DeviceLayernorm
<
XDataType
,
GammaDataType
,
GammaDataType
,
BetaDataType
,
BetaDataType
,
AccDataType
,
AccDataType
,
YDataType
,
YDataType
,
PassThrough
,
PassThrough
,
Rank
,
Rank
,
NumReduceDim
>>
NumReduceDim
>
;
instances
;
if
constexpr
(
is_same
<
XDataType
,
F16
>::
value
&&
is_same
<
GammaDataType
,
F16
>::
value
&&
// get device op instances
is_same
<
BetaDataType
,
F16
>::
value
&&
is_same
<
YDataType
,
F16
>::
value
&&
const
auto
instance_ptrs
=
is_same
<
AccDataType
,
F32
>::
value
)
ck
::
tensor_operation
::
device
::
instance
::
DeviceOperationInstanceFactory
<
{
DeviceOp
>::
GetInstances
();
if
(
length
.
size
()
==
2
)
tensor_operation
::
device
::
instance
::
add_device_layernorm_f16_rank2_instances
(
instances
);
}
else
if
constexpr
(
is_same
<
XDataType
,
F32
>::
value
&&
is_same
<
GammaDataType
,
F32
>::
value
&&
is_same
<
BetaDataType
,
F32
>::
value
&&
is_same
<
YDataType
,
F32
>::
value
&&
is_same
<
AccDataType
,
F32
>::
value
)
{
if
(
length
.
size
()
==
2
)
tensor_operation
::
device
::
instance
::
add_device_layernorm_f32_rank2_instances
(
instances
);
}
if
(
instances
.
size
()
<=
0
)
std
::
cout
<<
"found "
<<
instance_ptrs
.
size
()
<<
" instances"
<<
std
::
endl
;
{
throw
std
::
runtime_error
(
"wrong! no device normalization instance found"
);
}
std
::
string
best_instance_name
;
std
::
string
best_instance_name
;
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
float
best_avg_time
=
std
::
numeric_limits
<
float
>::
max
();
...
@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification,
...
@@ -157,7 +122,7 @@ void profile_layernorm_impl(int do_verification,
ref_invoker
.
Run
(
ref_argument
);
ref_invoker
.
Run
(
ref_argument
);
}
}
for
(
auto
&
inst_ptr
:
instances
)
for
(
auto
&
inst_ptr
:
instance
_ptr
s
)
{
{
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
auto
argument_ptr
=
inst_ptr
->
MakeArgumentPointer
(
length
,
strideXY
,
strideXY
,
...
@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification,
...
@@ -175,9 +140,9 @@ void profile_layernorm_impl(int do_verification,
if
(
!
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
if
(
!
inst_ptr
->
IsSupportedArgument
(
argument_ptr
.
get
()))
{
{
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
std
::
cout
<<
inst_ptr
->
GetTypeString
()
<<
" skipped due to unsupported argument: "
;
LogRange
(
std
::
cout
<<
"input lengths =
[
"
,
length
,
"
]
, "
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"input lengths = "
,
length
,
", "
)
<<
std
::
endl
;
return
;
continue
;
}
}
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
auto
invoker_ptr
=
inst_ptr
->
MakeInvokerPointer
();
...
...
profiler/src/profile_groupnorm.cpp
0 → 100644
View file @
24f99138
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <vector>
#include <unordered_map>
#include "profiler/include/data_type_enum.hpp"
#include "profiler/include/profile_groupnorm_impl.hpp"
using
ck
::
index_t
;
struct
GroupnormArgParser
{
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int
>>
long_opts
=
{{
"length"
,
{}}};
bool
parse_opt
(
int
argc
,
char
*
argv
[],
const
std
::
string
&
key
,
int
i
)
{
if
(
std
::
string
(
"--"
)
+
key
==
argv
[
i
])
{
int
pos
=
i
;
while
(
++
i
<
argc
&&
argv
[
i
][
0
]
!=
'-'
)
{}
int
end
=
i
;
for
(
int
j
=
pos
+
1
;
j
<
end
;
j
++
)
{
long_opts
[
key
].
push_back
(
std
::
stoi
(
argv
[
j
]));
}
return
true
;
}
return
false
;
}
void
operator
()(
int
argc
,
char
*
argv
[])
{
for
(
auto
&
kv
:
long_opts
)
{
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
if
(
parse_opt
(
argc
,
argv
,
kv
.
first
,
i
))
break
;
}
}
}
};
void
print_help_groupnorm
()
{
std
::
cout
<<
"arg1: tensor operation (groupnorm: Group normalization)
\n
"
<<
"arg2: data type (0: fp16; 1: fp32)
\n
"
<<
"arg3: verification (0: no; 1: yes)
\n
"
<<
"arg4: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg5: print tensor value (0: no; 1: yes)
\n
"
<<
"arg6: time kernel (0=no, 1=yes)
\n
"
<<
"--length: tensor extents (e.g, --length 1 16 16 32 40)
\n
"
<<
std
::
endl
;
}
int
profile_groupnorm
(
int
argc
,
char
*
argv
[])
{
ck
::
DataTypeEnum
data_type
=
ck
::
DataTypeEnum
::
Half
;
bool
do_verification
=
false
;
int
init_method
=
0
;
bool
do_log
=
0
;
bool
time_kernel
=
1
;
std
::
vector
<
index_t
>
length
=
{
64
,
16
,
16
,
32
,
40
};
if
(
argc
!=
1
&&
argc
!=
13
)
{
print_help_groupnorm
();
return
0
;
}
if
(
argc
==
13
)
{
data_type
=
static_cast
<
ck
::
DataTypeEnum
>
(
std
::
stoi
(
argv
[
2
]));
do_verification
=
std
::
stoi
(
argv
[
3
]);
init_method
=
std
::
stoi
(
argv
[
4
]);
do_log
=
std
::
stoi
(
argv
[
5
]);
time_kernel
=
std
::
stoi
(
argv
[
6
]);
// parse the long options
GroupnormArgParser
arg_parser
;
arg_parser
(
argc
,
argv
);
length
=
arg_parser
.
long_opts
[
"length"
];
}
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
if
(
data_type
==
ck
::
DataTypeEnum
::
Float
)
{
ck
::
profiler
::
profile_groupnorm_impl
<
F32
,
F32
,
F32
,
F32
,
F32
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
if
(
data_type
==
ck
::
DataTypeEnum
::
Half
)
{
ck
::
profiler
::
profile_groupnorm_impl
<
F16
,
F16
,
F16
,
F32
,
F16
>
(
do_verification
,
init_method
,
do_log
,
time_kernel
,
length
);
}
else
{
throw
std
::
runtime_error
(
"not implemented yet"
);
}
return
0
;
}
profiler/src/profile_layernorm.cpp
View file @
24f99138
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
#include <vector>
#include <vector>
#include <unordered_map>
#include <unordered_map>
#include "profiler/include/data_type_enum.hpp"
#include "profiler/include/profile_layernorm_impl.hpp"
#include "profiler/include/profile_layernorm_impl.hpp"
using
ck
::
index_t
;
using
ck
::
index_t
;
...
@@ -49,7 +50,7 @@ void print_help_layernorm()
...
@@ -49,7 +50,7 @@ void print_help_layernorm()
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg2: verification (0: no; 1: yes)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg3: initialization (0: no init; 1: integer value; 2: decimal value)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
<<
"arg4: print tensor value (0: no; 1: yes)
\n
"
<<
"arg5: time kernel (0=n
0
, 1=yes)
\n
"
<<
"arg5: time kernel (0=n
o
, 1=yes)
\n
"
<<
"--length: tensor extents (e.g, --length 1024 1024)
\n
"
<<
"--length: tensor extents (e.g, --length 1024 1024)
\n
"
<<
"--strideXY: tensor strides (e.g, --strideXY 1024 1)
\n
"
<<
"--strideXY: tensor strides (e.g, --strideXY 1024 1)
\n
"
<<
"--strideGamma: tensor strides (e.g, --strideGamma 1)
\n
"
<<
"--strideGamma: tensor strides (e.g, --strideGamma 1)
\n
"
...
@@ -114,10 +115,3 @@ int profile_layernorm(int argc, char* argv[])
...
@@ -114,10 +115,3 @@ int profile_layernorm(int argc, char* argv[])
return
0
;
return
0
;
}
}
// hijack main() for quick debugging
// int main(int argc, char* argv[])
// {
// profile_layernorm(argc, argv);
// return 0;
// }
profiler/src/profiler.cpp
View file @
24f99138
...
@@ -3,26 +3,27 @@
...
@@ -3,26 +3,27 @@
#include <cstring>
#include <cstring>
int
profile_gemm
(
int
,
char
*
[]);
//
int profile_gemm(int, char*[]);
int
profile_gemm_splitk
(
int
,
char
*
[]);
//
int profile_gemm_splitk(int, char*[]);
int
profile_gemm_bilinear
(
int
,
char
*
[]);
//
int profile_gemm_bilinear(int, char*[]);
int
profile_gemm_add_add_fastgelu
(
int
,
char
*
[]);
//
int profile_gemm_add_add_fastgelu(int, char*[]);
int
profile_gemm_reduce
(
int
,
char
*
[]);
//
int profile_gemm_reduce(int, char*[]);
int
profile_gemm_bias_add_reduce
(
int
,
char
*
[]);
//
int profile_gemm_bias_add_reduce(int, char*[]);
int
profile_batched_gemm
(
int
,
char
*
[]);
//
int profile_batched_gemm(int, char*[]);
int
profile_batched_gemm_gemm
(
int
,
char
*
[]);
//
int profile_batched_gemm_gemm(int, char*[]);
int
profile_batched_gemm_add_relu_gemm_add
(
int
,
char
*
[]);
//
int profile_batched_gemm_add_relu_gemm_add(int, char*[]);
int
profile_batched_gemm_reduce
(
int
,
char
*
[]);
//
int profile_batched_gemm_reduce(int, char*[]);
int
profile_grouped_gemm
(
int
,
char
*
[]);
//
int profile_grouped_gemm(int, char*[]);
int
profile_conv_fwd
(
int
,
char
*
[]);
//
int profile_conv_fwd(int, char*[]);
int
profile_conv_fwd_bias_relu
(
int
,
char
*
[]);
//
int profile_conv_fwd_bias_relu(int, char*[]);
int
profile_conv_fwd_bias_relu_add
(
int
,
char
*
[]);
//
int profile_conv_fwd_bias_relu_add(int, char*[]);
int
profile_conv_bwd_data
(
int
,
char
*
[]);
//
int profile_conv_bwd_data(int, char*[]);
int
profile_conv_bwd_weight
(
int
,
char
*
[]);
//
int profile_conv_bwd_weight(int, char*[]);
int
profile_grouped_conv_fwd
(
int
,
char
*
[]);
//
int profile_grouped_conv_fwd(int, char*[]);
int
profile_normalization
(
int
,
char
*
[]);
//
int profile_normalization(int, char*[]);
int
profile_layernorm
(
int
,
char
*
[]);
int
profile_layernorm
(
int
,
char
*
[]);
int
profile_reduce
(
int
,
char
*
[]);
int
profile_groupnorm
(
int
,
char
*
[]);
// int profile_reduce(int, char*[]);
static
void
print_helper_message
()
static
void
print_helper_message
()
{
{
...
@@ -56,6 +57,7 @@ int main(int argc, char* argv[])
...
@@ -56,6 +57,7 @@ int main(int argc, char* argv[])
return
0
;
return
0
;
}
}
#if 0
else if(strcmp(argv[1], "gemm") == 0)
else if(strcmp(argv[1], "gemm") == 0)
{
{
return profile_gemm(argc, argv);
return profile_gemm(argc, argv);
...
@@ -132,10 +134,15 @@ int main(int argc, char* argv[])
...
@@ -132,10 +134,15 @@ int main(int argc, char* argv[])
{
{
return profile_normalization(argc, argv);
return profile_normalization(argc, argv);
}
}
#endif
else
if
(
strcmp
(
argv
[
1
],
"layernorm"
)
==
0
)
else
if
(
strcmp
(
argv
[
1
],
"layernorm"
)
==
0
)
{
{
return
profile_layernorm
(
argc
,
argv
);
return
profile_layernorm
(
argc
,
argv
);
}
}
else
if
(
strcmp
(
argv
[
1
],
"groupnorm"
)
==
0
)
{
return
profile_groupnorm
(
argc
,
argv
);
}
else
else
{
{
print_helper_message
();
print_helper_message
();
...
...
test/layernorm/CMakeLists.txt
View file @
24f99138
add_custom_target
(
test_layernorm
)
add_custom_target
(
test_layernorm
)
add_gtest_executable
(
test_layernorm_fp32 test_layernorm_fp32.cpp
)
add_gtest_executable
(
test_layernorm2d_fp32 test_layernorm2d_fp32.cpp
)
add_gtest_executable
(
test_layernorm_fp16 test_layernorm_fp16.cpp
)
add_gtest_executable
(
test_layernorm2d_fp16 test_layernorm2d_fp16.cpp
)
add_gtest_executable
(
test_groupnorm_fp16 test_groupnorm_fp16.cpp
)
add_gtest_executable
(
test_groupnorm_fp32 test_groupnorm_fp32.cpp
)
target_link_libraries
(
test_layernorm_fp32 PRIVATE utility
)
target_link_libraries
(
test_layernorm2d_fp32 PRIVATE utility
)
target_link_libraries
(
test_layernorm_fp16 PRIVATE utility
)
target_link_libraries
(
test_layernorm2d_fp16 PRIVATE utility
)
target_link_libraries
(
test_groupnorm_fp16 PRIVATE utility device_normalization_instance
)
target_link_libraries
(
test_groupnorm_fp32 PRIVATE utility device_normalization_instance
)
add_dependencies
(
test_layernorm test_layernorm2d_fp32
)
add_dependencies
(
test_layernorm test_layernorm2d_fp16
)
add_dependencies
(
test_layernorm test_groupnorm_fp16
)
add_dependencies
(
test_layernorm test_groupnorm_fp32
)
add_dependencies
(
test_layernorm test_layernorm_fp32
)
add_dependencies
(
test_layernorm test_layernorm_fp16
)
test/layernorm/test_groupnorm_fp16.cpp
0 → 100644
View file @
24f99138
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/include/profile_groupnorm_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestGroupnorm
:
public
::
testing
::
Test
{
protected:
using
XDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
GammaDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
BetaDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
AccDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
YDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
void
Run
()
{
// N, H, W, G, C
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{{
1
,
1
,
1
,
1
,
1
},
{
1
,
2
,
3
,
4
,
5
},
{
256
,
9
,
9
,
9
,
9
},
{
1
,
64
,
64
,
32
,
10
},
{
1
,
32
,
32
,
32
,
20
},
{
1
,
16
,
16
,
32
,
40
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_groupnorm_impl
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
>
,
std
::
tuple
<
F16
,
F16
,
F16
,
F32
,
F16
>>
;
TYPED_TEST_SUITE
(
TestGroupnorm
,
KernelTypes
);
TYPED_TEST
(
TestGroupnorm
,
Test_FP16
)
{
this
->
Run
();
}
test/layernorm/test_groupnorm_fp32.cpp
0 → 100644
View file @
24f99138
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "profiler/include/profile_groupnorm_impl.hpp"
using
F16
=
ck
::
half_t
;
using
F32
=
float
;
using
ck
::
index_t
;
template
<
typename
Tuple
>
class
TestGroupnorm
:
public
::
testing
::
Test
{
protected:
using
XDataType
=
std
::
tuple_element_t
<
0
,
Tuple
>
;
using
GammaDataType
=
std
::
tuple_element_t
<
1
,
Tuple
>
;
using
BetaDataType
=
std
::
tuple_element_t
<
2
,
Tuple
>
;
using
AccDataType
=
std
::
tuple_element_t
<
3
,
Tuple
>
;
using
YDataType
=
std
::
tuple_element_t
<
4
,
Tuple
>
;
void
Run
()
{
// N, H, W, G, C
std
::
vector
<
std
::
vector
<
ck
::
index_t
>>
lengths
=
{{
1
,
1
,
1
,
1
,
1
},
{
1
,
2
,
3
,
4
,
5
},
{
256
,
9
,
9
,
9
,
9
},
{
1
,
64
,
64
,
32
,
10
},
{
1
,
32
,
32
,
32
,
20
},
{
1
,
16
,
16
,
32
,
40
}};
for
(
auto
length
:
lengths
)
{
bool
success
=
ck
::
profiler
::
profile_groupnorm_impl
<
XDataType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
YDataType
>
(
true
,
2
,
false
,
false
,
length
);
EXPECT_TRUE
(
success
);
}
}
};
using
KernelTypes
=
::
testing
::
Types
<
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
>
,
std
::
tuple
<
F32
,
F32
,
F32
,
F32
,
F32
>>
;
TYPED_TEST_SUITE
(
TestGroupnorm
,
KernelTypes
);
TYPED_TEST
(
TestGroupnorm
,
Test_FP32
)
{
this
->
Run
();
}
test/layernorm/test_layernorm_fp16.cpp
→
test/layernorm/test_layernorm
2d
_fp16.cpp
View file @
24f99138
...
@@ -2,28 +2,28 @@
...
@@ -2,28 +2,28 @@
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "test_layernorm_util.hpp"
#include "test_layernorm
2d
_util.hpp"
template
<
ck
::
index_t
N
>
template
<
ck
::
index_t
N
>
using
I
=
ck
::
Number
<
N
>
;
using
I
=
ck
::
Number
<
N
>
;
template
<
typename
Tuple
>
template
<
typename
Tuple
>
class
TestLayernormFP16
:
public
ck
::
TestLayernorm
<
Tuple
>
class
TestLayernorm
2d
FP16
:
public
ck
::
TestLayernorm
2d
<
Tuple
>
{
{
};
};
// clang-format off
// clang-format off
using
KernelTypes
=
::
testing
::
Types
<
using
KernelTypes
=
::
testing
::
Types
<
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, , GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
// XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize,
GammaSrcVectorDim
, GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize, YDstVectorSize>
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
8
>
,
I
<
32
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
4
>
,
I
<
64
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
128
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
,
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>
,
I
<
8
>>
std
::
tuple
<
ck
::
half_t
,
ck
::
half_t
,
ck
::
half_t
,
float
,
ck
::
half_t
,
I
<
2
>
,
I
<
1
>
,
I
<
256
>
,
I
<
1
>
,
I
<
256
>
,
I
<
2
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
1
>
,
I
<
8
>
,
I
<
8
>>
>
;
>
;
// clang-format on
// clang-format on
TYPED_TEST_SUITE
(
TestLayernormFP16
,
KernelTypes
);
TYPED_TEST_SUITE
(
TestLayernorm
2d
FP16
,
KernelTypes
);
TYPED_TEST
(
TestLayernormFP16
,
Test_FP16
)
{
this
->
Run
();
}
TYPED_TEST
(
TestLayernorm
2d
FP16
,
Test_FP16
)
{
this
->
Run
();
}
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment