Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7a3b49e5
Commit
7a3b49e5
authored
Jun 25, 2022
by
Chao Liu
Browse files
Merge remote-tracking branch 'origin/develop' into contraction
parents
e07b3d8e
d3051d75
Changes
592
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1095 additions
and
697 deletions
+1095
-697
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
...operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
+14
-16
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
...k/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
+38
-43
include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
+7
-5
include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
...nsor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
+20
-20
include/ck/tensor_operation/gpu/device/device_reduce.hpp
include/ck/tensor_operation/gpu/device/device_reduce.hpp
+7
-6
include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
...e/ck/tensor_operation/gpu/device/device_reduce_common.hpp
+7
-7
include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
.../tensor_operation/gpu/device/device_reduce_multiblock.hpp
+30
-25
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
.../tensor_operation/gpu/device/device_reduce_threadwise.hpp
+11
-8
include/ck/tensor_operation/gpu/device/device_softmax.hpp
include/ck/tensor_operation/gpu/device/device_softmax.hpp
+206
-0
include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
.../tensor_operation/gpu/device/device_unary_elementwise.hpp
+183
-0
include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
...de/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+4
-3
include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
...ensor_operation/gpu/device/reduction_operator_mapping.hpp
+111
-94
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+3
-0
include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
...r_operation/gpu/element/binary_element_wise_operation.hpp
+149
-83
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+124
-270
include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
...r_operation/gpu/element/element_wise_reduce_operation.hpp
+0
-10
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+123
-0
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+8
-8
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+26
-51
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+24
-48
No files found.
include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
View file @
7a3b49e5
#ifndef DEVICE_GEMM_XDL_SPLITK_C_SHUFFLE_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_GEMM_XDL_SPLITK_C_SHUFFLE_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "device.hpp"
#include "device_base.hpp"
#include "ck/utility/common_header.hpp"
#include "device_gemm.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "common_header.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "gridwise_gemm_xdlops_v2r4r2.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp"
#include "gemm_specialization.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
#ifndef CK_RUN_KERNEL_AND_TIME
#define CK_RUN_KERNEL_AND_TIME 1
#endif
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -641,4 +640,3 @@ struct DeviceGemmXdlSplitKCShuffle
...
@@ -641,4 +640,3 @@ struct DeviceGemmXdlSplitKCShuffle
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp
View file @
7a3b49e5
#ifndef DEVICE_GROUPED_GEMM_XDL_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_GROUPED_GEMM_XDL_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "device.hpp"
#include "device_base.hpp"
#include "ck/utility/common_header.hpp"
#include "device_gemm.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "common_header.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "gridwise_gemm_xdlops_v2r3.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
#include "gemm_specialization.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -362,7 +365,7 @@ struct DeviceGroupedGemmXdl
...
@@ -362,7 +365,7 @@ struct DeviceGroupedGemmXdl
{
{
grid_size_
=
0
;
grid_size_
=
0
;
gemm_descs_args
_workspace_
=
nullptr
;
p
_workspace_
=
nullptr
;
group_count_
=
ck
::
type_convert
<
ck
::
index_t
>
(
gemm_shapes
.
size
());
group_count_
=
ck
::
type_convert
<
ck
::
index_t
>
(
gemm_shapes
.
size
());
...
@@ -437,8 +440,6 @@ struct DeviceGroupedGemmXdl
...
@@ -437,8 +440,6 @@ struct DeviceGroupedGemmXdl
std
::
vector
<
GemmDescKernelArg
>
gemm_desc_kernel_arg_
;
std
::
vector
<
GemmDescKernelArg
>
gemm_desc_kernel_arg_
;
void
*
gemm_descs_args_workspace_
;
index_t
grid_size_
;
index_t
grid_size_
;
};
};
...
@@ -488,7 +489,7 @@ struct DeviceGroupedGemmXdl
...
@@ -488,7 +489,7 @@ struct DeviceGroupedGemmXdl
}
}
hipGetErrorString
(
hipGetErrorString
(
hipMemcpy
(
arg
.
gemm_descs_args
_workspace_
,
hipMemcpy
(
arg
.
p
_workspace_
,
arg
.
gemm_desc_kernel_arg_
.
data
(),
arg
.
gemm_desc_kernel_arg_
.
data
(),
arg
.
gemm_desc_kernel_arg_
.
size
()
*
sizeof
(
GemmDescKernelArg
),
arg
.
gemm_desc_kernel_arg_
.
size
()
*
sizeof
(
GemmDescKernelArg
),
hipMemcpyHostToDevice
));
hipMemcpyHostToDevice
));
...
@@ -507,17 +508,17 @@ struct DeviceGroupedGemmXdl
...
@@ -507,17 +508,17 @@ struct DeviceGroupedGemmXdl
CElementwiseOperation
,
CElementwiseOperation
,
true
>
;
true
>
;
ave_time
=
launch_and_time_kernel
(
ave_time
=
stream_config
,
launch_and_time_kernel
(
stream_config
,
kernel
,
kernel
,
dim3
(
arg
.
grid_size_
),
dim3
(
arg
.
grid_size_
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
cast_pointer_to_constant_address_space
(
arg
.
gemm_descs_args
_workspace_
),
cast_pointer_to_constant_address_space
(
arg
.
p
_workspace_
),
arg
.
gemm_desc_kernel_arg_
.
size
(),
arg
.
gemm_desc_kernel_arg_
.
size
(),
arg
.
a_element_op_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
arg
.
c_element_op_
);
}
}
else
else
{
{
...
@@ -531,17 +532,17 @@ struct DeviceGroupedGemmXdl
...
@@ -531,17 +532,17 @@ struct DeviceGroupedGemmXdl
CElementwiseOperation
,
CElementwiseOperation
,
false
>
;
false
>
;
ave_time
=
launch_and_time_kernel
(
ave_time
=
stream_config
,
launch_and_time_kernel
(
stream_config
,
kernel
,
kernel
,
dim3
(
arg
.
grid_size_
),
dim3
(
arg
.
grid_size_
),
dim3
(
BlockSize
),
dim3
(
BlockSize
),
0
,
0
,
cast_pointer_to_constant_address_space
(
arg
.
gemm_descs_args
_workspace_
),
cast_pointer_to_constant_address_space
(
arg
.
p
_workspace_
),
arg
.
gemm_desc_kernel_arg_
.
size
(),
arg
.
gemm_desc_kernel_arg_
.
size
(),
arg
.
a_element_op_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
b_element_op_
,
arg
.
c_element_op_
);
arg
.
c_element_op_
);
}
}
return
ave_time
;
return
ave_time
;
...
@@ -635,14 +636,8 @@ struct DeviceGroupedGemmXdl
...
@@ -635,14 +636,8 @@ struct DeviceGroupedGemmXdl
{
{
return
dynamic_cast
<
const
Argument
*>
(
p_arg
)
->
group_count_
*
sizeof
(
GemmDescKernelArg
);
return
dynamic_cast
<
const
Argument
*>
(
p_arg
)
->
group_count_
*
sizeof
(
GemmDescKernelArg
);
}
}
void
SetWorkSpacePointer
(
BaseArgument
*
p_arg
,
void
*
workspace_ptr
)
const
override
{
dynamic_cast
<
Argument
*>
(
p_arg
)
->
gemm_descs_args_workspace_
=
workspace_ptr
;
}
};
};
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
View file @
7a3b49e5
#ifndef DEVICE_POOL2D_FWD_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_POOL2D_FWD_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <iostream>
#include <array>
#include <array>
#include "device_base.hpp"
#include "reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/utility/reduction_enums.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -35,4 +38,3 @@ using DevicePool2dFwdPtr = std::unique_ptr<DevicePool2dFwd<ReduceOpId>>;
...
@@ -35,4 +38,3 @@ using DevicePool2dFwdPtr = std::unique_ptr<DevicePool2dFwd<ReduceOpId>>;
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp
View file @
7a3b49e5
#ifndef DEVICE_POOL2D_FWD_NHWC_NHWC_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_POOL2D_FWD_NHWC_NHWC_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "device_pool2d_fwd.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "gridwise_2d_reduction_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -35,14 +40,13 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
...
@@ -35,14 +40,13 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
using
IndexDataType
=
int32_t
;
using
IndexDataType
=
int32_t
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
AccDataType
,
ReduceOpId
>::
opType
;
using
ReduceOperation
=
typename
reduce_binary_operator
<
ReduceOpId
>::
opType
;
using
InElementwiseOperation
=
using
InElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
InElementwiseOperation
;
using
AccElementwiseOperation
=
using
AccElementwiseOperation
=
typename
reduce_unary_operator
<
AccDataType
,
ReduceOpId
,
true
,
true
>::
typename
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
AccElementwiseOperation
;
AccElementwiseOperation
;
static
constexpr
index_t
InSrcOutDstVectorDim
=
static
constexpr
index_t
InSrcOutDstVectorDim
=
0
;
// for NHWC, the dim C is the vector Dim for both input and output in memory, which is
0
;
// for NHWC, the dim C is the vector Dim for both input and output in memory, which is
...
@@ -178,13 +182,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
...
@@ -178,13 +182,10 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
invariant_lowest_length_
=
C
;
invariant_lowest_length_
=
C
;
reduce_lowest_length_
=
window_spatial_lengths
[
1
];
reduce_lowest_length_
=
window_spatial_lengths
[
1
];
// TODO: is this correct?
int32_t
reduceLength
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
if
constexpr
(
ReduceOpId
==
ck
::
ReduceTensorOp
::
AVG
)
{
std
::
tie
(
in_element_op_
,
acc_element_op_
)
=
ck
::
index_t
divider
=
window_spatial_lengths
[
0
]
*
window_spatial_lengths
[
1
];
reduce_unary_operator
<
ReduceOpId
,
true
,
true
>::
GetElementwiseOperator
(
reduceLength
);
in_element_op_
=
InElementwiseOperation
{
divider
};
acc_element_op_
=
AccElementwiseOperation
{
divider
};
}
}
}
const
InDataType
*
p_in_dev_
;
const
InDataType
*
p_in_dev_
;
...
@@ -319,9 +320,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
...
@@ -319,9 +320,8 @@ struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd
return
str
.
str
();
return
str
.
str
();
}
}
};
// namespace device
};
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce.hpp
View file @
7a3b49e5
#ifndef DEVICE_REDUCE_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <vector>
#include <memory>
#include <memory>
#include <iostream>
#include <iostream>
#include "common_header.hpp"
#include "
ck/utility/
common_header.hpp"
#include "
device_base
.hpp"
#include "
ck/utility/reduction_enums
.hpp"
#include "
reduction_enums
.hpp"
#include "
ck/tensor_operation/gpu/device/device_base
.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -41,4 +43,3 @@ using DeviceReducePtr =
...
@@ -41,4 +43,3 @@ using DeviceReducePtr =
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
View file @
7a3b49e5
#ifndef DEVICE_REDUCE_COMMON_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_COMMON_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <vector>
#include <cassert>
#include <cassert>
#include "common_header.hpp"
#include "
ck/utility/
common_header.hpp"
#include "reduction_enums.hpp"
#include "
ck/utility/
reduction_enums.hpp"
#include "reduction_operator.hpp"
#include "
ck/utility/
reduction_operator.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -85,6 +87,4 @@ std::vector<index_t> shuffle_tensor_dimensions(const std::vector<index_t>& origL
...
@@ -85,6 +87,4 @@ std::vector<index_t> shuffle_tensor_dimensions(const std::vector<index_t>& origL
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
View file @
7a3b49e5
#ifndef DEVICE_REDUCE_MULTIBLOCK_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_MULTIBLOCK_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "device.hpp"
#include "device_base.hpp"
#include "ck/utility/common_header.hpp"
#include "device_reduce.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "device_reduce_common.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "gridwise_set_buffer_value.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -61,12 +67,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -61,12 +67,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
static
constexpr
bool
use_multiblock
=
static
constexpr
bool
use_multiblock
=
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
AtomicAdd
);
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
AtomicAdd
);
static
constexpr
bool
out_type_compatible_with_atomic_op
=
static_assert
(
ck
::
reduce
::
InMemoryDataOperatonSupportedOnDataType
<
OutMemoryDataOperation
,
std
::
is_same
<
OutDataType
,
float
>::
value
||
std
::
is_same
<
OutDataType
,
double
>::
value
;
OutDataType
>::
value
,
"The OutDataType must support the specified OutMemoryDataOperation!"
);
static_assert
(
!
use_multiblock
||
(
use_multiblock
&&
out_type_compatible_with_atomic_op
),
"The OutDataType must support the atomic operation for using MultiBlock reduction"
);
static_assert
(
!
use_multiblock
||
(
use_multiblock
&&
!
OutputIndex
),
static_assert
(
!
use_multiblock
||
(
use_multiblock
&&
!
OutputIndex
),
"MultiBlock reduction can only be used when outputing index is not required"
);
"MultiBlock reduction can only be used when outputing index is not required"
);
...
@@ -348,8 +351,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -348,8 +351,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
if
constexpr
(
use_multiblock
)
if
constexpr
(
use_multiblock
)
{
{
const
auto
zero
Val
=
const
auto
identity
Val
=
ck
::
reduce
::
Get
ReductionZero
ValueForInMemoryDataOperation
<
OutDataType
>
(
ck
::
reduce
::
Get
Identity
ValueForInMemoryDataOperation
<
OutDataType
>
(
OutMemoryDataOperation
);
OutMemoryDataOperation
);
const
auto
kernel_pre
=
const
auto
kernel_pre
=
...
@@ -362,7 +365,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -362,7 +365,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
0
,
0
,
out_grid_desc_m_2
,
out_grid_desc_m_2
,
arg
.
out_dev_
,
arg
.
out_dev_
,
zero
Val
);
identity
Val
);
};
};
avg_time
+=
launch_and_time_kernel
(
stream_config
,
avg_time
+=
launch_and_time_kernel
(
stream_config
,
...
@@ -393,10 +396,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -393,10 +396,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
};
};
};
};
bool
IsSupportedArgument
(
const
Base
Argument
*
p
_a
rg
)
override
static
bool
IsSupportedArgument
(
const
Argument
*
p
A
rg
)
{
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
use_multiblock
)
if
constexpr
(
use_multiblock
)
{
{
if
(
static_cast
<
float
>
(
pArg
->
beta_
)
!=
0.0
f
)
if
(
static_cast
<
float
>
(
pArg
->
beta_
)
!=
0.0
f
)
...
@@ -445,11 +446,16 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -445,11 +446,16 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
else
else
{
{
// cases with very small reduce_total_length should be handled by ThreadWise kernel
// cases with very small reduce_total_length should be handled by ThreadWise kernel
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
<
2
)
//
if(pArg->reduce_total_length / KThreadSliceSize < 2)
return
(
false
);
//
return (false);
};
};
return
(
true
);
return
(
true
);
}
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
dynamic_cast
<
const
Argument
*>
(
p_arg
));
};
};
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
...
@@ -492,7 +498,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -492,7 +498,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
auto
str
=
std
::
stringstream
();
auto
str
=
std
::
stringstream
();
// clang-format off
// clang-format off
str
<<
"DeviceReduceMultiBlock
AtomicAdd
<"
<<
BlockSize
<<
","
;
str
<<
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
Set
?
"DeviceReduceBlockWise<"
:
"DeviceReduceMultiBlock<"
)
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
...
@@ -505,4 +511,3 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -505,4 +511,3 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
View file @
7a3b49e5
#ifndef DEVICE_REDUCE_THREADWISE_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_THREADWISE_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "device_reduce_common.hpp"
#include "ck/device_utility/kernel_launch.hpp"
#include "gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "gridwise_2d_reduction_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -370,4 +374,3 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -370,4 +374,3 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_softmax.hpp
0 → 100644
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceSoftmax
:
public
BaseOperator
{
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
// Used for freeloading of some handy functions from DeviceReduceMultiBlock
using
Reduction
=
DeviceReduceMultiBlock
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
reduce
::
Add
,
PassThrough
,
// InElementwiseOperation
PassThrough
,
// AccElementwiseOperation
InMemoryDataOperationEnum
::
Set
,
false
,
// PropagateNan
false
,
// OutputIndex
false
,
// HaveIndexInputIfOutputIndex
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
1
>
;
// OutDstVectorSize
using
GridDesc_M_K
=
decltype
(
Reduction
::
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridwiseReduce
=
GridwiseSoftmax_mk_to_mk
<
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
struct
Argument
:
public
Reduction
::
Argument
{
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
alpha
,
AccDataType
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
)
:
Reduction
::
Argument
(
inLengths
,
inStrides
,
{},
{},
reduceDims
,
0.0
f
,
// alpha
0.0
f
,
// beta
in_dev
,
nullptr
,
out_dev
,
nullptr
,
PassThrough
{},
PassThrough
{}),
// FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
// float32 precision. Make it support any data type so the fields can be removed.
alpha_
(
alpha
),
beta_
(
beta
)
{
// std::cout << "blkGroupSize= " << this->blkGroupSize
// << ", numBlockTileIteration= " << this->numBlockTileIteration
// << ", gridSize=" << this->gridSize
// << ", invariant_total_length=" << this->invariant_total_length <<
// std::endl;
}
AccDataType
alpha_
;
AccDataType
beta_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
out_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
kernel_main
=
kernel_softmax
<
GridwiseReduce
,
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
>
;
float
avg_time
=
0
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_main
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
out_grid_desc_m_k
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
beta_
,
arg
.
out_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
!
Reduction
::
IsSupportedArgument
(
p_arg_
))
{
return
false
;
}
if
(
p_arg_
->
inLengths_
[
Rank
-
1
]
%
OutDstVectorSize
!=
0
)
{
return
false
;
}
return
true
;
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
int
>
reduceDims
,
AccDataType
alpha
,
AccDataType
beta
,
const
void
*
in_dev
,
void
*
out_dev
)
{
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
));
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceSoftmax<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
0 → 100644
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
ElementwiseFunctor
,
index_t
Dim
,
index_t
ScalarPerVector
>
struct
DeviceUnaryElementwise
:
public
BaseOperator
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
typename
Desc_M0
>
static
auto
PadDescriptor_M0_1d
(
Desc_M0
desc_m0
,
index_t
gridSize
,
index_t
blockSize
)
{
const
auto
m0
=
desc_m0
.
GetLength
(
I0
);
const
index_t
loop_step
=
gridSize
*
blockSize
*
ScalarPerVector
;
const
auto
pad
=
math
::
integer_least_multiple
(
m0
,
loop_step
)
-
m0
;
const
auto
desc_m0_pad
=
transform_tensor_descriptor
(
desc_m0
,
make_tuple
(
make_right_pad_transform
(
m0
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
desc_m0_pad
;
}
static
auto
MakeDescriptor_M0
(
const
std
::
vector
<
index_t
>&
shape
,
const
std
::
vector
<
index_t
>&
stride
,
index_t
gridSize
,
index_t
blockSize
)
{
auto
tupleOfShape
=
generate_tuple
([
&
](
auto
I
)
{
return
shape
[
I
];
},
Number
<
Dim
>
{});
auto
tupleOfStride
=
generate_tuple
([
&
](
auto
I
)
{
return
stride
[
I
];
},
Number
<
Dim
>
{});
// nd desc - [s0, s1, s2, ...]
const
auto
desc
=
make_naive_tensor_descriptor
(
tupleOfShape
,
tupleOfStride
);
// merge nd to 1d desc - [s0 * s1 * ...]
if
constexpr
(
Dim
>
1
)
{
const
auto
desc_m0
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
tupleOfShape
)),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
Dim
>
{})),
make_tuple
(
Sequence
<
0
>
{}));
return
PadDescriptor_M0_1d
(
desc_m0
,
gridSize
,
blockSize
);
}
else
return
PadDescriptor_M0_1d
(
desc
,
gridSize
,
blockSize
);
}
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
GridwiseUEltwise
=
GridwiseUnaryElementwise_1D
<
ADataType
,
BDataType
,
GridDesc_M0
,
ElementwiseFunctor
,
ScalarPerVector
>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
ADataType
*
p_a
,
BDataType
*
p_b
,
const
std
::
vector
<
index_t
>&
shape
,
const
std
::
vector
<
index_t
>&
stride_a
,
const
std
::
vector
<
index_t
>&
stride_b
,
ElementwiseFunctor
functor
)
:
p_a_
(
p_a
),
p_b_
(
p_b
),
shape_
(
shape
),
functor_
(
functor
),
blockSize_
(
256
)
// FIXME - Calculate the grid size by number of CU in the future
{
index_t
tensor_size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
{});
gridSize_
=
GridwiseUEltwise
::
CalculateGridSize
(
tensor_size
);
a_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_a
,
gridSize_
,
blockSize_
);
b_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_b
,
gridSize_
,
blockSize_
);
}
const
ADataType
*
p_a_
;
BDataType
*
p_b_
;
std
::
vector
<
int
>
shape_
;
GridDesc_M0
a_grid_desc_m0_
;
GridDesc_M0
b_grid_desc_m0_
;
ElementwiseFunctor
functor_
;
index_t
blockSize_
;
index_t
gridSize_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
kernel
=
kernel_unary_elementwise_1d
<
GridwiseUEltwise
,
ADataType
,
BDataType
,
GridDesc_M0
,
ElementwiseFunctor
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize_
),
dim3
(
arg
.
blockSize_
),
0
,
arg
.
p_a_
,
arg
.
p_b_
,
arg
.
a_grid_desc_m0_
,
arg
.
b_grid_desc_m0_
,
arg
.
functor_
);
return
elapsed_time
;
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
==
nullptr
)
return
false
;
if
(
pArg
->
shape_
.
back
()
%
ScalarPerVector
!=
0
)
return
false
;
return
true
;
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
void
*
p_b
,
std
::
vector
<
index_t
>
shape
,
std
::
vector
<
index_t
>
stride_a
,
std
::
vector
<
index_t
>
stride_b
,
ElementwiseFunctor
functor
)
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
BDataType
*>
(
p_b
),
shape
,
stride_a
,
stride_b
,
functor
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
();
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceBinaryElementwise"
<<
"<"
<<
"ScalarPerVector = "
<<
ScalarPerVector
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
View file @
7a3b49e5
#ifndef GEMM_SPECIALIZATION
// SPDX-License-Identifier: MIT
#define GEMM_SPECIALIZATION
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -20,4 +22,3 @@ enum struct GemmSpecialization
...
@@ -20,4 +22,3 @@ enum struct GemmSpecialization
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
View file @
7a3b49e5
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
#include "ck/utility/reduction_operator.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
#include "ck/utility/reduction_enums.hpp"
* of this software and associated documentation files (the "Software"), to deal
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
* in the Software without restriction, including without limitation the rights
// FIXME: can it be replaced with ck::Tuple?
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#include <tuple>
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_OPERATOR_MAPPING_HPP
#define CK_REDUCTION_OPERATOR_MAPPING_HPP
#include "reduction_operator.hpp"
#include "reduction_enums.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -37,77 +16,69 @@ namespace ck {
...
@@ -37,77 +16,69 @@ namespace ck {
// The boolean member "indexable" are also provided in reduce_binary_operactor for
// The boolean member "indexable" are also provided in reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
// easier checking by the upper-layer codes in the kernels.
template
<
typename
T
,
ReduceTensorOp
Op
>
template
<
ReduceTensorOp
Op
>
struct
reduce_binary_operator
;
struct
reduce_binary_operator
;
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
ADD
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
ADD
>
{
{
using
opType
=
reduce
::
Add
<
T
>
;
using
opType
=
reduce
::
Add
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MUL
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MUL
>
{
{
using
opType
=
reduce
::
Mul
<
T
>
;
using
opType
=
reduce
::
Mul
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MIN
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MIN
>
{
{
using
opType
=
reduce
::
Min
<
T
>
;
using
opType
=
reduce
::
Min
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
static
constexpr
bool
indexable
=
true
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MAX
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MAX
>
{
{
using
opType
=
reduce
::
Max
<
T
>
;
using
opType
=
reduce
::
Max
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
static
constexpr
bool
indexable
=
true
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
AMAX
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
AMAX
>
{
{
using
opType
=
reduce
::
AMax
<
T
>
;
using
opType
=
reduce
::
AMax
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
static
constexpr
bool
indexable
=
true
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
AVG
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
AVG
>
{
{
using
opType
=
reduce
::
Add
<
T
>
;
using
opType
=
reduce
::
Add
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
NORM1
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
NORM1
>
{
{
using
opType
=
reduce
::
Add
<
T
>
;
using
opType
=
reduce
::
Add
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
NORM2
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
NORM2
>
{
{
using
opType
=
reduce
::
Add
<
T
>
;
using
opType
=
reduce
::
Add
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
...
@@ -115,55 +86,101 @@ struct reduce_binary_operator<T, ReduceTensorOp::NORM2>
...
@@ -115,55 +86,101 @@ struct reduce_binary_operator<T, ReduceTensorOp::NORM2>
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
// The two unary functors are called before and afer the Reduction is executed respectively
template
<
typename
T
,
ReduceTensorOp
Op
,
bool
IsFirstReduce
,
bool
IsLastReduce
>
template
<
ReduceTensorOp
Op
,
bool
IsFirstReduce
,
bool
IsLastReduce
>
struct
reduce_unary_operator
struct
reduce_unary_operator
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
,
bool
IsFirstReduce
>
template
<
bool
IsFirstReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
AVG
,
IsFirstReduce
,
true
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
AVG
,
IsFirstReduce
,
true
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
,
true
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryDivide
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{
reduceLength
});
};
};
};
template
<
typename
T
,
bool
IsLastReduce
>
template
<
bool
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM1
,
true
,
IsLastReduce
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM1
,
true
,
IsLastReduce
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
,
bool
IsLastReduce
>
template
<
bool
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
AMAX
,
true
,
IsLastReduce
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
AMAX
,
true
,
IsLastReduce
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
>
template
<
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
true
,
false
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
true
,
false
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
>
template
<
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
true
,
true
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
true
,
true
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
>
template
<
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
false
,
true
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
false
,
true
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
}
// end of namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
View file @
7a3b49e5
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#pragma once
#pragma once
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
binary_element_wise
{
namespace
element_wise
{
template
<
typename
Y
,
typename
X1
,
typename
X2
>
struct
Add
;
template
<
>
struct
Add
struct
Add
<
double
,
double
,
double
>
{
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
double
&
dst
,
const
double
&
src1
,
const
double
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
dst
=
src1
+
src2
;
y
=
x0
+
x1
;
}
};
};
template
<
>
template
<
>
struct
Add
<
float
,
float
,
float
>
{
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
float
&
dst
,
const
float
&
src1
,
const
float
&
src2
)
const
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
{
dst
=
src1
+
src2
;
y
=
x0
+
x1
;
}
};
};
template
<
>
// Question: should half_t be supported ?
struct
Add
<
half_t
,
half_t
,
half_t
>
template
<
>
{
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
y
=
x0
+
x1
;
};
// Question: should bhalf_t be supported ?
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
half_t
&
dst
,
const
half_t
&
src1
,
const
half_t
&
src2
)
const
operator
()
<
bhalf_t
>
(
b
half_t
&
y
,
const
b
half_t
&
x0
,
const
b
half_t
&
x1
)
const
{
{
dst
=
src1
+
src2
;
const
float
x1_tmp
=
ck
::
type_convert
<
float
>
(
x0
);
const
float
x2_tmp
=
ck
::
type_convert
<
float
>
(
x1
);
const
float
y_tmp
=
x1_tmp
+
x2_tmp
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_tmp
);
}
}
};
};
template
<
>
struct
Subtract
struct
Add
<
bhalf_t
,
bhalf_t
,
bhalf_t
>
{
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
bhalf_t
&
dst
,
const
bhalf_t
&
src1
,
const
bhalf_t
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
const
float
x1
=
ck
::
type_convert
<
float
>
(
src1
);
y
=
x0
-
x1
;
const
float
x2
=
ck
::
type_convert
<
float
>
(
src2
);
};
const
float
y
=
x1
+
x2
;
dst
=
ck
::
type_convert
<
bhalf_t
>
(
y
);
template
<
>
}
__host__
__device__
constexpr
void
};
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
y
=
x0
-
x1
;
};
template
<
typename
Y
,
typename
X1
,
typename
X2
>
// Question: should half_t be supported ?
struct
Substract
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
y
=
x0
-
x1
;
};
template
<
>
// Question: should bhalf_t be supported ?
struct
Substract
<
double
,
double
,
double
>
template
<
>
{
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
double
&
dst
,
const
double
&
src1
,
const
double
&
src2
)
const
operator
()
<
bhalf_t
>
(
bhalf_t
&
y
,
const
bhalf_t
&
x0
,
const
bhalf_t
&
x1
)
const
{
{
dst
=
src1
-
src2
;
const
float
x1_tmp
=
ck
::
type_convert
<
float
>
(
x0
);
const
float
x2_tmp
=
ck
::
type_convert
<
float
>
(
x1
);
const
float
y_tmp
=
x1_tmp
-
x2_tmp
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_tmp
);
}
}
};
};
template
<
>
struct
AlphaBetaAdd
struct
Substract
<
float
,
float
,
float
>
{
{
AlphaBetaAdd
(
float
alpha
,
float
beta
)
:
alpha_
(
alpha
),
beta_
(
beta
){};
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()(
float
&
dst
,
const
float
&
src1
,
const
float
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
dst
=
src1
-
src2
;
y
=
alpha_
*
x0
+
beta_
*
x1
;
}
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
y
=
static_cast
<
double
>
(
alpha_
)
*
x0
+
static_cast
<
double
>
(
beta_
)
*
x1
;
};
// Question: should half_t be supported ?
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
y
=
static_cast
<
half_t
>
(
alpha_
*
static_cast
<
float
>
(
x0
)
+
beta_
*
static_cast
<
float
>
(
x1
));
};
float
alpha_
;
float
beta_
;
};
};
template
<
>
struct
AddRelu
struct
Substract
<
half_t
,
half_t
,
half_t
>
{
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
half_t
&
dst
,
const
half_t
&
src1
,
const
half_t
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
dst
=
src1
-
src2
;
const
float
a
=
x0
+
x1
;
}
y
=
a
>
0.0
f
?
a
:
0.0
f
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
const
double
a
=
x0
+
x1
;
y
=
a
>
0.0
?
a
:
0.0
;
};
// Question: should half_t be supported ?
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
const
half_t
a
=
x0
+
x1
;
y
=
a
>
static_cast
<
half_t
>
(
0.0
f
)
?
a
:
static_cast
<
half_t
>
(
0.0
f
);
};
};
};
template
<
>
struct
AddHardswish
struct
Substract
<
bhalf_t
,
bhalf_t
,
bhalf_t
>
{
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
bhalf_t
&
dst
,
const
bhalf_t
&
src1
,
const
bhalf_t
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
const
float
x1
=
ck
::
type_convert
<
float
>
(
src1
);
float
a
=
x0
+
x1
;
const
float
x2
=
ck
::
type_convert
<
float
>
(
src2
);
float
b
=
a
+
float
{
3
};
const
float
y
=
x1
-
x2
;
float
c
=
(
b
>
0
)
*
(
b
>
6.0
f
?
6.0
f
:
b
)
*
a
*
0.166667
f
;
dst
=
ck
::
type_convert
<
bhalf_t
>
(
y
);
y
=
c
;
}
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
double
a
=
x0
+
x1
;
double
b
=
a
+
3.0
;
double
c
=
(
b
>
0
)
*
(
b
>
6.0
?
6.0
:
b
)
*
a
*
0.166667
;
y
=
c
;
};
// Question: should half_t be supported ?
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
3.0
f
;
float
c
=
(
b
>
0
)
*
(
b
>
6.0
f
?
6.0
f
:
b
)
*
a
*
0.166667
f
;
y
=
c
;
};
};
};
}
// namespace
binary_
element_wise
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
element_wise
{
namespace
element_wise
{
struct
PassThrough
// Need to ensure compiler will fail if there is no matching candidate, instead of compiler
{
// siliently do implicit type conversion
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
;
}
//
// Method 1:
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
x
;
}
//
// struct ExampleElementwiseOp
__host__
__device__
void
operator
()(
bhalf_t
&
y
,
const
bhalf_t
&
x
)
const
{
y
=
x
;
}
// {
// template<typename Y, typename X>
__host__
__device__
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
;
}
// __host__ __device__ constexpr void
// operator()(Y&, const X) const;
__host__
__device__
void
operator
()(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
y
=
x
;
}
//
// template<>
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
;
}
// __host__ __device__ constexpr void
};
// operator()<half_t, half_t>(half_t& y, const half_t& x) const
// {
struct
Add
// }
{
// };
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
//
{
// Method 2:
y
=
x0
+
x1
;
//
}
// template <typename Y, typename X>
// struct ExampleElementwiseOp;
__host__
__device__
constexpr
void
//
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
// template <>
{
// struct ExampleElementwiseOp<float, ck::bhalf_t>
// FIXME - Use float (acc type) bias in the future.
// {
y
=
x0
+
x1
;
// __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
}
// {
};
// }
// };
struct
AlphaBetaAdd
{
AlphaBetaAdd
(
float
alpha
,
float
beta
)
:
alpha_
(
alpha
),
beta_
(
beta
)
{}
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
y
=
alpha_
*
x0
+
beta_
*
x1
;
}
__host__
__device__
constexpr
void
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
// FIXME - Let x0 be acc type
y
=
static_cast
<
half_t
>
(
alpha_
*
static_cast
<
float
>
(
x0
)
+
beta_
*
static_cast
<
float
>
(
x1
));
}
float
alpha_
;
float
beta_
;
};
struct
AddRelu
{
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
const
float
a
=
x0
+
x1
;
y
=
a
>
0
?
a
:
0
;
}
__host__
__device__
constexpr
void
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
const
half_t
a
=
x0
+
x1
;
y
=
a
>
0
?
a
:
0
;
}
};
struct
AddHardswish
{
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
c
=
(
b
>
0
)
*
(
b
>
float
{
6
}
?
float
{
6
}
:
b
)
*
a
*
float
{
0.166667
};
y
=
c
;
}
__host__
__device__
constexpr
void
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
c
=
(
b
>
0
)
*
(
b
>
float
{
6
}
?
float
{
6
}
:
b
)
*
a
*
float
{
0.166667
};
y
=
c
;
}
};
struct
AddReluAdd
struct
AddReluAdd
{
{
__host__
__device__
constexpr
void
template
<
typename
Y
,
typename
X0
,
typename
X1
,
typename
X2
>
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
,
const
X2
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
{
half_t
a
=
x0
+
x1
;
half_t
a
=
x0
+
x1
;
half_t
b
=
a
>
0
?
a
:
0
;
half_t
b
=
a
>
0
?
a
:
0
;
y
=
b
+
x2
;
y
=
b
+
x2
;
}
}
__host__
__device__
constexpr
void
template
<
>
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
{
{
float
a
=
x0
+
x1
;
float
a
=
x0
+
x1
;
float
b
=
a
>
0
?
a
:
0
;
float
b
=
a
>
0
?
a
:
0
;
...
@@ -110,8 +69,9 @@ struct AddReluAdd
...
@@ -110,8 +69,9 @@ struct AddReluAdd
y
=
c
;
y
=
c
;
}
}
__host__
__device__
constexpr
void
template
<
>
operator
()(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
__host__
__device__
constexpr
void
operator
()
<
half_t
,
float
,
half_t
,
half_t
>
(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
{
float
a
=
x0
+
x1
;
float
a
=
x0
+
x1
;
float
b
=
a
>
0
?
a
:
0
;
float
b
=
a
>
0
?
a
:
0
;
...
@@ -122,8 +82,14 @@ struct AddReluAdd
...
@@ -122,8 +82,14 @@ struct AddReluAdd
struct
AddHardswishAdd
struct
AddHardswishAdd
{
{
__host__
__device__
constexpr
void
template
<
typename
Y
,
typename
X0
,
typename
X1
,
typename
X2
>
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
,
const
X2
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
{
{
float
a
=
x0
+
x1
;
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
b
=
a
+
float
{
3
};
...
@@ -132,8 +98,9 @@ struct AddHardswishAdd
...
@@ -132,8 +98,9 @@ struct AddHardswishAdd
y
=
d
;
y
=
d
;
}
}
__host__
__device__
constexpr
void
template
<
>
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
{
float
a
=
x0
+
x1
;
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
b
=
a
+
float
{
3
};
...
@@ -143,208 +110,95 @@ struct AddHardswishAdd
...
@@ -143,208 +110,95 @@ struct AddHardswishAdd
}
}
};
};
struct
Normalize
// C = A * B
{
// E = FastGelu(C + D0 + D1)
Normalize
(
float
epsilon
=
1e-4
)
:
epsilon_
(
epsilon
)
{}
struct
AddAddFastGelu
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x
,
const
float
&
mean
,
const
float
&
mean_square
,
const
float
&
gamma
,
const
float
&
beta
)
const
{
float
variance
=
mean_square
-
(
mean
*
mean
);
y
=
((
x
-
mean
)
/
sqrtf
(
variance
+
epsilon_
))
*
gamma
+
beta
;
}
float
epsilon_
;
};
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template
<
typename
Y
,
typename
X
,
bool
HasDividing
=
false
>
struct
UnaryIdentic
;
template
<
>
struct
UnaryIdentic
<
float
,
float
,
false
>
{
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
template
<
typename
E
,
typename
C
,
typename
D0
,
typename
D1
>
__host__
__device__
void
operator
()(
E
&
,
const
C
&
,
const
D0
&
,
const
D1
&
)
const
;
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
;
};
};
template
<
>
template
<
>
struct
UnaryIdentic
<
float
,
float
,
true
>
__host__
__device__
void
operator
()
<
half_t
,
float
,
half_t
,
half_t
>
(
half_t
&
e
,
{
const
float
&
c
,
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
const
half_t
&
d0
,
const
half_t
&
d1
)
const
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
{
y
=
x
/
type_convert
<
float
>
(
divider_
);
// Fast GeLU
};
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
const
auto
fast_gelu
=
[
&
](
float
x
)
{
const
float
u
=
float
(
2
)
*
x
*
(
float
(
0.035677
)
*
x
*
x
+
float
(
0.797885
));
const
float
emu
=
exp
(
-
u
);
const
float
cdf
=
float
(
0.5
)
+
float
(
0.5
)
*
(
float
(
2
)
/
(
float
(
1
)
+
emu
)
-
float
(
1
));
return
x
*
cdf
;
};
int32_t
divider_
=
1
;
const
float
y
=
fast_gelu
(
c
+
float
(
d0
)
+
float
(
d1
));
};
template
<
>
e
=
type_convert
<
half_t
>
(
y
);
struct
UnaryIdentic
<
half_t
,
half_t
,
false
>
}
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
x
;
};
};
};
template
<
>
struct
Normalize
struct
UnaryIdentic
<
double
,
double
,
false
>
{
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
// FIXME: is double absolutely necessary?
Normalize
(
double
epsilon
=
1e-4
)
:
epsilon_
(
epsilon
)
{}
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
;
};
template
<
typename
T
>
};
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x
,
const
T
&
mean
,
const
T
&
mean_square
,
const
T
&
gamma
,
const
T
&
beta
)
const
;
template
<
>
struct
UnaryIdentic
<
double
,
double
,
true
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
>
(
float
&
y
,
const
float
&
x
,
const
float
&
mean
,
const
float
&
mean_square
,
const
float
&
gamma
,
const
float
&
beta
)
const
{
{
y
=
x
/
type_convert
<
double
>
(
divider_
);
using
ck
::
math
::
sqrt
;
};
int32_t
divider_
=
1
;
};
template
<
>
struct
UnaryIdentic
<
int32_t
,
int32_t
,
false
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
;
};
};
template
<
>
struct
UnaryIdentic
<
int32_t
,
int32_t
,
true
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
/
divider_
;
};
int32_t
divider_
=
1
;
};
template
<
>
struct
UnaryIdentic
<
int8_t
,
int8_t
,
false
>
{
__host__
__device__
UnaryIdentic
(
const
int8_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
y
=
x
;
};
};
template
<
typename
Y
,
typename
X
,
bool
HasDividing
=
false
>
struct
UnarySquare
;
template
<
>
struct
UnarySquare
<
float
,
float
,
false
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
*
x
;
};
float
variance
=
mean_square
-
(
mean
*
mean
);
};
y
=
((
x
-
mean
)
/
sqrt
(
variance
+
static_cast
<
float
>
(
epsilon_
)))
*
gamma
+
beta
;
template
<
>
struct
UnarySquare
<
float
,
float
,
true
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
*
x
/
type_convert
<
float
>
(
divider_
);
};
};
int32_t
divider_
=
1
;
template
<
>
};
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x
,
template
<
>
const
double
&
mean
,
struct
UnarySquare
<
double
,
double
,
false
>
const
double
&
mean_square
,
{
const
double
&
gamma
,
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
const
double
&
beta
)
const
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
*
x
;
};
};
template
<
>
struct
UnarySquare
<
double
,
double
,
true
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
{
y
=
x
*
x
/
type_convert
<
double
>
(
divider_
);
using
ck
::
math
::
sqrt
;
double
variance
=
mean_square
-
(
mean
*
mean
);
y
=
((
x
-
mean
)
/
sqrt
(
variance
+
epsilon_
))
*
gamma
+
beta
;
};
};
int32_t
divider_
=
1
;
// FIXME: is double absolutely necessary?
double
epsilon_
;
};
};
template
<
typename
Y
,
typename
X
>
template
<
typename
Y
,
typename
X
>
struct
UnaryAbs
;
struct
UnaryTypeConvert
;
template
<
>
struct
UnaryAbs
<
float
,
float
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
abs
(
x
);
};
};
template
<
>
struct
UnaryAbs
<
half_t
,
half_t
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
__habs
(
x
);
};
};
template
<
>
template
<
>
struct
Unary
Abs
<
double
,
double
>
struct
Unary
TypeConvert
<
float
,
ck
::
bhalf_t
>
{
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
ck
::
bhalf_t
&
x
)
const
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
abs
(
x
);
};
};
template
<
>
struct
UnaryAbs
<
int8_t
,
int8_t
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
{
int8_t
sgn
=
x
>>
(
8
-
1
);
y
=
ck
::
type_convert
<
float
,
ck
::
bhalf_t
>
(
x
);
}
y
=
(
x
^
sgn
)
-
sgn
;
};
};
template
<
typename
Y
,
typename
X
>
struct
UnarySqrt
;
template
<
>
struct
UnarySqrt
<
float
,
float
>
{
__host__
__device__
UnarySqrt
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
sqrtf
(
x
);
};
};
};
template
<
>
template
<
>
struct
Unary
Sqrt
<
double
,
double
>
struct
Unary
TypeConvert
<
ck
::
bhalf_t
,
float
>
{
{
__host__
__device__
UnarySqrt
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
ck
::
bhalf_t
&
y
,
float
&
x
)
const
{
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
sqrt
(
x
);
};
y
=
ck
::
type_convert
<
ck
::
bhalf_t
,
float
>
(
x
);
}
};
};
}
// namespace element_wise
}
// namespace element_wise
...
...
include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
deleted
100644 → 0
View file @
e07b3d8e
#pragma once
#include "data_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
element_wise
{
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
0 → 100644
View file @
7a3b49e5
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
element_wise
{
struct
PassThrough
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
bhalf_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
;
};
};
struct
UnaryDivide
{
__host__
__device__
UnaryDivide
(
const
int32_t
divider
=
1
)
:
divider_
(
divider
){};
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
int32_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
/
type_convert
<
T
>
(
divider_
);
};
int32_t
divider_
=
1
;
};
struct
UnarySquare
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
*
x
;
};
};
struct
UnaryAbs
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
ck
::
math
::
abs
(
x
);
};
};
struct
UnarySqrt
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
,
"Data type is not supported by this operation!"
);
y
=
ck
::
math
::
sqrt
(
x
);
};
};
struct
Relu
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
>
0
?
x
:
0
;
}
template
<
>
__host__
__device__
void
operator
()(
bhalf_t
&
y
,
const
bhalf_t
&
x
)
const
{
float
x_f32
=
ck
::
type_convert
<
float
>
(
x
);
float
y_f32
=
x_f32
>
0
?
x_f32
:
0
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_f32
);
}
};
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
struct
FastGelu
{
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
;
template
<
>
__host__
__device__
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
const
float
u
=
float
(
2
)
*
x
*
(
float
(
0.035677
)
*
x
*
x
+
float
(
0.797885
));
const
float
emu
=
exp
(
-
u
);
const
float
cdf
=
float
(
0.5
)
+
float
(
0.5
)
*
(
float
(
2
)
/
(
float
(
1
)
+
emu
)
-
float
(
1
));
y
=
x
*
cdf
;
}
};
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
View file @
7a3b49e5
#ifndef UTILITY_BLOCK_TO_CTILE_MAP
// SPDX-License-Identifier: MIT
#define UTILITY_BLOCK_TO_CTILE_MAP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "utility/math.hpp"
#pragma once
#include "utility/number.hpp"
#include "tensor_description/tensor_adaptor.hpp"
#include "ck/utility/math.hpp"
#include "tensor_description/multi_index_transform_helper.hpp"
#include "ck/utility/number.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -485,5 +487,3 @@ __host__ __device__ bool DefaultValidCTileIndex(const CTileIdx& c_tile_idx,
...
@@ -485,5 +487,3 @@ __host__ __device__ bool DefaultValidCTileIndex(const CTileIdx& c_tile_idx,
}
}
}
// namespace ck
}
// namespace ck
#endif // UTILITY_BLOCK_TO_CTILE_MAP
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
View file @
7a3b49e5
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
#include "ck/utility/reduction_common.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
#include "ck/utility/reduction_operator.hpp"
* of this software and associated documentation files (the "Software"), to deal
#include "ck/utility/reduction_functions_accumulate.hpp"
* in the Software without restriction, including without limitation the rights
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
* copies of the Software, and to permit persons to whom the Software is
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
* furnished to do so, subject to the following conditions:
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -171,15 +147,15 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -171,15 +147,15 @@ struct GridwiseReduction_mk_to_m_multiblock
AccDataType
beta
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_value_global
)
OutDataType
*
const
__restrict__
p_out_value_global
)
{
{
const
auto
zero
Val
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
identity
Val
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
// LDS
// LDS
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
const
auto
in_global_val_buf
=
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
...
@@ -191,7 +167,7 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -191,7 +167,7 @@ struct GridwiseReduction_mk_to_m_multiblock
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
identity
Val
;
});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
...
@@ -358,12 +334,12 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -358,12 +334,12 @@ struct GridwiseReduction_mk_to_m_multiblock
__shared__
AccDataType
p_reduce_work_val_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_work_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_work_idx_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_work_idx_buffer
[
BlockSize
];
const
auto
zero
Val
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
identity
Val
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
const
auto
in_global_val_buf
=
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
...
@@ -418,7 +394,7 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -418,7 +394,7 @@ struct GridwiseReduction_mk_to_m_multiblock
thread_k_cluster_id
*
KThreadSliceSize
));
thread_k_cluster_id
*
KThreadSliceSize
));
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
accu_value_buf
(
I
)
=
identity
Val
;
accu_index_buf
(
I
)
=
0
;
accu_index_buf
(
I
)
=
0
;
});
});
...
@@ -459,7 +435,7 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -459,7 +435,7 @@ struct GridwiseReduction_mk_to_m_multiblock
in_thread_idx_buf
);
in_thread_idx_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
AccDataType
tmpValue
=
zero
Val
;
AccDataType
tmpValue
=
identity
Val
;
IndexDataType
tmpIndex
=
0
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
...
@@ -512,7 +488,7 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -512,7 +488,7 @@ struct GridwiseReduction_mk_to_m_multiblock
in_thread_val_buf
(
Number
<
offset
>
{}));
in_thread_val_buf
(
Number
<
offset
>
{}));
});
});
AccDataType
tmpValue
=
zero
Val
;
AccDataType
tmpValue
=
identity
Val
;
IndexDataType
tmpIndex
=
0
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
...
@@ -635,4 +611,3 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -635,4 +611,3 @@ struct GridwiseReduction_mk_to_m_multiblock
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
View file @
7a3b49e5
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
#include "ck/utility/data_type.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
#include "ck/utility/reduction_common.hpp"
* of this software and associated documentation files (the "Software"), to deal
#include "ck/utility/reduction_operator.hpp"
* in the Software without restriction, including without limitation the rights
#include "ck/utility/reduction_functions_accumulate.hpp"
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
* copies of the Software, and to permit persons to whom the Software is
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
* furnished to do so, subject to the following conditions:
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#define CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -135,12 +112,12 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -135,12 +112,12 @@ struct GridwiseReduction_mk_to_m_threadwise
ReduceOperation
,
ReduceOperation
,
PropagateNan
>
;
PropagateNan
>
;
const
auto
zero
Val
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
identity
Val
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
const
auto
in_global_val_buf
=
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
...
@@ -149,7 +126,7 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -149,7 +126,7 @@ struct GridwiseReduction_mk_to_m_threadwise
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
identity
Val
;
});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
...
@@ -276,12 +253,12 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -276,12 +253,12 @@ struct GridwiseReduction_mk_to_m_threadwise
(
void
)
acc_elementwise_op
;
(
void
)
acc_elementwise_op
;
const
auto
zero
Val
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
identity
Val
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
const
auto
in_global_val_buf
=
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
...
@@ -303,7 +280,7 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -303,7 +280,7 @@ struct GridwiseReduction_mk_to_m_threadwise
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
accu_value_buf
(
I
)
=
identity
Val
;
accu_index_buf
(
I
)
=
0
;
accu_index_buf
(
I
)
=
0
;
});
});
...
@@ -495,4 +472,3 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -495,4 +472,3 @@ struct GridwiseReduction_mk_to_m_threadwise
};
};
}
// namespace ck
}
// namespace ck
#endif
Prev
1
…
4
5
6
7
8
9
10
11
12
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment