Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
5d015452
Commit
5d015452
authored
Jul 06, 2022
by
Chaitanya Inumella
Browse files
Rebased the hipTENSOR development branch with the contraction branch
parents
b7fa6bb1
ed3feb4d
Changes
425
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2100 additions
and
618 deletions
+2100
-618
include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
.../tensor_operation/gpu/device/device_reduce_multiblock.hpp
+30
-25
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
.../tensor_operation/gpu/device/device_reduce_threadwise.hpp
+11
-8
include/ck/tensor_operation/gpu/device/device_softmax.hpp
include/ck/tensor_operation/gpu/device/device_softmax.hpp
+250
-0
include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
.../tensor_operation/gpu/device/device_unary_elementwise.hpp
+183
-0
include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
...de/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+20
-3
include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
...ensor_operation/gpu/device/reduction_operator_mapping.hpp
+111
-94
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+3
-0
include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
...r_operation/gpu/element/binary_element_wise_operation.hpp
+142
-83
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+124
-270
include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
...r_operation/gpu/element/element_wise_reduce_operation.hpp
+0
-10
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+139
-0
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+8
-8
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
+26
-51
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
...r_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
+24
-48
include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
...ensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
+7
-4
include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
...sor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
+7
-4
include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
...or_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
+3
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
...pu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+995
-0
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
...de/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
+14
-10
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
...ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
+3
-0
No files found.
Too many changes to show.
To preserve performance only
425 of 425+
files are displayed.
Plain diff
Email patch
include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_MULTIBLOCK_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_MULTIBLOCK_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "device.hpp"
#include "device_base.hpp"
#include "ck/utility/common_header.hpp"
#include "device_reduce.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "device_reduce_common.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "gridwise_set_buffer_value.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -61,12 +67,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -61,12 +67,9 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
static
constexpr
bool
use_multiblock
=
static
constexpr
bool
use_multiblock
=
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
AtomicAdd
);
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
AtomicAdd
);
static
constexpr
bool
out_type_compatible_with_atomic_op
=
static_assert
(
ck
::
reduce
::
InMemoryDataOperatonSupportedOnDataType
<
OutMemoryDataOperation
,
std
::
is_same
<
OutDataType
,
float
>::
value
||
std
::
is_same
<
OutDataType
,
double
>::
value
;
OutDataType
>::
value
,
"The OutDataType must support the specified OutMemoryDataOperation!"
);
static_assert
(
!
use_multiblock
||
(
use_multiblock
&&
out_type_compatible_with_atomic_op
),
"The OutDataType must support the atomic operation for using MultiBlock reduction"
);
static_assert
(
!
use_multiblock
||
(
use_multiblock
&&
!
OutputIndex
),
static_assert
(
!
use_multiblock
||
(
use_multiblock
&&
!
OutputIndex
),
"MultiBlock reduction can only be used when outputing index is not required"
);
"MultiBlock reduction can only be used when outputing index is not required"
);
...
@@ -348,8 +351,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -348,8 +351,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
if
constexpr
(
use_multiblock
)
if
constexpr
(
use_multiblock
)
{
{
const
auto
zero
Val
=
const
auto
identity
Val
=
ck
::
reduce
::
Get
ReductionZero
ValueForInMemoryDataOperation
<
OutDataType
>
(
ck
::
reduce
::
Get
Identity
ValueForInMemoryDataOperation
<
OutDataType
>
(
OutMemoryDataOperation
);
OutMemoryDataOperation
);
const
auto
kernel_pre
=
const
auto
kernel_pre
=
...
@@ -362,7 +365,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -362,7 +365,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
0
,
0
,
out_grid_desc_m_2
,
out_grid_desc_m_2
,
arg
.
out_dev_
,
arg
.
out_dev_
,
zero
Val
);
identity
Val
);
};
};
avg_time
+=
launch_and_time_kernel
(
stream_config
,
avg_time
+=
launch_and_time_kernel
(
stream_config
,
...
@@ -393,10 +396,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -393,10 +396,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
};
};
};
};
bool
IsSupportedArgument
(
const
Base
Argument
*
p
_a
rg
)
override
static
bool
IsSupportedArgument
(
const
Argument
*
p
A
rg
)
{
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
use_multiblock
)
if
constexpr
(
use_multiblock
)
{
{
if
(
static_cast
<
float
>
(
pArg
->
beta_
)
!=
0.0
f
)
if
(
static_cast
<
float
>
(
pArg
->
beta_
)
!=
0.0
f
)
...
@@ -445,11 +446,16 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -445,11 +446,16 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
else
else
{
{
// cases with very small reduce_total_length should be handled by ThreadWise kernel
// cases with very small reduce_total_length should be handled by ThreadWise kernel
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
<
2
)
//
if(pArg->reduce_total_length / KThreadSliceSize < 2)
return
(
false
);
//
return (false);
};
};
return
(
true
);
return
(
true
);
}
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
dynamic_cast
<
const
Argument
*>
(
p_arg
));
};
};
std
::
unique_ptr
<
BaseArgument
>
std
::
unique_ptr
<
BaseArgument
>
...
@@ -492,7 +498,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -492,7 +498,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
auto
str
=
std
::
stringstream
();
auto
str
=
std
::
stringstream
();
// clang-format off
// clang-format off
str
<<
"DeviceReduceMultiBlock
AtomicAdd
<"
<<
BlockSize
<<
","
;
str
<<
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
Set
?
"DeviceReduceBlockWise<"
:
"DeviceReduceMultiBlock<"
)
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
...
@@ -505,4 +511,3 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -505,4 +511,3 @@ struct DeviceReduceMultiBlock : public DeviceReduce<InElementwiseOperation, AccE
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
View file @
5d015452
#ifndef DEVICE_REDUCE_THREADWISE_HPP
// SPDX-License-Identifier: MIT
#define DEVICE_REDUCE_THREADWISE_HPP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include "device.hpp"
#include "device_reduce.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "device_reduce_common.hpp"
#include "ck/device_utility/kernel_launch.hpp"
#include "gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "gridwise_2d_reduction_threadwise.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -370,4 +374,3 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
...
@@ -370,4 +374,3 @@ struct DeviceReduceThreadWise : public DeviceReduce<InElementwiseOperation, AccE
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/device_softmax.hpp
0 → 100644
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceSoftmax
:
public
DeviceNormalization
{
static
constexpr
index_t
kRank
=
Rank
;
static
constexpr
index_t
kNumReduceDim
=
NumReduceDim
;
virtual
index_t
GetRank
()
const
override
{
return
kRank
;
}
virtual
index_t
GetNumReduceDim
()
const
override
{
return
kNumReduceDim
;
}
using
PassThrough
=
tensor_operation
::
element_wise
::
PassThrough
;
// Used for freeloading of some handy functions from DeviceReduceMultiBlock
using
Reduction
=
DeviceReduceMultiBlock
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
reduce
::
Add
,
PassThrough
,
// InElementwiseOperation
PassThrough
,
// AccElementwiseOperation
InMemoryDataOperationEnum
::
Set
,
false
,
// PropagateNan
false
,
// OutputIndex
false
,
// HaveIndexInputIfOutputIndex
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
1
>
;
// OutDstVectorSize
using
GridDesc_M_K
=
decltype
(
Reduction
::
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridwiseSoftmaxGeneric
=
GridwiseSoftmax_mk_to_mk
<
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
,
false
>
;
using
GridwiseSoftmaxSweepOnce
=
GridwiseSoftmax_mk_to_mk
<
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
,
true
>
;
struct
Argument
:
public
Reduction
::
Argument
{
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
alpha
,
AccDataType
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
)
:
Reduction
::
Argument
(
inLengths
,
inStrides
,
{},
{},
reduceDims
,
0.0
f
,
// alpha
0.0
f
,
// beta
in_dev
,
nullptr
,
out_dev
,
nullptr
,
PassThrough
{},
PassThrough
{}),
// FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
// float32 precision. Make it support any data type so the fields can be removed.
alpha_
(
alpha
),
beta_
(
beta
)
{
// std::cout << "blkGroupSize= " << this->blkGroupSize
// << ", numBlockTileIteration= " << this->numBlockTileIteration
// << ", gridSize=" << this->gridSize
// << ", invariant_total_length=" << this->invariant_total_length <<
// std::endl;
}
AccDataType
alpha_
;
AccDataType
beta_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
out_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
bool
sweep_once
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{})
<=
KThreadClusterSize
*
KThreadSliceSize
;
const
auto
kernel_main
=
sweep_once
?
kernel_softmax
<
GridwiseSoftmaxSweepOnce
,
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
>
:
kernel_softmax
<
GridwiseSoftmaxGeneric
,
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
>
;
float
avg_time
=
0
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_main
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
out_grid_desc_m_k
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
beta_
,
arg
.
out_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
!
Reduction
::
IsSupportedArgument
(
p_arg_
))
{
return
false
;
}
if
(
p_arg_
->
inLengths_
[
Rank
-
1
]
%
OutDstVectorSize
!=
0
)
{
return
false
;
}
return
true
;
};
// inLengths: input tensor extent(s) from high to low dimension
// inStrides: input tensor stride(s) from high to low dimension
// reduceDims: the dimension(s) the softmax normalization operate on
// alpha: typeless pointer in host memory storing the alpha scaling value as type AccDataType
// beta: typeless pointer in host memory storing the beta scaling value as type AccDataType
// in_dev: typeless const pointer in device memory storing the input tensor
// out_dev: typeless pointer in device memory storing the output tensor
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
void
*
alpha
,
const
void
*
beta
,
const
void
*
in_dev
,
void
*
out_dev
)
override
{
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
reduceDims
,
*
static_cast
<
const
AccDataType
*>
(
alpha
),
*
static_cast
<
const
AccDataType
*>
(
beta
),
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
));
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceSoftmax<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/device_unary_elementwise.hpp
0 → 100644
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <vector>
#include "ck/device_utility/device_prop.hpp"
#include "ck/device_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_unary_elementwise_1d.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
ADataType
,
typename
BDataType
,
typename
ElementwiseFunctor
,
index_t
Dim
,
index_t
ScalarPerVector
>
struct
DeviceUnaryElementwise
:
public
BaseOperator
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
template
<
typename
Desc_M0
>
static
auto
PadDescriptor_M0_1d
(
Desc_M0
desc_m0
,
index_t
gridSize
,
index_t
blockSize
)
{
const
auto
m0
=
desc_m0
.
GetLength
(
I0
);
const
index_t
loop_step
=
gridSize
*
blockSize
*
ScalarPerVector
;
const
auto
pad
=
math
::
integer_least_multiple
(
m0
,
loop_step
)
-
m0
;
const
auto
desc_m0_pad
=
transform_tensor_descriptor
(
desc_m0
,
make_tuple
(
make_right_pad_transform
(
m0
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
desc_m0_pad
;
}
static
auto
MakeDescriptor_M0
(
const
std
::
vector
<
index_t
>&
shape
,
const
std
::
vector
<
index_t
>&
stride
,
index_t
gridSize
,
index_t
blockSize
)
{
auto
tupleOfShape
=
generate_tuple
([
&
](
auto
I
)
{
return
shape
[
I
];
},
Number
<
Dim
>
{});
auto
tupleOfStride
=
generate_tuple
([
&
](
auto
I
)
{
return
stride
[
I
];
},
Number
<
Dim
>
{});
// nd desc - [s0, s1, s2, ...]
const
auto
desc
=
make_naive_tensor_descriptor
(
tupleOfShape
,
tupleOfStride
);
// merge nd to 1d desc - [s0 * s1 * ...]
if
constexpr
(
Dim
>
1
)
{
const
auto
desc_m0
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
tupleOfShape
)),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
Dim
>
{})),
make_tuple
(
Sequence
<
0
>
{}));
return
PadDescriptor_M0_1d
(
desc_m0
,
gridSize
,
blockSize
);
}
else
return
PadDescriptor_M0_1d
(
desc
,
gridSize
,
blockSize
);
}
using
GridDesc_M0
=
decltype
(
MakeDescriptor_M0
({
1
,
1
},
{
1
,
1
},
1
,
1
));
using
GridwiseUEltwise
=
GridwiseUnaryElementwise_1D
<
ADataType
,
BDataType
,
GridDesc_M0
,
ElementwiseFunctor
,
ScalarPerVector
>
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
ADataType
*
p_a
,
BDataType
*
p_b
,
const
std
::
vector
<
index_t
>&
shape
,
const
std
::
vector
<
index_t
>&
stride_a
,
const
std
::
vector
<
index_t
>&
stride_b
,
ElementwiseFunctor
functor
)
:
p_a_
(
p_a
),
p_b_
(
p_b
),
shape_
(
shape
),
functor_
(
functor
),
blockSize_
(
256
)
// FIXME - Calculate the grid size by number of CU in the future
{
index_t
tensor_size
=
std
::
accumulate
(
shape
.
begin
(),
shape
.
end
(),
1
,
std
::
multiplies
<
int
>
{});
gridSize_
=
GridwiseUEltwise
::
CalculateGridSize
(
tensor_size
);
a_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_a
,
gridSize_
,
blockSize_
);
b_grid_desc_m0_
=
MakeDescriptor_M0
(
shape
,
stride_b
,
gridSize_
,
blockSize_
);
}
const
ADataType
*
p_a_
;
BDataType
*
p_b_
;
std
::
vector
<
int
>
shape_
;
GridDesc_M0
a_grid_desc_m0_
;
GridDesc_M0
b_grid_desc_m0_
;
ElementwiseFunctor
functor_
;
index_t
blockSize_
;
index_t
gridSize_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
kernel
=
kernel_unary_elementwise_1d
<
GridwiseUEltwise
,
ADataType
,
BDataType
,
GridDesc_M0
,
ElementwiseFunctor
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize_
),
dim3
(
arg
.
blockSize_
),
0
,
arg
.
p_a_
,
arg
.
p_b_
,
arg
.
a_grid_desc_m0_
,
arg
.
b_grid_desc_m0_
,
arg
.
functor_
);
return
elapsed_time
;
}
// polymorphic
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
pArg
==
nullptr
)
return
false
;
if
(
pArg
->
shape_
.
back
()
%
ScalarPerVector
!=
0
)
return
false
;
return
true
;
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
void
*
p_b
,
std
::
vector
<
index_t
>
shape
,
std
::
vector
<
index_t
>
stride_a
,
std
::
vector
<
index_t
>
stride_b
,
ElementwiseFunctor
functor
)
{
return
std
::
make_unique
<
Argument
>
(
static_cast
<
const
ADataType
*>
(
p_a
),
static_cast
<
BDataType
*>
(
p_b
),
shape
,
stride_a
,
stride_b
,
functor
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
();
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceBinaryElementwise"
<<
"<"
<<
"ScalarPerVector = "
<<
ScalarPerVector
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
View file @
5d015452
#ifndef GEMM_SPECIALIZATION
// SPDX-License-Identifier: MIT
#define GEMM_SPECIALIZATION
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
...
@@ -17,7 +19,22 @@ enum struct GemmSpecialization
...
@@ -17,7 +19,22 @@ enum struct GemmSpecialization
MNKPadding
,
MNKPadding
,
};
};
inline
std
::
string
getGemmSpecializationString
(
const
GemmSpecialization
&
s
)
{
switch
(
s
)
{
case
GemmSpecialization
::
Default
:
return
"Default"
;
case
GemmSpecialization
::
MPadding
:
return
"MPadding"
;
case
GemmSpecialization
::
NPadding
:
return
"NPadding"
;
case
GemmSpecialization
::
KPadding
:
return
"KPadding"
;
case
GemmSpecialization
::
MNPadding
:
return
"MNPadding"
;
case
GemmSpecialization
::
MKPadding
:
return
"MKPadding"
;
case
GemmSpecialization
::
NKPadding
:
return
"NKPadding"
;
case
GemmSpecialization
::
MNKPadding
:
return
"MNKPadding"
;
default:
return
"Unrecognized specialization!"
;
}
}
}
// namespace device
}
// namespace device
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
View file @
5d015452
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
#include "ck/utility/reduction_operator.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
#include "ck/utility/reduction_enums.hpp"
* of this software and associated documentation files (the "Software"), to deal
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
* in the Software without restriction, including without limitation the rights
// FIXME: can it be replaced with ck::Tuple?
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#include <tuple>
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_REDUCTION_OPERATOR_MAPPING_HPP
#define CK_REDUCTION_OPERATOR_MAPPING_HPP
#include "reduction_operator.hpp"
#include "reduction_enums.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -37,77 +16,69 @@ namespace ck {
...
@@ -37,77 +16,69 @@ namespace ck {
// The boolean member "indexable" are also provided in reduce_binary_operactor for
// The boolean member "indexable" are also provided in reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
// easier checking by the upper-layer codes in the kernels.
template
<
typename
T
,
ReduceTensorOp
Op
>
template
<
ReduceTensorOp
Op
>
struct
reduce_binary_operator
;
struct
reduce_binary_operator
;
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
ADD
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
ADD
>
{
{
using
opType
=
reduce
::
Add
<
T
>
;
using
opType
=
reduce
::
Add
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MUL
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MUL
>
{
{
using
opType
=
reduce
::
Mul
<
T
>
;
using
opType
=
reduce
::
Mul
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MIN
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MIN
>
{
{
using
opType
=
reduce
::
Min
<
T
>
;
using
opType
=
reduce
::
Min
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
static
constexpr
bool
indexable
=
true
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
MAX
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MAX
>
{
{
using
opType
=
reduce
::
Max
<
T
>
;
using
opType
=
reduce
::
Max
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
static
constexpr
bool
indexable
=
true
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
AMAX
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
AMAX
>
{
{
using
opType
=
reduce
::
AMax
<
T
>
;
using
opType
=
reduce
::
AMax
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
true
;
static
constexpr
bool
indexable
=
true
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
AVG
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
AVG
>
{
{
using
opType
=
reduce
::
Add
<
T
>
;
using
opType
=
reduce
::
Add
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
NORM1
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
NORM1
>
{
{
using
opType
=
reduce
::
Add
<
T
>
;
using
opType
=
reduce
::
Add
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
template
<
typename
T
>
template
<
>
struct
reduce_binary_operator
<
T
,
ReduceTensorOp
::
NORM2
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
NORM2
>
{
{
using
opType
=
reduce
::
Add
<
T
>
;
using
opType
=
reduce
::
Add
;
using
dataType
=
T
;
static
constexpr
bool
indexable
=
false
;
static
constexpr
bool
indexable
=
false
;
};
};
...
@@ -115,55 +86,101 @@ struct reduce_binary_operator<T, ReduceTensorOp::NORM2>
...
@@ -115,55 +86,101 @@ struct reduce_binary_operator<T, ReduceTensorOp::NORM2>
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
// The two unary functors are called before and afer the Reduction is executed respectively
template
<
typename
T
,
ReduceTensorOp
Op
,
bool
IsFirstReduce
,
bool
IsLastReduce
>
template
<
ReduceTensorOp
Op
,
bool
IsFirstReduce
,
bool
IsLastReduce
>
struct
reduce_unary_operator
struct
reduce_unary_operator
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
,
bool
IsFirstReduce
>
template
<
bool
IsFirstReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
AVG
,
IsFirstReduce
,
true
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
AVG
,
IsFirstReduce
,
true
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
,
true
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryDivide
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{
reduceLength
});
};
};
};
template
<
typename
T
,
bool
IsLastReduce
>
template
<
bool
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM1
,
true
,
IsLastReduce
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM1
,
true
,
IsLastReduce
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
,
bool
IsLastReduce
>
template
<
bool
IsLastReduce
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
AMAX
,
true
,
IsLastReduce
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
AMAX
,
true
,
IsLastReduce
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
>
template
<
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
true
,
false
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
true
,
false
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
>
template
<
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
true
,
true
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
true
,
true
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
template
<
typename
T
>
template
<
>
struct
reduce_unary_operator
<
T
,
ReduceTensorOp
::
NORM2
,
false
,
true
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
false
,
true
>
{
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryIdentic
<
T
,
T
>
;
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
<
T
,
T
>
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
};
}
// end of namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
View file @
5d015452
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
* Copyright (c) 2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#pragma once
#pragma once
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
binary_element_wise
{
namespace
element_wise
{
template
<
typename
Y
,
typename
X1
,
typename
X2
>
struct
Add
;
template
<
>
struct
Add
struct
Add
<
double
,
double
,
double
>
{
{
template
<
typename
Y
,
typename
X0
,
typename
X1
>
__host__
__device__
constexpr
void
operator
()(
Y
&
y
,
const
X0
&
x0
,
const
X1
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
double
&
dst
,
const
double
&
src1
,
const
double
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
dst
=
src1
+
src2
;
y
=
x0
+
x1
;
}
};
};
template
<
>
template
<
>
struct
Add
<
float
,
float
,
float
>
{
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
float
&
dst
,
const
float
&
src1
,
const
float
&
src2
)
const
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
{
dst
=
src1
+
src2
;
y
=
x0
+
x1
;
}
};
};
template
<
>
template
<
>
struct
Add
<
half_t
,
half_t
,
half_t
>
__host__
__device__
constexpr
void
{
operator
()
<
half_t
>
(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
)
const
{
y
=
type_convert
<
half_t
>
(
x0
)
+
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
y
=
x0
+
x1
;
};
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
half_t
&
dst
,
const
half_t
&
src1
,
const
half_t
&
src2
)
const
operator
()
<
bhalf_t
>
(
b
half_t
&
y
,
const
b
half_t
&
x0
,
const
b
half_t
&
x1
)
const
{
{
dst
=
src1
+
src2
;
const
float
x1_tmp
=
ck
::
type_convert
<
float
>
(
x0
);
const
float
x2_tmp
=
ck
::
type_convert
<
float
>
(
x1
);
const
float
y_tmp
=
x1_tmp
+
x2_tmp
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_tmp
);
}
}
};
};
template
<
>
struct
Subtract
struct
Add
<
bhalf_t
,
bhalf_t
,
bhalf_t
>
{
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
bhalf_t
&
dst
,
const
bhalf_t
&
src1
,
const
bhalf_t
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
const
float
x1
=
ck
::
type_convert
<
float
>
(
src1
);
y
=
x0
-
x1
;
const
float
x2
=
ck
::
type_convert
<
float
>
(
src2
);
};
const
float
y
=
x1
+
x2
;
dst
=
ck
::
type_convert
<
bhalf_t
>
(
y
);
}
};
template
<
typename
Y
,
typename
X1
,
typename
X2
>
template
<
>
struct
Substract
;
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
y
=
x0
-
x1
;
};
template
<
>
template
<
>
struct
Substract
<
double
,
double
,
double
>
__host__
__device__
constexpr
void
{
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
y
=
x0
-
x1
;
};
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
double
&
dst
,
const
double
&
src1
,
const
double
&
src2
)
const
operator
()
<
bhalf_t
>
(
bhalf_t
&
y
,
const
bhalf_t
&
x0
,
const
bhalf_t
&
x1
)
const
{
{
dst
=
src1
-
src2
;
const
float
x1_tmp
=
ck
::
type_convert
<
float
>
(
x0
);
const
float
x2_tmp
=
ck
::
type_convert
<
float
>
(
x1
);
const
float
y_tmp
=
x1_tmp
-
x2_tmp
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_tmp
);
}
}
};
};
template
<
>
struct
Bilinear
struct
Substract
<
float
,
float
,
float
>
{
{
Bilinear
(
float
alpha
,
float
beta
)
:
alpha_
(
alpha
),
beta_
(
beta
){};
template
<
typename
Y
,
typename
X0
,
typename
X1
>
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()(
float
&
dst
,
const
float
&
src1
,
const
float
&
src2
)
const
operator
()
<
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
dst
=
src1
-
src2
;
y
=
alpha_
*
x0
+
beta_
*
x1
;
}
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
float
,
half_t
>
(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
)
const
{
y
=
type_convert
<
half_t
>
(
alpha_
*
x0
+
beta_
*
ck
::
type_convert
<
float
>
(
x1
));
};
float
alpha_
;
float
beta_
;
};
};
template
<
>
struct
AddRelu
struct
Substract
<
half_t
,
half_t
,
half_t
>
{
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
half_t
&
dst
,
const
half_t
&
src1
,
const
half_t
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
dst
=
src1
-
src2
;
const
float
a
=
x0
+
x1
;
}
y
=
a
>
0.0
f
?
a
:
0.0
f
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
const
double
a
=
x0
+
x1
;
y
=
a
>
0.0
?
a
:
0.0
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
const
half_t
a
=
x0
+
x1
;
y
=
a
>
type_convert
<
half_t
>
(
0.0
f
)
?
a
:
type_convert
<
half_t
>
(
0.0
f
);
};
};
};
template
<
>
struct
AddHardswish
struct
Substract
<
bhalf_t
,
bhalf_t
,
bhalf_t
>
{
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
__host__
__device__
constexpr
void
operator
()
(
bhalf_t
&
dst
,
const
bhalf_t
&
src1
,
const
bhalf_t
&
src2
)
const
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
{
const
float
x1
=
ck
::
type_convert
<
float
>
(
src1
);
float
a
=
x0
+
x1
;
const
float
x2
=
ck
::
type_convert
<
float
>
(
src2
);
float
b
=
a
+
float
{
3
};
const
float
y
=
x1
-
x2
;
float
c
=
(
b
>
0
)
*
(
b
>
6.0
f
?
6.0
f
:
b
)
*
a
*
0.166667
f
;
dst
=
ck
::
type_convert
<
bhalf_t
>
(
y
);
y
=
c
;
}
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
double
a
=
x0
+
x1
;
double
b
=
a
+
3.0
;
double
c
=
(
b
>
0
)
*
(
b
>
6.0
?
6.0
:
b
)
*
a
*
0.166667
;
y
=
c
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
3.0
f
;
float
c
=
(
b
>
0
)
*
(
b
>
6.0
f
?
6.0
f
:
b
)
*
a
*
0.166667
f
;
y
=
c
;
};
};
};
}
// namespace
binary_
element_wise
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace tensor_operation
}
// namespace ck
}
// namespace ck
include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "data_type.hpp"
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
namespace
tensor_operation
{
namespace
tensor_operation
{
namespace
element_wise
{
namespace
element_wise
{
struct
PassThrough
// Need to ensure compiler will fail if there is no matching candidate, instead of compiler
{
// siliently do implicit type conversion
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
;
}
//
// Method 1:
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
x
;
}
//
// struct ExampleElementwiseOp
__host__
__device__
void
operator
()(
bhalf_t
&
y
,
const
bhalf_t
&
x
)
const
{
y
=
x
;
}
// {
// template<typename Y, typename X>
__host__
__device__
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
;
}
// __host__ __device__ constexpr void
// operator()(Y&, const X) const;
__host__
__device__
void
operator
()(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
y
=
x
;
}
//
// template<>
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
;
}
// __host__ __device__ constexpr void
};
// operator()<half_t, half_t>(half_t& y, const half_t& x) const
// {
struct
Add
// }
{
// };
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
//
{
// Method 2:
y
=
x0
+
x1
;
//
}
// template <typename Y, typename X>
// struct ExampleElementwiseOp;
__host__
__device__
constexpr
void
//
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
// template <>
{
// struct ExampleElementwiseOp<float, ck::bhalf_t>
// FIXME - Use float (acc type) bias in the future.
// {
y
=
x0
+
x1
;
// __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
}
// {
};
// }
// };
struct
AlphaBetaAdd
{
AlphaBetaAdd
(
float
alpha
,
float
beta
)
:
alpha_
(
alpha
),
beta_
(
beta
)
{}
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
y
=
alpha_
*
x0
+
beta_
*
x1
;
}
__host__
__device__
constexpr
void
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
// FIXME - Let x0 be acc type
y
=
static_cast
<
half_t
>
(
alpha_
*
static_cast
<
float
>
(
x0
)
+
beta_
*
static_cast
<
float
>
(
x1
));
}
float
alpha_
;
float
beta_
;
};
struct
AddRelu
{
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
const
float
a
=
x0
+
x1
;
y
=
a
>
0
?
a
:
0
;
}
__host__
__device__
constexpr
void
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
const
half_t
a
=
x0
+
x1
;
y
=
a
>
0
?
a
:
0
;
}
};
struct
AddHardswish
{
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
c
=
(
b
>
0
)
*
(
b
>
float
{
6
}
?
float
{
6
}
:
b
)
*
a
*
float
{
0.166667
};
y
=
c
;
}
__host__
__device__
constexpr
void
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
c
=
(
b
>
0
)
*
(
b
>
float
{
6
}
?
float
{
6
}
:
b
)
*
a
*
float
{
0.166667
};
y
=
c
;
}
};
struct
AddReluAdd
struct
AddReluAdd
{
{
__host__
__device__
constexpr
void
template
<
typename
Y
,
typename
X0
,
typename
X1
,
typename
X2
>
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
,
const
X2
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
{
half_t
a
=
x0
+
x1
;
half_t
a
=
x0
+
x1
;
half_t
b
=
a
>
0
?
a
:
0
;
half_t
b
=
a
>
0
?
a
:
0
;
y
=
b
+
x2
;
y
=
b
+
x2
;
}
}
__host__
__device__
constexpr
void
template
<
>
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
{
{
float
a
=
x0
+
x1
;
float
a
=
x0
+
x1
;
float
b
=
a
>
0
?
a
:
0
;
float
b
=
a
>
0
?
a
:
0
;
...
@@ -110,8 +69,9 @@ struct AddReluAdd
...
@@ -110,8 +69,9 @@ struct AddReluAdd
y
=
c
;
y
=
c
;
}
}
__host__
__device__
constexpr
void
template
<
>
operator
()(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
__host__
__device__
constexpr
void
operator
()
<
half_t
,
float
,
half_t
,
half_t
>
(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
{
float
a
=
x0
+
x1
;
float
a
=
x0
+
x1
;
float
b
=
a
>
0
?
a
:
0
;
float
b
=
a
>
0
?
a
:
0
;
...
@@ -122,8 +82,14 @@ struct AddReluAdd
...
@@ -122,8 +82,14 @@ struct AddReluAdd
struct
AddHardswishAdd
struct
AddHardswishAdd
{
{
__host__
__device__
constexpr
void
template
<
typename
Y
,
typename
X0
,
typename
X1
,
typename
X2
>
operator
()(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
,
const
X2
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
{
{
float
a
=
x0
+
x1
;
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
b
=
a
+
float
{
3
};
...
@@ -132,8 +98,9 @@ struct AddHardswishAdd
...
@@ -132,8 +98,9 @@ struct AddHardswishAdd
y
=
d
;
y
=
d
;
}
}
__host__
__device__
constexpr
void
template
<
>
operator
()(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
{
float
a
=
x0
+
x1
;
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
b
=
a
+
float
{
3
};
...
@@ -143,208 +110,95 @@ struct AddHardswishAdd
...
@@ -143,208 +110,95 @@ struct AddHardswishAdd
}
}
};
};
struct
Normalize
// C = A * B
{
// E = FastGelu(C + D0 + D1)
Normalize
(
float
epsilon
=
1e-4
)
:
epsilon_
(
epsilon
)
{}
struct
AddAddFastGelu
__host__
__device__
constexpr
void
operator
()(
float
&
y
,
const
float
&
x
,
const
float
&
mean
,
const
float
&
mean_square
,
const
float
&
gamma
,
const
float
&
beta
)
const
{
float
variance
=
mean_square
-
(
mean
*
mean
);
y
=
((
x
-
mean
)
/
sqrtf
(
variance
+
epsilon_
))
*
gamma
+
beta
;
}
float
epsilon_
;
};
// Unary operators are usually called element-wisely before/after the reduction is executed on the
// elements. They are needed for easy implementation of reduction types of AVG, NRM1, NRM2
template
<
typename
Y
,
typename
X
,
bool
HasDividing
=
false
>
struct
UnaryIdentic
;
template
<
>
struct
UnaryIdentic
<
float
,
float
,
false
>
{
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
template
<
typename
E
,
typename
C
,
typename
D0
,
typename
D1
>
__host__
__device__
void
operator
()(
E
&
,
const
C
&
,
const
D0
&
,
const
D1
&
)
const
;
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
;
};
};
template
<
>
template
<
>
struct
UnaryIdentic
<
float
,
float
,
true
>
__host__
__device__
void
operator
()
<
half_t
,
float
,
half_t
,
half_t
>
(
half_t
&
e
,
{
const
float
&
c
,
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
const
half_t
&
d0
,
const
half_t
&
d1
)
const
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
{
y
=
x
/
type_convert
<
float
>
(
divider_
);
// Fast GeLU
};
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
const
auto
fast_gelu
=
[
&
](
float
x
)
{
const
float
u
=
float
(
2
)
*
x
*
(
float
(
0.035677
)
*
x
*
x
+
float
(
0.797885
));
const
float
emu
=
exp
(
-
u
);
const
float
cdf
=
float
(
0.5
)
+
float
(
0.5
)
*
(
float
(
2
)
/
(
float
(
1
)
+
emu
)
-
float
(
1
));
return
x
*
cdf
;
};
int32_t
divider_
=
1
;
const
float
y
=
fast_gelu
(
c
+
float
(
d0
)
+
float
(
d1
));
};
template
<
>
e
=
type_convert
<
half_t
>
(
y
);
struct
UnaryIdentic
<
half_t
,
half_t
,
false
>
}
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
x
;
};
};
};
template
<
>
struct
Normalize
struct
UnaryIdentic
<
double
,
double
,
false
>
{
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
// FIXME: is double absolutely necessary?
Normalize
(
double
epsilon
=
1e-4
)
:
epsilon_
(
epsilon
)
{}
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
;
};
template
<
typename
T
>
};
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x
,
const
T
&
mean
,
const
T
&
mean_square
,
const
T
&
gamma
,
const
T
&
beta
)
const
;
template
<
>
struct
UnaryIdentic
<
double
,
double
,
true
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
>
(
float
&
y
,
const
float
&
x
,
const
float
&
mean
,
const
float
&
mean_square
,
const
float
&
gamma
,
const
float
&
beta
)
const
{
{
y
=
x
/
type_convert
<
double
>
(
divider_
);
using
ck
::
math
::
sqrt
;
};
int32_t
divider_
=
1
;
};
template
<
>
struct
UnaryIdentic
<
int32_t
,
int32_t
,
false
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
;
};
};
template
<
>
struct
UnaryIdentic
<
int32_t
,
int32_t
,
true
>
{
__host__
__device__
UnaryIdentic
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
/
divider_
;
};
int32_t
divider_
=
1
;
};
template
<
>
struct
UnaryIdentic
<
int8_t
,
int8_t
,
false
>
{
__host__
__device__
UnaryIdentic
(
const
int8_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
y
=
x
;
};
};
template
<
typename
Y
,
typename
X
,
bool
HasDividing
=
false
>
struct
UnarySquare
;
template
<
>
struct
UnarySquare
<
float
,
float
,
false
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
*
x
;
};
float
variance
=
mean_square
-
(
mean
*
mean
);
};
y
=
((
x
-
mean
)
/
sqrt
(
variance
+
type_convert
<
float
>
(
epsilon_
)))
*
gamma
+
beta
;
template
<
>
struct
UnarySquare
<
float
,
float
,
true
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
*
x
/
type_convert
<
float
>
(
divider_
);
};
};
int32_t
divider_
=
1
;
template
<
>
};
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x
,
template
<
>
const
double
&
mean
,
struct
UnarySquare
<
double
,
double
,
false
>
const
double
&
mean_square
,
{
const
double
&
gamma
,
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
const
double
&
beta
)
const
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
*
x
;
};
};
template
<
>
struct
UnarySquare
<
double
,
double
,
true
>
{
__host__
__device__
UnarySquare
(
const
int32_t
divider
=
1
)
{
divider_
=
divider
;
};
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
{
y
=
x
*
x
/
type_convert
<
double
>
(
divider_
);
using
ck
::
math
::
sqrt
;
double
variance
=
mean_square
-
(
mean
*
mean
);
y
=
((
x
-
mean
)
/
sqrt
(
variance
+
epsilon_
))
*
gamma
+
beta
;
};
};
int32_t
divider_
=
1
;
// FIXME: is double absolutely necessary?
double
epsilon_
;
};
};
template
<
typename
Y
,
typename
X
>
template
<
typename
Y
,
typename
X
>
struct
UnaryAbs
;
struct
UnaryTypeConvert
;
template
<
>
struct
UnaryAbs
<
float
,
float
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
abs
(
x
);
};
};
template
<
>
struct
UnaryAbs
<
half_t
,
half_t
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
__habs
(
x
);
};
};
template
<
>
template
<
>
struct
Unary
Abs
<
double
,
double
>
struct
Unary
TypeConvert
<
float
,
ck
::
bhalf_t
>
{
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
ck
::
bhalf_t
&
x
)
const
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
abs
(
x
);
};
};
template
<
>
struct
UnaryAbs
<
int8_t
,
int8_t
>
{
__host__
__device__
UnaryAbs
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
{
int8_t
sgn
=
x
>>
(
8
-
1
);
y
=
ck
::
type_convert
<
float
,
ck
::
bhalf_t
>
(
x
);
}
y
=
(
x
^
sgn
)
-
sgn
;
};
};
template
<
typename
Y
,
typename
X
>
struct
UnarySqrt
;
template
<
>
struct
UnarySqrt
<
float
,
float
>
{
__host__
__device__
UnarySqrt
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
float
&
y
,
const
float
&
x
)
const
{
y
=
sqrtf
(
x
);
};
};
};
template
<
>
template
<
>
struct
Unary
Sqrt
<
double
,
double
>
struct
Unary
TypeConvert
<
ck
::
bhalf_t
,
float
>
{
{
__host__
__device__
UnarySqrt
(
const
int32_t
divider
=
1
)
{
(
void
)
divider
;
};
__host__
__device__
void
operator
()(
ck
::
bhalf_t
&
y
,
float
&
x
)
const
{
__host__
__device__
void
operator
()(
double
&
y
,
const
double
&
x
)
const
{
y
=
sqrt
(
x
);
};
y
=
ck
::
type_convert
<
ck
::
bhalf_t
,
float
>
(
x
);
}
};
};
}
// namespace element_wise
}
// namespace element_wise
...
...
include/ck/tensor_operation/gpu/element/element_wise_reduce_operation.hpp
deleted
100644 → 0
View file @
b7fa6bb1
#pragma once
#include "data_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
element_wise
{
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
0 → 100644
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
element_wise
{
struct
PassThrough
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
bhalf_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
;
};
};
struct
Scale
{
__host__
__device__
Scale
(
float
scale
)
:
scale_
(
scale
)
{}
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
;
template
<
>
__host__
__device__
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
y
=
scale_
*
x
;
};
float
scale_
;
};
struct
UnaryDivide
{
__host__
__device__
UnaryDivide
(
const
int32_t
divider
=
1
)
:
divider_
(
divider
)
{}
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
int32_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
/
type_convert
<
T
>
(
divider_
);
};
int32_t
divider_
=
1
;
};
struct
UnarySquare
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
*
x
;
};
};
struct
UnaryAbs
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
ck
::
math
::
abs
(
x
);
};
};
struct
UnarySqrt
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
,
"Data type is not supported by this operation!"
);
y
=
ck
::
math
::
sqrt
(
x
);
};
};
struct
Relu
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
>
0
?
x
:
0
;
}
template
<
>
__host__
__device__
void
operator
()(
bhalf_t
&
y
,
const
bhalf_t
&
x
)
const
{
float
x_f32
=
ck
::
type_convert
<
float
>
(
x
);
float
y_f32
=
x_f32
>
0
?
x_f32
:
0
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_f32
);
}
};
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
struct
FastGelu
{
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
;
template
<
>
__host__
__device__
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
const
float
u
=
float
(
2
)
*
x
*
(
float
(
0.035677
)
*
x
*
x
+
float
(
0.797885
));
const
float
emu
=
exp
(
-
u
);
const
float
cdf
=
float
(
0.5
)
+
float
(
0.5
)
*
(
float
(
2
)
/
(
float
(
1
)
+
emu
)
-
float
(
1
));
y
=
x
*
cdf
;
}
};
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
View file @
5d015452
#ifndef UTILITY_BLOCK_TO_CTILE_MAP
// SPDX-License-Identifier: MIT
#define UTILITY_BLOCK_TO_CTILE_MAP
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include "utility/math.hpp"
#pragma once
#include "utility/number.hpp"
#include "tensor_description/tensor_adaptor.hpp"
#include "ck/utility/math.hpp"
#include "tensor_description/multi_index_transform_helper.hpp"
#include "ck/utility/number.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -485,5 +487,3 @@ __host__ __device__ bool DefaultValidCTileIndex(const CTileIdx& c_tile_idx,
...
@@ -485,5 +487,3 @@ __host__ __device__ bool DefaultValidCTileIndex(const CTileIdx& c_tile_idx,
}
}
}
// namespace ck
}
// namespace ck
#endif // UTILITY_BLOCK_TO_CTILE_MAP
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
View file @
5d015452
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
#include "ck/utility/reduction_common.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
#include "ck/utility/reduction_operator.hpp"
* of this software and associated documentation files (the "Software"), to deal
#include "ck/utility/reduction_functions_accumulate.hpp"
* in the Software without restriction, including without limitation the rights
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
* copies of the Software, and to permit persons to whom the Software is
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
* furnished to do so, subject to the following conditions:
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
#define CK_GRIDWISE_2D_REDUCTION_MULTIBLOCK_HPP
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_blockwise.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -171,15 +147,15 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -171,15 +147,15 @@ struct GridwiseReduction_mk_to_m_multiblock
AccDataType
beta
,
AccDataType
beta
,
OutDataType
*
const
__restrict__
p_out_value_global
)
OutDataType
*
const
__restrict__
p_out_value_global
)
{
{
const
auto
zero
Val
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
identity
Val
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
// LDS
// LDS
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
const
auto
in_global_val_buf
=
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
...
@@ -191,7 +167,7 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -191,7 +167,7 @@ struct GridwiseReduction_mk_to_m_multiblock
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
identity
Val
;
});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
...
@@ -358,12 +334,12 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -358,12 +334,12 @@ struct GridwiseReduction_mk_to_m_multiblock
__shared__
AccDataType
p_reduce_work_val_buffer
[
BlockSize
];
__shared__
AccDataType
p_reduce_work_val_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_work_idx_buffer
[
BlockSize
];
__shared__
IndexDataType
p_reduce_work_idx_buffer
[
BlockSize
];
const
auto
zero
Val
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
identity
Val
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
const
auto
in_global_val_buf
=
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
out_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
...
@@ -418,7 +394,7 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -418,7 +394,7 @@ struct GridwiseReduction_mk_to_m_multiblock
thread_k_cluster_id
*
KThreadSliceSize
));
thread_k_cluster_id
*
KThreadSliceSize
));
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
accu_value_buf
(
I
)
=
identity
Val
;
accu_index_buf
(
I
)
=
0
;
accu_index_buf
(
I
)
=
0
;
});
});
...
@@ -459,7 +435,7 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -459,7 +435,7 @@ struct GridwiseReduction_mk_to_m_multiblock
in_thread_idx_buf
);
in_thread_idx_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
AccDataType
tmpValue
=
zero
Val
;
AccDataType
tmpValue
=
identity
Val
;
IndexDataType
tmpIndex
=
0
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
...
@@ -512,7 +488,7 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -512,7 +488,7 @@ struct GridwiseReduction_mk_to_m_multiblock
in_thread_val_buf
(
Number
<
offset
>
{}));
in_thread_val_buf
(
Number
<
offset
>
{}));
});
});
AccDataType
tmpValue
=
zero
Val
;
AccDataType
tmpValue
=
identity
Val
;
IndexDataType
tmpIndex
=
0
;
IndexDataType
tmpIndex
=
0
;
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
...
@@ -635,4 +611,3 @@ struct GridwiseReduction_mk_to_m_multiblock
...
@@ -635,4 +611,3 @@ struct GridwiseReduction_mk_to_m_multiblock
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
View file @
5d015452
/*******************************************************************************
// SPDX-License-Identifier: MIT
*
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
* MIT License
*
#pragma once
* Copyright (c) 2021 Advanced Micro Devices, Inc.
*
#include "ck/utility/data_type.hpp"
* Permission is hereby granted, free of charge, to any person obtaining a copy
#include "ck/utility/reduction_common.hpp"
* of this software and associated documentation files (the "Software"), to deal
#include "ck/utility/reduction_operator.hpp"
* in the Software without restriction, including without limitation the rights
#include "ck/utility/reduction_functions_accumulate.hpp"
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
* copies of the Software, and to permit persons to whom the Software is
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
* furnished to do so, subject to the following conditions:
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#define CK_GRIDWISE_2D_REDUCTION_THREADWISE_HPP
#include "data_type.hpp"
#include "reduction_common.hpp"
#include "reduction_operator.hpp"
#include "reduction_functions_accumulate.hpp"
#include "reduction_functions_threadwise.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -135,12 +112,12 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -135,12 +112,12 @@ struct GridwiseReduction_mk_to_m_threadwise
ReduceOperation
,
ReduceOperation
,
PropagateNan
>
;
PropagateNan
>
;
const
auto
zero
Val
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
identity
Val
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
const
auto
in_global_val_buf
=
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
auto
dst_global_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
p_out_value_global
,
out_grid_desc_m
.
GetElementSpaceSize
());
...
@@ -149,7 +126,7 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -149,7 +126,7 @@ struct GridwiseReduction_mk_to_m_threadwise
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
accu_value_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
identity
Val
;
});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
...
@@ -276,12 +253,12 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -276,12 +253,12 @@ struct GridwiseReduction_mk_to_m_threadwise
(
void
)
acc_elementwise_op
;
(
void
)
acc_elementwise_op
;
const
auto
zero
Val
=
ReduceOperation
::
GetReductionZeroVal
();
const
auto
identity
Val
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>
();
const
auto
in_global_val_buf
=
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
in_grid_desc_m_k
.
GetElementSpaceSize
(),
type_convert
<
InDataType
>
(
zeroVal
));
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
const
auto
in_global_idx_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
p_in_index_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
());
...
@@ -303,7 +280,7 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -303,7 +280,7 @@ struct GridwiseReduction_mk_to_m_threadwise
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
IndexDataType
,
MThreadSliceSize
,
true
>
accu_index_buf
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf
(
I
)
=
zero
Val
;
accu_value_buf
(
I
)
=
identity
Val
;
accu_index_buf
(
I
)
=
0
;
accu_index_buf
(
I
)
=
0
;
});
});
...
@@ -495,4 +472,3 @@ struct GridwiseReduction_mk_to_m_threadwise
...
@@ -495,4 +472,3 @@ struct GridwiseReduction_mk_to_m_threadwise
};
};
}
// namespace ck
}
// namespace ck
#endif
include/ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "cluster_descriptor.hpp"
#include "
ck/tensor_description/
cluster_descriptor.hpp"
#include "data_type.hpp"
#include "
ck/utility/
data_type.hpp"
#include "element_wise_operation.hpp"
#include "
ck/tensor_operation/gpu/element/
element_wise_operation.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "
ck/tensor_operation/gpu/thread/
threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "c
luster_descriptor
.hpp"
#include "c
k/utility/data_type
.hpp"
#include "
data_type
.hpp"
#include "
ck/tensor_description/cluster_descriptor
.hpp"
#include "element_wise_operation.hpp"
#include "
ck/tensor_operation/gpu/element/
element_wise_operation.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "
ck/tensor_operation/gpu/thread/
threadwise_tensor_slice_transfer.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
0 → 100644
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwiseGemm
,
typename
FloatAB
,
typename
FloatC
,
typename
FloatC0
,
typename
FloatC1
,
typename
ReducePtrsGlobal
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
typename
C1ElementwiseOperation
,
typename
ReduceInElementwiseOperations
,
typename
ReduceAccElementwiseOperations
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
ReduceGridDescriptor_MBlock_MPerBlock
,
typename
Block2CTileMap
,
bool
HasMainKBlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_gemm_bias_add_reduce_xdl_cshuffle_v1
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
FloatC0
*
__restrict__
p_bias_grid
,
const
FloatC1
*
__restrict__
p_d0_grid
,
ReducePtrsGlobal
p_reduces_grid
,
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CElementwiseOperation
c_element_op
,
const
C1ElementwiseOperation
c1_element_op
,
const
ReduceInElementwiseOperations
reduce_in_element_ops
,
const
ReduceAccElementwiseOperations
reduce_out_element_ops
,
const
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1
,
const
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1
,
const
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
c_grid_desc_mblock_mperblock_nblock_nperblock
,
const
C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
c0_grid_desc_mblock_mperblock_nblock_nperblock
,
const
C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
c1_grid_desc_mblock_mperblock_nblock_nperblock
,
const
ReduceGridDescriptor_MBlock_MPerBlock
reduce_grid_desc_mblock_mperblock
,
const
Block2CTileMap
block_2_ctile_map
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
,
p_b_grid
,
p_c_grid
,
p_bias_grid
,
p_d0_grid
,
p_reduces_grid
,
p_shared
,
a_element_op
,
b_element_op
,
c_element_op
,
c1_element_op
,
reduce_in_element_ops
,
reduce_out_element_ops
,
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
c_grid_desc_mblock_mperblock_nblock_nperblock
,
c0_grid_desc_mblock_mperblock_nblock_nperblock
,
c1_grid_desc_mblock_mperblock_nblock_nperblock
,
reduce_grid_desc_mblock_mperblock
,
block_2_ctile_map
);
#else
ignore
=
p_a_grid
;
ignore
=
p_b_grid
;
ignore
=
p_c_grid
;
ignore
=
p_bias_grid
;
ignore
=
p_d0_grid
;
ignore
=
p_reduces_grid
;
ignore
=
a_element_op
;
ignore
=
b_element_op
;
ignore
=
c_element_op
;
ignore
=
c1_element_op
;
ignore
=
reduce_in_element_ops
;
ignore
=
reduce_out_element_ops
;
ignore
=
a_grid_desc_ak0_m_ak1
;
ignore
=
b_grid_desc_bk0_n_bk1
;
ignore
=
c_grid_desc_mblock_mperblock_nblock_nperblock
;
ignore
=
c0_grid_desc_mblock_mperblock_nblock_nperblock
;
ignore
=
c1_grid_desc_mblock_mperblock_nblock_nperblock
;
ignore
=
reduce_grid_desc_mblock_mperblock
;
ignore
=
block_2_ctile_map
;
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
template
<
typename
FloatAB
,
typename
FloatGemmAcc
,
typename
FloatCShuffle
,
typename
FloatC
,
typename
FloatC0
,
typename
FloatC1
,
typename
FloatReduceAcc
,
typename
ReducePtrsGlobal
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CElementwiseOperation
,
typename
C1ElementwiseOperation
,
typename
ReduceOperations
,
typename
ReduceInElementwiseOperations
,
typename
ReduceAccElementwiseOperations
,
InMemoryDataOperationEnum
CGlobalMemoryDataOperation
,
typename
ReduceGlobalMemoryDataOperation
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
CGridDesc_M_N
,
typename
C0GridDesc_M_N
,
typename
C1GridDesc_M_N
,
typename
ReduceGridDesc_M
,
index_t
NumGemmKPrefetchStage
,
index_t
BlockSize
,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
KPerBlock
,
index_t
AK1Value
,
index_t
BK1Value
,
index_t
MPerXdl
,
index_t
NPerXdl
,
index_t
MXdlPerWave
,
index_t
NXdlPerWave
,
typename
ABlockTransferThreadClusterLengths_AK0_M_AK1
,
typename
ABlockTransferThreadClusterArrangeOrder
,
typename
ABlockTransferSrcAccessOrder
,
index_t
ABlockTransferSrcVectorDim
,
index_t
ABlockTransferSrcScalarPerVector
,
index_t
ABlockTransferDstScalarPerVector_AK1
,
bool
AThreadTransferSrcResetCoordinateAfterRun
,
index_t
ABlockLdsExtraM
,
typename
BBlockTransferThreadClusterLengths_BK0_N_BK1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
typename
BBlockTransferSrcAccessOrder
,
index_t
BBlockTransferSrcVectorDim
,
index_t
BBlockTransferSrcScalarPerVector
,
index_t
BBlockTransferDstScalarPerVector_BK1
,
bool
BThreadTransferSrcResetCoordinateAfterRun
,
index_t
BBlockLdsExtraN
,
index_t
CShuffleMXdlPerWavePerShuffle
,
index_t
CShuffleNXdlPerWavePerShuffle
,
typename
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
index_t
CShuffleBlockTransferScalarPerVector_NPerBlock
,
typename
CReduceThreadClusterLengths_MPerBlock_NPerBlock
,
index_t
CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock
,
index_t
CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock
,
LoopScheduler
LoopSched
>
struct
GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
I4
=
Number
<
4
>
{};
static
constexpr
auto
I5
=
Number
<
5
>
{};
static
constexpr
auto
I6
=
Number
<
6
>
{};
static
constexpr
auto
I7
=
Number
<
7
>
{};
// K1 should be Number<...>
static
constexpr
auto
AK0
=
Number
<
KPerBlock
/
AK1Value
>
{};
static
constexpr
auto
BK0
=
Number
<
KPerBlock
/
BK1Value
>
{};
static
constexpr
auto
AK1
=
Number
<
AK1Value
>
{};
static
constexpr
auto
BK1
=
Number
<
BK1Value
>
{};
using
ThisThreadBlock
=
ThisThreadBlock
<
BlockSize
>
;
using
GridwiseGemmPipe
=
GridwiseGemmPipeline_v1
<
NumGemmKPrefetchStage
>
;
__host__
__device__
static
constexpr
auto
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
()
{
// A matrix in LDS memory, dst of blockwise copy
return
make_naive_tensor_descriptor
(
make_tuple
(
AK0
,
Number
<
MPerBlock
>
{},
AK1
),
make_tuple
(
Number
<
MPerBlock
+
ABlockLdsExtraM
>
{}
*
AK1
,
AK1
,
I1
));
}
__host__
__device__
static
constexpr
auto
GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1
()
{
// B matrix in LDS memory, dst of blockwise copy
return
make_naive_tensor_descriptor
(
make_tuple
(
BK0
,
Number
<
NPerBlock
>
{},
BK1
),
make_tuple
(
Number
<
NPerBlock
+
BBlockLdsExtraN
>
{}
*
BK1
,
BK1
,
I1
));
}
__host__
__device__
static
constexpr
auto
GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
()
{
constexpr
index_t
MWave
=
MPerBlock
/
(
MXdlPerWave
*
MPerXdl
);
constexpr
index_t
NWave
=
NPerBlock
/
(
NXdlPerWave
*
NPerXdl
);
constexpr
auto
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
CShuffleMXdlPerWavePerShuffle
*
MWave
*
MPerXdl
>
{},
I1
,
Number
<
CShuffleNXdlPerWavePerShuffle
*
NWave
*
NPerXdl
>
{}));
return
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
;
}
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_desc_ak0_m_ak1
=
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
();
constexpr
auto
b_block_desc_bk0_n_bk1
=
GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1
();
// lds max alignment
constexpr
auto
max_lds_align
=
math
::
lcm
(
AK1
,
BK1
);
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
max_lds_align
);
constexpr
auto
b_block_space_size_aligned
=
math
::
integer_least_multiple
(
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
(),
max_lds_align
);
// LDS allocation for C shuffle in LDS
constexpr
auto
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
=
GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
();
constexpr
auto
c_block_size
=
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
();
return
math
::
max
((
a_block_space_size_aligned
+
b_block_space_size_aligned
)
*
sizeof
(
FloatAB
),
c_block_size
*
sizeof
(
FloatCShuffle
));
}
// block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
template
<
typename
Block2CTileMap
>
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
AGridDesc_AK0_M_AK1
&
a_grid_desc_ak0_m_ak1
,
const
BGridDesc_BK0_N_BK1
&
b_grid_desc_bk0_n_bk1
,
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
const
Block2CTileMap
&
block_2_ctile_map
)
{
// static_assert(is_known_at_compile_time<remove_cv_t<decltype(AK1)>>::value &&
// is_known_at_compile_time<remove_cv_t<decltype(BK1)>>::value,
// "wrong! K1 need to be known at compile-time");
static_assert
((
MPerBlock
%
(
MPerXdl
*
MXdlPerWave
)
==
0
)
&&
(
NPerBlock
%
(
NXdlPerWave
*
NPerXdl
))
==
0
,
"Invalid tuning param!"
);
const
auto
M
=
a_grid_desc_ak0_m_ak1
.
GetLength
(
I1
);
const
auto
N
=
b_grid_desc_bk0_n_bk1
.
GetLength
(
I1
);
const
auto
K
=
a_grid_desc_ak0_m_ak1
.
GetLength
(
I0
)
*
a_grid_desc_ak0_m_ak1
.
GetLength
(
I2
);
if
(
!
(
M
==
c_grid_desc_m_n
.
GetLength
(
I0
)
&&
N
==
c_grid_desc_m_n
.
GetLength
(
I1
)))
return
false
;
if
(
!
(
M
%
MPerBlock
==
0
&&
N
%
NPerBlock
==
0
&&
K
%
KPerBlock
==
0
))
return
false
;
// check gridwise gemm pipeline
const
auto
num_k_loop
=
K
/
KPerBlock
;
if
(
!
GridwiseGemmPipe
::
IsSupported
(
num_k_loop
))
{
return
false
;
}
if
(
!
block_2_ctile_map
.
CheckValidity
(
c_grid_desc_m_n
))
{
return
false
;
}
// TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
return
true
;
}
__host__
__device__
static
constexpr
bool
CalculateHasMainKBlockLoop
(
index_t
K
)
{
const
index_t
num_loop
=
K
/
KPerBlock
;
return
GridwiseGemmPipe
::
CalculateHasMainLoop
(
num_loop
);
}
template
<
typename
CGridDesc_M_N_
>
__host__
__device__
static
constexpr
auto
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
const
CGridDesc_M_N_
&
c_grid_desc_m_n
)
{
const
auto
M
=
c_grid_desc_m_n
.
GetLength
(
I0
);
const
auto
N
=
c_grid_desc_m_n
.
GetLength
(
I1
);
const
auto
MBlock
=
M
/
MPerBlock
;
const
auto
NBlock
=
N
/
NPerBlock
;
const
auto
c_grid_desc_mblock_mperblock_nblock_nperblock
=
transform_tensor_descriptor
(
c_grid_desc_m_n
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
MBlock
,
Number
<
MPerBlock
>
{})),
make_unmerge_transform
(
make_tuple
(
NBlock
,
Number
<
NPerBlock
>
{}))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
return
c_grid_desc_mblock_mperblock_nblock_nperblock
;
}
__host__
__device__
static
constexpr
auto
MakeReduceGridDescriptor_MBlock_MPerBlock
(
const
ReduceGridDesc_M
&
d_grid_desc_m
)
{
const
auto
M
=
d_grid_desc_m
.
GetLength
(
I0
);
const
auto
MBlock
=
M
/
MPerBlock
;
const
auto
reduce_grid_desc_mblock_mperblock
=
transform_tensor_descriptor
(
d_grid_desc_m
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
MBlock
,
Number
<
MPerBlock
>
{}))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
return
reduce_grid_desc_mblock_mperblock
;
}
// return block_id to C matrix tile idx (m0, n0) mapping
__host__
__device__
static
constexpr
auto
MakeDefaultBlock2CTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
{
return
BlockToCTileMap_M00_N0_M01Adapt
<
MPerBlock
,
NPerBlock
,
CGridDesc_M_N
>
(
c_grid_desc_m_n
);
}
using
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
=
remove_cvref_t
<
decltype
(
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
CGridDesc_M_N
{}))
>
;
using
C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
=
remove_cvref_t
<
decltype
(
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
C0GridDesc_M_N
{}))
>
;
using
C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
=
remove_cvref_t
<
decltype
(
MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
C1GridDesc_M_N
{}))
>
;
using
ReduceGridDescriptor_MBlock_MPerBlock
=
remove_cvref_t
<
decltype
(
MakeReduceGridDescriptor_MBlock_MPerBlock
(
ReduceGridDesc_M
{}))
>
;
using
DefaultBlock2CTileMap
=
remove_cvref_t
<
decltype
(
MakeDefaultBlock2CTileMap
(
CGridDesc_M_N
{}))
>
;
template
<
bool
HasMainKBlockLoop
,
typename
Block2CTileMap
>
__device__
static
void
Run
(
const
FloatAB
*
__restrict__
p_a_grid
,
const
FloatAB
*
__restrict__
p_b_grid
,
FloatC
*
__restrict__
p_c_grid
,
const
FloatC0
*
__restrict__
p_bias_grid
,
const
FloatC1
*
__restrict__
p_d0_grid
,
ReducePtrsGlobal
p_reduces_grid
,
void
*
__restrict__
p_shared
,
const
AElementwiseOperation
&
a_element_op
,
const
BElementwiseOperation
&
b_element_op
,
const
CElementwiseOperation
&
c_element_op
,
const
C1ElementwiseOperation
&
c1_element_op
,
const
ReduceInElementwiseOperations
&
reduce_in_element_ops
,
const
ReduceAccElementwiseOperations
&
reduce_out_element_ops
,
const
AGridDesc_AK0_M_AK1
&
a_grid_desc_ak0_m_ak1
,
const
BGridDesc_BK0_N_BK1
&
b_grid_desc_bk0_n_bk1
,
const
CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
&
c_grid_desc_mblock_mperblock_nblock_nperblock
,
const
C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
&
c0_grid_desc_mblock_mperblock_nblock_nperblock
,
const
C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
&
c1_grid_desc_mblock_mperblock_nblock_nperblock
,
const
ReduceGridDescriptor_MBlock_MPerBlock
&
reduce_grid_desc_mblock_mperblock
,
const
Block2CTileMap
&
block_2_ctile_map
)
{
const
auto
a_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_a_grid
,
a_grid_desc_ak0_m_ak1
.
GetElementSpaceSize
());
const
auto
b_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_b_grid
,
b_grid_desc_bk0_n_bk1
.
GetElementSpaceSize
());
auto
c_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_c_grid
,
c_grid_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
auto
c0_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_bias_grid
,
c0_grid_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
auto
c1_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_d0_grid
,
c1_grid_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
// divide block work by [M, N]
const
auto
block_work_idx
=
block_2_ctile_map
.
CalculateBottomIndex
(
make_multi_index
(
get_block_1d_id
()));
if
(
!
block_2_ctile_map
.
ValidCTileIndex
(
block_work_idx
,
make_tuple
(
c_grid_desc_mblock_mperblock_nblock_nperblock
.
GetLength
(
I0
),
c_grid_desc_mblock_mperblock_nblock_nperblock
.
GetLength
(
I2
))))
{
return
;
}
// HACK: this force m/n_block_data_idx_on_grid into SGPR
const
index_t
m_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I0
]
*
MPerBlock
);
const
index_t
n_block_data_idx_on_grid
=
__builtin_amdgcn_readfirstlane
(
block_work_idx
[
I1
]
*
NPerBlock
);
// lds max alignment
constexpr
auto
max_lds_align
=
math
::
lcm
(
AK1
,
BK1
);
// A matrix in LDS memory, dst of blockwise copy
constexpr
auto
a_block_desc_ak0_m_ak1
=
GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1
();
// B matrix in LDS memory, dst of blockwise copy
constexpr
auto
b_block_desc_bk0_n_bk1
=
GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1
();
// A matrix blockwise copy
auto
a_blockwise_copy
=
ThreadGroupTensorSliceTransfer_v4r1
<
ThisThreadBlock
,
AElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
AK0
,
MPerBlock
,
AK1
>
,
ABlockTransferThreadClusterLengths_AK0_M_AK1
,
ABlockTransferThreadClusterArrangeOrder
,
FloatAB
,
FloatAB
,
decltype
(
a_grid_desc_ak0_m_ak1
),
decltype
(
a_block_desc_ak0_m_ak1
),
ABlockTransferSrcAccessOrder
,
Sequence
<
1
,
0
,
2
>
,
ABlockTransferSrcVectorDim
,
2
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_AK1
,
1
,
1
,
AThreadTransferSrcResetCoordinateAfterRun
,
true
,
NumGemmKPrefetchStage
>
(
a_grid_desc_ak0_m_ak1
,
make_multi_index
(
0
,
m_block_data_idx_on_grid
,
0
),
a_element_op
,
a_block_desc_ak0_m_ak1
,
make_multi_index
(
0
,
0
,
0
),
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
// B matrix blockwise copy
auto
b_blockwise_copy
=
ThreadGroupTensorSliceTransfer_v4r1
<
ThisThreadBlock
,
BElementwiseOperation
,
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
BK0
,
NPerBlock
,
BK1
>
,
BBlockTransferThreadClusterLengths_BK0_N_BK1
,
BBlockTransferThreadClusterArrangeOrder
,
FloatAB
,
FloatAB
,
decltype
(
b_grid_desc_bk0_n_bk1
),
decltype
(
b_block_desc_bk0_n_bk1
),
BBlockTransferSrcAccessOrder
,
Sequence
<
1
,
0
,
2
>
,
BBlockTransferSrcVectorDim
,
2
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_BK1
,
1
,
1
,
BThreadTransferSrcResetCoordinateAfterRun
,
true
,
NumGemmKPrefetchStage
>
(
b_grid_desc_bk0_n_bk1
,
make_multi_index
(
0
,
n_block_data_idx_on_grid
,
0
),
b_element_op
,
b_block_desc_bk0_n_bk1
,
make_multi_index
(
0
,
0
,
0
),
ck
::
tensor_operation
::
element_wise
::
PassThrough
{});
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[K0PerBlock, MPerBlock] is in LDS
// b_mtx[K0PerBlock, NPerBlock] is in LDS
// c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
// register
// sanity check
constexpr
index_t
KPack
=
math
::
max
(
math
::
lcm
(
AK1
,
BK1
),
MfmaSelector
<
FloatAB
,
MPerXdl
,
NPerXdl
>::
selected_mfma
.
k_per_blk
);
auto
blockwise_gemm
=
BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector
<
BlockSize
,
FloatAB
,
FloatGemmAcc
,
decltype
(
a_block_desc_ak0_m_ak1
),
decltype
(
b_block_desc_bk0_n_bk1
),
MPerXdl
,
NPerXdl
,
MXdlPerWave
,
NXdlPerWave
,
KPack
,
LoopSched
>
();
auto
c_thread_buf
=
blockwise_gemm
.
GetCThreadBuffer
();
// LDS allocation for A and B: be careful of alignment
constexpr
auto
a_block_space_size_aligned
=
math
::
integer_least_multiple
(
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
(),
max_lds_align
);
auto
a_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatAB
*>
(
p_shared
),
a_block_desc_ak0_m_ak1
.
GetElementSpaceSize
());
auto
b_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatAB
*>
(
p_shared
)
+
a_block_space_size_aligned
,
b_block_desc_bk0_n_bk1
.
GetElementSpaceSize
());
constexpr
auto
a_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
AK1
,
0
,
0
);
constexpr
auto
b_block_slice_copy_step
=
make_multi_index
(
KPerBlock
/
BK1
,
0
,
0
);
// gridwise GEMM pipeline
const
auto
gridwise_gemm_pipeline
=
GridwiseGemmPipeline_v1_Selector
<
NumGemmKPrefetchStage
,
LoopSched
>
();
const
index_t
num_k_block_main_loop
=
__builtin_amdgcn_readfirstlane
(
(
a_grid_desc_ak0_m_ak1
.
GetLength
(
I0
)
*
a_grid_desc_ak0_m_ak1
.
GetLength
(
I2
))
/
KPerBlock
);
gridwise_gemm_pipeline
.
template
Run
<
HasMainKBlockLoop
>(
a_grid_desc_ak0_m_ak1
,
a_block_desc_ak0_m_ak1
,
a_blockwise_copy
,
a_grid_buf
,
a_block_buf
,
a_block_slice_copy_step
,
b_grid_desc_bk0_n_bk1
,
b_block_desc_bk0_n_bk1
,
b_blockwise_copy
,
b_grid_buf
,
b_block_buf
,
b_block_slice_copy_step
,
blockwise_gemm
,
c_thread_buf
,
num_k_block_main_loop
);
// shuffle C + reduction + write out
{
static_assert
(
MXdlPerWave
%
CShuffleMXdlPerWavePerShuffle
==
0
&&
NXdlPerWave
%
CShuffleNXdlPerWavePerShuffle
==
0
,
"wrong!"
);
constexpr
index_t
MWave
=
MPerBlock
/
(
MXdlPerWave
*
MPerXdl
);
constexpr
index_t
NWave
=
NPerBlock
/
(
NXdlPerWave
*
NPerXdl
);
// TODO: hacky, fix it!
constexpr
auto
c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2
=
blockwise_gemm
.
GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2
();
// TODO: hacky, fix it!
// c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
constexpr
auto
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
=
blockwise_gemm
.
GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2
();
constexpr
auto
M0
=
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
.
GetLength
(
I0
);
constexpr
auto
N0
=
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
.
GetLength
(
I1
);
constexpr
auto
M1
=
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
.
GetLength
(
I2
);
constexpr
auto
N1
=
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
.
GetLength
(
I3
);
constexpr
auto
M2
=
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
.
GetLength
(
I4
);
constexpr
auto
M3
=
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
.
GetLength
(
I5
);
constexpr
auto
M4
=
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
.
GetLength
(
I6
);
constexpr
auto
N2
=
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp
.
GetLength
(
I7
);
constexpr
auto
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
=
GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
();
auto
c_shuffle_block_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
static_cast
<
FloatCShuffle
*>
(
p_shared
),
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
constexpr
auto
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
=
transform_tensor_descriptor
(
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
make_freeze_transform
(
I0
),
make_unmerge_transform
(
make_tuple
(
Number
<
CShuffleMXdlPerWavePerShuffle
>
{},
// M0 (MXdlPerWave) per shuffle
M1
,
// M1 = MWave
M2
,
// M2 * M3 * M4 = MPerXdl
M3
,
M4
)),
make_freeze_transform
(
I0
),
make_unmerge_transform
(
make_tuple
(
Number
<
CShuffleNXdlPerWavePerShuffle
>
{},
// N0 (NXdlPerWave) per shuffle
N1
,
// N1 = NWave
N2
))),
// N2 = NPerXdl
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
,
2
,
4
,
5
,
6
>
{},
Sequence
<>
{},
Sequence
<
1
,
3
,
7
>
{}));
// calculate origin of thread output tensor on global memory
// blockwise GEMM c matrix starting index
const
auto
c_thread_mtx_on_block
=
blockwise_gemm
.
CalculateCThreadOriginDataIndex
(
I0
,
I0
,
I0
,
I0
);
const
index_t
m_thread_data_on_block
=
c_thread_mtx_on_block
[
I0
];
const
index_t
n_thread_data_on_block
=
c_thread_mtx_on_block
[
I1
];
const
auto
m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
M0
,
M1
,
M2
,
M3
,
M4
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
m_thread_data_on_block_idx
=
m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor
.
CalculateBottomIndex
(
make_multi_index
(
m_thread_data_on_block
));
const
auto
n_thread_data_on_block_to_n0_n1_n2_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
N0
,
N1
,
N2
))),
make_tuple
(
Sequence
<
0
,
1
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
n_thread_data_on_block_idx
=
n_thread_data_on_block_to_n0_n1_n2_adaptor
.
CalculateBottomIndex
(
make_multi_index
(
n_thread_data_on_block
));
// shuffle: threadwise copy C from VGPR to LDS
auto
c_thread_copy_vgpr_to_lds
=
ThreadwiseTensorSliceTransfer_v1r3
<
FloatGemmAcc
,
FloatCShuffle
,
decltype
(
c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
decltype
(
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
),
ck
::
tensor_operation
::
element_wise
::
PassThrough
,
Sequence
<
CShuffleMXdlPerWavePerShuffle
,
CShuffleNXdlPerWavePerShuffle
,
I1
,
I1
,
M2
,
I1
,
M4
,
I1
>
,
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
,
7
,
1
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
{
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
make_multi_index
(
0
,
0
,
m_thread_data_on_block_idx
[
I1
],
n_thread_data_on_block_idx
[
I1
],
m_thread_data_on_block_idx
[
I2
],
m_thread_data_on_block_idx
[
I3
],
m_thread_data_on_block_idx
[
I4
],
n_thread_data_on_block_idx
[
I2
]),
ck
::
tensor_operation
::
element_wise
::
PassThrough
{}};
// space filling curve for threadwise C in VGPR
constexpr
auto
sfc_c_vgpr
=
SpaceFillingCurve
<
Sequence
<
MXdlPerWave
,
NXdlPerWave
,
1
,
1
,
M2
,
1
,
M4
,
1
>
,
Sequence
<
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
,
Sequence
<
CShuffleMXdlPerWavePerShuffle
,
CShuffleNXdlPerWavePerShuffle
,
1
,
1
,
M2
,
1
,
M4
,
1
>>
{};
// space filling curve for shuffled blockwise C in global mem
constexpr
auto
sfc_c_global
=
SpaceFillingCurve
<
Sequence
<
1
,
MPerBlock
,
1
,
NPerBlock
>
,
Sequence
<
0
,
2
,
1
,
3
>
,
Sequence
<
1
,
CShuffleMXdlPerWavePerShuffle
*
MWave
*
MPerXdl
,
1
,
CShuffleNXdlPerWavePerShuffle
*
NWave
*
NPerXdl
>>
{};
// TODO: this should be implemented as a blockwise reduction
// LDS c_reduce_block_desc_mperblock_nperblock
constexpr
auto
c_reduce_block_desc_mperblock_nperblock
=
transform_tensor_descriptor
(
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
make_freeze_transform
(
I0
),
make_pass_through_transform
(
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetLength
(
I1
)),
make_freeze_transform
(
I0
),
make_pass_through_transform
(
c_shuffle_block_desc_mblock_mperblock_nblock_nperblock
.
GetLength
(
I3
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
>
{},
Sequence
<>
{},
Sequence
<
1
>
{}));
static_assert
(
CReduceThreadClusterLengths_MPerBlock_NPerBlock
::
At
(
I0
)
*
CReduceThreadClusterLengths_MPerBlock_NPerBlock
::
At
(
I1
)
==
BlockSize
,
"wrong!"
);
static_assert
((
CShuffleMXdlPerWavePerShuffle
*
MWave
*
MPerXdl
)
%
CReduceThreadClusterLengths_MPerBlock_NPerBlock
::
At
(
I0
)
==
0
&&
(
CShuffleNXdlPerWavePerShuffle
*
NWave
*
NPerXdl
)
%
CReduceThreadClusterLengths_MPerBlock_NPerBlock
::
At
(
I1
)
==
0
,
"wrong!"
);
constexpr
index_t
mreduce_per_thread
=
(
CShuffleMXdlPerWavePerShuffle
*
MWave
*
MPerXdl
)
/
CReduceThreadClusterLengths_MPerBlock_NPerBlock
::
At
(
I0
);
constexpr
index_t
nreduce_per_thread
=
(
CShuffleNXdlPerWavePerShuffle
*
NWave
*
NPerXdl
)
/
CReduceThreadClusterLengths_MPerBlock_NPerBlock
::
At
(
I1
);
constexpr
auto
c_reduce_thread_lengths_mperblock_nperblock
=
Sequence
<
mreduce_per_thread
,
nreduce_per_thread
>
{};
// VGPR c_reduce_thread_desc_mperblock_nperblock
constexpr
auto
c_reduce_thread_desc_mperblock_nperblock
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
mreduce_per_thread
>
{},
Number
<
nreduce_per_thread
>
{}));
// VGPR reduce_thread_desc_mperblock
constexpr
auto
reduce_thread_desc_mperblock
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
mreduce_per_thread
>
{}));
// VGPR reduce_thread_desc_mblock_mperblock
constexpr
auto
reduce_thread_desc_mblock_mperblock
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
mreduce_per_thread
>
{}));
auto
c_reduce_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatReduceAcc
>
(
c_reduce_thread_desc_mperblock_nperblock
.
GetElementSpaceSize
());
// reduce: threadwise copy from LDS to VGPR
constexpr
auto
c_reduce_thread_cluster_desc
=
make_cluster_descriptor
(
CReduceThreadClusterLengths_MPerBlock_NPerBlock
{},
Sequence
<
1
,
0
>
{});
const
auto
c_reduce_thread_cluster_idx
=
c_reduce_thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
get_thread_local_1d_id
()));
const
auto
c_reduce_thread_data_idx_begin
=
c_reduce_thread_cluster_idx
*
c_reduce_thread_lengths_mperblock_nperblock
;
auto
c_reduce_thread_copy_lds_to_vgpr
=
ThreadwiseTensorSliceTransfer_v2
<
FloatCShuffle
,
FloatReduceAcc
,
decltype
(
c_reduce_block_desc_mperblock_nperblock
),
decltype
(
c_reduce_thread_desc_mperblock_nperblock
),
decltype
(
c_reduce_thread_lengths_mperblock_nperblock
),
Sequence
<
0
,
1
>
,
1
,
CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock
,
1
,
true
>
{
c_reduce_block_desc_mperblock_nperblock
,
c_reduce_thread_data_idx_begin
};
auto
reduce_tuple_thread_copy_vgpr_to_global
=
generate_tuple
(
[
&
](
auto
I
)
{
auto
p_reduce_grid
=
p_reduces_grid
[
I
];
auto
reduce_acc_element_op
=
reduce_out_element_ops
[
I
];
return
ThreadwiseTensorSliceTransfer_v1r3
<
FloatReduceAcc
,
remove_pointer_t
<
decltype
(
p_reduce_grid
)
>
,
decltype
(
reduce_thread_desc_mblock_mperblock
),
decltype
(
reduce_grid_desc_mblock_mperblock
),
decltype
(
reduce_acc_element_op
),
Sequence
<
1
,
mreduce_per_thread
>
,
Sequence
<
0
,
1
>
,
1
,
CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock
,
ReduceGlobalMemoryDataOperation
::
At
(
I
),
1
,
false
>
{
reduce_grid_desc_mblock_mperblock
,
make_multi_index
(
block_work_idx
[
I0
],
// mblock
c_reduce_thread_data_idx_begin
[
I0
]),
// mperblock
reduce_acc_element_op
};
},
Number
<
p_reduces_grid
.
Size
()
>
{});
// c0 and c1
constexpr
auto
c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
mreduce_per_thread
>
{},
I1
,
Number
<
nreduce_per_thread
>
{}));
constexpr
auto
c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock
=
c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock
;
auto
c01_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatReduceAcc
>
(
c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock
.
GetElementSpaceSize
());
auto
c0_thread_copy_global_to_vgpr
=
ThreadwiseTensorSliceTransfer_v2
<
FloatC0
,
FloatReduceAcc
,
decltype
(
c0_grid_desc_mblock_mperblock_nblock_nperblock
),
decltype
(
c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock
),
Sequence
<
I1
,
mreduce_per_thread
,
I1
,
nreduce_per_thread
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock
,
1
,
true
>
(
c0_grid_desc_mblock_mperblock_nblock_nperblock
,
make_multi_index
(
I0
,
m_block_data_idx_on_grid
+
c_reduce_thread_data_idx_begin
[
I0
],
I0
,
n_block_data_idx_on_grid
+
c_reduce_thread_data_idx_begin
[
I1
]));
auto
c1_thread_copy_global_to_vgpr
=
ThreadwiseTensorSliceTransfer_v2
<
FloatC1
,
FloatReduceAcc
,
decltype
(
c1_grid_desc_mblock_mperblock_nblock_nperblock
),
decltype
(
c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock
),
Sequence
<
I1
,
mreduce_per_thread
,
I1
,
nreduce_per_thread
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
3
,
CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock
,
1
,
true
>
(
c1_grid_desc_mblock_mperblock_nblock_nperblock
,
make_multi_index
(
I0
,
m_block_data_idx_on_grid
+
c_reduce_thread_data_idx_begin
[
I0
],
I0
,
n_block_data_idx_on_grid
+
c_reduce_thread_data_idx_begin
[
I1
]));
constexpr
auto
c_reduce_thread_desc_mblock_mperblock_nblock_nperblock
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
I1
,
Number
<
mreduce_per_thread
>
{},
I1
,
Number
<
nreduce_per_thread
>
{}));
auto
c_reduce_thread_copy_vgpr_to_global
=
ThreadwiseTensorSliceTransfer_v1r3
<
FloatReduceAcc
,
FloatC
,
decltype
(
c_reduce_thread_desc_mblock_mperblock_nblock_nperblock
),
decltype
(
c_grid_desc_mblock_mperblock_nblock_nperblock
),
tensor_operation
::
element_wise
::
PassThrough
,
Sequence
<
I1
,
mreduce_per_thread
,
I1
,
nreduce_per_thread
>
,
// SliceLengths
Sequence
<
0
,
1
,
2
,
3
>
,
// DimAccessOrder
3
,
// DstVectorDim
CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock
,
InMemoryDataOperationEnum
::
Set
,
1
,
true
>
{
c_grid_desc_mblock_mperblock_nblock_nperblock
,
make_multi_index
(
I0
,
m_block_data_idx_on_grid
+
c_reduce_thread_data_idx_begin
[
I0
],
I0
,
n_block_data_idx_on_grid
+
c_reduce_thread_data_idx_begin
[
I1
]),
tensor_operation
::
element_wise
::
PassThrough
{}};
constexpr
index_t
num_access
=
sfc_c_vgpr
.
GetNumOfAccess
();
static_assert
(
num_access
==
sfc_c_global
.
GetNumOfAccess
(),
"wrong!"
);
static_for
<
0
,
num_access
,
1
>
{}([
&
](
auto
access_id
)
{
// each thread write its data from VGPR to LDS
c_thread_copy_vgpr_to_lds
.
Run
(
c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
sfc_c_vgpr
.
GetIndexTupleOfNumber
(
access_id
),
c_thread_buf
,
c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2
,
c_shuffle_block_buf
);
// make sure it's safe to write to LDS
block_sync_lds
();
{
c_reduce_thread_copy_lds_to_vgpr
.
Run
(
c_reduce_block_desc_mperblock_nperblock
,
c_shuffle_block_buf
,
c_reduce_thread_desc_mperblock_nperblock
,
make_tuple
(
I0
,
I0
),
c_reduce_thread_buf
);
c0_thread_copy_global_to_vgpr
.
Run
(
c0_grid_desc_mblock_mperblock_nblock_nperblock
,
c0_grid_buf
,
c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c01_thread_buf
);
// c = activation(c + bias)
static_for
<
0
,
c_reduce_thread_desc_mperblock_nperblock
.
GetElementSize
(),
1
>
{}(
[
&
](
auto
i
)
{
FloatReduceAcc
out
;
c_element_op
(
out
,
c_reduce_thread_buf
(
i
)
+
c01_thread_buf
(
i
));
c_reduce_thread_buf
(
i
)
=
out
;
});
c1_thread_copy_global_to_vgpr
.
Run
(
c1_grid_desc_mblock_mperblock_nblock_nperblock
,
c1_grid_buf
,
c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c01_thread_buf
);
// c = c + c1_functior(c1)
static_for
<
0
,
c_reduce_thread_desc_mperblock_nperblock
.
GetElementSize
(),
1
>
{}(
[
&
](
auto
i
)
{
c1_element_op
(
c01_thread_buf
(
i
),
c01_thread_buf
(
i
));
c_reduce_thread_buf
(
i
)
+=
c01_thread_buf
(
i
);
});
c_reduce_thread_copy_vgpr_to_global
.
Run
(
c_reduce_thread_desc_mblock_mperblock_nblock_nperblock
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
c_reduce_thread_buf
,
c_grid_desc_mblock_mperblock_nblock_nperblock
,
c_grid_buf
);
static_for
<
0
,
p_reduces_grid
.
Size
(),
1
>
{}([
&
](
auto
In
)
{
auto
&
p_reduce_grid
=
p_reduces_grid
[
In
];
auto
reduce_grid_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_reduce_grid
,
reduce_grid_desc_mblock_mperblock
.
GetElementSpaceSize
());
auto
reduce_thread_buf
=
make_static_buffer
<
AddressSpaceEnum
::
Vgpr
,
FloatReduceAcc
>
(
reduce_thread_desc_mperblock
.
GetElementSpaceSize
());
auto
&
reduce_in_element_op
=
reduce_in_element_ops
[
In
];
auto
&
reduce_thread_copy_vgpr_to_global
=
reduce_tuple_thread_copy_vgpr_to_global
(
In
);
using
ReduceOperation
=
remove_cvref_t
<
decltype
(
ReduceOperations
{}[
In
])
>
;
using
ThreadwiseReduce
=
ThreadwiseReduction
<
FloatReduceAcc
,
decltype
(
c_reduce_thread_desc_mperblock_nperblock
),
decltype
(
reduce_thread_desc_mperblock
),
ReduceOperation
,
false
>
;
// Global write Gemm shuffle + reduction
const
auto
reduce_identityVal
=
ReduceOperation
::
template
GetIdentityValue
<
FloatReduceAcc
>();
static_for
<
0
,
mreduce_per_thread
,
1
>
{}(
[
&
](
auto
I
)
{
reduce_thread_buf
(
I
)
=
reduce_identityVal
;
});
// reduce in VGPR
static_for
<
0
,
mreduce_per_thread
,
1
>
{}([
&
](
auto
im
)
{
static_for
<
0
,
nreduce_per_thread
,
1
>
{}([
&
](
auto
in
)
{
constexpr
auto
offset
=
Number
<
c_reduce_thread_desc_mperblock_nperblock
.
CalculateOffset
(
make_tuple
(
im
,
in
))
>
{};
reduce_in_element_op
(
c_reduce_thread_buf
(
offset
),
c_reduce_thread_buf
(
offset
));
});
});
ThreadwiseReduce
::
Reduce
(
c_reduce_thread_buf
,
reduce_thread_buf
);
// copy from VGPR to Global
reduce_thread_copy_vgpr_to_global
.
Run
(
reduce_thread_desc_mblock_mperblock
,
make_tuple
(
I0
,
I0
),
reduce_thread_buf
,
reduce_grid_desc_mblock_mperblock
,
reduce_grid_buf
);
if
constexpr
(
access_id
<
num_access
-
1
)
{
constexpr
auto
c_global_step
=
sfc_c_global
.
GetForwardStep
(
access_id
);
reduce_thread_copy_vgpr_to_global
.
MoveDstSliceWindow
(
reduce_grid_desc_mblock_mperblock
,
make_tuple
(
c_global_step
[
I0
],
c_global_step
[
I1
]));
}
});
}
if
constexpr
(
access_id
<
num_access
-
1
)
{
constexpr
auto
c_global_step
=
sfc_c_global
.
GetForwardStep
(
access_id
);
// move on C
c_reduce_thread_copy_vgpr_to_global
.
MoveDstSliceWindow
(
c_grid_desc_mblock_mperblock_nblock_nperblock
,
c_global_step
);
// move on C0
c0_thread_copy_global_to_vgpr
.
MoveSrcSliceWindow
(
c0_grid_desc_mblock_mperblock_nblock_nperblock
,
c_global_step
);
// move on C1
c1_thread_copy_global_to_vgpr
.
MoveSrcSliceWindow
(
c1_grid_desc_mblock_mperblock_nblock_nperblock
,
c_global_step
);
}
});
}
// Reduction
}
};
}
// namespace ck
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#pragma once
#include "common_header.hpp"
#include "ck/utility/common_header.hpp"
#include "multi_index_transform_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
#include "blockwise_gemm_dl_v2r3.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
#include "blockwise_tensor_slice_transfer_v5r1.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp"
#include "threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp"
#include "threadwise_tensor_slice_set.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
namespace
ck
{
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
View file @
5d015452
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
...
...
Prev
1
…
6
7
8
9
10
11
12
13
14
…
22
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment