Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
MIGraphX
Commits
78a300ff
Commit
78a300ff
authored
Oct 07, 2022
by
Alan Turner
Browse files
Update tuning method
parent
dea0555f
Changes
159
Hide whitespace changes
Inline
Side-by-side
Showing
19 changed files
with
5892 additions
and
0 deletions
+5892
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
...e/ck/tensor_operation/gpu/device/device_reduce_common.hpp
+142
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
.../tensor_operation/gpu/device/device_reduce_multiblock.hpp
+513
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
.../tensor_operation/gpu/device/device_reduce_threadwise.hpp
+376
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_softmax.hpp
...include/ck/tensor_operation/gpu/device/device_softmax.hpp
+68
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_sparse_embedding3_forward_layernorm.hpp
...gpu/device/device_sparse_embedding3_forward_layernorm.hpp
+210
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
...de/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+58
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
...vice_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+1015
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
.../tensor_operation/gpu/device/impl/device_permute_impl.hpp
+282
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
.../tensor_operation/gpu/device/impl/device_softmax_impl.hpp
+272
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
.../include/ck/tensor_operation/gpu/device/matrix_padder.hpp
+382
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
...ensor_operation/gpu/device/reduction_operator_mapping.hpp
+186
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
.../include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+417
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
.../ck/tensor_operation/gpu/device/tensor_specialization.hpp
+28
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
...r_operation/gpu/element/binary_element_wise_operation.hpp
+270
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
...k/tensor_operation/gpu/element/element_wise_operation.hpp
+303
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
...or_operation/gpu/element/unary_element_wise_operation.hpp
+252
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
...clude/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+533
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
...on/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
+321
-0
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
...on/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
+264
-0
No files found.
Too many changes to show.
To preserve performance only
159 of 159+
files are displayed.
Plain diff
Email patch
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_reduce_common.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <cassert>
#include "ck/utility/common_header.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_operator.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// here, inLengths[] is already shuffled so that lengths of invariant dims are included before those
// of reduce dims
template
<
index_t
Rank
,
int
NumReduceDim
>
std
::
pair
<
long_index_t
,
long_index_t
>
get_2d_lengths
(
const
std
::
vector
<
index_t
>&
inLengths
)
{
static_assert
(
Rank
<=
6
,
"bigger Rank size not supported!"
);
long_index_t
invariant_total_length
=
1
;
long_index_t
reduce_total_length
=
1
;
constexpr
int
NumInvariantDim
=
Rank
-
NumReduceDim
;
for
(
int
i
=
NumInvariantDim
;
i
<
Rank
;
i
++
)
reduce_total_length
*=
inLengths
[
i
];
for
(
int
i
=
0
;
i
<
NumInvariantDim
;
i
++
)
invariant_total_length
*=
inLengths
[
i
];
return
std
::
make_pair
(
invariant_total_length
,
reduce_total_length
);
};
template
<
index_t
Rank
,
int
NumReduceDim
>
std
::
pair
<
long_index_t
,
long_index_t
>
get_2d_lengths
(
const
std
::
array
<
index_t
,
Rank
>&
inLengths
)
{
static_assert
(
Rank
<=
6
,
"bigger Rank size not supported!"
);
long_index_t
invariant_total_length
=
1
;
long_index_t
reduce_total_length
=
1
;
constexpr
int
NumInvariantDim
=
Rank
-
NumReduceDim
;
for
(
int
i
=
NumInvariantDim
;
i
<
Rank
;
i
++
)
reduce_total_length
*=
inLengths
[
i
];
for
(
int
i
=
0
;
i
<
NumInvariantDim
;
i
++
)
invariant_total_length
*=
inLengths
[
i
];
return
std
::
make_pair
(
invariant_total_length
,
reduce_total_length
);
};
// helper functions using variadic template arguments
template
<
index_t
...
Ns
>
auto
make_tuple_from_array_and_index_seq
(
const
std
::
vector
<
index_t
>&
lengths
,
Sequence
<
Ns
...
>
)
{
return
make_tuple
(
static_cast
<
index_t
>
(
lengths
[
Ns
])...);
};
template
<
index_t
arraySize
>
auto
make_tuple_from_array
(
const
std
::
vector
<
index_t
>&
lengths
,
Number
<
arraySize
>
)
{
static_assert
(
arraySize
>=
1
&&
arraySize
<=
6
,
"The tensor should have 1 to 6 dimensions"
);
constexpr
auto
index_seq
=
typename
arithmetic_sequence_gen
<
0
,
arraySize
,
1
>::
type
{};
return
make_tuple_from_array_and_index_seq
(
lengths
,
index_seq
);
};
template
<
index_t
Rank
,
index_t
NumReduceDim
>
std
::
vector
<
index_t
>
shuffle_tensor_dimensions
(
const
std
::
vector
<
index_t
>&
origLengthsStrides
,
const
std
::
vector
<
int
>&
reduceDims
)
{
std
::
vector
<
index_t
>
newLengthsStrides
;
assert
(
Rank
==
origLengthsStrides
.
size
()
&&
NumReduceDim
==
reduceDims
.
size
());
int
reduceFlag
=
0
;
// flag the bits for the reduceDims
for
(
int
i
=
0
;
i
<
NumReduceDim
;
i
++
)
{
reduceFlag
|=
1
<<
reduceDims
[
i
];
};
// collect invariant dimensions
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
{
newLengthsStrides
.
push_back
(
origLengthsStrides
[
i
]);
};
// collect reduce dimensions
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
>
0
)
{
newLengthsStrides
.
push_back
(
origLengthsStrides
[
i
]);
};
return
newLengthsStrides
;
};
template
<
index_t
Rank
,
index_t
NumReduceDim
>
std
::
array
<
index_t
,
Rank
>
shuffle_tensor_dimensions
(
const
std
::
array
<
index_t
,
Rank
>&
origLengthsStrides
,
const
std
::
array
<
int
,
NumReduceDim
>&
reduceDims
)
{
std
::
array
<
index_t
,
Rank
>
newLengthsStrides
;
int
reduceFlag
=
0
;
// flag the bits for the reduceDims
for
(
int
i
=
0
;
i
<
NumReduceDim
;
i
++
)
{
reduceFlag
|=
1
<<
reduceDims
[
i
];
};
// collect invariant dimensions
int
pos
=
0
;
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
==
0
)
{
newLengthsStrides
[
pos
++
]
=
origLengthsStrides
[
i
];
};
// collect reduce dimensions
for
(
int
i
=
0
;
i
<
Rank
;
i
++
)
if
((
reduceFlag
&
(
1
<<
i
))
>
0
)
{
newLengthsStrides
[
pos
++
]
=
origLengthsStrides
[
i
];
};
return
newLengthsStrides
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
InMemoryDataOperationEnum
OutMemoryDataOperation
,
bool
PropagateNan
,
bool
OutputIndex
,
bool
HaveIndexInputIfOutputIndex
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceReduceMultiBlock
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(
BlockSize
==
MThreadClusterSize
*
KThreadClusterSize
,
"Invalid thread cluster size assignments!"
);
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
(
MThreadSliceSize
%
OutDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
using
IndexDataType
=
int32_t
;
static
constexpr
bool
HaveIndexInput
=
OutputIndex
&&
HaveIndexInputIfOutputIndex
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
numSrcDim
=
Rank
;
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
// So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added
// later
static
constexpr
bool
use_multiblock
=
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
AtomicAdd
);
static_assert
(
ck
::
reduce
::
InMemoryDataOperatonSupportedOnDataType
<
OutMemoryDataOperation
,
OutDataType
>::
value
,
"The OutDataType must support the specified OutMemoryDataOperation!"
);
static_assert
(
!
use_multiblock
||
(
use_multiblock
&&
!
OutputIndex
),
"MultiBlock reduction can only be used when outputing index is not required"
);
static_assert
(
ReduceOperation
::
IsCompatibleInMemoryDataOperation
(
OutMemoryDataOperation
),
"The reduction accumulation operation must be compatible with the OutMemoryDataOperation!"
);
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
index_t
>&
inLengths
,
const
std
::
vector
<
index_t
>&
inStrides
,
int
blkGroupSize
,
int
numBlockTileIteration
)
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
in_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
reduceAllDim
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
reduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
int
reduceSizePerBlock
=
K_BlockTileSize
*
numBlockTileIteration
;
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
reduceSizePerBlock
*
blkGroupSize
-
reduceLength
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad_M
),
make_right_pad_transform
(
reduceLength
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
index_t
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
)
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
outPad
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
auto
out_grid_desc_m_padded
=
transform_tensor_descriptor
(
out_grid_desc_m
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
outPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
out_grid_desc_m_padded
);
};
static
auto
MakeDst1dDescriptorForBufferSet
(
const
std
::
vector
<
index_t
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
)
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
length
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
pad
=
math
::
integer_least_multiple
(
length
,
BlockSize
)
-
length
;
auto
out_grid_desc_m_padded
=
transform_tensor_descriptor
(
out_grid_desc_m
,
make_tuple
(
make_right_pad_transform
(
length
,
pad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
out_grid_desc_m_padded
);
};
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
const
IndexDataType
*
in_index_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_index_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
in_index_dev_
{
in_index_dev
},
out_dev_
{
out_dev
},
out_index_dev_
{
out_index_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
alpha_
=
type_convert
<
AccDataType
>
(
alpha
);
beta_
=
type_convert
<
AccDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
NumReduceDim
>
(
inLengths_
);
if
constexpr
(
NumInvariantDim
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths_
[
NumInvariantDim
-
1
];
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
if
constexpr
(
use_multiblock
)
{
int
iterations
=
1
;
while
(
true
)
{
int
testBlkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
// we want the blkGroupSize be not more than 128
if
(
testBlkGroupSize
<=
128
)
break
;
iterations
++
;
};
blkGroupSize
=
(
reduce_total_length
+
(
K_BlockTileSize
*
iterations
)
-
1
)
/
(
K_BlockTileSize
*
iterations
);
numBlockTileIteration
=
iterations
;
}
else
{
blkGroupSize
=
1
;
numBlockTileIteration
=
(
reduce_total_length
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
};
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
*
blkGroupSize
;
gridSize_pre
=
math
::
integer_least_multiple
(
invariant_total_length
,
BlockSize
)
/
BlockSize
;
}
std
::
vector
<
index_t
>
inLengths_
;
std
::
vector
<
index_t
>
inStrides_
;
std
::
vector
<
index_t
>
outLengths_
;
std
::
vector
<
index_t
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
const
IndexDataType
*
in_index_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_index_dev_
;
InElementwiseOperation
in_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
index_t
invariant_lowest_length
;
index_t
reduce_lowest_length
;
long_index_t
invariant_total_length
;
long_index_t
reduce_total_length
;
int
blkGroupSize
;
int
numBlockTileIteration
;
size_t
gridSize
;
size_t
gridSize_pre
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
DeviceReduceMultiBlock
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
out_grid_desc_m
=
DeviceReduceMultiBlock
::
MakeDst1dDescriptor
(
arg
.
outLengths_
,
arg
.
outStrides_
);
const
auto
out_grid_desc_m_2
=
DeviceReduceMultiBlock
::
MakeDst1dDescriptorForBufferSet
(
arg
.
outLengths_
,
arg
.
outStrides_
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
using
OutGridDesc_M_2
=
decltype
(
out_grid_desc_m_2
);
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_multiblock
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
OutMemoryDataOperation
,
PropagateNan
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
const
auto
kernel_main
=
kernel_reduce_multiblock
<
GridwiseReduce
,
OutputIndex
,
HaveIndexInput
,
InDataType
,
OutDataType
,
AccDataType
,
int32_t
,
InGridDesc_M_K
,
OutGridDesc_M
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
float
avg_time
=
0
;
if
constexpr
(
use_multiblock
)
{
const
auto
identityVal
=
ck
::
reduce
::
GetIdentityValueForInMemoryDataOperation
<
OutDataType
>
(
OutMemoryDataOperation
);
const
auto
kernel_pre
=
kernel_buffer_set_value
<
BlockSize
,
OutDataType
,
OutGridDesc_M_2
>
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_pre
,
dim3
(
arg
.
gridSize_pre
),
dim3
(
BlockSize
),
0
,
out_grid_desc_m_2
,
arg
.
out_dev_
,
identityVal
);
};
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_main
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
in_index_dev_
,
arg
.
beta_
,
arg
.
out_dev_
,
arg
.
out_index_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
static
bool
IsSupportedArgument
(
const
Argument
*
pArg
)
{
if
constexpr
(
use_multiblock
)
{
if
(
static_cast
<
float
>
(
pArg
->
beta_
)
!=
0.0
f
)
return
(
false
);
};
if
constexpr
(
InSrcVectorDim
==
0
)
{
if
constexpr
(
NumInvariantDim
==
0
)
{
return
(
false
);
}
else
{
if
(
pArg
->
inStrides_
[
NumInvariantDim
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
invariant_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
}
else
{
if
(
pArg
->
inStrides_
[
Rank
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
reduce_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
// To improve
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
return
(
false
);
if
constexpr
(
use_multiblock
)
{
// blkGroupSize of 1 should be handled by Blockwise path using
// InMemoryDataOperationEnum::Set
if
(
pArg
->
blkGroupSize
==
1
)
return
(
false
);
// This is very strong restriction, but needed to avoid some failure
if
(
pArg
->
invariant_lowest_length
%
M_BlockTileSize
!=
0
)
return
(
false
);
}
else
{
// cases with very small reduce_total_length should be handled by ThreadWise kernel
// if(pArg->reduce_total_length / KThreadSliceSize < 2)
// return (false);
};
return
(
true
);
}
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
dynamic_cast
<
const
Argument
*>
(
p_arg
));
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
const
void
*
in_index_dev
,
void
*
out_dev
,
void
*
out_index_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
const
IndexDataType
*>
(
in_index_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_index_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
(
OutMemoryDataOperation
==
InMemoryDataOperationEnum
::
Set
?
"DeviceReduceBlockWise<"
:
"DeviceReduceMultiBlock<"
)
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_reduce_threadwise.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
index_t
Rank
,
index_t
NumReduceDim
,
typename
ReduceOperation
,
typename
InElementwiseOperation
,
typename
AccElementwiseOperation
,
bool
PropagateNan
,
bool
OutputIndex
,
bool
HaveIndexInputIfOutputIndex
,
index_t
BlockSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceReduceThreadWise
:
public
DeviceReduce
<
InElementwiseOperation
,
AccElementwiseOperation
>
{
static_assert
(
Rank
<=
6
,
"Bigger Rank size is not supported!"
);
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
))
&&
(
MThreadSliceSize
%
OutDstVectorSize
==
0
),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
using
IndexDataType
=
int32_t
;
static
constexpr
bool
HaveIndexInput
=
OutputIndex
&&
HaveIndexInputIfOutputIndex
;
static
constexpr
index_t
NumInvariantDim
=
Rank
-
NumReduceDim
;
static
constexpr
index_t
numSrcDim
=
Rank
;
static
constexpr
index_t
numDstDim
=
(
NumInvariantDim
==
0
)
?
1
:
NumInvariantDim
;
static
constexpr
bool
reduceAllDim
=
(
NumInvariantDim
==
0
);
static
constexpr
index_t
M_BlockTileSize
=
BlockSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
1
*
KThreadSliceSize
;
static
auto
MakeSrc2dDescriptor
(
const
std
::
vector
<
index_t
>&
inLengths
,
const
std
::
vector
<
index_t
>&
inStrides
)
{
const
auto
tupleSrcLengths
=
make_tuple_from_array
(
inLengths
,
Number
<
numSrcDim
>
{});
const
auto
tupleSrcStrides
=
make_tuple_from_array
(
inStrides
,
Number
<
numSrcDim
>
{});
const
auto
inDesc
=
make_naive_tensor_descriptor
(
tupleSrcLengths
,
tupleSrcStrides
);
const
auto
in_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
reduceAllDim
)
{
const
auto
one_dim_inDesc
=
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
tupleSrcLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numSrcDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
transform_tensor_descriptor
(
one_dim_inDesc
,
make_tuple
(
make_unmerge_transform
(
make_tuple
(
1
,
one_dim_inDesc
.
GetLength
(
Number
<
0
>
{})))),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{}));
}
else
{
using
InvariantDims
=
typename
arithmetic_sequence_gen
<
0
,
NumInvariantDim
,
1
>::
type
;
using
ReduceDims
=
typename
arithmetic_sequence_gen
<
NumInvariantDim
,
Rank
,
1
>::
type
;
const
auto
reduceDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
ReduceDims
{});
const
auto
invariantDimLengths
=
make_tuple_from_array_and_index_seq
(
inLengths
,
InvariantDims
{});
return
transform_tensor_descriptor
(
inDesc
,
make_tuple
(
make_merge_transform
(
invariantDimLengths
),
make_merge_transform
(
reduceDimLengths
)),
make_tuple
(
InvariantDims
{},
ReduceDims
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
}();
const
auto
invariantLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
0
>
{});
const
auto
reduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
const
auto
inPad_M
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
const
auto
inPad_K
=
math
::
integer_least_multiple
(
reduceLength
,
K_BlockTileSize
)
-
reduceLength
;
auto
in_grid_desc_m_k_padded
=
transform_tensor_descriptor
(
in_grid_desc_m_k
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
inPad_M
),
make_right_pad_transform
(
reduceLength
,
inPad_K
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
(
in_grid_desc_m_k_padded
);
};
static
auto
MakeDst1dDescriptor
(
const
std
::
vector
<
index_t
>&
outLengths
,
const
std
::
vector
<
index_t
>&
outStrides
)
{
const
auto
tupleDstLengths
=
make_tuple_from_array
(
outLengths
,
Number
<
numDstDim
>
{});
const
auto
tupleDstStrides
=
make_tuple_from_array
(
outStrides
,
Number
<
numDstDim
>
{});
auto
outDesc
=
make_naive_tensor_descriptor
(
tupleDstLengths
,
tupleDstStrides
);
auto
out_grid_desc_m
=
transform_tensor_descriptor
(
outDesc
,
make_tuple
(
make_merge_transform
(
tupleDstLengths
)),
make_tuple
(
typename
arithmetic_sequence_gen
<
0
,
numDstDim
,
1
>::
type
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
invariantLength
=
out_grid_desc_m
.
GetLength
(
Number
<
0
>
{});
const
auto
outPad
=
math
::
integer_least_multiple
(
invariantLength
,
M_BlockTileSize
)
-
invariantLength
;
auto
out_grid_desc_m_padded
=
transform_tensor_descriptor
(
out_grid_desc_m
,
make_tuple
(
make_right_pad_transform
(
invariantLength
,
outPad
)),
make_tuple
(
Sequence
<
0
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
return
(
out_grid_desc_m_padded
);
};
struct
Argument
:
public
BaseArgument
{
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
IndexDataType
*
out_index_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
:
outLengths_
{
outLengths
},
outStrides_
{
outStrides
},
in_dev_
{
in_dev
},
out_dev_
{
out_dev
},
out_index_dev_
{
out_index_dev
},
in_elementwise_op_
{
in_elementwise_op
},
acc_elementwise_op_
{
acc_elementwise_op
}
{
inLengths_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inLengths
,
reduceDims
);
inStrides_
=
shuffle_tensor_dimensions
<
Rank
,
NumReduceDim
>
(
inStrides
,
reduceDims
);
alpha_
=
type_convert
<
AccDataType
>
(
alpha
);
beta_
=
type_convert
<
AccDataType
>
(
beta
);
std
::
tie
(
invariant_total_length
,
reduce_total_length
)
=
get_2d_lengths
<
Rank
,
NumReduceDim
>
(
inLengths_
);
if
constexpr
(
NumInvariantDim
==
0
)
invariant_lowest_length
=
1
;
else
invariant_lowest_length
=
inLengths_
[
NumInvariantDim
-
1
];
reduce_lowest_length
=
inLengths_
[
Rank
-
1
];
numBlockTileIteration
=
(
reduce_total_length
+
K_BlockTileSize
-
1
)
/
K_BlockTileSize
;
gridSize
=
math
::
integer_least_multiple
(
invariant_total_length
,
M_BlockTileSize
)
/
M_BlockTileSize
;
}
std
::
vector
<
index_t
>
inLengths_
;
std
::
vector
<
index_t
>
inStrides_
;
std
::
vector
<
index_t
>
outLengths_
;
std
::
vector
<
index_t
>
outStrides_
;
AccDataType
alpha_
;
AccDataType
beta_
;
const
InDataType
*
in_dev_
;
OutDataType
*
out_dev_
;
IndexDataType
*
out_index_dev_
;
InElementwiseOperation
in_elementwise_op_
;
AccElementwiseOperation
acc_elementwise_op_
;
index_t
invariant_lowest_length
;
index_t
reduce_lowest_length
;
long_index_t
invariant_total_length
;
long_index_t
reduce_total_length
;
int
numBlockTileIteration
;
size_t
gridSize
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
DeviceReduceThreadWise
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
);
const
auto
out_grid_desc_m
=
DeviceReduceThreadWise
::
MakeDst1dDescriptor
(
arg
.
outLengths_
,
arg
.
outStrides_
);
using
InGridDesc_M_K
=
decltype
(
in_grid_desc_m_k
);
using
OutGridDesc_M
=
decltype
(
out_grid_desc_m
);
float
avg_time
=
0
;
using
GridwiseReduce
=
GridwiseReduction_mk_to_m_threadwise
<
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
ReduceOperation
,
InElementwiseOperation
,
AccElementwiseOperation
,
InMemoryDataOperationEnum
::
Set
,
PropagateNan
,
BlockSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
>
;
const
auto
kernel
=
kernel_reduce_threadwise
<
GridwiseReduce
,
OutputIndex
,
HaveIndexInput
,
InDataType
,
OutDataType
,
AccDataType
,
IndexDataType
,
InGridDesc_M_K
,
OutGridDesc_M
,
InElementwiseOperation
,
AccElementwiseOperation
>
;
avg_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
out_grid_desc_m
,
arg
.
in_elementwise_op_
,
arg
.
acc_elementwise_op_
,
arg
.
alpha_
,
arg
.
in_dev_
,
nullptr
,
arg
.
beta_
,
arg
.
out_dev_
,
arg
.
out_index_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
pArg
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
constexpr
(
InSrcVectorDim
==
0
)
{
if
constexpr
(
NumInvariantDim
==
0
)
{
return
(
false
);
}
else
{
if
(
pArg
->
inStrides_
[
NumInvariantDim
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
invariant_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
}
else
{
if
(
pArg
->
inStrides_
[
Rank
-
1
]
!=
1
)
return
(
false
);
if
(
pArg
->
reduce_lowest_length
%
InSrcVectorSize
!=
0
)
return
(
false
);
};
// To improve
if
(
pArg
->
invariant_lowest_length
%
OutDstVectorSize
!=
0
)
return
(
false
);
// cases with big reduce_total_length should be handled by Blockwise kernel
if
(
pArg
->
reduce_total_length
/
KThreadSliceSize
>=
32
)
return
(
false
);
return
(
true
);
};
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
index_t
>
outLengths
,
const
std
::
vector
<
index_t
>
outStrides
,
const
std
::
vector
<
int
>
reduceDims
,
float
alpha
,
float
beta
,
const
void
*
in_dev
,
const
void
*
in_index_dev
,
void
*
out_dev
,
void
*
out_index_dev
,
const
InElementwiseOperation
in_elementwise_op
,
const
AccElementwiseOperation
acc_elementwise_op
)
override
{
(
void
)
in_index_dev
;
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
outLengths
,
outStrides
,
reduceDims
,
alpha
,
beta
,
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
static_cast
<
IndexDataType
*>
(
out_index_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceThreadWise<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
BlockSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
1
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_softmax.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <memory>
#include <vector>
#include "ck/tensor_operation/gpu/device/device_base.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
InElementwiseOp
,
typename
AccElementwiseOp
,
index_t
Rank
>
struct
DeviceSoftmax
:
public
BaseOperator
{
//
// @brief Makes a pointer to Argument class.
//
// @param[in] inLengths Input tensor extent(s) from high to low dimension
// @param[in] inStrides Input tensor stride(s) from high to low dimension
// @param[in] reduceDims The dimension(s) the normalization operation is applied
// @param[in] alpha Typeless pointer in host memory storing the alpha scaling
// value as type AccDataType
// @param[in] beta Typeless pointer in host memory storing the beta scaling
// value as type AccDataType
// @param[in] in_dev Typeless const pointer in device memory storing the input
// tensor
// @param out_dev Typeless pointer in device memory storing the output tensor
// @param[in] in_elementwise_op The input elementwise operation.
// @param[in] acc_elementwise_op The accumulation elementwise operation.
//
// @return Unique pointer to the Argument class.
//
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
void
*
alpha
,
const
void
*
beta
,
const
void
*
in_dev
,
void
*
out_dev
,
InElementwiseOp
in_elementwise_op
,
AccElementwiseOp
acc_elementwise_op
)
=
0
;
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
=
0
;
virtual
index_t
GetRank
()
const
=
0
;
virtual
index_t
GetNumReduceDim
()
const
=
0
;
};
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
InElementwiseOp
,
typename
AccElementwiseOp
,
index_t
Rank
>
using
DeviceSoftmaxPtr
=
std
::
unique_ptr
<
DeviceSoftmax
<
InDataType
,
AccDataType
,
OutDataType
,
InElementwiseOp
,
AccElementwiseOp
,
Rank
>>
;
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/device_sparse_embedding3_forward_layernorm.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
EmbType
,
typename
IndexType
,
typename
GammaDataType
,
typename
BetaDataType
,
typename
AccDataType
,
typename
OutType
,
ck
::
index_t
BlockSize
,
ck
::
index_t
DimClusterSize
,
ck
::
index_t
RowClusterSize
,
ck
::
index_t
DimPerBlock
,
ck
::
index_t
RowPerBlock
,
ck
::
index_t
DimThreadSize
,
ck
::
index_t
RowVectorSize
>
struct
DeviceSparseEmbedding3ForwardLayernorm
:
public
BaseOperator
{
static
auto
MakeOutputDescriptor
(
const
index_t
index_length
,
const
index_t
rows
)
{
return
make_naive_tensor_descriptor_packed
(
make_tuple
(
index_length
,
rows
));
}
struct
Argument
:
public
BaseArgument
{
Argument
(
OutType
*
p_out
,
const
EmbType
*
p_emb_a
,
const
EmbType
*
p_emb_b
,
const
EmbType
*
p_emb_c
,
const
IndexType
*
p_index_a
,
const
IndexType
*
p_index_b
,
const
IndexType
*
p_index_c
,
const
GammaDataType
*
p_gamma
,
const
BetaDataType
*
p_beta
,
const
ck
::
index_t
NumRows
,
const
ck
::
index_t
EmbeddingDim
,
const
ck
::
index_t
IndexLength
,
const
AccDataType
epsilon
)
:
p_out_
(
p_out
),
p_emb_a_
(
p_emb_a
),
p_emb_b_
(
p_emb_b
),
p_emb_c_
(
p_emb_c
),
p_index_a_
(
p_index_a
),
p_index_b_
(
p_index_b
),
p_index_c_
(
p_index_c
),
p_gamma_
(
p_gamma
),
p_beta_
(
p_beta
),
NumRows_
(
NumRows
),
EmbeddingDim_
(
EmbeddingDim
),
IndexLength_
(
IndexLength
),
epsilon_
(
epsilon
)
{
grid_size_
=
(
IndexLength
+
DimClusterSize
-
1
)
/
DimClusterSize
;
}
OutType
*
p_out_
;
const
EmbType
*
p_emb_a_
;
const
EmbType
*
p_emb_b_
;
const
EmbType
*
p_emb_c_
;
const
IndexType
*
p_index_a_
;
const
IndexType
*
p_index_b_
;
const
IndexType
*
p_index_c_
;
const
GammaDataType
*
p_gamma_
;
const
BetaDataType
*
p_beta_
;
ck
::
index_t
NumRows_
;
ck
::
index_t
EmbeddingDim_
;
ck
::
index_t
IndexLength_
;
AccDataType
epsilon_
;
size_t
grid_size_
;
};
virtual
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
void
*
p_out
,
const
void
*
p_emb_a
,
const
void
*
p_emb_b
,
const
void
*
p_emb_c
,
const
void
*
p_index_a
,
const
void
*
p_index_b
,
const
void
*
p_index_c
,
const
void
*
p_gamma
,
const
void
*
p_beta
,
ck
::
index_t
NumRows
,
ck
::
index_t
EmbeddingDim
,
ck
::
index_t
IndexLength
,
const
AccDataType
epsilon
)
{
return
std
::
make_unique
<
Argument
>
(
reinterpret_cast
<
OutType
*>
(
p_out
),
reinterpret_cast
<
const
EmbType
*>
(
p_emb_a
),
reinterpret_cast
<
const
EmbType
*>
(
p_emb_b
),
reinterpret_cast
<
const
EmbType
*>
(
p_emb_c
),
reinterpret_cast
<
const
IndexType
*>
(
p_index_a
),
reinterpret_cast
<
const
IndexType
*>
(
p_index_b
),
reinterpret_cast
<
const
IndexType
*>
(
p_index_c
),
reinterpret_cast
<
const
GammaDataType
*>
(
p_gamma
),
reinterpret_cast
<
const
BetaDataType
*>
(
p_beta
),
NumRows
,
EmbeddingDim
,
IndexLength
,
epsilon
);
}
using
GridwiseSparseEmbedding
=
GridwiseSparseEmbedding3ForwardLayernorm
<
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
decltype
(
MakeOutputDescriptor
(
1
,
1
)),
BlockSize
,
DimClusterSize
,
RowClusterSize
,
DimPerBlock
,
RowPerBlock
,
DimThreadSize
,
RowVectorSize
>
;
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
auto
out_desc
=
MakeOutputDescriptor
(
arg
.
IndexLength_
,
arg
.
EmbeddingDim_
);
const
auto
kernel_main
=
kernel_sparse_embedding3_forward_layernorm
<
GridwiseSparseEmbedding
,
EmbType
,
IndexType
,
GammaDataType
,
BetaDataType
,
AccDataType
,
OutType
,
decltype
(
out_desc
)
>
;
float
avg_time
=
0
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_main
,
dim3
(
arg
.
grid_size_
),
dim3
(
BlockSize
),
0
,
arg
.
p_out_
,
arg
.
p_emb_a_
,
arg
.
p_emb_b_
,
arg
.
p_emb_c_
,
arg
.
p_index_a_
,
arg
.
p_index_b_
,
arg
.
p_index_c_
,
arg
.
p_gamma_
,
arg
.
p_beta_
,
out_desc
,
arg
.
epsilon_
);
return
(
avg_time
);
}
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
static
bool
IsSupportedArgument
(
const
Argument
*
p_arg
)
{
return
(
RowPerBlock
==
p_arg
->
EmbeddingDim_
)
&&
(
p_arg
->
NumRows_
%
DimPerBlock
==
0
);
}
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
virtual
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
{
return
std
::
make_unique
<
Invoker
>
();
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceSparseEmbedding3ForwardLayernorm_"
<<
BlockSize
<<
"_"
<<
DimClusterSize
<<
"x"
<<
RowClusterSize
<<
"_"
<<
DimPerBlock
<<
"x"
<<
RowPerBlock
<<
"_"
<<
DimThreadSize
<<
"x"
<<
RowVectorSize
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
enum
struct
GemmSpecialization
{
// Gemm
Default
,
MPadding
,
NPadding
,
KPadding
,
MNPadding
,
MKPadding
,
NKPadding
,
MNKPadding
,
// Gemm + Gemm
OPadding
,
MOPadding
,
NOPadding
,
KOPadding
,
MNOPadding
,
MKOPadding
,
NKOPadding
,
MNKOPadding
,
};
inline
std
::
string
getGemmSpecializationString
(
const
GemmSpecialization
&
s
)
{
switch
(
s
)
{
case
GemmSpecialization
::
Default
:
return
"Default"
;
case
GemmSpecialization
::
MPadding
:
return
"MPadding"
;
case
GemmSpecialization
::
NPadding
:
return
"NPadding"
;
case
GemmSpecialization
::
KPadding
:
return
"KPadding"
;
case
GemmSpecialization
::
MNPadding
:
return
"MNPadding"
;
case
GemmSpecialization
::
MKPadding
:
return
"MKPadding"
;
case
GemmSpecialization
::
NKPadding
:
return
"NKPadding"
;
case
GemmSpecialization
::
MNKPadding
:
return
"MNKPadding"
;
case
GemmSpecialization
::
OPadding
:
return
"OPadding"
;
case
GemmSpecialization
::
MOPadding
:
return
"MOPadding"
;
case
GemmSpecialization
::
NOPadding
:
return
"NOPadding"
;
case
GemmSpecialization
::
KOPadding
:
return
"KOPadding"
;
case
GemmSpecialization
::
MNOPadding
:
return
"MNOPadding"
;
case
GemmSpecialization
::
MKOPadding
:
return
"MKOPadding"
;
case
GemmSpecialization
::
NKOPadding
:
return
"NKOPadding"
;
case
GemmSpecialization
::
MNKOPadding
:
return
"MNKOPadding"
;
default:
return
"Unrecognized specialization!"
;
}
}
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
namespace
{
template
<
index_t
NumDTensor
>
struct
ComputePtrOffsetOfStridedBatch
{
ComputePtrOffsetOfStridedBatch
()
=
default
;
ComputePtrOffsetOfStridedBatch
(
index_t
BatchStrideA
,
index_t
BatchStrideB
,
Array
<
ck
::
index_t
,
NumDTensor
>
BatchStrideDs
,
index_t
BatchStrideE
)
:
BatchStrideA_
(
BatchStrideA
),
BatchStrideB_
(
BatchStrideB
),
BatchStrideDs_
(
BatchStrideDs
),
BatchStrideE_
(
BatchStrideE
)
{
}
__host__
__device__
constexpr
long_index_t
GetAPtrOffset
(
index_t
g_idx
)
const
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideA_
);
}
__host__
__device__
constexpr
long_index_t
GetBPtrOffset
(
index_t
g_idx
)
const
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideB_
);
}
__host__
__device__
constexpr
auto
GetDsPtrOffset
(
index_t
g_idx
)
const
{
Array
<
long_index_t
,
NumDTensor
>
ds_offset
;
static_for
<
0
,
NumDTensor
,
1
>
{}(
[
&
](
auto
i
)
{
ds_offset
(
i
)
=
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideDs_
[
i
]);
});
return
ds_offset
;
}
__host__
__device__
constexpr
long_index_t
GetEPtrOffset
(
index_t
g_idx
)
const
{
return
g_idx
*
static_cast
<
long_index_t
>
(
BatchStrideE_
);
}
index_t
BatchStrideA_
;
index_t
BatchStrideB_
;
Array
<
ck
::
index_t
,
NumDTensor
>
BatchStrideDs_
;
index_t
BatchStrideE_
;
};
/*
* \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
*
* \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
* given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
* strided batched, but we can easily extend to other layouts. The returned offset can be either \p
* index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
* limitations.
*
* \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
* returns the 2D index of the tile that it computes. \see
* GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
*
* \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
* tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
* descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
* device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
* DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
* pointer offset into \p ComputePtrOffsetOfStridedBatch.
*
* \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
* Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
* realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
*
*/
template
<
typename
GridwiseGemm
,
typename
ABDataType
,
typename
DsPointer
,
typename
EDataType
,
typename
AElementwiseOperation
,
typename
BElementwiseOperation
,
typename
CDEElementwiseOperation
,
typename
AGridDesc_AK0_M_AK1
,
typename
BGridDesc_BK0_N_BK1
,
typename
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
typename
Block2ETileMap
,
typename
ComputePtrOffsetOfBatch
,
bool
HasMainKBlockLoop
>
__global__
void
#if CK_USE_LAUNCH_BOUNDS
__launch_bounds__
(
CK_MAX_THREAD_PER_BLOCK
,
CK_MIN_BLOCK_PER_CU
)
#endif
kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle
(
const
ABDataType
*
__restrict__
p_a_grid
,
const
ABDataType
*
__restrict__
p_b_grid
,
DsPointer
p_ds_grid
,
EDataType
*
__restrict__
p_e_grid
,
const
AElementwiseOperation
a_element_op
,
const
BElementwiseOperation
b_element_op
,
const
CDEElementwiseOperation
cde_element_op
,
const
index_t
batch_count
,
const
AGridDesc_AK0_M_AK1
a_grid_desc_ak0_m_ak1
,
const
BGridDesc_BK0_N_BK1
b_grid_desc_bk0_n_bk1
,
const
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
ds_grid_desc_mblock_mperblock_nblock_nperblock
,
const
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
e_grid_desc_mblock_mperblock_nblock_nperblock_
,
const
Block2ETileMap
block_2_ctile_map
,
const
ComputePtrOffsetOfBatch
compute_ptr_offset_of_batch
)
{
#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
// offset base pointer for each work-group
const
index_t
num_blocks_per_batch
=
__builtin_amdgcn_readfirstlane
(
get_grid_size
()
/
batch_count
);
const
index_t
g_idx
=
__builtin_amdgcn_readfirstlane
(
get_block_1d_id
()
/
num_blocks_per_batch
);
const
long_index_t
a_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
g_idx
)));
const
long_index_t
b_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
g_idx
)));
const
long_index_t
e_batch_offset
=
__builtin_amdgcn_readfirstlane
(
static_cast
<
long_index_t
>
(
compute_ptr_offset_of_batch
.
GetEPtrOffset
(
g_idx
)));
const
auto
ds_batch_offset
=
compute_ptr_offset_of_batch
.
GetDsPtrOffset
(
g_idx
);
__shared__
char
p_shared
[
GridwiseGemm
::
GetSharedMemoryNumberOfByte
()];
DsPointer
p_ds_grid_grp
;
static
constexpr
index_t
NumDTensor
=
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
::
Size
();
static_for
<
0
,
NumDTensor
,
1
>
{}(
[
&
](
auto
i
)
{
p_ds_grid_grp
(
i
)
=
p_ds_grid
[
i
]
+
ds_batch_offset
[
i
];
});
GridwiseGemm
::
template
Run
<
HasMainKBlockLoop
>(
p_a_grid
+
a_batch_offset
,
p_b_grid
+
b_batch_offset
,
p_ds_grid_grp
,
p_e_grid
+
e_batch_offset
,
p_shared
,
a_element_op
,
b_element_op
,
cde_element_op
,
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
ds_grid_desc_mblock_mperblock_nblock_nperblock
,
e_grid_desc_mblock_mperblock_nblock_nperblock_
,
block_2_ctile_map
);
#else
ignore
=
p_a_grid
;
ignore
=
p_b_grid
;
ignore
=
p_ds_grid
;
ignore
=
p_e_grid
;
ignore
=
batch_count
;
ignore
=
a_grid_desc_ak0_m_ak1
;
ignore
=
b_grid_desc_bk0_n_bk1
;
ignore
=
ds_grid_desc_mblock_mperblock_nblock_nperblock
;
ignore
=
e_grid_desc_mblock_mperblock_nblock_nperblock_
;
ignore
=
a_element_op
;
ignore
=
b_element_op
;
ignore
=
cde_element_op
;
ignore
=
compute_ptr_offset_of_batch
;
ignore
=
block_2_ctile_map
;
#endif
}
}
// namespace
// Conv backward data multiple D:
// input : output image A: [G, N, K, Ho, Wo]
// input : weight B: [G, K, C, Y, X],
// input : D0, D1, ... : [G, N, K, Ho, Wo]
// output : input image E: [G, N, C, Hi, Wi]
// C = a_op(A) * b_op(B)
// E = cde_op(C, D0, D1, ...)
template
<
index_t
NDimSpatial
,
typename
ALayout
,
// output image
typename
BLayout
,
// weight
typename
DsLayout
,
// bias
typename
ELayout
,
// input image
typename
ADataType
,
// output image
typename
BDataType
,
// weight
typename
AccDataType
,
typename
CShuffleDataType
,
typename
DsDataType
,
// bias
typename
EDataType
,
// input image
typename
AElementwiseOp
,
// output image
typename
BElementwiseOp
,
// weight
typename
CDEElementwiseOp
,
// C, bias, and input image
ConvolutionBackwardDataSpecialization
ConvBackwardDataSpecialization
,
bool
DoPadGemmM
,
bool
DoPadGemmN
,
index_t
NumGemmKPrefetchStage
,
index_t
BlockSize
,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
KPerBlock
,
index_t
AK1
,
index_t
BK1
,
index_t
MPerXDL
,
index_t
NPerXDL
,
index_t
MXdlPerWave
,
index_t
NXdlPerWave
,
typename
ABlockTransferThreadClusterLengths_AK0_M_AK1
,
typename
ABlockTransferThreadClusterArrangeOrder
,
typename
ABlockTransferSrcAccessOrder
,
index_t
ABlockTransferSrcVectorDim
,
index_t
ABlockTransferSrcScalarPerVector
,
index_t
ABlockTransferDstScalarPerVector_AK1
,
index_t
ABlockLdsExtraM
,
typename
BBlockTransferThreadClusterLengths_BK0_N_BK1
,
typename
BBlockTransferThreadClusterArrangeOrder
,
typename
BBlockTransferSrcAccessOrder
,
index_t
BBlockTransferSrcVectorDim
,
index_t
BBlockTransferSrcScalarPerVector
,
index_t
BBlockTransferDstScalarPerVector_BK1
,
index_t
BBlockLdsExtraN
,
index_t
CShuffleMXdlPerWavePerShuffle
,
index_t
CShuffleNXdlPerWavePerShuffle
,
typename
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
index_t
CDEBlockTransferScalarPerVector_NPerBlock
,
LoopScheduler
LoopSched
=
make_default_loop_scheduler
()>
struct
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
:
public
DeviceGroupedConvBwdDataMultipleD
<
NDimSpatial
,
ALayout
,
// output image
BLayout
,
// weight
DsLayout
,
// bias
ELayout
,
// input image
ADataType
,
// output image
BDataType
,
// weight
DsDataType
,
// bias
EDataType
,
// input image
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
>
{
// FIXME
static_assert
(
NDimSpatial
==
2
,
"wrong! only implemented for 2D now"
);
using
DeviceOp
=
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
;
static
constexpr
index_t
NumDTensor
=
DsDataType
::
Size
();
// TODO make A/B datatype different
using
ABDataType
=
ADataType
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
static
constexpr
auto
transform_conv_to_gemm
=
TransformConvBwdDataToGemm_v1
<
NDimSpatial
,
ConvBackwardDataSpecialization
,
AK1
,
BK1
,
MPerBlock
,
NPerBlock
,
DoPadGemmM
,
DoPadGemmN
>
{};
static
auto
GetDummyABDsEGridDescriptor
()
{
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_lengths
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
dummy_tensor_strides
=
{
1
};
const
std
::
array
<
index_t
,
NDimSpatial
>
dummy_spatial_lengths
=
{
1
};
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
const
auto
ds_grid_desc_m_n
=
generate_tuple
(
[
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
return
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
},
Number
<
NumDTensor
>
{});
const
auto
e_grid_desc_m_n
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_tensor_lengths
,
dummy_tensor_strides
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
,
dummy_spatial_lengths
);
return
make_tuple
(
a_grid_desc_ak0_m_ak1
,
b_grid_desc_bk0_n_bk1
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
);
}
// GridwiseGemm
using
GridwiseGemm
=
GridwiseGemmMultipleD_xdl_cshuffle
<
ABDataType
,
// TODO: distinguish A/B datatype
AccDataType
,
CShuffleDataType
,
DsDataType
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
InMemoryDataOperationEnum
::
Set
,
NumGemmKPrefetchStage
,
BlockSize
,
MPerBlock
,
NPerBlock
,
KPerBlock
,
AK1
,
BK1
,
MPerXDL
,
NPerXDL
,
MXdlPerWave
,
NXdlPerWave
,
ABlockTransferThreadClusterLengths_AK0_M_AK1
,
ABlockTransferThreadClusterArrangeOrder
,
ABlockTransferSrcAccessOrder
,
ABlockTransferSrcVectorDim
,
ABlockTransferSrcScalarPerVector
,
ABlockTransferDstScalarPerVector_AK1
,
false
,
ABlockLdsExtraM
,
BBlockTransferThreadClusterLengths_BK0_N_BK1
,
BBlockTransferThreadClusterArrangeOrder
,
BBlockTransferSrcAccessOrder
,
BBlockTransferSrcVectorDim
,
BBlockTransferSrcScalarPerVector
,
BBlockTransferDstScalarPerVector_BK1
,
false
,
BBlockLdsExtraN
,
CShuffleMXdlPerWavePerShuffle
,
CShuffleNXdlPerWavePerShuffle
,
CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
,
CDEBlockTransferScalarPerVector_NPerBlock
,
LoopSched
>
;
template
<
typename
Desc_K0_M_K1
>
static
auto
transform_k0_m_k1_to_m_k
(
const
Desc_K0_M_K1
&
desc_k0_m_k1
)
{
const
auto
grid_desc_m_k
=
transform_tensor_descriptor
(
desc_k0_m_k1
,
make_tuple
(
make_pass_through_transform
(
desc_k0_m_k1
.
GetLength
(
I1
)),
make_merge_transform
(
make_tuple
(
desc_k0_m_k1
.
GetLength
(
I0
),
desc_k0_m_k1
.
GetLength
(
I2
)))),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
return
grid_desc_m_k
;
}
// desc
using
ABDsEGridDesc
=
decltype
(
GetDummyABDsEGridDescriptor
());
using
AGridDesc_AK0_M_AK1
=
remove_cvref_t
<
tuple_element_t
<
0
,
ABDsEGridDesc
>>
;
using
BGridDesc_BK0_N_BK1
=
remove_cvref_t
<
tuple_element_t
<
1
,
ABDsEGridDesc
>>
;
using
DsGridDesc_M_N
=
remove_cvref_t
<
tuple_element_t
<
2
,
ABDsEGridDesc
>>
;
using
EGridDesc_M_N
=
remove_cvref_t
<
tuple_element_t
<
3
,
ABDsEGridDesc
>>
;
using
AGridDesc_M_K
=
decltype
(
transform_k0_m_k1_to_m_k
(
AGridDesc_AK0_M_AK1
{}));
using
BGridDesc_N_K
=
decltype
(
transform_k0_m_k1_to_m_k
(
BGridDesc_BK0_N_BK1
{}));
using
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
=
decltype
(
GridwiseGemm
::
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
DsGridDesc_M_N
{}));
using
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
=
decltype
(
GridwiseGemm
::
MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
EGridDesc_M_N
{}));
// block-to-e-tile map
using
Block2ETileMap
=
remove_cvref_t
<
decltype
(
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
EGridDesc_M_N
{}))
>
;
// Argument
struct
Argument
:
public
BaseArgument
{
Argument
(
const
void
*
p_a
,
// output image
const
void
*
p_b
,
// weight
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
// bias
void
*
p_e
,
// input image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_k_wos_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_lengths
,
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOp
&
a_element_op
,
const
BElementwiseOp
&
b_element_op
,
const
CDEElementwiseOp
&
cde_element_op
)
:
p_a_grid_
{
static_cast
<
const
ADataType
*>
(
p_a
)},
p_b_grid_
{
static_cast
<
const
BDataType
*>
(
p_b
)},
p_ds_grid_
{},
p_e_grid_
{
static_cast
<
EDataType
*>
(
p_e
)},
num_group_
{
a_g_n_k_wos_lengths
[
0
]},
num_gemm_
{},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
},
a_g_n_k_wos_lengths_
{
a_g_n_k_wos_lengths
},
a_g_n_k_wos_strides_
{
a_g_n_k_wos_strides
},
b_g_k_c_xs_lengths_
{
b_g_k_c_xs_lengths
},
b_g_k_c_xs_strides_
{
b_g_k_c_xs_strides
},
ds_g_n_c_wis_lengths_
{
ds_g_n_c_wis_lengths
},
ds_g_n_c_wis_strides_
{
ds_g_n_c_wis_strides
},
e_g_n_c_wis_lengths_
{
e_g_n_c_wis_lengths
},
e_g_n_c_wis_strides_
{
e_g_n_c_wis_strides
},
conv_filter_strides_
{
conv_filter_strides
},
conv_filter_dilations_
{
conv_filter_dilations
},
input_left_pads_
{
input_left_pads
},
input_right_pads_
{
input_right_pads
}
{
// populate Ds pointer
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DDataType
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsDataType
>>
;
p_ds_grid_
(
i
)
=
static_cast
<
const
DDataType
*>
(
p_ds
[
i
]);
});
// A/B/Ds/E Batch Stride
compute_ptr_offset_of_batch_
.
BatchStrideA_
=
a_g_n_k_wos_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideB_
=
b_g_k_c_xs_strides
[
0
];
compute_ptr_offset_of_batch_
.
BatchStrideE_
=
e_g_n_c_wis_strides
[
0
];
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
compute_ptr_offset_of_batch_
.
BatchStrideDs_
(
i
)
=
ds_g_n_c_wis_strides
[
i
][
0
];
});
// problem definition
const
index_t
Y
=
b_g_k_c_xs_lengths
[
3
];
const
index_t
X
=
b_g_k_c_xs_lengths
[
4
];
const
index_t
ConvStrideH
=
conv_filter_strides_
[
0
];
const
index_t
ConvStrideW
=
conv_filter_strides_
[
1
];
const
index_t
ConvDilationH
=
conv_filter_dilations_
[
0
];
const
index_t
ConvDilationW
=
conv_filter_dilations_
[
1
];
const
auto
GcdStrideDilationH
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
const
auto
GcdStrideDilationW
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
const
auto
YTilde
=
ConvStrideH
/
GcdStrideDilationH
;
const
auto
XTilde
=
ConvStrideW
/
GcdStrideDilationW
;
// number of GEMM
num_gemm_
=
YTilde
*
XTilde
;
for
(
index_t
i_ytilde
=
0
;
i_ytilde
<
YTilde
;
++
i_ytilde
)
{
for
(
index_t
i_xtilde
=
0
;
i_xtilde
<
XTilde
;
++
i_xtilde
)
{
// check slice is valid
const
auto
YDotSlice
=
math
::
integer_divide_ceil
(
Y
-
i_ytilde
,
YTilde
);
const
auto
XDotSlice
=
math
::
integer_divide_ceil
(
X
-
i_xtilde
,
XTilde
);
if
(
YDotSlice
*
XDotSlice
<=
0
)
{
continue
;
}
const
auto
a_grid_desc_ak0_m_ak1
=
transform_conv_to_gemm
.
template
MakeADescriptor_AK0_M_AK1
<
ALayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
{
i_ytilde
,
i_xtilde
});
const
auto
b_grid_desc_bk0_n_bk1
=
transform_conv_to_gemm
.
template
MakeBDescriptor_BK0_N_BK1
<
BLayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
{
i_ytilde
,
i_xtilde
});
DsGridDesc_M_N
ds_grid_desc_m_n
;
// populate Ds desc
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
ds_grid_desc_m_n
(
i
)
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
DLayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
ds_g_n_c_wis_lengths
[
i
],
ds_g_n_c_wis_strides
[
i
],
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
{
i_ytilde
,
i_xtilde
});
});
const
auto
e_grid_desc_m_n
=
transform_conv_to_gemm
.
template
MakeCDescriptor_M_N
<
ELayout
>(
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
{
i_ytilde
,
i_xtilde
});
// desc for problem definition
const
auto
a_grid_desc_m_k
=
transform_k0_m_k1_to_m_k
(
a_grid_desc_ak0_m_ak1
);
const
auto
b_grid_desc_n_k
=
transform_k0_m_k1_to_m_k
(
b_grid_desc_bk0_n_bk1
);
a_grid_desc_m_k_container_
.
push_back
(
a_grid_desc_m_k
);
b_grid_desc_n_k_container_
.
push_back
(
b_grid_desc_n_k
);
ds_grid_desc_m_n_container_
.
push_back
(
ds_grid_desc_m_n
);
e_grid_desc_m_n_container_
.
push_back
(
e_grid_desc_m_n
);
// desc for blockwise copy
a_grid_desc_ak0_m_ak1_container_
.
push_back
(
a_grid_desc_ak0_m_ak1
);
b_grid_desc_bk0_n_bk1_container_
.
push_back
(
b_grid_desc_bk0_n_bk1
);
// block-to-e-tile-map
auto
block_2_etile_map
=
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
e_grid_desc_m_n
);
block_2_etile_map_container_
.
push_back
(
block_2_etile_map
);
if
(
GridwiseGemm
::
CheckValidity
(
a_grid_desc_m_k
,
b_grid_desc_n_k
,
ds_grid_desc_m_n
,
e_grid_desc_m_n
,
block_2_etile_map
))
{
ds_grid_desc_mblock_mperblock_nblock_nperblock_container_
.
push_back
(
GridwiseGemm
::
MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
ds_grid_desc_m_n
));
e_grid_desc_mblock_mperblock_nblock_nperblock_container_
.
push_back
(
GridwiseGemm
::
MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
(
e_grid_desc_m_n
));
}
}
}
}
void
Print
()
const
{
for
(
index_t
i
=
0
;
i
<
num_gemm_
;
i
++
)
{
std
::
cout
<<
"a_grid_desc_ak0_m_ak1_container_"
<<
a_grid_desc_ak0_m_ak1_container_
[
i
]
<<
std
::
endl
;
std
::
cout
<<
"b_grid_desc_bk0_n_bk1_container_"
<<
b_grid_desc_bk0_n_bk1_container_
[
i
]
<<
std
::
endl
;
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
j
)
{
std
::
cout
<<
"ds_grid_desc_mblock_mperblock_nblock_nperblock_container_"
<<
ds_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
][
j
]
<<
std
::
endl
;
});
std
::
cout
<<
"e_grid_desc_mblock_mperblock_nblock_nperblock_container_"
<<
e_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
]
<<
std
::
endl
;
}
}
// pointers
const
ADataType
*
p_a_grid_
;
const
BDataType
*
p_b_grid_
;
typename
GridwiseGemm
::
DsGridPointer
p_ds_grid_
;
EDataType
*
p_e_grid_
;
// tensor descriptor for problem definition
index_t
num_group_
;
index_t
num_gemm_
;
std
::
vector
<
AGridDesc_M_K
>
a_grid_desc_m_k_container_
;
std
::
vector
<
BGridDesc_N_K
>
b_grid_desc_n_k_container_
;
std
::
vector
<
DsGridDesc_M_N
>
ds_grid_desc_m_n_container_
;
std
::
vector
<
EGridDesc_M_N
>
e_grid_desc_m_n_container_
;
// tensor descriptor for block-wise copy
std
::
vector
<
AGridDesc_AK0_M_AK1
>
a_grid_desc_ak0_m_ak1_container_
;
std
::
vector
<
BGridDesc_BK0_N_BK1
>
b_grid_desc_bk0_n_bk1_container_
;
std
::
vector
<
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
>
ds_grid_desc_mblock_mperblock_nblock_nperblock_container_
;
std
::
vector
<
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
>
e_grid_desc_mblock_mperblock_nblock_nperblock_container_
;
// block-to-e-tile map
std
::
vector
<
Block2ETileMap
>
block_2_etile_map_container_
;
// for computing batch offset
ComputePtrOffsetOfStridedBatch
<
NumDTensor
>
compute_ptr_offset_of_batch_
;
// element-wise op
AElementwiseOp
a_element_op_
;
BElementwiseOp
b_element_op_
;
CDEElementwiseOp
cde_element_op_
;
// for checking IsSupportedArgument()
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
a_g_n_k_wos_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_lengths_
;
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>
ds_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_lengths_
;
std
::
array
<
index_t
,
NDimSpatial
+
3
>
e_g_n_c_wis_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_strides_
;
std
::
array
<
index_t
,
NDimSpatial
>
conv_filter_dilations_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_left_pads_
;
std
::
array
<
index_t
,
NDimSpatial
>
input_right_pads_
;
};
// Invoker
struct
Invoker
:
public
BaseInvoker
{
using
Argument
=
DeviceOp
::
Argument
;
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
if
(
stream_config
.
log_level_
>
0
)
{
arg
.
Print
();
}
float
ave_time
=
0
;
for
(
index_t
i
=
0
;
i
<
arg
.
num_gemm_
;
i
++
)
{
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_m_k_container_
[
i
],
arg
.
b_grid_desc_n_k_container_
[
i
],
arg
.
ds_grid_desc_m_n_container_
[
i
],
arg
.
e_grid_desc_m_n_container_
[
i
],
arg
.
block_2_etile_map_container_
[
i
]))
{
throw
std
::
runtime_error
(
"wrong! device_op has invalid setting"
);
}
const
index_t
grid_size
=
arg
.
block_2_etile_map_container_
[
i
].
CalculateGridSize
(
arg
.
e_grid_desc_m_n_container_
[
i
])
*
arg
.
num_group_
;
const
auto
GemmK
=
arg
.
a_grid_desc_m_k_container_
[
i
].
GetLength
(
I1
);
auto
launch_kernel
=
[
&
](
auto
has_main_k_block_loop
)
{
constexpr
bool
has_main_loop
=
has_main_k_block_loop
.
value
;
const
auto
kernel
=
kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle
<
GridwiseGemm
,
ADataType
,
// TODO: distiguish A/B datatype
typename
GridwiseGemm
::
DsGridPointer
,
EDataType
,
AElementwiseOp
,
BElementwiseOp
,
CDEElementwiseOp
,
DeviceOp
::
AGridDesc_AK0_M_AK1
,
DeviceOp
::
BGridDesc_BK0_N_BK1
,
DeviceOp
::
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
DeviceOp
::
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
,
Block2ETileMap
,
ComputePtrOffsetOfStridedBatch
<
NumDTensor
>
,
has_main_loop
>
;
return
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
p_a_grid_
,
arg
.
p_b_grid_
,
arg
.
p_ds_grid_
,
arg
.
p_e_grid_
,
arg
.
a_element_op_
,
arg
.
b_element_op_
,
arg
.
cde_element_op_
,
arg
.
a_g_n_k_wos_lengths_
[
0
],
// Group count
arg
.
a_grid_desc_ak0_m_ak1_container_
[
i
],
arg
.
b_grid_desc_bk0_n_bk1_container_
[
i
],
arg
.
ds_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
],
arg
.
e_grid_desc_mblock_mperblock_nblock_nperblock_container_
[
i
],
arg
.
block_2_etile_map_container_
[
i
],
arg
.
compute_ptr_offset_of_batch_
);
};
if
(
GridwiseGemm
::
CalculateHasMainKBlockLoop
(
GemmK
))
{
ave_time
+=
launch_kernel
(
integral_constant
<
bool
,
true
>
{});
}
else
{
ave_time
+=
launch_kernel
(
integral_constant
<
bool
,
false
>
{});
}
}
return
ave_time
;
}
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
}
};
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
const
index_t
ConvK
=
arg
.
b_g_k_c_xs_lengths_
[
1
];
const
index_t
ConvC
=
arg
.
b_g_k_c_xs_lengths_
[
2
];
// Specifialization
if
constexpr
(
ConvBackwardDataSpecialization
==
ConvolutionBackwardDataSpecialization
::
Filter1x1Stride1Pad0
)
{
// check if it's 1x1, stride=1 pad = 0 conv
for
(
int
i
=
0
;
i
<
NDimSpatial
;
i
++
)
{
if
(
!
(
arg
.
b_g_k_c_xs_lengths_
[
3
+
i
]
==
1
&&
arg
.
conv_filter_strides_
[
i
]
==
1
&&
arg
.
input_left_pads_
[
i
]
==
0
&&
arg
.
input_right_pads_
[
i
]
==
0
))
{
return
false
;
}
}
}
// vector load for A matrix from global memory to LDS
if
constexpr
(
is_same_v
<
ALayout
,
tensor_layout
::
convolution
::
GNHWK
>
)
{
if
(
!
(
ABlockTransferSrcVectorDim
==
2
&&
ConvK
%
ABlockTransferSrcScalarPerVector
==
0
))
{
return
false
;
}
}
else
{
return
false
;
}
// vector load for B matrix from global memory to LDS
if
constexpr
(
is_same_v
<
BLayout
,
tensor_layout
::
convolution
::
GKYXC
>
)
{
if
(
!
(
BBlockTransferSrcVectorDim
==
1
&&
ConvC
%
BBlockTransferSrcScalarPerVector
==
0
))
{
return
false
;
}
}
else
{
return
false
;
}
// vector store for Ds
bool
ds_valid
=
true
;
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
if
constexpr
(
is_same_v
<
DLayout
,
tensor_layout
::
convolution
::
GNHWC
>
||
is_same_v
<
DLayout
,
tensor_layout
::
convolution
::
NHWGC
>
||
is_same_v
<
DLayout
,
tensor_layout
::
convolution
::
G_NHW_C
>
||
is_same_v
<
DLayout
,
tensor_layout
::
convolution
::
GC
>
||
is_same_v
<
DLayout
,
tensor_layout
::
convolution
::
G_C
>
)
{
// vector load D matrix from global memory
if
(
!
(
ConvC
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
))
{
ds_valid
=
false
;
}
}
else
{
ds_valid
=
false
;
}
});
if
(
!
ds_valid
)
{
return
false
;
}
// vector store for E
if
constexpr
(
is_same_v
<
ELayout
,
tensor_layout
::
convolution
::
GNHWC
>
)
{
// vector store C matrix into global memory
if
(
!
(
ConvC
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
))
{
return
false
;
}
}
else
{
return
false
;
}
// Gridwise GEMM size
for
(
std
::
size_t
i
=
0
;
i
<
arg
.
a_grid_desc_ak0_m_ak1_container_
.
size
();
i
++
)
{
if
(
!
GridwiseGemm
::
CheckValidity
(
arg
.
a_grid_desc_m_k_container_
[
i
],
arg
.
b_grid_desc_n_k_container_
[
i
],
arg
.
ds_grid_desc_m_n_container_
[
i
],
arg
.
e_grid_desc_m_n_container_
[
i
],
arg
.
block_2_etile_map_container_
[
i
]))
{
return
false
;
}
}
return
true
;
}
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
return
IsSupportedArgument
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
));
}
static
auto
MakeArgument
(
const
void
*
p_a
,
// output image
const
void
*
p_b
,
// weight
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
// bias
void
*
p_e
,
// input image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_k_wos_lengths
,
// output image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_k_wos_strides
,
// output image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
// weight
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
// weight
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_lengths
,
// bias
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_strides
,
// bias
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_lengths
,
// input image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_strides
,
// input image
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOp
&
a_element_op
,
const
BElementwiseOp
&
b_element_op
,
const
CDEElementwiseOp
&
cde_element_op
)
{
return
Argument
{
p_a
,
p_b
,
p_ds
,
p_e
,
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
ds_g_n_c_wis_lengths
,
ds_g_n_c_wis_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
a_element_op
,
b_element_op
,
cde_element_op
};
}
static
auto
MakeInvoker
()
{
return
Invoker
{};
}
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
void
*
p_a
,
// output image
const
void
*
p_b
,
// weight
const
std
::
array
<
const
void
*
,
NumDTensor
>&
p_ds
,
// bias
void
*
p_e
,
// input image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_k_wos_lengths
,
// output image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_k_wos_strides
,
// output image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
// weight
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
// weight
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_lengths
,
// bias
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_c_wis_strides
,
// bias
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_lengths
,
// input image
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
e_g_n_c_wis_strides
,
// input image
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
conv_filter_dilations
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
index_t
,
NDimSpatial
>&
input_right_pads
,
const
AElementwiseOp
&
a_element_op
,
const
BElementwiseOp
&
b_element_op
,
const
CDEElementwiseOp
&
cde_element_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
p_a
,
p_b
,
p_ds
,
p_e
,
a_g_n_k_wos_lengths
,
a_g_n_k_wos_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
ds_g_n_c_wis_lengths
,
ds_g_n_c_wis_strides
,
e_g_n_c_wis_lengths
,
e_g_n_c_wis_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
a_element_op
,
b_element_op
,
cde_element_op
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
(
Invoker
{});
}
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1"
<<
"<"
<<
BlockSize
<<
", "
<<
MPerBlock
<<
", "
<<
NPerBlock
<<
", "
<<
KPerBlock
<<
", "
<<
AK1
<<
", "
<<
BK1
<<
", "
<<
getConvBackwardDataSpecializationString
(
ConvBackwardDataSpecialization
)
<<
">"
;
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <array>
#include <memory>
#include <utility>
#include "ck/utility/math.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_permute.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_permute.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
// Swap last 2 dimensions
// input shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2], d[NumDim-1]]
// ^^^^^^^^^^^
// output shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-1], d[NumDim-2]]
// ^^^^^^^^^^^
template
<
index_t
NumDim
,
typename
InDataType
,
typename
OutDataType
,
typename
ElementwiseOperation
,
index_t
BlockSize
,
index_t
NPerBlock
,
index_t
HPerBlock
,
index_t
WPerBlock
,
index_t
InBlockLdsExtraW
,
typename
InBlockTransferThreadClusterLengths
,
typename
InBlockTransferThreadClusterArrangeOrder
,
index_t
SrcVectorDim
,
index_t
DstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
>
struct
DevicePermuteImpl
:
DevicePermute
<
NumDim
,
InDataType
,
OutDataType
,
ElementwiseOperation
>
{
using
BaseType
=
DevicePermute
<
NumDim
,
InDataType
,
OutDataType
,
ElementwiseOperation
>
;
using
typename
BaseType
::
Lengths
;
using
typename
BaseType
::
Strides
;
static_assert
(
3
<=
NumDim
,
"Only accept at least 3D dimension tensor"
);
static_assert
((
NumDim
-
2
)
<=
SrcVectorDim
&&
SrcVectorDim
<
NumDim
);
static_assert
((
NumDim
-
2
)
<=
DstVectorDim
&&
DstVectorDim
<
NumDim
);
static_assert
(
SrcVectorDim
!=
DstVectorDim
);
template
<
index_t
N
=
NumDim
>
static
auto
ConvertArrayToTuple
(
const
std
::
array
<
index_t
,
NumDim
>&
array
)
{
static_assert
(
1
<=
N
&&
N
<=
NumDim
);
return
generate_tuple
([
&
](
auto
I
)
{
return
array
[
I
];
},
Number
<
N
>
{});
}
static
auto
MakeDescriptor_N_H_W
(
const
Lengths
&
lengths
,
const
Strides
&
stride
)
{
// create nd descriptor, shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2],
// d[NumDim-1]]
const
auto
desc
=
make_naive_tensor_descriptor
(
ConvertArrayToTuple
(
lengths
),
ConvertArrayToTuple
(
stride
));
// merge nd to 3d descriptor, shape: [(d[0] * d[1] * d[2] * ... * d[NumDim-3]), d[NumDim-2],
// d[NumDim-1]]
// => [N, H, W]
const
index_t
H
=
*
std
::
next
(
rbegin
(
lengths
));
const
index_t
W
=
*
rbegin
(
lengths
);
const
auto
desc_n_h_w
=
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
ConvertArrayToTuple
<
NumDim
-
2
>
(
lengths
)),
make_pass_through_transform
(
H
),
make_pass_through_transform
(
W
)),
make_tuple
(
generate_sequence_v2
([
&
](
auto
I
)
{
return
I
;
},
Number
<
NumDim
-
2
>
{}),
Sequence
<
NumDim
-
2
>
{},
Sequence
<
NumDim
-
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}));
return
PadTensorDescriptor
(
desc_n_h_w
,
make_tuple
(
NPerBlock
,
HPerBlock
,
WPerBlock
),
Sequence
<
true
,
true
,
true
>
{});
}
using
InGridDesc
=
decltype
(
MakeDescriptor_N_H_W
({
1
,
1
},
{
1
,
1
}));
using
OutGridDesc
=
InGridDesc
;
using
GridwisePermute
=
GridwisePermute
<
InGridDesc
,
OutGridDesc
,
InDataType
,
OutDataType
,
ElementwiseOperation
,
BlockSize
,
NPerBlock
,
HPerBlock
,
WPerBlock
,
InBlockLdsExtraW
,
InBlockTransferThreadClusterLengths
,
InBlockTransferThreadClusterArrangeOrder
,
SrcVectorDim
-
(
NumDim
-
3
),
// calculate new SrcVectorDim for the merged descriptor
DstVectorDim
-
(
NumDim
-
3
),
// calculate new DstVectorDim for the merged descriptor
SrcScalarPerVector
,
DstScalarPerVector
>
;
using
Block2TileMap
=
typename
GridwisePermute
::
DefaultBlock2TileMap
;
struct
Argument
:
public
BaseArgument
{
Argument
(
const
Lengths
&
in_lengths
,
const
Strides
&
in_strides
,
const
Lengths
&
out_lengths
,
const
Strides
&
out_strides
,
const
void
*
in_dev_buffer
,
void
*
out_dev_buffer
,
ElementwiseOperation
elementwise_op
)
:
in_dev_buffer_
(
static_cast
<
const
InDataType
*>
(
in_dev_buffer
)),
out_dev_buffer_
(
static_cast
<
OutDataType
*>
(
out_dev_buffer
)),
in_grid_desc_
(
MakeDescriptor_N_H_W
(
in_lengths
,
in_strides
)),
out_grid_desc_
(
MakeDescriptor_N_H_W
(
out_lengths
,
out_strides
)),
in_lengths_
(
in_lengths
),
in_strides_
(
in_strides
),
out_lengths_
(
out_lengths
),
out_strides_
(
out_strides
),
elementwise_op_
(
elementwise_op
),
block_2_tile_map_
(
GridwisePermute
::
MakeDefaultBlock2TileMap
(
in_grid_desc_
))
{
}
const
InDataType
*
in_dev_buffer_
;
OutDataType
*
out_dev_buffer_
;
InGridDesc
in_grid_desc_
;
OutGridDesc
out_grid_desc_
;
Lengths
in_lengths_
;
Strides
in_strides_
;
Lengths
out_lengths_
;
Strides
out_strides_
;
ElementwiseOperation
elementwise_op_
;
Block2TileMap
block_2_tile_map_
;
};
struct
Invoker
:
BaseInvoker
{
static
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
index_t
grid_size
=
arg
.
block_2_tile_map_
.
CalculateGridSize
(
arg
.
in_grid_desc_
);
const
auto
kernel
=
kernel_nd_permute
<
GridwisePermute
,
InGridDesc
,
OutGridDesc
,
InDataType
,
OutDataType
,
ElementwiseOperation
,
Block2TileMap
>
;
float
elapsed_time
=
launch_and_time_kernel
(
stream_config
,
kernel
,
dim3
(
grid_size
),
dim3
(
BlockSize
),
0
,
arg
.
in_grid_desc_
,
arg
.
out_grid_desc_
,
arg
.
in_dev_buffer_
,
arg
.
out_dev_buffer_
,
arg
.
elementwise_op_
,
arg
.
block_2_tile_map_
);
return
elapsed_time
;
}
float
Run
(
const
BaseArgument
*
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
final
{
const
auto
*
const
argument
=
dynamic_cast
<
const
Argument
*>
(
arg
);
if
(
!
argument
)
{
return
NAN
;
}
return
Run
(
*
argument
,
stream_config
);
}
};
static
bool
IsSupportedArgument
(
const
Argument
&
arg
)
{
constexpr
auto
GetPaddedLength
=
[](
index_t
length
,
index_t
tile_length
)
{
return
math
::
integer_divide_ceil
(
length
,
tile_length
)
*
tile_length
;
};
constexpr
auto
IsScalarPerVectorValid
=
[](
index_t
length
,
index_t
stride
,
index_t
scalar_per_vector
)
{
if
(
stride
==
1
&&
length
%
scalar_per_vector
==
0
)
{
return
true
;
}
else
if
(
stride
!=
1
&&
scalar_per_vector
==
1
)
{
return
true
;
}
return
false
;
};
return
IsScalarPerVectorValid
(
arg
.
in_lengths_
[
SrcVectorDim
],
arg
.
in_strides_
[
SrcVectorDim
],
SrcScalarPerVector
)
&&
IsScalarPerVectorValid
(
GetPaddedLength
(
arg
.
in_lengths_
[
SrcVectorDim
],
(
SrcVectorDim
==
NumDim
-
2
?
HPerBlock
:
WPerBlock
)),
arg
.
in_strides_
[
SrcVectorDim
],
SrcScalarPerVector
)
&&
IsScalarPerVectorValid
(
arg
.
out_lengths_
[
DstVectorDim
],
arg
.
out_strides_
[
DstVectorDim
],
DstScalarPerVector
)
&&
IsScalarPerVectorValid
(
GetPaddedLength
(
arg
.
out_lengths_
[
DstVectorDim
],
(
DstVectorDim
==
NumDim
-
2
?
HPerBlock
:
WPerBlock
)),
arg
.
in_strides_
[
DstVectorDim
],
DstScalarPerVector
)
&&
GridwisePermute
::
CheckValidity
(
arg
.
in_grid_desc_
,
arg
.
out_grid_desc_
);
};
// override methods inherited from 'BaseOperator'
bool
IsSupportedArgument
(
const
BaseArgument
*
arg
)
override
final
{
const
auto
*
const
argument
=
dynamic_cast
<
const
Argument
*>
(
arg
);
if
(
!
argument
)
{
return
false
;
}
return
IsSupportedArgument
(
*
argument
);
}
// override methods inherited from 'DevicePermute'
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
Lengths
&
in_lengths
,
const
Strides
&
in_strides
,
const
Lengths
&
out_lengths
,
const
Strides
&
out_strides
,
const
void
*
in_dev_buffer
,
void
*
out_dev_buffer
,
ElementwiseOperation
elementwise_op
)
override
final
{
return
std
::
make_unique
<
Argument
>
(
in_lengths
,
in_strides
,
out_lengths
,
out_strides
,
in_dev_buffer
,
out_dev_buffer
,
elementwise_op
);
}
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
final
{
return
std
::
make_unique
<
Invoker
>
();
};
// other constructor methods
template
<
typename
...
Args
>
static
std
::
enable_if_t
<
std
::
is_constructible_v
<
Argument
,
Args
...
>
,
Argument
>
MakeArgument
(
Args
&&
...
args
)
noexcept
(
std
::
is_nothrow_constructible_v
<
Argument
,
Args
...
>
)
{
return
Argument
{
std
::
forward
<
Args
>
(
args
)...};
}
static
std
::
enable_if_t
<
std
::
is_default_constructible_v
<
Invoker
>
,
Invoker
>
MakeInvoker
()
noexcept
(
std
::
is_nothrow_default_constructible_v
<
Invoker
>
)
{
return
Invoker
{};
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <sstream>
#include "ck/utility/reduction_operator.hpp"
#include "ck/tensor_operation/gpu/device/device_base.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
InDataType
,
typename
AccDataType
,
typename
OutDataType
,
typename
InElementwiseOp
,
typename
AccElementwiseOp
,
index_t
Rank
,
index_t
NumReduceDim
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
index_t
OutDstVectorSize
>
struct
DeviceSoftmaxImpl
:
public
DeviceSoftmax
<
InDataType
,
AccDataType
,
OutDataType
,
InElementwiseOp
,
AccElementwiseOp
,
Rank
>
{
static
constexpr
index_t
kRank
=
Rank
;
static
constexpr
index_t
kNumReduceDim
=
NumReduceDim
;
virtual
index_t
GetRank
()
const
override
{
return
kRank
;
}
virtual
index_t
GetNumReduceDim
()
const
override
{
return
kNumReduceDim
;
}
// Used for freeloading of some handy functions from DeviceReduceMultiBlock
using
Reduction
=
DeviceReduceMultiBlock
<
InDataType
,
AccDataType
,
OutDataType
,
Rank
,
NumReduceDim
,
reduce
::
Add
,
InElementwiseOp
,
AccElementwiseOp
,
InMemoryDataOperationEnum
::
Set
,
false
,
// PropagateNan
false
,
// OutputIndex
false
,
// HaveIndexInputIfOutputIndex
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
1
>
;
// OutDstVectorSize
using
GridDesc_M_K
=
decltype
(
Reduction
::
MakeSrc2dDescriptor
({
1
},
{
1
},
1
,
1
));
using
GridwiseSoftmaxGeneric
=
GridwiseSoftmax_mk_to_mk
<
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
,
false
>
;
using
GridwiseSoftmaxSweepOnce
=
GridwiseSoftmax_mk_to_mk
<
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
,
BlockSize
,
MThreadClusterSize
,
KThreadClusterSize
,
MThreadSliceSize
,
KThreadSliceSize
,
InSrcVectorDim
,
InSrcVectorSize
,
OutDstVectorSize
,
true
>
;
struct
Argument
:
public
Reduction
::
Argument
{
Argument
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
index_t
>
reduceDims
,
AccDataType
alpha
,
AccDataType
beta
,
const
InDataType
*
in_dev
,
OutDataType
*
out_dev
,
InElementwiseOp
in_elementwise_op
,
AccElementwiseOp
acc_elementwise_op
)
:
Reduction
::
Argument
(
inLengths
,
inStrides
,
{},
{},
reduceDims
,
0.0
f
,
// alpha
0.0
f
,
// beta
in_dev
,
nullptr
,
out_dev
,
nullptr
,
in_elementwise_op
,
acc_elementwise_op
),
// FIXME: The base class DeviceReduceMultiBlock::Argument only supports alpha/beta of
// float32 precision. Make it support any data type so the fields can be removed.
alpha_
(
alpha
),
beta_
(
beta
)
{
// std::cout << "blkGroupSize= " << this->blkGroupSize
// << ", numBlockTileIteration= " << this->numBlockTileIteration
// << ", gridSize=" << this->gridSize
// << ", invariant_total_length=" << this->invariant_total_length <<
// std::endl;
}
AccDataType
alpha_
;
AccDataType
beta_
;
};
struct
Invoker
:
public
BaseInvoker
{
float
Run
(
const
Argument
&
arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
{
const
auto
in_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
const
auto
out_grid_desc_m_k
=
Reduction
::
MakeSrc2dDescriptor
(
arg
.
inLengths_
,
arg
.
inStrides_
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
);
bool
sweep_once
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{})
<=
KThreadClusterSize
*
KThreadSliceSize
;
const
auto
kernel_main
=
sweep_once
?
kernel_softmax
<
GridwiseSoftmaxSweepOnce
,
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
>
:
kernel_softmax
<
GridwiseSoftmaxGeneric
,
InDataType
,
OutDataType
,
AccDataType
,
GridDesc_M_K
>
;
float
avg_time
=
0
;
avg_time
+=
launch_and_time_kernel
(
stream_config
,
kernel_main
,
dim3
(
arg
.
gridSize
),
dim3
(
BlockSize
),
0
,
in_grid_desc_m_k
,
out_grid_desc_m_k
,
arg
.
blkGroupSize
,
arg
.
numBlockTileIteration
,
arg
.
alpha_
,
arg
.
in_dev_
,
arg
.
beta_
,
arg
.
out_dev_
);
return
(
avg_time
);
};
float
Run
(
const
BaseArgument
*
p_arg
,
const
StreamConfig
&
stream_config
=
StreamConfig
{})
override
{
return
Run
(
*
dynamic_cast
<
const
Argument
*>
(
p_arg
),
stream_config
);
};
};
bool
IsSupportedArgument
(
const
BaseArgument
*
p_arg
)
override
{
const
Argument
*
p_arg_
=
dynamic_cast
<
const
Argument
*>
(
p_arg
);
if
(
!
Reduction
::
IsSupportedArgument
(
p_arg_
))
{
return
false
;
}
if
(
p_arg_
->
inLengths_
[
Rank
-
1
]
%
OutDstVectorSize
!=
0
)
{
return
false
;
}
return
true
;
};
//
// @brief Makes a pointer to Argument class.
//
// @param[in] inLengths Input tensor extent(s) from high to low dimension
// @param[in] inStrides Input tensor stride(s) from high to low dimension
// @param[in] reduceDims The dimension(s) the normalization operation is applied
// @param[in] alpha Typeless pointer in host memory storing the alpha scaling
// value as type AccDataType
// @param[in] beta Typeless pointer in host memory storing the beta scaling
// value as type AccDataType
// @param[in] in_dev Typeless const pointer in device memory storing the input
// tensor
// @param out_dev Typeless pointer in device memory storing the output tensor
// @param[in] in_elementwise_op The input elementwise operation.
// @param[in] acc_elementwise_op The accumulation elementwise operation.
//
// @return Unique pointer to the Argument class.
//
std
::
unique_ptr
<
BaseArgument
>
MakeArgumentPointer
(
const
std
::
vector
<
index_t
>
inLengths
,
const
std
::
vector
<
index_t
>
inStrides
,
const
std
::
vector
<
int
>
reduceDims
,
const
void
*
alpha
,
const
void
*
beta
,
const
void
*
in_dev
,
void
*
out_dev
,
InElementwiseOp
in_elementwise_op
,
AccElementwiseOp
acc_elementwise_op
)
override
{
return
std
::
make_unique
<
Argument
>
(
inLengths
,
inStrides
,
reduceDims
,
*
static_cast
<
const
AccDataType
*>
(
alpha
),
*
static_cast
<
const
AccDataType
*>
(
beta
),
static_cast
<
const
InDataType
*>
(
in_dev
),
static_cast
<
OutDataType
*>
(
out_dev
),
in_elementwise_op
,
acc_elementwise_op
);
};
std
::
unique_ptr
<
BaseInvoker
>
MakeInvokerPointer
()
override
{
return
std
::
make_unique
<
Invoker
>
();
};
std
::
string
GetTypeString
()
const
override
{
auto
str
=
std
::
stringstream
();
// clang-format off
str
<<
"DeviceReduceSoftmax<"
<<
BlockSize
<<
","
;
str
<<
"M_C"
<<
MThreadClusterSize
<<
"_S"
<<
MThreadSliceSize
<<
","
;
str
<<
"K_C"
<<
KThreadClusterSize
<<
"_S"
<<
KThreadSliceSize
<<
","
;
str
<<
"InSrcVectorDim_"
<<
InSrcVectorDim
<<
"_InSrcVectorSize_"
<<
InSrcVectorSize
<<
"_OutDstVectorSize_"
<<
OutDstVectorSize
<<
">"
;
// clang-format on
return
str
.
str
();
}
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
template
<
typename
TensorDesc
,
typename
TileLengths
,
// Tuple<...>
typename
DoPads
>
// Sequence<bool, bool, ...>
__host__
__device__
constexpr
auto
PadTensorDescriptor
(
const
TensorDesc
&
desc
,
const
TileLengths
&
tile_lengths
,
DoPads
)
{
constexpr
index_t
num_dim
=
DoPads
::
Size
();
static_assert
(
num_dim
==
TileLengths
::
Size
()
&&
num_dim
==
TensorDesc
::
GetNumOfDimension
(),
"wrong! inconsistent # of dimensions"
);
// transforms
const
auto
transforms
=
generate_tuple
(
[
&
](
auto
idim
)
{
const
auto
MRaw
=
desc
.
GetLength
(
idim
);
const
auto
MPerTile
=
tile_lengths
[
idim
];
const
auto
M
=
math
::
integer_divide_ceil
(
MRaw
,
MPerTile
)
*
MPerTile
;
const
auto
MPad
=
M
-
MRaw
;
const
bool
DoPadM
=
DoPads
::
At
(
idim
);
const
auto
MTransform
=
conditional_expr
<
DoPadM
>
(
make_right_pad_transform
(
MRaw
,
MPad
),
make_pass_through_transform
(
MRaw
));
return
MTransform
;
},
Number
<
num_dim
>
{});
// lower dimension Id
const
auto
lower_dimss
=
generate_tuple
([
&
](
auto
idim
)
{
return
Sequence
<
idim
.
value
>
{};
},
Number
<
num_dim
>
{});
// upper dimension Id
const
auto
upper_dimss
=
lower_dimss
;
return
transform_tensor_descriptor
(
desc
,
transforms
,
lower_dimss
,
upper_dimss
);
}
// M/N/K/OPerTileType could be index_t or Number<>
template
<
GemmSpecialization
GemmSpec
,
typename
MPerTileType
,
typename
NPerTileType
,
typename
KPerTileType
,
typename
OPerTileType
>
struct
GemmGemmPadder
{
// TODO: hard to scale; use mask instead
static
constexpr
bool
PadM
=
GemmSpec
==
GemmSpecialization
::
MPadding
||
GemmSpec
==
GemmSpecialization
::
MNPadding
||
GemmSpec
==
GemmSpecialization
::
MKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
||
GemmSpec
==
GemmSpecialization
::
MOPadding
||
GemmSpec
==
GemmSpecialization
::
MNOPadding
||
GemmSpec
==
GemmSpecialization
::
MKOPadding
||
GemmSpec
==
GemmSpecialization
::
MNKOPadding
;
static
constexpr
bool
PadN
=
GemmSpec
==
GemmSpecialization
::
NPadding
||
GemmSpec
==
GemmSpecialization
::
MNPadding
||
GemmSpec
==
GemmSpecialization
::
NKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
||
GemmSpec
==
GemmSpecialization
::
NOPadding
||
GemmSpec
==
GemmSpecialization
::
MNOPadding
||
GemmSpec
==
GemmSpecialization
::
NKOPadding
||
GemmSpec
==
GemmSpecialization
::
MNKOPadding
;
static
constexpr
bool
PadK
=
GemmSpec
==
GemmSpecialization
::
KPadding
||
GemmSpec
==
GemmSpecialization
::
MKPadding
||
GemmSpec
==
GemmSpecialization
::
NKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
||
GemmSpec
==
GemmSpecialization
::
KOPadding
||
GemmSpec
==
GemmSpecialization
::
MKOPadding
||
GemmSpec
==
GemmSpecialization
::
NKOPadding
||
GemmSpec
==
GemmSpecialization
::
MNKOPadding
;
static
constexpr
bool
PadO
=
GemmSpec
==
GemmSpecialization
::
OPadding
||
GemmSpec
==
GemmSpecialization
::
MOPadding
||
GemmSpec
==
GemmSpecialization
::
NOPadding
||
GemmSpec
==
GemmSpecialization
::
KOPadding
||
GemmSpec
==
GemmSpecialization
::
MNOPadding
||
GemmSpec
==
GemmSpecialization
::
MKOPadding
||
GemmSpec
==
GemmSpecialization
::
NKOPadding
||
GemmSpec
==
GemmSpecialization
::
MNKOPadding
;
// A[M, K]
template
<
typename
ADesc_MRaw_KRaw
>
__host__
__device__
constexpr
auto
PadADescriptor_M_K
(
const
ADesc_MRaw_KRaw
&
a_desc_mraw_kraw
)
const
{
return
PadTensorDescriptor
(
a_desc_mraw_kraw
,
make_tuple
(
MPerTile_
,
KPerTile_
),
Sequence
<
PadM
,
PadK
>
{});
}
// B[K, N]
template
<
typename
BDesc_NRaw_KRaw
>
__host__
__device__
constexpr
auto
PadBDescriptor_N_K
(
const
BDesc_NRaw_KRaw
&
b_desc_nraw_kraw
)
const
{
return
PadTensorDescriptor
(
b_desc_nraw_kraw
,
make_tuple
(
NPerTile_
,
KPerTile_
),
Sequence
<
PadN
,
PadK
>
{});
}
// B1[Gemm1N, Gemm1K] = B1[O, N]
template
<
typename
B1Desc_NRaw_KRaw
>
__host__
__device__
constexpr
auto
PadB1Descriptor_N_K
(
const
B1Desc_NRaw_KRaw
&
b1_desc_nraw_kraw
)
const
{
return
PadTensorDescriptor
(
b1_desc_nraw_kraw
,
make_tuple
(
OPerTile_
,
NPerTile_
),
Sequence
<
PadO
,
PadN
>
{});
}
// C[M, Gemm1N] = C[M, O]
template
<
typename
CDesc_MRaw_NRaw
>
__host__
__device__
constexpr
auto
PadCDescriptor_M_N
(
const
CDesc_MRaw_NRaw
&
c_desc_mraw_nraw
)
const
{
return
PadTensorDescriptor
(
c_desc_mraw_nraw
,
make_tuple
(
MPerTile_
,
OPerTile_
),
Sequence
<
PadM
,
PadO
>
{});
}
MPerTileType
MPerTile_
;
NPerTileType
NPerTile_
;
KPerTileType
KPerTile_
;
OPerTileType
OPerTile_
;
};
// M/N/KPerTileType could be index_t or Number<>
template
<
GemmSpecialization
GemmSpec
,
typename
MPerTileType
,
typename
NPerTileType
,
typename
KPerTileType
>
struct
GemmPadder
{
static
constexpr
bool
PadM
=
(
GemmSpec
==
GemmSpecialization
::
MPadding
||
GemmSpec
==
GemmSpecialization
::
MNPadding
||
GemmSpec
==
GemmSpecialization
::
MKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
);
static
constexpr
bool
PadN
=
(
GemmSpec
==
GemmSpecialization
::
NPadding
||
GemmSpec
==
GemmSpecialization
::
MNPadding
||
GemmSpec
==
GemmSpecialization
::
NKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
);
static
constexpr
bool
PadK
=
(
GemmSpec
==
GemmSpecialization
::
KPadding
||
GemmSpec
==
GemmSpecialization
::
MKPadding
||
GemmSpec
==
GemmSpecialization
::
NKPadding
||
GemmSpec
==
GemmSpecialization
::
MNKPadding
);
template
<
typename
ADesc_MRaw_KRaw
>
__host__
__device__
constexpr
auto
PadADescriptor_M_K
(
const
ADesc_MRaw_KRaw
&
a_desc_mraw_kraw
)
const
{
return
PadTensorDescriptor
(
a_desc_mraw_kraw
,
make_tuple
(
MPerTile_
,
KPerTile_
),
Sequence
<
PadM
,
PadK
>
{});
}
template
<
typename
BDesc_NRaw_KRaw
>
__host__
__device__
constexpr
auto
PadBDescriptor_N_K
(
const
BDesc_NRaw_KRaw
&
b_desc_nraw_kraw
)
const
{
return
PadTensorDescriptor
(
b_desc_nraw_kraw
,
make_tuple
(
NPerTile_
,
KPerTile_
),
Sequence
<
PadN
,
PadK
>
{});
}
template
<
typename
CDesc_MRaw_NRaw
>
__host__
__device__
constexpr
auto
PadCDescriptor_M_N
(
const
CDesc_MRaw_NRaw
&
c_desc_mraw_nraw
)
const
{
return
PadTensorDescriptor
(
c_desc_mraw_nraw
,
make_tuple
(
MPerTile_
,
NPerTile_
),
Sequence
<
PadM
,
PadN
>
{});
}
MPerTileType
MPerTile_
;
NPerTileType
NPerTile_
;
KPerTileType
KPerTile_
;
};
// Alias of GemmPadder; to deprecate
template
<
GemmSpecialization
GemmSpec
,
typename
MPerTileType
,
typename
NPerTileType
,
typename
KPerTileType
>
struct
MatrixPadder
:
public
GemmPadder
<
GemmSpec
,
MPerTileType
,
NPerTileType
,
KPerTileType
>
{
};
// M/N/KPerTileType could be index_t or Number<>
template
<
bool
PadM
,
bool
PadN
,
bool
PadK
,
typename
MPerTileType
,
typename
NPerTileType
,
typename
KPerTileType
>
struct
GemmPadder_v2
{
template
<
typename
ADesc_MRaw_KRaw
>
__host__
__device__
constexpr
auto
PadADescriptor_M_K
(
const
ADesc_MRaw_KRaw
&
a_desc_mraw_kraw
)
const
{
return
PadTensorDescriptor
(
a_desc_mraw_kraw
,
make_tuple
(
MPerTile_
,
KPerTile_
),
Sequence
<
PadM
,
PadK
>
{});
}
template
<
typename
BDesc_NRaw_KRaw
>
__host__
__device__
constexpr
auto
PadBDescriptor_N_K
(
const
BDesc_NRaw_KRaw
&
b_desc_nraw_kraw
)
const
{
return
PadTensorDescriptor
(
b_desc_nraw_kraw
,
make_tuple
(
NPerTile_
,
KPerTile_
),
Sequence
<
PadN
,
PadK
>
{});
}
template
<
typename
CDesc_MRaw_NRaw
>
__host__
__device__
constexpr
auto
PadCDescriptor_M_N
(
const
CDesc_MRaw_NRaw
&
c_desc_mraw_nraw
)
const
{
return
PadTensorDescriptor
(
c_desc_mraw_nraw
,
make_tuple
(
MPerTile_
,
NPerTile_
),
Sequence
<
PadM
,
PadN
>
{});
}
MPerTileType
MPerTile_
;
NPerTileType
NPerTile_
;
KPerTileType
KPerTile_
;
};
// M/N/KPerTileType could be index_t or Number<>
template
<
bool
PadM
,
bool
PadN
,
bool
PadK
,
typename
MPerTileType
,
typename
NPerTileType
,
typename
KPerTileType
>
struct
MatrixPadder_v2
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
template
<
typename
ADesc_MRaw_KRaw
>
__host__
__device__
constexpr
auto
PadADescriptor_M_K
(
const
ADesc_MRaw_KRaw
&
a_desc_mraw_kraw
)
const
{
const
auto
MRaw
=
a_desc_mraw_kraw
.
GetLength
(
I0
);
const
auto
KRaw
=
a_desc_mraw_kraw
.
GetLength
(
I1
);
const
auto
M
=
math
::
integer_divide_ceil
(
MRaw
,
MPerTile_
)
*
MPerTile_
;
const
auto
K
=
math
::
integer_divide_ceil
(
KRaw
,
KPerTile_
)
*
KPerTile_
;
const
auto
MPad
=
M
-
MRaw
;
const
auto
KPad
=
K
-
KRaw
;
if
constexpr
(
PadM
&&
PadK
)
{
// pad both M and K
return
transform_tensor_descriptor
(
a_desc_mraw_kraw
,
make_tuple
(
make_right_pad_transform
(
MRaw
,
MPad
),
make_right_pad_transform
(
KRaw
,
KPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
(
PadM
&&
(
!
PadK
))
{
// pad M, but not K
return
transform_tensor_descriptor
(
a_desc_mraw_kraw
,
make_tuple
(
make_right_pad_transform
(
MRaw
,
MPad
),
make_pass_through_transform
(
KRaw
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
((
!
PadM
)
&&
PadK
)
{
// pad K, but not M
return
transform_tensor_descriptor
(
a_desc_mraw_kraw
,
make_tuple
(
make_pass_through_transform
(
MRaw
),
make_right_pad_transform
(
KRaw
,
KPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
{
// not pad M or K
return
a_desc_mraw_kraw
;
}
}
template
<
typename
BDesc_NRaw_KRaw
>
__host__
__device__
constexpr
auto
PadBDescriptor_N_K
(
const
BDesc_NRaw_KRaw
&
b_desc_nraw_kraw
)
const
{
const
auto
NRaw
=
b_desc_nraw_kraw
.
GetLength
(
I0
);
const
auto
KRaw
=
b_desc_nraw_kraw
.
GetLength
(
I1
);
const
auto
N
=
math
::
integer_divide_ceil
(
NRaw
,
NPerTile_
)
*
NPerTile_
;
const
auto
K
=
math
::
integer_divide_ceil
(
KRaw
,
KPerTile_
)
*
KPerTile_
;
const
auto
NPad
=
N
-
NRaw
;
const
auto
KPad
=
K
-
KRaw
;
if
constexpr
(
PadN
&&
PadK
)
{
// pad both N and K
return
transform_tensor_descriptor
(
b_desc_nraw_kraw
,
make_tuple
(
make_right_pad_transform
(
NRaw
,
NPad
),
make_right_pad_transform
(
KRaw
,
KPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
(
PadN
&&
(
!
PadK
))
{
// pad N, but not K
return
transform_tensor_descriptor
(
b_desc_nraw_kraw
,
make_tuple
(
make_right_pad_transform
(
NRaw
,
NPad
),
make_pass_through_transform
(
KRaw
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
((
!
PadN
)
&&
PadK
)
{
// pad K, but not N
return
transform_tensor_descriptor
(
b_desc_nraw_kraw
,
make_tuple
(
make_pass_through_transform
(
NRaw
),
make_right_pad_transform
(
KRaw
,
KPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
{
// not pad N or K
return
b_desc_nraw_kraw
;
}
}
template
<
typename
CDesc_MRaw_NRaw
>
__host__
__device__
constexpr
auto
PadCDescriptor_M_N
(
const
CDesc_MRaw_NRaw
&
c_desc_mraw_nraw
)
const
{
const
auto
MRaw
=
c_desc_mraw_nraw
.
GetLength
(
I0
);
const
auto
NRaw
=
c_desc_mraw_nraw
.
GetLength
(
I1
);
const
auto
M
=
math
::
integer_divide_ceil
(
MRaw
,
MPerTile_
)
*
MPerTile_
;
const
auto
N
=
math
::
integer_divide_ceil
(
NRaw
,
NPerTile_
)
*
NPerTile_
;
const
auto
MPad
=
M
-
MRaw
;
const
auto
NPad
=
N
-
NRaw
;
if
constexpr
(
PadM
&&
PadN
)
{
// pad M and N
return
transform_tensor_descriptor
(
c_desc_mraw_nraw
,
make_tuple
(
make_right_pad_transform
(
MRaw
,
MPad
),
make_right_pad_transform
(
NRaw
,
NPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
(
PadM
&&
(
!
PadN
))
{
// pad M, but not N
return
transform_tensor_descriptor
(
c_desc_mraw_nraw
,
make_tuple
(
make_right_pad_transform
(
MRaw
,
MPad
),
make_pass_through_transform
(
NRaw
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
if
constexpr
((
!
PadM
)
&&
PadN
)
{
// pad N, but not M
return
transform_tensor_descriptor
(
c_desc_mraw_nraw
,
make_tuple
(
make_pass_through_transform
(
MRaw
),
make_right_pad_transform
(
NRaw
,
NPad
)),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
}
else
{
// not pad M or N
return
c_desc_mraw_nraw
;
}
}
MPerTileType
MPerTile_
;
NPerTileType
NPerTile_
;
KPerTileType
KPerTile_
;
};
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
// FIXME: can it be replaced with ck::Tuple?
#include <tuple>
namespace
ck
{
// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
// respective functor classes.
// The boolean member "indexable" are also provided in reduce_binary_operactor for
// easier checking by the upper-layer codes in the kernels.
template
<
ReduceTensorOp
Op
>
struct
reduce_binary_operator
;
template
<
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
ADD
>
{
using
opType
=
reduce
::
Add
;
static
constexpr
bool
indexable
=
false
;
};
template
<
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MUL
>
{
using
opType
=
reduce
::
Mul
;
static
constexpr
bool
indexable
=
false
;
};
template
<
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MIN
>
{
using
opType
=
reduce
::
Min
;
static
constexpr
bool
indexable
=
true
;
};
template
<
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
MAX
>
{
using
opType
=
reduce
::
Max
;
static
constexpr
bool
indexable
=
true
;
};
template
<
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
AMAX
>
{
using
opType
=
reduce
::
AMax
;
static
constexpr
bool
indexable
=
true
;
};
template
<
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
AVG
>
{
using
opType
=
reduce
::
Add
;
static
constexpr
bool
indexable
=
false
;
};
template
<
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
NORM1
>
{
using
opType
=
reduce
::
Add
;
static
constexpr
bool
indexable
=
false
;
};
template
<
>
struct
reduce_binary_operator
<
ReduceTensorOp
::
NORM2
>
{
using
opType
=
reduce
::
Add
;
static
constexpr
bool
indexable
=
false
;
};
// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
// functor classes.
// The two unary functors are called before and afer the Reduction is executed respectively
template
<
ReduceTensorOp
Op
,
bool
IsFirstReduce
,
bool
IsLastReduce
>
struct
reduce_unary_operator
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
template
<
bool
IsFirstReduce
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
AVG
,
IsFirstReduce
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryDivide
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{
reduceLength
});
};
};
template
<
bool
IsLastReduce
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM1
,
true
,
IsLastReduce
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
template
<
bool
IsLastReduce
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
AMAX
,
true
,
IsLastReduce
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnaryAbs
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
template
<
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
true
,
false
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
template
<
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
true
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySquare
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
template
<
>
struct
reduce_unary_operator
<
ReduceTensorOp
::
NORM2
,
false
,
true
>
{
using
InElementwiseOperation
=
tensor_operation
::
element_wise
::
PassThrough
;
using
AccElementwiseOperation
=
tensor_operation
::
element_wise
::
UnarySqrt
;
static
std
::
tuple
<
InElementwiseOperation
,
AccElementwiseOperation
>
GetElementwiseOperator
(
int32_t
reduceLength
)
{
(
void
)
reduceLength
;
return
std
::
make_tuple
(
InElementwiseOperation
{},
AccElementwiseOperation
{});
};
};
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace
ck
{
namespace
tensor_layout
{
struct
BaseTensorLayout
{
};
namespace
gemm
{
struct
RowMajor
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"RowMajor"
;
};
struct
ColumnMajor
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"ColumnMajor"
;
};
}
// namespace gemm
namespace
convolution
{
// input tensor
// packed NCW/NCHW/NCDHW
struct
NCW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NCW"
;
};
struct
NCHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NCHW"
;
};
struct
NCDHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NCDHW"
;
};
// packed GNCW/GNCHW/GNCDHW
struct
GNCW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNCW"
;
};
struct
GNCHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNCHW"
;
};
struct
GNCDHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNCDHW"
;
};
// input tensor
// packed NWC/NHWC/NDHWC
struct
NWC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NWC"
;
};
struct
NHWC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NHWC"
;
};
struct
NDHWC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NDHWC"
;
};
// input tensor
// packed GNWC/GNHWC/GNDHWC
struct
GNWC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNWC"
;
};
struct
GNHWC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNHWC"
;
};
struct
GNDHWC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNDHWC"
;
};
// for input bias
struct
GC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GC"
;
};
// input tensor
// packed NWGC/NHWGC/NDHWGC
struct
NWGC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NWGC"
;
};
struct
NHWGC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NHWGC"
;
};
struct
NDHWGC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NDHWGC"
;
};
// input tensor
// strided layout
struct
G_NW_C
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NW_C"
;
};
struct
G_NHW_C
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NHW_C"
;
};
struct
G_NDHW_C
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NDHW_C"
;
};
// for input bias
struct
G_C
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_C"
;
};
// weight tensor
// packed KCX/KCYX/KCZYX
struct
KCX
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KCX"
;
};
struct
KCYX
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KCYX"
;
};
struct
KCZYX
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KCZYX"
;
};
// weight tensor
// packed KCX/KCYX/KCZYX
struct
GKCX
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GKCX"
;
};
struct
GKCYX
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GKCYX"
;
};
struct
GKCZYX
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GKCZYX"
;
};
// weight tensor
// packed KXC/KYXC/KZYXC
struct
KXC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KXC"
;
};
struct
KYXC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KYXC"
;
};
struct
KZYXC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KZYXC"
;
};
// weight tensor
// packed GKXC/GKYXC/GKZYXC
struct
GKXC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GKXC"
;
};
struct
GKYXC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GKYXC"
;
};
struct
GKZYXC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GKZYXC"
;
};
// weight tensor
// packed KXGC/KYXGC/KZYXGC
struct
KXGC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KXGC"
;
};
struct
KYXGC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KYXGC"
;
};
struct
KZYXGC
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"KZYXGC"
;
};
// weight tensor
// strided
struct
G_K_X_C
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_K_X_C"
;
};
struct
G_K_YX_C
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_K_YX_C"
;
};
struct
G_K_ZYX_C
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_K_ZYX_C"
;
};
// output tensor
// packed NKW/NKHW/NKDHW
struct
NKW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NKW"
;
};
struct
NKHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NKHW"
;
};
struct
NKDHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NKDHW"
;
};
// output tensor
// packed GNKW/GNKHW/GNKDHW
struct
GNKW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNKW"
;
};
struct
GNKHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNKHW"
;
};
struct
GNKDHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNKDHW"
;
};
// output tensor
// packed NWK/NHWK/NDHWK
struct
NWK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NWK"
;
};
struct
NHWK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NHWK"
;
};
struct
NDHWK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NDHWK"
;
};
// output tensor
// packed GNWK/GNHWK/GNDHWK
struct
GNWK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNWK"
;
};
struct
GNHWK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNHWK"
;
};
struct
GNDHWK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNDHWK"
;
};
// for output bias
struct
GK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GK"
;
};
// output tensor
// packed NWGK/NHWGK/NDHWGK
struct
NWGK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NWGK"
;
};
struct
NHWGK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NHWGK"
;
};
struct
NDHWGK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NDHWGK"
;
};
// output tensor
// strided layout
struct
G_NW_K
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NW_K"
;
};
struct
G_NHW_K
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NHW_K"
;
};
struct
G_NDHW_K
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NDHW_K"
;
};
// for output bias
struct
G_K
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_K"
;
};
// K-reduced output tensor (packed)
struct
GNW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNW"
;
};
struct
GNHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNHW"
;
};
struct
GNDHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GNDHW"
;
};
// K-reduced output tensor (packed)
struct
NWG
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NWG"
;
};
struct
NHWG
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NHWG"
;
};
struct
NDHWG
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"NDHWG"
;
};
// K-reduced output tensor (strided)
struct
G_NW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NW"
;
};
struct
G_NHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NHW"
;
};
struct
G_NDHW
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"G_NDHW"
;
};
}
// namespace convolution
template
<
typename
Layout
,
typename
std
::
enable_if
<
std
::
is_base_of
<
BaseTensorLayout
,
Layout
>
::
value
,
bool
>::
type
=
false
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
os
,
const
Layout
&
)
{
os
<<
Layout
::
name
;
return
os
;
}
}
// namespace tensor_layout
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
enum
struct
TensorSpecialization
{
Default
,
Packed
};
inline
std
::
string
getTensorSpecializationString
(
const
TensorSpecialization
&
s
)
{
switch
(
s
)
{
case
TensorSpecialization
::
Default
:
return
"Default"
;
case
TensorSpecialization
::
Packed
:
return
"Packed"
;
default:
return
"Unrecognized specialization!"
;
}
}
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
element_wise
{
struct
Add
{
template
<
typename
Y
,
typename
X0
,
typename
X1
>
__host__
__device__
constexpr
void
operator
()(
Y
&
y
,
const
X0
&
x0
,
const
X1
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
y
=
x0
+
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
y
=
x0
+
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
half_t
&
x1
)
const
{
y
=
x0
+
type_convert
<
half_t
>
(
x1
);
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
)
const
{
y
=
type_convert
<
half_t
>
(
x0
)
+
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
y
=
x0
+
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
bhalf_t
>
(
bhalf_t
&
y
,
const
bhalf_t
&
x0
,
const
bhalf_t
&
x1
)
const
{
const
float
x1_tmp
=
ck
::
type_convert
<
float
>
(
x0
);
const
float
x2_tmp
=
ck
::
type_convert
<
float
>
(
x1
);
const
float
y_tmp
=
x1_tmp
+
x2_tmp
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_tmp
);
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
int8_t
>
(
int8_t
&
y
,
const
int8_t
&
x0
,
const
int8_t
&
x1
)
const
{
y
=
x0
+
x1
;
};
};
struct
Subtract
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
y
=
x0
-
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
y
=
x0
-
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
y
=
x0
-
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
bhalf_t
>
(
bhalf_t
&
y
,
const
bhalf_t
&
x0
,
const
bhalf_t
&
x1
)
const
{
const
float
x1_tmp
=
ck
::
type_convert
<
float
>
(
x0
);
const
float
x2_tmp
=
ck
::
type_convert
<
float
>
(
x1
);
const
float
y_tmp
=
x1_tmp
-
x2_tmp
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_tmp
);
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
int8_t
>
(
int8_t
&
y
,
const
int8_t
&
x0
,
const
int8_t
&
x1
)
const
{
y
=
x0
-
x1
;
};
};
struct
Bilinear
{
Bilinear
(
float
alpha
,
float
beta
)
:
alpha_
(
alpha
),
beta_
(
beta
){};
template
<
typename
Y
,
typename
X0
,
typename
X1
>
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
y
=
alpha_
*
x0
+
beta_
*
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
y
=
type_convert
<
half_t
>
(
alpha_
)
*
x0
+
type_convert
<
half_t
>
(
beta_
)
*
x1
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
float
,
half_t
>
(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
)
const
{
y
=
type_convert
<
half_t
>
(
alpha_
*
x0
+
beta_
*
ck
::
type_convert
<
float
>
(
x1
));
};
float
alpha_
;
float
beta_
;
};
struct
AddRelu
{
template
<
typename
Y
,
typename
X0
,
typename
X1
>
__host__
__device__
constexpr
void
operator
()(
Y
&
y
,
const
X0
&
x0
,
const
X1
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
const
float
a
=
x0
+
x1
;
y
=
a
>
0.0
f
?
a
:
0.0
f
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
,
double
,
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
const
double
a
=
x0
+
x1
;
y
=
a
>
0.0
?
a
:
0.0
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
const
half_t
a
=
x0
+
x1
;
y
=
a
>
type_convert
<
half_t
>
(
0.0
f
)
?
a
:
type_convert
<
half_t
>
(
0.0
f
);
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
float
,
half_t
>
(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
)
const
{
const
float
a
=
x0
+
x1
;
y
=
a
>
type_convert
<
half_t
>
(
0.0
f
)
?
a
:
type_convert
<
half_t
>
(
0.0
f
);
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
half_t
>
(
float
&
y
,
const
float
&
x0
,
const
half_t
&
x1
)
const
{
const
float
a
=
x0
+
type_convert
<
float
>
(
x1
);
y
=
a
>
0.0
f
?
a
:
0.0
f
;
};
};
struct
AddHardswish
{
template
<
typename
T
>
__host__
__device__
constexpr
void
operator
()(
T
&
y
,
const
T
&
x0
,
const
T
&
x1
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
c
=
(
b
>
0
)
*
(
b
>
6.0
f
?
6.0
f
:
b
)
*
a
*
0.166667
f
;
y
=
c
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
>
(
double
&
y
,
const
double
&
x0
,
const
double
&
x1
)
const
{
double
a
=
x0
+
x1
;
double
b
=
a
+
3.0
;
double
c
=
(
b
>
0
)
*
(
b
>
6.0
?
6.0
:
b
)
*
a
*
0.166667
;
y
=
c
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
3.0
f
;
float
c
=
(
b
>
0
)
*
(
b
>
6.0
f
?
6.0
f
:
b
)
*
a
*
0.166667
f
;
y
=
c
;
};
};
// C = A * B
// E = FastGelu(C + D)
struct
AddFastGelu
{
// Fast GeLU
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
__host__
__device__
static
constexpr
float
GetFastGeLU
(
float
x
)
{
const
float
u
=
2.
f
*
x
*
(
0.035677
f
*
x
*
x
+
0.797885
f
);
const
float
emu
=
exp
(
-
u
);
const
float
cdf
=
0.5
f
+
0.5
f
*
(
2.
f
/
(
1.
f
+
emu
)
-
1.
f
);
return
x
*
cdf
;
}
template
<
typename
T
>
static
inline
constexpr
bool
is_valid_param_type_v
=
std
::
is_same_v
<
T
,
float
>
||
std
::
is_same_v
<
T
,
half_t
>
||
std
::
is_same_v
<
T
,
bhalf_t
>
||
std
::
is_same_v
<
T
,
int32_t
>
||
std
::
is_same_v
<
T
,
int8_t
>
;
template
<
typename
E
,
typename
C
,
typename
D
>
__host__
__device__
constexpr
void
operator
()(
E
&
e
,
const
C
&
c
,
const
D
&
d
)
const
{
static_assert
(
is_valid_param_type_v
<
E
>
&&
is_valid_param_type_v
<
C
>
&&
is_valid_param_type_v
<
D
>
);
const
float
y
=
GetFastGeLU
(
type_convert
<
float
>
(
c
)
+
type_convert
<
float
>
(
d
));
e
=
type_convert
<
E
>
(
y
);
}
template
<
typename
D
>
__host__
__device__
constexpr
void
operator
()(
float
&
e
,
const
float
&
c
,
const
D
&
d
)
const
{
static_assert
(
is_valid_param_type_v
<
D
>
);
e
=
GetFastGeLU
(
c
+
type_convert
<
float
>
(
d
));
}
};
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
element_wise
{
// Need to ensure compiler will fail if there is no matching candidate, instead of compiler
// siliently do implicit type conversion
//
// Method 1:
//
// struct ExampleElementwiseOp
// {
// template<typename Y, typename X>
// __host__ __device__ constexpr void
// operator()(Y&, const X) const;
//
// template<>
// __host__ __device__ constexpr void
// operator()<half_t, half_t>(half_t& y, const half_t& x) const
// {
// }
// };
//
// Method 2:
//
// template <typename Y, typename X>
// struct ExampleElementwiseOp;
//
// template <>
// struct ExampleElementwiseOp<float, ck::bhalf_t>
// {
// __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
// {
// }
// };
struct
AddReluAdd
{
template
<
typename
Y
,
typename
X0
,
typename
X1
,
typename
X2
>
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
,
const
X2
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
half_t
a
=
x0
+
x1
;
half_t
b
=
a
>
0
?
a
:
0
;
y
=
b
+
x2
;
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
>
0
?
a
:
0
;
float
c
=
b
+
x2
;
y
=
c
;
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
float
,
half_t
,
half_t
>
(
half_t
&
y
,
const
float
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
>
0
?
a
:
0
;
float
c
=
b
+
x2
;
y
=
c
;
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
bhalf_t
,
float
,
bhalf_t
,
bhalf_t
>
(
bhalf_t
&
y
,
const
float
&
x0
,
const
bhalf_t
&
x1
,
const
bhalf_t
&
x2
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
>
0
?
a
:
0
;
float
c
=
b
+
x2
;
y
=
c
;
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
int8_t
,
int8_t
,
int8_t
,
int8_t
>
(
int8_t
&
y
,
const
int8_t
&
x0
,
const
int8_t
&
x1
,
const
int8_t
&
x2
)
const
{
int32_t
a
=
x0
+
x1
;
int32_t
b
=
a
>
0
?
a
:
0
;
int32_t
c
=
b
+
x2
;
y
=
c
;
}
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
template
<
>
__host__
__device__
constexpr
void
operator
()
<
int4_t
,
int8_t
,
int4_t
,
int4_t
>
(
int4_t
&
y
,
const
int8_t
&
x0
,
const
int4_t
&
x1
,
const
int4_t
&
x2
)
const
{
int32_t
a
=
x0
+
x1
;
int32_t
b
=
a
>
0
?
a
:
0
;
int32_t
c
=
b
+
x2
;
y
=
c
;
}
#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
};
struct
AddHardswishAdd
{
template
<
typename
Y
,
typename
X0
,
typename
X1
,
typename
X2
>
__host__
__device__
constexpr
void
operator
()(
Y
&
,
const
X0
&
,
const
X1
&
,
const
X2
&
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x0
,
const
float
&
x1
,
const
float
&
x2
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
c
=
(
b
>
0
)
*
(
b
>
float
{
6
}
?
float
{
6
}
:
b
)
*
a
*
float
{
0.166667
};
float
d
=
c
+
x2
;
y
=
d
;
}
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
half_t
,
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x0
,
const
half_t
&
x1
,
const
half_t
&
x2
)
const
{
float
a
=
x0
+
x1
;
float
b
=
a
+
float
{
3
};
float
c
=
(
b
>
0
)
*
(
b
>
float
{
6
}
?
float
{
6
}
:
b
)
*
a
*
float
{
0.166667
};
float
d
=
c
+
x2
;
y
=
d
;
}
};
// C = A * B
// E = C + D0 + D1
struct
AddAdd
{
template
<
typename
E
,
typename
C
,
typename
D0
,
typename
D1
>
__host__
__device__
void
operator
()(
E
&
e
,
const
C
&
c
,
const
D0
&
d0
,
const
D1
&
d1
)
const
{
// Only support floating so far
static_assert
(
is_same
<
E
,
half_t
>::
value
||
is_same
<
E
,
float
>::
value
||
is_same
<
E
,
double
>::
value
,
"Data type is not supported by this operation!"
);
static_assert
(
is_same
<
C
,
half_t
>::
value
||
is_same
<
C
,
float
>::
value
||
is_same
<
C
,
double
>::
value
,
"Data type is not supported by this operation!"
);
static_assert
(
is_same
<
D0
,
half_t
>::
value
||
is_same
<
D0
,
float
>::
value
||
is_same
<
D0
,
double
>::
value
,
"Data type is not supported by this operation!"
);
static_assert
(
is_same
<
D1
,
half_t
>::
value
||
is_same
<
D1
,
float
>::
value
||
is_same
<
D1
,
double
>::
value
,
"Data type is not supported by this operation!"
);
const
C
y
=
c
+
type_convert
<
C
>
(
d0
)
+
type_convert
<
C
>
(
d1
);
e
=
type_convert
<
E
>
(
y
);
}
};
// C = A * B
// E = FastGelu(C + D0 + D1)
struct
AddAddFastGelu
{
// Fast GeLU
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
__host__
__device__
static
constexpr
float
GetFastGeLU
(
float
x
)
{
const
float
u
=
2.
f
*
x
*
(
0.035677
f
*
x
*
x
+
0.797885
f
);
const
float
emu
=
exp
(
-
u
);
const
float
cdf
=
0.5
f
+
0.5
f
*
(
2.
f
/
(
1.
f
+
emu
)
-
1.
f
);
return
x
*
cdf
;
}
template
<
typename
T
>
static
inline
constexpr
bool
is_valid_param_type_v
=
std
::
is_same_v
<
T
,
float
>
||
std
::
is_same_v
<
T
,
half_t
>
||
std
::
is_same_v
<
T
,
bhalf_t
>
||
std
::
is_same_v
<
T
,
int32_t
>
||
std
::
is_same_v
<
T
,
int8_t
>
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
||
std
::
is_same_v
<
T
,
ck
::
int4_t
>
#endif
;
template
<
typename
E
,
typename
C
,
typename
D0
,
typename
D1
>
__host__
__device__
constexpr
void
operator
()(
E
&
e
,
const
C
&
c
,
const
D0
&
d0
,
const
D1
&
d1
)
const
{
static_assert
(
is_valid_param_type_v
<
E
>
&&
is_valid_param_type_v
<
C
>
&&
is_valid_param_type_v
<
D0
>
&&
is_valid_param_type_v
<
D1
>
);
const
float
y
=
GetFastGeLU
(
type_convert
<
float
>
(
c
)
+
type_convert
<
float
>
(
d0
)
+
type_convert
<
float
>
(
d1
));
e
=
type_convert
<
E
>
(
y
);
}
};
struct
Normalize
{
// FIXME: is double absolutely necessary?
Normalize
(
double
epsilon
=
1e-4
)
:
epsilon_
(
epsilon
)
{}
template
<
typename
T1
,
typename
T2
,
typename
T3
>
__host__
__device__
constexpr
void
operator
()(
T1
&
y
,
const
T1
&
x
,
const
T2
&
mean
,
const
T2
&
mean_square
,
const
T3
&
gamma
,
const
T3
&
beta
)
const
;
template
<
>
__host__
__device__
constexpr
void
operator
()
<
half_t
,
float
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x
,
const
float
&
mean
,
const
float
&
mean_square
,
const
half_t
&
gamma
,
const
half_t
&
beta
)
const
{
using
ck
::
math
::
sqrt
;
float
variance
=
mean_square
-
(
mean
*
mean
);
float
tmp_x
=
type_convert
<
float
>
(
x
);
float
tmp_gamma
=
type_convert
<
float
>
(
gamma
);
float
tmp_beta
=
type_convert
<
float
>
(
beta
);
float
tmp_y
=
((
tmp_x
-
mean
)
/
sqrt
(
variance
+
type_convert
<
float
>
(
epsilon_
)))
*
tmp_gamma
+
tmp_beta
;
y
=
type_convert
<
half_t
>
(
tmp_y
);
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
float
,
float
,
float
>
(
float
&
y
,
const
float
&
x
,
const
float
&
mean
,
const
float
&
mean_square
,
const
float
&
gamma
,
const
float
&
beta
)
const
{
using
ck
::
math
::
sqrt
;
float
variance
=
mean_square
-
(
mean
*
mean
);
y
=
((
x
-
mean
)
/
sqrt
(
variance
+
type_convert
<
float
>
(
epsilon_
)))
*
gamma
+
beta
;
};
template
<
>
__host__
__device__
constexpr
void
operator
()
<
double
,
double
,
double
>
(
double
&
y
,
const
double
&
x
,
const
double
&
mean
,
const
double
&
mean_square
,
const
double
&
gamma
,
const
double
&
beta
)
const
{
using
ck
::
math
::
sqrt
;
double
variance
=
mean_square
-
(
mean
*
mean
);
y
=
((
x
-
mean
)
/
sqrt
(
variance
+
epsilon_
))
*
gamma
+
beta
;
};
// FIXME: is double absolutely necessary?
double
epsilon_
;
};
template
<
typename
Y
,
typename
X
>
struct
UnaryTypeConvert
;
template
<
>
struct
UnaryTypeConvert
<
float
,
ck
::
bhalf_t
>
{
__host__
__device__
void
operator
()(
float
&
y
,
ck
::
bhalf_t
&
x
)
const
{
y
=
ck
::
type_convert
<
float
,
ck
::
bhalf_t
>
(
x
);
}
};
template
<
>
struct
UnaryTypeConvert
<
ck
::
bhalf_t
,
float
>
{
__host__
__device__
void
operator
()(
ck
::
bhalf_t
&
y
,
float
&
x
)
const
{
y
=
ck
::
type_convert
<
ck
::
bhalf_t
,
float
>
(
x
);
}
};
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/data_type.hpp"
#include "ck/utility/math_v2.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
element_wise
{
struct
PassThrough
{
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
;
template
<
>
__host__
__device__
void
operator
()
<
double
,
double
>
(
double
&
y
,
const
double
&
x
)
const
{
y
=
x
;
}
template
<
>
__host__
__device__
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
y
=
x
;
}
template
<
>
__host__
__device__
void
operator
()
<
half_t
,
half_t
>
(
half_t
&
y
,
const
half_t
&
x
)
const
{
y
=
x
;
}
template
<
>
__host__
__device__
void
operator
()
<
bhalf_t
,
bhalf_t
>
(
bhalf_t
&
y
,
const
bhalf_t
&
x
)
const
{
y
=
x
;
}
template
<
>
__host__
__device__
void
operator
()
<
int32_t
,
int32_t
>
(
int32_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
x
;
}
template
<
>
__host__
__device__
void
operator
()
<
bhalf_t
,
float
>
(
bhalf_t
&
y
,
const
float
&
x
)
const
{
y
=
type_convert
<
bhalf_t
>
(
x
);
}
template
<
>
__host__
__device__
void
operator
()
<
int8_t
,
int8_t
>
(
int8_t
&
y
,
const
int8_t
&
x
)
const
{
y
=
x
;
}
template
<
>
__host__
__device__
void
operator
()
<
int8_t
,
int32_t
>
(
int8_t
&
y
,
const
int32_t
&
x
)
const
{
y
=
type_convert
<
int8_t
>
(
x
);
}
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
template
<
>
__host__
__device__
void
operator
()
<
int4_t
,
int4_t
>
(
int4_t
&
y
,
const
int4_t
&
x
)
const
{
y
=
x
;
}
#endif
};
struct
UnaryConvert
{
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
{
y
=
type_convert
<
Y
>
(
x
);
}
};
struct
Scale
{
__host__
__device__
Scale
(
float
scale
)
:
scale_
(
scale
)
{}
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
;
template
<
>
__host__
__device__
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
y
=
scale_
*
x
;
};
float
scale_
;
};
struct
ScaleAndResetNaNToMinusInfinity
{
__host__
__device__
ScaleAndResetNaNToMinusInfinity
(
float
scale
)
:
scale_
(
scale
)
{}
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
;
template
<
>
__host__
__device__
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
y
=
ck
::
math
::
isnan
(
x
)
?
-
ck
::
NumericLimits
<
float
>::
Infinity
()
:
scale_
*
x
;
};
float
scale_
;
};
struct
UnaryDivide
{
__host__
__device__
UnaryDivide
(
const
int32_t
divider
=
1
)
:
divider_
(
divider
)
{}
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
int32_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
/
type_convert
<
T
>
(
divider_
);
};
int32_t
divider_
=
1
;
};
struct
UnarySquare
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same_v
<
T
,
float
>
||
is_same_v
<
T
,
double
>
||
is_same_v
<
T
,
int32_t
>
||
is_same_v
<
T
,
int8_t
>
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
||
is_same_v
<
T
,
int4_t
>
#endif
,
"Data type is not supported by this operation!"
);
y
=
x
*
x
;
};
};
struct
UnaryAbs
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
ck
::
math
::
abs
(
x
);
};
};
struct
UnarySqrt
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
,
"Data type is not supported by this operation!"
);
y
=
ck
::
math
::
sqrt
(
x
);
};
};
struct
Relu
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
half_t
>::
value
||
is_same
<
T
,
int32_t
>::
value
||
is_same
<
T
,
int8_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
x
>
0
?
x
:
0
;
}
template
<
>
__host__
__device__
void
operator
()(
bhalf_t
&
y
,
const
bhalf_t
&
x
)
const
{
float
x_f32
=
ck
::
type_convert
<
float
>
(
x
);
float
y_f32
=
x_f32
>
0
?
x_f32
:
0
;
y
=
ck
::
type_convert
<
bhalf_t
>
(
y_f32
);
}
};
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
struct
FastGelu
{
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
;
template
<
>
__host__
__device__
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
const
float
u
=
float
(
2
)
*
x
*
(
float
(
0.035677
)
*
x
*
x
+
float
(
0.797885
));
const
float
emu
=
exp
(
-
u
);
const
float
cdf
=
float
(
0.5
)
+
float
(
0.5
)
*
(
float
(
2
)
/
(
float
(
1
)
+
emu
)
-
float
(
1
));
y
=
x
*
cdf
;
}
};
// https://paperswithcode.com/method/gelu
// y = 0.5*x*(1+erf(x/sqrt(2)))
struct
Gelu
{
template
<
typename
Y
,
typename
X
>
__host__
__device__
void
operator
()(
Y
&
y
,
const
X
&
x
)
const
;
template
<
>
__host__
__device__
void
operator
()
<
float
,
float
>
(
float
&
y
,
const
float
&
x
)
const
{
y
=
0.5
f
*
x
*
(
1.
f
+
erf
(
float
(
0.70710678118
f
*
x
)));
}
template
<
>
__host__
__device__
void
operator
()
<
ck
::
half_t
,
ck
::
half_t
>
(
ck
::
half_t
&
y
,
const
ck
::
half_t
&
x
)
const
{
y
=
ck
::
half_t
(
0.5
)
*
x
*
(
ck
::
half_t
(
1
)
+
ck
::
half_t
(
erf
(
float
(
0.70710678118
f
*
x
))));
}
};
struct
Sigmoid
{
template
<
typename
T
>
__host__
__device__
void
operator
()(
T
&
y
,
const
T
&
x
)
const
{
static_assert
(
is_same
<
T
,
float
>::
value
||
is_same
<
T
,
double
>::
value
||
is_same
<
T
,
ck
::
half_t
>::
value
,
"Data type is not supported by this operation!"
);
y
=
1
/
(
ck
::
type_convert
<
T
>
(
1
)
+
exp
(
-
x
));
};
int32_t
divider_
=
1
;
};
}
// namespace element_wise
}
// namespace tensor_operation
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/math.hpp"
#include "ck/utility/number.hpp"
#include "ck/tensor_description/tensor_adaptor.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace
ck
{
// Rows of column-vectors
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
typename
CGridDesc_M_N
,
bool
DeviceCTileIndexCheck
=
false
>
struct
BlockToCTileMap_M00_N0_M01
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_M00_N0_M01
()
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
1
)
:
M01_
(
M01
),
underlying_map_
(
GetBlockToCTileMap
(
c_grid_desc_m_n
,
M01
))
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
auto
M00
=
math
::
integer_divide_ceil
(
M0
,
M01_
);
const
index_t
grid_size
=
M00
*
M01_
*
N0
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
return
underlying_map_
.
CalculateBottomIndex
(
idx_top
);
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
c_tile_idx
,
const
CTileDim
&
c_tile_dim
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
DefaultValidCTileIndex
(
c_tile_idx
,
c_tile_dim
);
else
return
true
;
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
true
;
// validity check moved to kernel
const
index_t
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
if
(
M0
%
M01_
==
0
)
{
return
true
;
}
else
{
return
false
;
}
}
private:
__host__
__device__
static
constexpr
auto
GetBlockToCTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
)
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
auto
M00
=
math
::
integer_divide_ceil
(
M0
,
M01
);
const
auto
m00_n0_m01_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_insert_transform
(
1
),
make_unmerge_transform
(
make_tuple
(
M00
,
M01
)),
make_pass_through_transform
(
make_tuple
(
N0
))),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
3
>
{},
Sequence
<
2
>
{}));
const
auto
cblockid_to_m00_n0_m01_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
1
,
M00
,
N0
,
M01
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
chain_tensor_adaptors
(
m00_n0_m01_to_m0_n0_block_cluster_adaptor
,
cblockid_to_m00_n0_m01_block_cluster_adaptor
);
return
cblockid_to_m0_n0_block_cluster_adaptor
;
}
index_t
M01_
;
using
UnderlyingMap
=
decltype
(
GetBlockToCTileMap
(
CGridDesc_M_N
{},
1
));
UnderlyingMap
underlying_map_
;
};
// Rows of column-vectors
// This C-tile map dynamically adjusts M01 when C-tile index is out of range
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
typename
CGridDesc_M_N
>
struct
BlockToCTileMap_M00_N0_M01Adapt
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
()
=
default
;
__host__
__device__
BlockToCTileMap_M00_N0_M01Adapt
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
8
)
:
M01_
(
M01
),
c_grid_desc_m_n_
(
c_grid_desc_m_n
)
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
index_t
grid_size
=
M0
*
N0
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
auto
block_1d_id
=
idx_top
[
I0
];
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n_
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n_
.
GetLength
(
I1
),
NPerBlock
);
block_1d_id
=
block_1d_id
%
(
M0
*
N0
);
// swallow batch index
index_t
idx_N0
=
block_1d_id
%
N0
;
index_t
idx_M0
=
block_1d_id
/
N0
;
const
auto
M01_adapt
=
(
idx_M0
<
M0
-
M0
%
M01_
)
?
M01_
:
M0
%
M01_
;
index_t
idx_M00
=
idx_M0
/
M01_
;
index_t
idx_M01
=
idx_M0
%
M01_
;
index_t
idx_N0_M01_local
=
idx_N0
+
idx_M01
*
N0
;
return
make_tuple
(
idx_N0_M01_local
%
M01_adapt
+
idx_M00
*
M01_
,
idx_N0_M01_local
/
M01_adapt
);
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
/* c_tile_idx */
,
const
CTileDim
&
/* c_tile_dim */
)
const
{
return
true
;
// always valid provided that user gets grid size from CalculateGridSize()
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
private:
index_t
M01_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
};
// 2D slices of column-vectors in 3D space
// This C-tile map dynamically adjusts M01 when C-tile index is out of range
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
typename
CGridDesc_M_N
>
struct
BlockToCTileMap_KSplit_M00_N0_M01Adapt
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_KSplit_M00_N0_M01Adapt
()
=
default
;
__host__
__device__
BlockToCTileMap_KSplit_M00_N0_M01Adapt
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
8
,
index_t
KSplit
=
1
)
:
M01_
(
M01
),
KSplit_
(
KSplit
),
c_grid_desc_m_n_
(
c_grid_desc_m_n
)
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
index_t
grid_size
=
M0
*
N0
*
KSplit_
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
auto
block_1d_id
=
idx_top
[
I0
];
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n_
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n_
.
GetLength
(
I1
),
NPerBlock
);
const
index_t
idx_ksplit
=
block_1d_id
/
(
M0
*
N0
);
block_1d_id
=
block_1d_id
%
(
M0
*
N0
);
index_t
idx_N0
=
block_1d_id
%
N0
;
index_t
idx_M0
=
block_1d_id
/
N0
;
const
auto
M01_adapt
=
(
idx_M0
<
M0
-
M0
%
M01_
)
?
M01_
:
M0
%
M01_
;
index_t
idx_M00
=
idx_M0
/
M01_
;
index_t
idx_M01
=
idx_M0
%
M01_
;
index_t
idx_N0_M01_local
=
idx_N0
+
idx_M01
*
N0
;
return
make_tuple
(
idx_ksplit
,
idx_N0_M01_local
%
M01_adapt
+
idx_M00
*
M01_
,
idx_N0_M01_local
/
M01_adapt
);
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
/* c_tile_idx */
,
const
CTileDim
&
/* c_tile_dim */
)
const
{
return
true
;
// always valid provided that user gets grid size from CalculateGridSize()
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
/* c_grid_desc_m_n */
)
const
{
return
true
;
}
private:
index_t
M01_
;
index_t
KSplit_
;
CGridDesc_M_N
c_grid_desc_m_n_
;
};
// Blocks of row-vectors
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
typename
CGridDesc_M_N
,
bool
DeviceCTileIndexCheck
=
false
>
struct
BlockToCTileMap_M00_N00_M01_N01
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
__device__
BlockToCTileMap_M00_N00_M01_N01
()
=
default
;
__host__
__device__
BlockToCTileMap_M00_N00_M01_N01
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
1
,
index_t
N01
=
1
)
:
M01_
(
M01
),
N01_
(
N01
),
underlying_map_
(
GetBlockToCTileMap
(
c_grid_desc_m_n
,
M01
,
N01
))
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
auto
M00
=
math
::
integer_divide_ceil
(
M0
,
M01_
);
const
auto
N00
=
math
::
integer_divide_ceil
(
N0
,
N01_
);
const
index_t
grid_size
=
M00
*
M01_
*
N00
*
N01_
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
return
underlying_map_
.
CalculateBottomIndex
(
idx_top
);
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
c_tile_idx
,
const
CTileDim
&
c_tile_dim
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
DefaultValidCTileIndex
(
c_tile_idx
,
c_tile_dim
);
else
return
true
;
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
true
;
// validity check moved to kernel
const
index_t
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
index_t
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
if
(
M0
%
M01_
==
0
&&
N0
%
N01_
==
0
)
{
return
true
;
}
else
{
return
false
;
}
}
private:
__host__
__device__
static
constexpr
auto
GetBlockToCTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
,
index_t
N01
)
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
auto
M00
=
math
::
integer_divide_ceil
(
M0
,
M01
);
const
auto
N00
=
math
::
integer_divide_ceil
(
N0
,
N01
);
const
auto
m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_insert_transform
(
1
),
// swallow the carry from lower dimensions
make_unmerge_transform
(
make_tuple
(
M00
,
M01
)),
make_unmerge_transform
(
make_tuple
(
N00
,
N01
))),
make_tuple
(
Sequence
<>
{},
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
3
>
{},
Sequence
<
2
,
4
>
{}));
const
auto
cblockid_to_m00_m01_n00_n01_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
1
,
M00
,
N00
,
M01
,
N01
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
cblockid_to_m0_n0_block_cluster_adaptor
=
chain_tensor_adaptors
(
m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
,
cblockid_to_m00_m01_n00_n01_block_cluster_adaptor
);
return
cblockid_to_m0_n0_block_cluster_adaptor
;
}
index_t
M01_
,
N01_
;
using
UnderlyingMap
=
decltype
(
GetBlockToCTileMap
(
CGridDesc_M_N
{},
1
,
1
));
UnderlyingMap
underlying_map_
;
};
// 2D slices of row-vectors in 3D space
template
<
index_t
MPerBlock
,
index_t
NPerBlock
,
typename
CGridDesc_M_N
,
bool
DeviceCTileIndexCheck
=
false
>
struct
BlockToCTileMap_KSplit_M00_N00_M01_N01
{
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
auto
I2
=
Number
<
2
>
{};
static
constexpr
auto
I3
=
Number
<
3
>
{};
__host__
BlockToCTileMap_KSplit_M00_N00_M01_N01
()
=
default
;
__host__
BlockToCTileMap_KSplit_M00_N00_M01_N01
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
=
1
,
index_t
N01
=
1
,
index_t
KSplit
=
1
)
:
M01_
(
M01
),
N01_
(
N01
),
KSplit_
(
KSplit
),
underlying_map_
(
GetBlockToCTileMap
(
c_grid_desc_m_n
,
M01
,
N01
,
KSplit
))
{
}
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
auto
M00
=
math
::
integer_divide_ceil
(
M0
,
M01_
);
const
auto
N00
=
math
::
integer_divide_ceil
(
N0
,
N01_
);
const
index_t
grid_size
=
M00
*
M01_
*
N00
*
N01_
*
KSplit_
;
return
grid_size
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
return
underlying_map_
.
CalculateBottomIndex
(
idx_top
);
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
c_tile_idx
,
const
CTileDim
&
c_tile_dim
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
DefaultValidCTileIndex
(
c_tile_idx
,
c_tile_dim
);
else
return
true
;
}
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
if
constexpr
(
DeviceCTileIndexCheck
)
return
true
;
// validity check moved to kernel
const
index_t
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
index_t
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
if
(
M0
%
M01_
==
0
&&
N0
%
N01_
==
0
)
{
return
true
;
}
else
{
return
false
;
}
}
private:
__host__
static
constexpr
auto
GetBlockToCTileMap
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
index_t
M01
,
index_t
N01
,
index_t
KSplit
)
{
const
auto
M0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I0
),
MPerBlock
);
const
auto
N0
=
math
::
integer_divide_ceil
(
c_grid_desc_m_n
.
GetLength
(
I1
),
NPerBlock
);
const
auto
M00
=
math
::
integer_divide_ceil
(
M0
,
M01
);
const
auto
N00
=
math
::
integer_divide_ceil
(
N0
,
N01
);
const
auto
ksplit_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_pass_through_transform
(
KSplit
),
make_unmerge_transform
(
make_tuple
(
M00
,
M01
)),
make_unmerge_transform
(
make_tuple
(
N00
,
N01
))),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
,
3
>
{},
Sequence
<
2
,
4
>
{}));
const
auto
c_blockid_to_ksplit_m00_m01_n00_n01_block_cluster_adaptor
=
make_single_stage_tensor_adaptor
(
make_tuple
(
make_merge_transform
(
make_tuple
(
KSplit
,
M00
,
N00
,
M01
,
N01
))),
make_tuple
(
Sequence
<
0
,
1
,
2
,
3
,
4
>
{}),
make_tuple
(
Sequence
<
0
>
{}));
const
auto
c_blockid_to_ksplit_m0_n0_block_cluster_adaptor
=
chain_tensor_adaptors
(
ksplit_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor
,
c_blockid_to_ksplit_m00_m01_n00_n01_block_cluster_adaptor
);
return
c_blockid_to_ksplit_m0_n0_block_cluster_adaptor
;
}
index_t
M01_
,
N01_
,
KSplit_
;
using
UnderlyingMap
=
decltype
(
GetBlockToCTileMap
(
CGridDesc_M_N
{},
1
,
1
,
1
));
UnderlyingMap
underlying_map_
;
};
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
DefaultValidCTileIndex
(
const
CTileIdx
&
c_tile_idx
,
const
CTileDim
&
c_tile_dim
)
{
bool
is_valid
=
false
;
const
index_t
m_block
=
c_tile_dim
[
Number
<
0
>
{}];
const
index_t
n_block
=
c_tile_dim
[
Number
<
1
>
{}];
if
constexpr
(
CTileIdx
::
Size
()
==
2
)
{
const
index_t
m_block_idx
=
c_tile_idx
[
Number
<
0
>
{}];
const
index_t
n_block_idx
=
c_tile_idx
[
Number
<
1
>
{}];
if
(
0
<=
m_block_idx
&&
m_block_idx
<
m_block
&&
0
<=
n_block_idx
&&
n_block_idx
<
n_block
)
{
is_valid
=
true
;
}
}
else
if
constexpr
(
CTileIdx
::
Size
()
==
3
)
{
const
index_t
ksplit_idx
=
c_tile_idx
[
Number
<
0
>
{}];
const
index_t
m_block_idx
=
c_tile_idx
[
Number
<
1
>
{}];
const
index_t
n_block_idx
=
c_tile_idx
[
Number
<
2
>
{}];
if
(
0
<=
m_block_idx
&&
m_block_idx
<
m_block
&&
0
<=
n_block_idx
&&
n_block_idx
<
n_block
)
{
is_valid
=
true
;
}
ignore
=
ksplit_idx
;
}
return
is_valid
;
}
// This wrapper class is for grouped gemm where it subtracts blockIdx by a value so that the
// workgroups assigned to a given gemm problem have top index offsetted to range [0,
// grid_size_per_gemm]
template
<
typename
UnderlyingBlockToCTileMap
>
struct
OffsettedBlockToCTileMap
{
using
underlying_type
=
UnderlyingBlockToCTileMap
;
OffsettedBlockToCTileMap
(
UnderlyingBlockToCTileMap
block_to_ctile_map
,
index_t
block_start
)
{
block_to_ctile_map_
=
block_to_ctile_map
;
block_start_
=
block_start
;
}
template
<
typename
TopIdx
>
__host__
__device__
constexpr
auto
CalculateBottomIndex
(
const
TopIdx
&
idx_top
)
const
{
return
block_to_ctile_map_
.
CalculateBottomIndex
(
make_multi_index
(
idx_top
[
Number
<
0
>
{}]
-
block_start_
));
}
template
<
typename
CTileIdx
,
typename
CTileDim
>
__host__
__device__
bool
ValidCTileIndex
(
const
CTileIdx
&
c_tile_idx
,
const
CTileDim
&
c_tile_dim
)
const
{
return
block_to_ctile_map_
.
ValidCTileIndex
(
c_tile_idx
,
c_tile_dim
);
}
template
<
typename
CGridDesc_M_N
>
__host__
bool
CheckValidity
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
return
block_to_ctile_map_
.
CheckValidity
(
c_grid_desc_m_n
);
}
template
<
typename
CGridDesc_M_N
>
__host__
constexpr
index_t
CalculateGridSize
(
const
CGridDesc_M_N
&
c_grid_desc_m_n
)
const
{
return
block_to_ctile_map_
.
CalculateGridSize
(
c_grid_desc_m_n
);
}
UnderlyingBlockToCTileMap
block_to_ctile_map_
;
index_t
block_start_
;
};
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwiseMultipleReduction
,
index_t
NumReduction
,
typename
InDataType
,
typename
OutDataTypePointerTuple
,
typename
AccDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M_Tuple
,
typename
InElementwiseOperationTuple
,
typename
AccElementwiseOperationTuple
>
__global__
void
kernel_multiple_reduce_multiblock
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
OutGridDesc_M_Tuple
out_grid_desc_m_tuple
,
const
InElementwiseOperationTuple
in_elementwise_op_tuple
,
const
AccElementwiseOperationTuple
acc_elementwise_op_tuple
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
Array
<
AccDataType
,
NumReduction
>
alpha_values
,
const
InDataType
*
const
__restrict__
p_in_value_global
,
Array
<
AccDataType
,
NumReduction
>
beta_values
,
OutDataTypePointerTuple
p_out_value_global_tuple
)
{
GridwiseMultipleReduction
::
Run
(
in_grid_desc_m_k
,
out_grid_desc_m_tuple
,
in_elementwise_op_tuple
,
acc_elementwise_op_tuple
,
block_group_size
,
num_k_block_tile_iteration
,
alpha_values
,
p_in_value_global
,
beta_values
,
p_out_value_global_tuple
);
};
template
<
index_t
NumReduction
,
typename
InDataType
,
typename
OutDataTypePointerTuple
,
typename
AccDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M_Tuple
,
typename
ReduceOperation
,
typename
InElementwiseOperationTuple
,
typename
AccElementwiseOperationTuple
,
InMemoryDataOperationEnum
OutMemoryDataOperation
,
bool
PropagateNan
,
index_t
BlockSize
,
index_t
MThreadClusterSize
,
index_t
KThreadClusterSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
typename
OutDstVectorSizeSeq
>
struct
GridwiseMultipleReduction_mk_to_m_multiblock
{
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
)),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static_assert
(
NumReduction
==
OutDataTypePointerTuple
::
Size
()
&&
NumReduction
==
OutGridDesc_M_Tuple
::
Size
()
&&
NumReduction
==
OutDstVectorSizeSeq
::
Size
()
&&
NumReduction
==
InElementwiseOperationTuple
::
Size
()
&&
NumReduction
==
AccElementwiseOperationTuple
::
Size
(),
"All tuple should have the same size as the number of Reductions!"
);
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
using
ThreadClusterLengths_M_K
=
Sequence
<
MThreadClusterSize
,
KThreadClusterSize
>
;
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadClusterArrangeOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
static
constexpr
auto
thread_cluster_desc
=
make_cluster_descriptor
(
ThreadClusterLengths_M_K
{},
ThreadClusterArrangeOrder
{});
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{})));
using
BlockwiseReduce
=
PartitionedBlockwiseReduction
<
AccDataType
,
BlockSize
,
ThreadClusterLengths_M_K
,
ThreadClusterArrangeOrder
,
ReduceOperation
,
PropagateNan
>
;
using
ThreadwiseReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
static
constexpr
index_t
M_BlockTileSize
=
MThreadClusterSize
*
MThreadSliceSize
;
static
constexpr
index_t
K_BlockTileSize
=
KThreadClusterSize
*
KThreadSliceSize
;
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
__device__
static
void
Run
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M_Tuple
&
out_grid_desc_m_tuple
,
const
InElementwiseOperationTuple
&
in_elementwise_op_tuple
,
const
AccElementwiseOperationTuple
&
acc_elementwise_op_tuple
,
index_t
block_group_size
,
index_t
num_k_block_tile_iteration
,
Array
<
AccDataType
,
NumReduction
>
alpha_values
,
const
InDataType
*
const
__restrict__
p_in_value_global
,
Array
<
AccDataType
,
NumReduction
>
beta_values
,
OutDataTypePointerTuple
p_out_value_global_tuple
)
{
const
auto
identityVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>();
// LDS, reused by all reductions
__shared__
AccDataType
p_reduce_work_buffer
[
BlockSize
];
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
auto
out_global_val_buf_tuple
=
generate_tuple
(
[
&
](
auto
iR
)
{
return
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global_tuple
[
iR
],
out_grid_desc_m_tuple
[
iR
].
GetElementSpaceSize
());
},
Number
<
NumReduction
>
{});
auto
reduce_work_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Lds
>
(
p_reduce_work_buffer
,
BlockSize
);
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
auto
in_thread_buf_tuple
=
generate_tuple
(
[
&
](
auto
iR
)
{
(
void
)
iR
;
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
{};
},
Number
<
NumReduction
>
{});
auto
accu_value_buf_tuple
=
generate_tuple
(
[
&
](
auto
iR
)
{
(
void
)
iR
;
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
{};
},
Number
<
NumReduction
>
{});
static_for
<
0
,
NumReduction
,
1
>
{}([
&
](
auto
iR
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}(
[
&
](
auto
J
)
{
accu_value_buf_tuple
(
iR
)(
J
)
=
identityVal
;
});
});
const
index_t
thread_local_id
=
get_thread_local_1d_id
();
const
index_t
block_global_id
=
get_block_1d_id
();
const
index_t
blkgroup_id
=
block_global_id
/
block_group_size
;
const
index_t
block_local_id
=
block_global_id
%
block_group_size
;
const
auto
thread_cluster_idx
=
thread_cluster_desc
.
CalculateBottomIndex
(
make_multi_index
(
thread_local_id
));
const
auto
thread_m_cluster_id
=
thread_cluster_idx
[
I0
];
const
auto
thread_k_cluster_id
=
thread_cluster_idx
[
I1
];
const
index_t
reduceSizePerBlock
=
K_BlockTileSize
*
num_k_block_tile_iteration
;
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
,
block_local_id
*
reduceSizePerBlock
+
thread_k_cluster_id
*
KThreadSliceSize
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
K_BlockTileSize
);
index_t
reducedTiles
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_val_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
NumReduction
,
1
>
{}([
&
](
auto
iR
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_elementwise_op_tuple
[
iR
](
in_thread_buf_tuple
(
iR
)(
Number
<
offset
>
{}),
in_thread_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduce
::
Reduce
(
in_thread_buf_tuple
(
iR
),
accu_value_buf_tuple
(
iR
));
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedTiles
++
;
}
while
(
reducedTiles
<
num_k_block_tile_iteration
);
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
static_for
<
0
,
NumReduction
,
1
>
{}([
&
](
auto
iR
)
{
using
OutDataTypePointer
=
remove_cvref_t
<
decltype
(
OutDataTypePointerTuple
{}[
iR
])
>
;
using
OutDataType
=
remove_cvref_t
<
remove_pointer_t
<
OutDataTypePointer
>>
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
BlockwiseReduce
::
Reduce
(
reduce_work_buf
,
accu_value_buf_tuple
(
iR
)(
I
));
});
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
if
(
thread_k_cluster_id
==
0
)
{
acc_elementwise_op_tuple
[
iR
](
accu_value_buf_tuple
(
iR
)(
I
),
accu_value_buf_tuple
(
iR
)(
I
));
accu_value_buf_tuple
(
iR
)(
I
)
*=
alpha_values
[
iR
];
}
});
if
(
thread_k_cluster_id
==
0
)
{
if
(
block_group_size
==
0
&&
!
float_equal_zero
{}(
beta_values
[
iR
]))
{
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
decltype
(
out_grid_desc_m_tuple
[
iR
]),
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSizeSeq
::
At
(
iR
),
1
,
false
>
(
out_grid_desc_m_tuple
[
iR
],
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
));
threadwise_dst_load
.
Run
(
out_grid_desc_m_tuple
[
iR
],
out_global_val_buf_tuple
(
iR
),
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValueBuf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf_tuple
(
iR
)(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValueBuf
[
I
])
*
beta_values
[
iR
];
});
};
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
decltype
(
out_grid_desc_m_tuple
[
iR
]),
PassThroughOp
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSizeSeq
::
At
(
iR
),
OutMemoryDataOperation
,
1
,
true
>
(
out_grid_desc_m_tuple
[
iR
],
make_multi_index
(
blkgroup_id
*
M_BlockTileSize
+
thread_m_cluster_id
*
MThreadSliceSize
),
PassThroughOp
{});
threadwise_dst_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf_tuple
[
iR
],
out_grid_desc_m_tuple
[
iR
],
out_global_val_buf_tuple
(
iR
));
};
});
};
};
// namespace ck
}
// namespace ck
deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
0 → 100644
View file @
78a300ff
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/utility/reduction_common.hpp"
#include "ck/utility/reduction_operator.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
namespace
ck
{
template
<
typename
GridwiseMultipleReduction
,
index_t
NumReduction
,
typename
InDataType
,
typename
OutDataTypePointerTuple
,
typename
AccDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M_Tuple
,
typename
InElementwiseOperationTuple
,
typename
AccElementwiseOperationTuple
>
__global__
void
kernel_multiple_reduce_threadwise
(
const
InGridDesc_M_K
in_grid_desc_m_k
,
const
OutGridDesc_M_Tuple
out_grid_desc_m_tuple
,
const
InElementwiseOperationTuple
in_elementwise_op_tuple
,
const
AccElementwiseOperationTuple
acc_elementwise_op_tuple
,
Array
<
AccDataType
,
NumReduction
>
alpha_values
,
const
InDataType
*
const
__restrict__
p_in_value_global
,
Array
<
AccDataType
,
NumReduction
>
beta_values
,
OutDataTypePointerTuple
p_out_value_global_tuple
)
{
GridwiseMultipleReduction
::
Run
(
in_grid_desc_m_k
,
out_grid_desc_m_tuple
,
in_elementwise_op_tuple
,
acc_elementwise_op_tuple
,
alpha_values
,
p_in_value_global
,
beta_values
,
p_out_value_global_tuple
);
};
template
<
index_t
NumReduction
,
typename
InDataType
,
typename
OutDataTypePointerTuple
,
typename
AccDataType
,
typename
InGridDesc_M_K
,
typename
OutGridDesc_M_Tuple
,
typename
ReduceOperation
,
typename
InElementwiseOperationTuple
,
typename
AccElementwiseOperationTuple
,
InMemoryDataOperationEnum
OutMemoryDataOperation
,
bool
PropagateNan
,
index_t
BlockSize
,
index_t
MThreadSliceSize
,
index_t
KThreadSliceSize
,
index_t
InSrcVectorDim
,
index_t
InSrcVectorSize
,
typename
OutDstVectorSizeSeq
>
struct
GridwiseMultipleReduction_mk_to_m_threadwise
{
static_assert
(((
InSrcVectorDim
==
0
&&
MThreadSliceSize
%
InSrcVectorSize
==
0
)
||
(
InSrcVectorDim
==
1
&&
KThreadSliceSize
%
InSrcVectorSize
==
0
)),
"Invalid thread slice sizes and/or vector sizes configuration, please check!"
);
static_assert
(
NumReduction
==
OutDataTypePointerTuple
::
Size
()
&&
NumReduction
==
OutGridDesc_M_Tuple
::
Size
()
&&
NumReduction
==
OutDstVectorSizeSeq
::
Size
()
&&
NumReduction
==
InElementwiseOperationTuple
::
Size
()
&&
NumReduction
==
AccElementwiseOperationTuple
::
Size
(),
"All tuple should have the same size as the number of Reductions!"
);
static
constexpr
bool
reorder_thread_cluster
=
(
InSrcVectorDim
==
0
);
using
ThreadBufferDimAccessOrder
=
typename
conditional
<
reorder_thread_cluster
,
Sequence
<
1
,
0
>
,
Sequence
<
0
,
1
>>::
type
;
using
ThreadReduceSrcDesc_M_K
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{})));
using
ThreadReduceDstDesc_M
=
decltype
(
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{})));
using
ThreadwiseReduce
=
ThreadwiseReduction
<
AccDataType
,
ThreadReduceSrcDesc_M_K
,
ThreadReduceDstDesc_M
,
ReduceOperation
,
PropagateNan
>
;
using
PassThroughOp
=
tensor_operation
::
element_wise
::
PassThrough
;
static
constexpr
auto
I0
=
Number
<
0
>
{};
using
Accumulation
=
detail
::
AccumulateWithNanCheck
<
PropagateNan
,
ReduceOperation
,
AccDataType
>
;
__device__
static
void
Run
(
const
InGridDesc_M_K
&
in_grid_desc_m_k
,
const
OutGridDesc_M_Tuple
&
out_grid_desc_m_tuple
,
const
InElementwiseOperationTuple
&
in_elementwise_op_tuple
,
const
AccElementwiseOperationTuple
&
acc_elementwise_op_tuple
,
Array
<
AccDataType
,
NumReduction
>
alpha_values
,
const
InDataType
*
const
__restrict__
p_in_value_global
,
Array
<
AccDataType
,
NumReduction
>
beta_values
,
OutDataTypePointerTuple
p_out_value_global_tuple
)
{
const
auto
identityVal
=
ReduceOperation
::
template
GetIdentityValue
<
AccDataType
>();
const
auto
in_global_val_buf
=
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_in_value_global
,
in_grid_desc_m_k
.
GetElementSpaceSize
(),
ReduceOperation
::
template
GetIdentityValue
<
InDataType
>());
auto
out_global_val_buf_tuple
=
generate_tuple
(
[
&
](
auto
iR
)
{
return
make_dynamic_buffer
<
AddressSpaceEnum
::
Global
>
(
p_out_value_global_tuple
[
iR
],
out_grid_desc_m_tuple
[
iR
].
GetElementSpaceSize
());
},
Number
<
NumReduction
>
{});
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
in_thread_buf
;
auto
in_thread_buf_tuple
=
generate_tuple
(
[
&
](
auto
iR
)
{
(
void
)
iR
;
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
*
KThreadSliceSize
,
true
>
{};
},
Number
<
NumReduction
>
{});
auto
accu_value_buf_tuple
=
generate_tuple
(
[
&
](
auto
iR
)
{
(
void
)
iR
;
return
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
AccDataType
,
MThreadSliceSize
,
true
>
{};
},
Number
<
NumReduction
>
{});
static_for
<
0
,
NumReduction
,
1
>
{}([
&
](
auto
iR
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}(
[
&
](
auto
J
)
{
accu_value_buf_tuple
(
iR
)(
J
)
=
identityVal
;
});
});
const
index_t
thread_global_1d_id
=
get_thread_global_1d_id
();
const
auto
toReduceLength
=
in_grid_desc_m_k
.
GetLength
(
Number
<
1
>
{});
using
ThreadBufferLengths
=
Sequence
<
MThreadSliceSize
,
KThreadSliceSize
>
;
constexpr
auto
thread_buffer_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
Number
<
MThreadSliceSize
>
{},
Number
<
KThreadSliceSize
>
{}));
auto
threadwise_src_load
=
ThreadwiseTensorSliceTransfer_v2
<
InDataType
,
AccDataType
,
InGridDesc_M_K
,
decltype
(
thread_buffer_desc
),
ThreadBufferLengths
,
ThreadBufferDimAccessOrder
,
InSrcVectorDim
,
InSrcVectorSize
,
1
,
false
>
(
in_grid_desc_m_k
,
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
,
0
));
constexpr
auto
in_thread_copy_step
=
make_multi_index
(
0
,
KThreadSliceSize
);
index_t
reducedLength
=
0
;
do
{
threadwise_src_load
.
Run
(
in_grid_desc_m_k
,
in_global_val_buf
,
thread_buffer_desc
,
make_tuple
(
I0
,
I0
),
in_thread_buf
);
static_for
<
0
,
NumReduction
,
1
>
{}([
&
](
auto
iR
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
// do element-wise pre-reduction operation
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
thread_buffer_desc
.
CalculateOffset
(
make_tuple
(
iM
,
iK
));
in_elementwise_op_tuple
[
iR
](
in_thread_buf_tuple
(
iR
)(
Number
<
offset
>
{}),
in_thread_buf
(
Number
<
offset
>
{}));
});
});
ThreadwiseReduce
::
Reduce
(
in_thread_buf_tuple
(
iR
),
accu_value_buf_tuple
(
iR
));
});
threadwise_src_load
.
MoveSrcSliceWindow
(
in_grid_desc_m_k
,
in_thread_copy_step
);
reducedLength
+=
KThreadSliceSize
;
}
while
(
reducedLength
<
toReduceLength
);
constexpr
auto
reduced_data_desc
=
ThreadReduceDstDesc_M
{};
static_for
<
0
,
NumReduction
,
1
>
{}([
&
](
auto
iR
)
{
using
OutDataTypePointer
=
remove_cvref_t
<
decltype
(
OutDataTypePointerTuple
{}[
iR
])
>
;
using
OutDataType
=
remove_cvref_t
<
remove_pointer_t
<
OutDataTypePointer
>>
;
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
acc_elementwise_op_tuple
[
iR
](
accu_value_buf_tuple
(
iR
)(
I
),
accu_value_buf_tuple
(
iR
)(
I
));
accu_value_buf_tuple
(
iR
)(
I
)
*=
alpha_values
[
iR
];
});
if
(
!
float_equal_zero
{}(
beta_values
[
iR
]))
{
StaticBuffer
<
AddressSpaceEnum
::
Vgpr
,
OutDataType
,
MThreadSliceSize
,
true
>
priorDstValueBuf
;
auto
threadwise_dst_load
=
ThreadwiseTensorSliceTransfer_v2
<
OutDataType
,
OutDataType
,
decltype
(
out_grid_desc_m_tuple
[
iR
]),
decltype
(
reduced_data_desc
),
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSizeSeq
::
At
(
iR
),
1
,
false
>
(
out_grid_desc_m_tuple
[
iR
],
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
));
threadwise_dst_load
.
Run
(
out_grid_desc_m_tuple
[
iR
],
out_global_val_buf_tuple
(
iR
),
reduced_data_desc
,
make_tuple
(
I0
),
priorDstValueBuf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
I
)
{
accu_value_buf_tuple
(
iR
)(
I
)
+=
type_convert
<
AccDataType
>
(
priorDstValueBuf
[
I
])
*
beta_values
[
iR
];
});
};
auto
threadwise_dst_store
=
ThreadwiseTensorSliceTransfer_v1r3
<
AccDataType
,
OutDataType
,
decltype
(
reduced_data_desc
),
decltype
(
out_grid_desc_m_tuple
[
iR
]),
PassThroughOp
,
Sequence
<
MThreadSliceSize
>
,
Sequence
<
0
>
,
0
,
OutDstVectorSizeSeq
::
At
(
iR
),
OutMemoryDataOperation
,
1
,
true
>
(
out_grid_desc_m_tuple
[
iR
],
make_multi_index
(
thread_global_1d_id
*
MThreadSliceSize
),
PassThroughOp
{});
threadwise_dst_store
.
Run
(
reduced_data_desc
,
make_tuple
(
I0
),
accu_value_buf_tuple
[
iR
],
out_grid_desc_m_tuple
[
iR
],
out_global_val_buf_tuple
(
iR
));
});
};
};
}
// namespace ck
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment